;;; (planet ingest) --- injest feeds (declare (module (planet ingest)) (import scheme (chicken base) (chicken io) (chicken port) (chicken time posix) (planet feed) (planet util) http-client uri-common rss rfc3339 sxpath (prefix atom atom:))) ;;; Public API (define-public (download-and-ingest-feed url) (ingest-feed (download-feed url) url)) (define-public (ingest-feed fd #!optional url) (cond ((rss->feed fd) => (cut ingest/rss <> url)) ((atom->feed fd) => (cut ingest/atom <> url)) (else (error "Unknown feed type" fd)))) (define-public (download-feed url) (with-input-from-request url #f read-string)) ;;; RSS (define (rss->feed x) (or (and (rss:feed? x) x) (false-on-error (lambda () (with-input-from-string x rss:read))))) (define (ingest/rss fd #!optional url) (let ((chan (rss:feed-channel fd)) (items (map item/rss->item (rss:feed-items fd)))) (make-feed url ; url object -- xxx; brittle (rss:item-title chan) (rfc822->rfc3339 (rss:item-attribute chan 'lastBuildDate)) (rss:item-attributes chan) items))) (define (item/rss->item it) (assert (rss:item? it) "Not an rss:item" it) (let ((attachments (map enclosure->attachment rss:item-enclosures))) (make-item (rss:item-link it) (rss:item-title it) (rfc822->rfc3339 (rss:item-attribute it 'pubDate)) (rss:item-attributes it) (rss:item-description it) rss:item-enclosures attachments))) (define (enclosure->attachment enc) (make-attachment rss:enclosure-url rss:enclosure-length rss:enclosure-type)) (define (rfc822->rfc3339 tm) (let loop ((fmts ;; these don't capture the timezone -- afaik, a limitation of ;; strptime '("%a, %d %b %y %H:%M:%S" "%d %b %y %H:%M:%S" "%a, %e %b %y %H:%M:%S" "%e %b %y %H:%M:%S" "%a, %d %b %y %H:%M" "%d %b %y %H:%M" "%a, %e %b %y %H:%M" "%e %b %y %H:%M" "%a, %d %b %Y %H:%M:%S" "%d %b %Y %H:%M:%S" "%a, %e %b %Y %H:%M:%S" "%e %b %Y %H:%M:%S" "%a, %d %b %Y %H:%M" "%d %b %Y %H:%M" "%a, %e %b %Y %H:%M" "%e %b %Y %H:%M"))) (cond ((null? fmts) tm) ; fallback ((string->time tm (car fmts)) => (lambda (tm) (time->rfc3339 tm))) (else (loop (cdr fmts)))))) ;;; Atom (define (atom->feed fd) (or (and (atom:feed? fd) fd) (false-on-error (lambda () (call-with-input-string fd atom:read-atom-feed))))) (define (ingest/atom fd #!optional url) (let ((items (map item/atom->item (atom:feed-entries fd)))) (make-feed url (atom:feed-title fd) (string->rfc3339 (atom:feed-updated fd)) (map (lambda (pair) (cons (car pair) ((cdr pair) fd))) ;; have to build meta `((authors . ,atom:feed-authors) (categories . ,atom:feed-categories) (contributors . ,atom:feed-contributors) (generator . ,atom:feed-generator) (icon . ,atom:feed-icon) (id . ,atom:feed-id) (links . ,atom:feed-links) (logo . ,atom:feed-logo) (rights . ,atom:feed-rights) (subtitle . ,atom:feed-subtitle))) items))) (define (item/atom->item it) (assert (atom:entry? it) "Not an Atom entry" it) (make-item (item-link/atom it) (atom:entry-title it) (atom:entry-updated it) (map (lambda (pair) (cons (car pair) ((cdr pair) it))) `((authors . ,atom:entry-authors) (categories . ,atom:entry-categories) (content . ,atom:entry-content) (contributors . ,atom:entry-contributors) (id . ,atom:entry-id) (links . ,atom:entry-links) (published . ,atom:entry-published) (rights . ,atom:entry-rights) (source . ,atom:entry-source) (summary . ,atom:entry-summary))) (item-content/atom it) #f ; afaik atom doesn't do enclosures )) (define (item-link/atom it) ;; Find the of IT, or return the first (let ((alt ((sxpath '(// (atom:link (@ rel (equal? "alternate"))))) it))) (if (null? alt) (car (atom:entry-links it)) (car alt)))) (define (item-content/atom it) ;; Return entry-content or entry-summary if content is external or binary (let ((c (atom:entry-content it))) (case (atom:content-kind c) ((text html textual) (atom:content-text c)) ((xhtml) (atom:content-xhtml c)) ((xml) (atom:content-xml c)) ((binary external) (let ((sum (atom:entry-summary it))) (case (atom:summary-type sum) ((xhtml) (atom:summary-xhtml sum)) (else (atom:summary-text sum))))) (else (error "Bad content type for item" it)))))