;;; (planet ingest) --- injest feeds
(declare (module (planet ingest))
(import scheme (chicken base)
(chicken io)
(chicken port)
(chicken time posix)
(planet feed)
(planet util)
http-client
uri-common
rss
rfc3339
sxpath
(prefix atom atom:)))
;;; Public API
(define-public (download-and-ingest-feed url)
(ingest-feed (download-feed url) url))
(define-public (ingest-feed fd #!optional url)
(cond
((rss->feed fd) => (cut ingest/rss <> url))
((atom->feed fd) => (cut ingest/atom <> url))
(else (error "Unknown feed type" fd))))
(define-public (download-feed url)
(with-input-from-request url #f read-string))
;;; RSS
(define (rss->feed x)
(or (and (rss:feed? x) x)
(false-on-error
(lambda () (with-input-from-string x rss:read)))))
(define (ingest/rss fd #!optional url)
(let ((chan (rss:feed-channel fd))
(items (map item/rss->item (rss:feed-items fd))))
(make-feed url ; url object -- xxx; brittle
(rss:item-title chan)
(rfc822->rfc3339 (rss:item-attribute chan 'lastBuildDate))
(rss:item-attributes chan)
items)))
(define (item/rss->item it)
(assert (rss:item? it) "Not an rss:item" it)
(let ((attachments (map enclosure->attachment rss:item-enclosures)))
(make-item (rss:item-link it)
(rss:item-title it)
(rfc822->rfc3339 (rss:item-attribute it 'pubDate))
(rss:item-attributes it)
(rss:item-description it)
rss:item-enclosures
attachments)))
(define (enclosure->attachment enc)
(make-attachment rss:enclosure-url
rss:enclosure-length
rss:enclosure-type))
(define (rfc822->rfc3339 tm)
(let loop ((fmts
;; these don't capture the timezone -- afaik, a limitation of
;; strptime
'("%a, %d %b %y %H:%M:%S"
"%d %b %y %H:%M:%S"
"%a, %e %b %y %H:%M:%S"
"%e %b %y %H:%M:%S"
"%a, %d %b %y %H:%M"
"%d %b %y %H:%M"
"%a, %e %b %y %H:%M"
"%e %b %y %H:%M"
"%a, %d %b %Y %H:%M:%S"
"%d %b %Y %H:%M:%S"
"%a, %e %b %Y %H:%M:%S"
"%e %b %Y %H:%M:%S"
"%a, %d %b %Y %H:%M"
"%d %b %Y %H:%M"
"%a, %e %b %Y %H:%M"
"%e %b %Y %H:%M")))
(cond
((null? fmts) tm) ; fallback
((string->time tm (car fmts))
=> (lambda (tm) (time->rfc3339 tm)))
(else (loop (cdr fmts))))))
;;; Atom
(define (atom->feed fd)
(or (and (atom:feed? fd) fd)
(false-on-error
(lambda () (call-with-input-string fd atom:read-atom-feed)))))
(define (ingest/atom fd #!optional url)
(let ((items (map item/atom->item (atom:feed-entries fd))))
(make-feed url
(atom:feed-title fd)
(string->rfc3339 (atom:feed-updated fd))
(map (lambda (pair) (cons (car pair) ((cdr pair) fd)))
;; have to build meta
`((authors . ,atom:feed-authors)
(categories . ,atom:feed-categories)
(contributors . ,atom:feed-contributors)
(generator . ,atom:feed-generator)
(icon . ,atom:feed-icon)
(id . ,atom:feed-id)
(links . ,atom:feed-links)
(logo . ,atom:feed-logo)
(rights . ,atom:feed-rights)
(subtitle . ,atom:feed-subtitle)))
items)))
(define (item/atom->item it)
(assert (atom:entry? it) "Not an Atom entry" it)
(make-item (item-link/atom it)
(atom:entry-title it)
(atom:entry-updated it)
(map (lambda (pair) (cons (car pair) ((cdr pair) it)))
`((authors . ,atom:entry-authors)
(categories . ,atom:entry-categories)
(content . ,atom:entry-content)
(contributors . ,atom:entry-contributors)
(id . ,atom:entry-id)
(links . ,atom:entry-links)
(published . ,atom:entry-published)
(rights . ,atom:entry-rights)
(source . ,atom:entry-source)
(summary . ,atom:entry-summary)))
(item-content/atom it)
#f ; afaik atom doesn't do enclosures
))
(define (item-link/atom it)
;; Find the of IT, or return the first
(let ((alt ((sxpath '(// (atom:link (@ rel (equal? "alternate")))))
it)))
(if (null? alt)
(car (atom:entry-links it))
(car alt))))
(define (item-content/atom it)
;; Return entry-content or entry-summary if content is external or binary
(let ((c (atom:entry-content it)))
(case (atom:content-kind c)
((text html textual) (atom:content-text c))
((xhtml) (atom:content-xhtml c))
((xml) (atom:content-xml c))
((binary external)
(let ((sum (atom:entry-summary it)))
(case (atom:summary-type sum)
((xhtml) (atom:summary-xhtml sum))
(else (atom:summary-text sum)))))
(else (error "Bad content type for item" it)))))