From a5b4863d9702a13e96665d118f9b974bb59ae7d6 Mon Sep 17 00:00:00 2001 From: Case Duckworth Date: Thu, 2 May 2024 22:55:16 -0500 Subject: First commit --- ingest.scm | 154 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 ingest.scm (limited to 'ingest.scm') diff --git a/ingest.scm b/ingest.scm new file mode 100644 index 0000000..3b65c75 --- /dev/null +++ b/ingest.scm @@ -0,0 +1,154 @@ +;;; (planet ingest) --- injest feeds + +(declare (module (planet ingest)) + (import scheme (chicken base) + (chicken io) + (chicken port) + (chicken time posix) + (planet feed) + (planet util) + http-client + uri-common + rss + rfc3339 + sxpath + (prefix atom atom:))) + +;;; Public API + +(define-public (download-and-ingest-feed url) + (ingest-feed (download-feed url) url)) + +(define-public (ingest-feed fd #!optional url) + (cond + ((rss->feed fd) => (cut ingest/rss <> url)) + ((atom->feed fd) => (cut ingest/atom <> url)) + (else (error "Unknown feed type" fd)))) + +(define-public (download-feed url) + (with-input-from-request url #f read-string)) + +;;; RSS +(define (rss->feed x) + (or (and (rss:feed? x) x) + (false-on-error + (lambda () (with-input-from-string x rss:read))))) + +(define (ingest/rss fd #!optional url) + (let ((chan (rss:feed-channel fd)) + (items (map item/rss->item (rss:feed-items fd)))) + (make-feed url ; url object -- xxx; brittle + (rss:item-title chan) + (rfc822->rfc3339 (rss:item-attribute chan 'lastBuildDate)) + (rss:item-attributes chan) + items))) + +(define (item/rss->item it) + (assert (rss:item? it) "Not an rss:item" it) + (let ((attachments (map enclosure->attachment rss:item-enclosures))) + (make-item (rss:item-link it) + (rss:item-title it) + (rfc822->rfc3339 (rss:item-attribute it 'pubDate)) + (rss:item-attributes it) + (rss:item-description it) + rss:item-enclosures + attachments))) + +(define (enclosure->attachment enc) + (make-attachment rss:enclosure-url + rss:enclosure-length + rss:enclosure-type)) + +(define (rfc822->rfc3339 tm) + (let loop ((fmts + ;; these don't capture the timezone -- afaik, a limitation of + ;; strptime + '("%a, %d %b %y %H:%M:%S" + "%d %b %y %H:%M:%S" + "%a, %e %b %y %H:%M:%S" + "%e %b %y %H:%M:%S" + "%a, %d %b %y %H:%M" + "%d %b %y %H:%M" + "%a, %e %b %y %H:%M" + "%e %b %y %H:%M" + "%a, %d %b %Y %H:%M:%S" + "%d %b %Y %H:%M:%S" + "%a, %e %b %Y %H:%M:%S" + "%e %b %Y %H:%M:%S" + "%a, %d %b %Y %H:%M" + "%d %b %Y %H:%M" + "%a, %e %b %Y %H:%M" + "%e %b %Y %H:%M"))) + (cond + ((null? fmts) tm) ; fallback + ((string->time tm (car fmts)) + => (lambda (tm) (time->rfc3339 tm))) + (else (loop (cdr fmts)))))) + +;;; Atom + +(define (atom->feed fd) + (or (and (atom:feed? fd) fd) + (false-on-error + (lambda () (call-with-input-string fd atom:read-atom-feed))))) + +(define (ingest/atom fd #!optional url) + (let ((items (map item/atom->item (atom:feed-entries fd)))) + (make-feed url + (atom:feed-title fd) + (string->rfc3339 (atom:feed-updated fd)) + (map (lambda (pair) (cons (car pair) ((cdr pair) fd))) + ;; have to build meta + `((authors . ,atom:feed-authors) + (categories . ,atom:feed-categories) + (contributors . ,atom:feed-contributors) + (generator . ,atom:feed-generator) + (icon . ,atom:feed-icon) + (id . ,atom:feed-id) + (links . ,atom:feed-links) + (logo . ,atom:feed-logo) + (rights . ,atom:feed-rights) + (subtitle . ,atom:feed-subtitle))) + items))) + +(define (item/atom->item it) + (assert (atom:entry? it) "Not an Atom entry" it) + (make-item (item-link/atom it) + (atom:entry-title it) + (atom:entry-updated it) + (map (lambda (pair) (cons (car pair) ((cdr pair) it))) + `((authors . ,atom:entry-authors) + (categories . ,atom:entry-categories) + (content . ,atom:entry-content) + (contributors . ,atom:entry-contributors) + (id . ,atom:entry-id) + (links . ,atom:entry-links) + (published . ,atom:entry-published) + (rights . ,atom:entry-rights) + (source . ,atom:entry-source) + (summary . ,atom:entry-summary))) + (item-content/atom it) + #f ; afaik atom doesn't do enclosures + )) + +(define (item-link/atom it) + ;; Find the of IT, or return the first + (let ((alt ((sxpath '(// (atom:link (@ rel (equal? "alternate"))))) + it))) + (if (null? alt) + (car (atom:entry-links it)) + (car alt)))) + +(define (item-content/atom it) + ;; Return entry-content or entry-summary if content is external or binary + (let ((c (atom:entry-content it))) + (case (atom:content-kind c) + ((text html textual) (atom:content-text c)) + ((xhtml) (atom:content-xhtml c)) + ((xml) (atom:content-xml c)) + ((binary external) + (let ((sum (atom:entry-summary it))) + (case (atom:summary-type sum) + ((xhtml) (atom:summary-xhtml sum)) + (else (atom:summary-text sum))))) + (else (error "Bad content type for item" it))))) -- cgit 1.4.1-21-gabe81