From a5b4863d9702a13e96665d118f9b974bb59ae7d6 Mon Sep 17 00:00:00 2001
From: Case Duckworth
Date: Thu, 2 May 2024 22:55:16 -0500
Subject: First commit
---
ingest.scm | 154 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 154 insertions(+)
create mode 100644 ingest.scm
(limited to 'ingest.scm')
diff --git a/ingest.scm b/ingest.scm
new file mode 100644
index 0000000..3b65c75
--- /dev/null
+++ b/ingest.scm
@@ -0,0 +1,154 @@
+;;; (planet ingest) --- injest feeds
+
+(declare (module (planet ingest))
+ (import scheme (chicken base)
+ (chicken io)
+ (chicken port)
+ (chicken time posix)
+ (planet feed)
+ (planet util)
+ http-client
+ uri-common
+ rss
+ rfc3339
+ sxpath
+ (prefix atom atom:)))
+
+;;; Public API
+
+(define-public (download-and-ingest-feed url)
+ (ingest-feed (download-feed url) url))
+
+(define-public (ingest-feed fd #!optional url)
+ (cond
+ ((rss->feed fd) => (cut ingest/rss <> url))
+ ((atom->feed fd) => (cut ingest/atom <> url))
+ (else (error "Unknown feed type" fd))))
+
+(define-public (download-feed url)
+ (with-input-from-request url #f read-string))
+
+;;; RSS
+(define (rss->feed x)
+ (or (and (rss:feed? x) x)
+ (false-on-error
+ (lambda () (with-input-from-string x rss:read)))))
+
+(define (ingest/rss fd #!optional url)
+ (let ((chan (rss:feed-channel fd))
+ (items (map item/rss->item (rss:feed-items fd))))
+ (make-feed url ; url object -- xxx; brittle
+ (rss:item-title chan)
+ (rfc822->rfc3339 (rss:item-attribute chan 'lastBuildDate))
+ (rss:item-attributes chan)
+ items)))
+
+(define (item/rss->item it)
+ (assert (rss:item? it) "Not an rss:item" it)
+ (let ((attachments (map enclosure->attachment rss:item-enclosures)))
+ (make-item (rss:item-link it)
+ (rss:item-title it)
+ (rfc822->rfc3339 (rss:item-attribute it 'pubDate))
+ (rss:item-attributes it)
+ (rss:item-description it)
+ rss:item-enclosures
+ attachments)))
+
+(define (enclosure->attachment enc)
+ (make-attachment rss:enclosure-url
+ rss:enclosure-length
+ rss:enclosure-type))
+
+(define (rfc822->rfc3339 tm)
+ (let loop ((fmts
+ ;; these don't capture the timezone -- afaik, a limitation of
+ ;; strptime
+ '("%a, %d %b %y %H:%M:%S"
+ "%d %b %y %H:%M:%S"
+ "%a, %e %b %y %H:%M:%S"
+ "%e %b %y %H:%M:%S"
+ "%a, %d %b %y %H:%M"
+ "%d %b %y %H:%M"
+ "%a, %e %b %y %H:%M"
+ "%e %b %y %H:%M"
+ "%a, %d %b %Y %H:%M:%S"
+ "%d %b %Y %H:%M:%S"
+ "%a, %e %b %Y %H:%M:%S"
+ "%e %b %Y %H:%M:%S"
+ "%a, %d %b %Y %H:%M"
+ "%d %b %Y %H:%M"
+ "%a, %e %b %Y %H:%M"
+ "%e %b %Y %H:%M")))
+ (cond
+ ((null? fmts) tm) ; fallback
+ ((string->time tm (car fmts))
+ => (lambda (tm) (time->rfc3339 tm)))
+ (else (loop (cdr fmts))))))
+
+;;; Atom
+
+(define (atom->feed fd)
+ (or (and (atom:feed? fd) fd)
+ (false-on-error
+ (lambda () (call-with-input-string fd atom:read-atom-feed)))))
+
+(define (ingest/atom fd #!optional url)
+ (let ((items (map item/atom->item (atom:feed-entries fd))))
+ (make-feed url
+ (atom:feed-title fd)
+ (string->rfc3339 (atom:feed-updated fd))
+ (map (lambda (pair) (cons (car pair) ((cdr pair) fd)))
+ ;; have to build meta
+ `((authors . ,atom:feed-authors)
+ (categories . ,atom:feed-categories)
+ (contributors . ,atom:feed-contributors)
+ (generator . ,atom:feed-generator)
+ (icon . ,atom:feed-icon)
+ (id . ,atom:feed-id)
+ (links . ,atom:feed-links)
+ (logo . ,atom:feed-logo)
+ (rights . ,atom:feed-rights)
+ (subtitle . ,atom:feed-subtitle)))
+ items)))
+
+(define (item/atom->item it)
+ (assert (atom:entry? it) "Not an Atom entry" it)
+ (make-item (item-link/atom it)
+ (atom:entry-title it)
+ (atom:entry-updated it)
+ (map (lambda (pair) (cons (car pair) ((cdr pair) it)))
+ `((authors . ,atom:entry-authors)
+ (categories . ,atom:entry-categories)
+ (content . ,atom:entry-content)
+ (contributors . ,atom:entry-contributors)
+ (id . ,atom:entry-id)
+ (links . ,atom:entry-links)
+ (published . ,atom:entry-published)
+ (rights . ,atom:entry-rights)
+ (source . ,atom:entry-source)
+ (summary . ,atom:entry-summary)))
+ (item-content/atom it)
+ #f ; afaik atom doesn't do enclosures
+ ))
+
+(define (item-link/atom it)
+ ;; Find the of IT, or return the first
+ (let ((alt ((sxpath '(// (atom:link (@ rel (equal? "alternate")))))
+ it)))
+ (if (null? alt)
+ (car (atom:entry-links it))
+ (car alt))))
+
+(define (item-content/atom it)
+ ;; Return entry-content or entry-summary if content is external or binary
+ (let ((c (atom:entry-content it)))
+ (case (atom:content-kind c)
+ ((text html textual) (atom:content-text c))
+ ((xhtml) (atom:content-xhtml c))
+ ((xml) (atom:content-xml c))
+ ((binary external)
+ (let ((sum (atom:entry-summary it)))
+ (case (atom:summary-type sum)
+ ((xhtml) (atom:summary-xhtml sum))
+ (else (atom:summary-text sum)))))
+ (else (error "Bad content type for item" it)))))
--
cgit 1.4.1-21-gabe81