summary refs log tree commit diff stats
path: root/ingest.scm
diff options
context:
space:
mode:
authorCase Duckworth2024-05-02 22:55:16 -0500
committerCase Duckworth2024-05-02 22:55:16 -0500
commita5b4863d9702a13e96665d118f9b974bb59ae7d6 (patch)
tree874c396bd745e3569c6d8c49e54a95d213eac238 /ingest.scm
downloadplanet-a5b4863d9702a13e96665d118f9b974bb59ae7d6.tar.gz
planet-a5b4863d9702a13e96665d118f9b974bb59ae7d6.zip
First commit main
Diffstat (limited to 'ingest.scm')
-rw-r--r--ingest.scm154
1 files changed, 154 insertions, 0 deletions
diff --git a/ingest.scm b/ingest.scm new file mode 100644 index 0000000..3b65c75 --- /dev/null +++ b/ingest.scm
@@ -0,0 +1,154 @@
1;;; (planet ingest) --- injest feeds
2
3(declare (module (planet ingest))
4 (import scheme (chicken base)
5 (chicken io)
6 (chicken port)
7 (chicken time posix)
8 (planet feed)
9 (planet util)
10 http-client
11 uri-common
12 rss
13 rfc3339
14 sxpath
15 (prefix atom atom:)))
16
17;;; Public API
18
19(define-public (download-and-ingest-feed url)
20 (ingest-feed (download-feed url) url))
21
22(define-public (ingest-feed fd #!optional url)
23 (cond
24 ((rss->feed fd) => (cut ingest/rss <> url))
25 ((atom->feed fd) => (cut ingest/atom <> url))
26 (else (error "Unknown feed type" fd))))
27
28(define-public (download-feed url)
29 (with-input-from-request url #f read-string))
30
31;;; RSS
32(define (rss->feed x)
33 (or (and (rss:feed? x) x)
34 (false-on-error
35 (lambda () (with-input-from-string x rss:read)))))
36
37(define (ingest/rss fd #!optional url)
38 (let ((chan (rss:feed-channel fd))
39 (items (map item/rss->item (rss:feed-items fd))))
40 (make-feed url ; url object -- xxx; brittle
41 (rss:item-title chan)
42 (rfc822->rfc3339 (rss:item-attribute chan 'lastBuildDate))
43 (rss:item-attributes chan)
44 items)))
45
46(define (item/rss->item it)
47 (assert (rss:item? it) "Not an rss:item" it)
48 (let ((attachments (map enclosure->attachment rss:item-enclosures)))
49 (make-item (rss:item-link it)
50 (rss:item-title it)
51 (rfc822->rfc3339 (rss:item-attribute it 'pubDate))
52 (rss:item-attributes it)
53 (rss:item-description it)
54 rss:item-enclosures
55 attachments)))
56
57(define (enclosure->attachment enc)
58 (make-attachment rss:enclosure-url
59 rss:enclosure-length
60 rss:enclosure-type))
61
62(define (rfc822->rfc3339 tm)
63 (let loop ((fmts
64 ;; these don't capture the timezone -- afaik, a limitation of
65 ;; strptime
66 '("%a, %d %b %y %H:%M:%S"
67 "%d %b %y %H:%M:%S"
68 "%a, %e %b %y %H:%M:%S"
69 "%e %b %y %H:%M:%S"
70 "%a, %d %b %y %H:%M"
71 "%d %b %y %H:%M"
72 "%a, %e %b %y %H:%M"
73 "%e %b %y %H:%M"
74 "%a, %d %b %Y %H:%M:%S"
75 "%d %b %Y %H:%M:%S"
76 "%a, %e %b %Y %H:%M:%S"
77 "%e %b %Y %H:%M:%S"
78 "%a, %d %b %Y %H:%M"
79 "%d %b %Y %H:%M"
80 "%a, %e %b %Y %H:%M"
81 "%e %b %Y %H:%M")))
82 (cond
83 ((null? fmts) tm) ; fallback
84 ((string->time tm (car fmts))
85 => (lambda (tm) (time->rfc3339 tm)))
86 (else (loop (cdr fmts))))))
87
88;;; Atom
89
90(define (atom->feed fd)
91 (or (and (atom:feed? fd) fd)
92 (false-on-error
93 (lambda () (call-with-input-string fd atom:read-atom-feed)))))
94
95(define (ingest/atom fd #!optional url)
96 (let ((items (map item/atom->item (atom:feed-entries fd))))
97 (make-feed url
98 (atom:feed-title fd)
99 (string->rfc3339 (atom:feed-updated fd))
100 (map (lambda (pair) (cons (car pair) ((cdr pair) fd)))
101 ;; have to build meta
102 `((authors . ,atom:feed-authors)
103 (categories . ,atom:feed-categories)
104 (contributors . ,atom:feed-contributors)
105 (generator . ,atom:feed-generator)
106 (icon . ,atom:feed-icon)
107 (id . ,atom:feed-id)
108 (links . ,atom:feed-links)
109 (logo . ,atom:feed-logo)
110 (rights . ,atom:feed-rights)
111 (subtitle . ,atom:feed-subtitle)))
112 items)))
113
114(define (item/atom->item it)
115 (assert (atom:entry? it) "Not an Atom entry" it)
116 (make-item (item-link/atom it)
117 (atom:entry-title it)
118 (atom:entry-updated it)
119 (map (lambda (pair) (cons (car pair) ((cdr pair) it)))
120 `((authors . ,atom:entry-authors)
121 (categories . ,atom:entry-categories)
122 (content . ,atom:entry-content)
123 (contributors . ,atom:entry-contributors)
124 (id . ,atom:entry-id)
125 (links . ,atom:entry-links)
126 (published . ,atom:entry-published)
127 (rights . ,atom:entry-rights)
128 (source . ,atom:entry-source)
129 (summary . ,atom:entry-summary)))
130 (item-content/atom it)
131 #f ; afaik atom doesn't do enclosures
132 ))
133
134(define (item-link/atom it)
135 ;; Find the <link rel="alternate"> of IT, or return the first <link>
136 (let ((alt ((sxpath '(// (atom:link (@ rel (equal? "alternate")))))
137 it)))
138 (if (null? alt)
139 (car (atom:entry-links it))
140 (car alt))))
141
142(define (item-content/atom it)
143 ;; Return entry-content or entry-summary if content is external or binary
144 (let ((c (atom:entry-content it)))
145 (case (atom:content-kind c)
146 ((text html textual) (atom:content-text c))
147 ((xhtml) (atom:content-xhtml c))
148 ((xml) (atom:content-xml c))
149 ((binary external)
150 (let ((sum (atom:entry-summary it)))
151 (case (atom:summary-type sum)
152 ((xhtml) (atom:summary-xhtml sum))
153 (else (atom:summary-text sum)))))
154 (else (error "Bad content type for item" it)))))