summary refs log tree commit diff stats
path: root/ingest.scm
blob: 3b65c753924a98938667514dc524ef3d3be1afda (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
;;; (planet ingest) --- injest feeds

(declare (module (planet ingest))
         (import scheme (chicken base)
                 (chicken io)
                 (chicken port)
                 (chicken time posix)
                 (planet feed)
                 (planet util)
                 http-client
                 uri-common
                 rss
                 rfc3339
                 sxpath
                 (prefix atom atom:)))

;;; Public API

(define-public (download-and-ingest-feed url)
  (ingest-feed (download-feed url) url))

(define-public (ingest-feed fd #!optional url)
  (cond
   ((rss->feed fd) => (cut ingest/rss <> url))
   ((atom->feed fd) => (cut ingest/atom <> url))
   (else (error "Unknown feed type" fd))))

(define-public (download-feed url)
  (with-input-from-request url #f read-string))

;;; RSS
(define (rss->feed x)
  (or (and (rss:feed? x) x)
      (false-on-error
       (lambda () (with-input-from-string x rss:read)))))

(define (ingest/rss fd #!optional url)
  (let ((chan (rss:feed-channel fd))
        (items (map item/rss->item (rss:feed-items fd))))
    (make-feed url                      ; url object -- xxx; brittle
               (rss:item-title chan)
               (rfc822->rfc3339 (rss:item-attribute chan 'lastBuildDate))
               (rss:item-attributes chan)
               items)))

(define (item/rss->item it)
  (assert (rss:item? it) "Not an rss:item" it)
  (let ((attachments (map enclosure->attachment rss:item-enclosures)))
    (make-item (rss:item-link it)
               (rss:item-title it)
               (rfc822->rfc3339 (rss:item-attribute it 'pubDate))
               (rss:item-attributes it)
               (rss:item-description it)
               rss:item-enclosures
               attachments)))

(define (enclosure->attachment enc)
  (make-attachment rss:enclosure-url
                   rss:enclosure-length
                   rss:enclosure-type))

(define (rfc822->rfc3339 tm)
  (let loop ((fmts
              ;; these don't capture the timezone -- afaik, a limitation of
              ;; strptime
              '("%a, %d %b %y %H:%M:%S"
                "%d %b %y %H:%M:%S"
                "%a, %e %b %y %H:%M:%S"
                "%e %b %y %H:%M:%S"
                "%a, %d %b %y %H:%M"
                "%d %b %y %H:%M"
                "%a, %e %b %y %H:%M"
                "%e %b %y %H:%M"
                "%a, %d %b %Y %H:%M:%S"
                "%d %b %Y %H:%M:%S"
                "%a, %e %b %Y %H:%M:%S"
                "%e %b %Y %H:%M:%S"
                "%a, %d %b %Y %H:%M"
                "%d %b %Y %H:%M"
                "%a, %e %b %Y %H:%M"
                "%e %b %Y %H:%M")))
    (cond
     ((null? fmts) tm) ; fallback
     ((string->time tm (car fmts))
      => (lambda (tm) (time->rfc3339 tm)))
     (else (loop (cdr fmts))))))

;;; Atom

(define (atom->feed fd)
  (or (and (atom:feed? fd) fd)
      (false-on-error
       (lambda () (call-with-input-string fd atom:read-atom-feed)))))

(define (ingest/atom fd #!optional url)
  (let ((items (map item/atom->item (atom:feed-entries fd))))
    (make-feed url
               (atom:feed-title fd)
               (string->rfc3339 (atom:feed-updated fd))
               (map (lambda (pair) (cons (car pair) ((cdr pair) fd)))
                    ;; have to build meta
                    `((authors . ,atom:feed-authors)
                      (categories . ,atom:feed-categories)
                      (contributors . ,atom:feed-contributors)
                      (generator . ,atom:feed-generator)
                      (icon . ,atom:feed-icon)
                      (id . ,atom:feed-id)
                      (links . ,atom:feed-links)
                      (logo . ,atom:feed-logo)
                      (rights . ,atom:feed-rights)
                      (subtitle . ,atom:feed-subtitle)))
               items)))

(define (item/atom->item it)
  (assert (atom:entry? it) "Not an Atom entry" it)
  (make-item (item-link/atom it)
             (atom:entry-title it)
             (atom:entry-updated it)
             (map (lambda (pair) (cons (car pair) ((cdr pair) it)))
                  `((authors . ,atom:entry-authors)
                    (categories . ,atom:entry-categories)
                    (content . ,atom:entry-content)
                    (contributors . ,atom:entry-contributors)
                    (id . ,atom:entry-id)
                    (links . ,atom:entry-links)
                    (published . ,atom:entry-published)
                    (rights . ,atom:entry-rights)
                    (source . ,atom:entry-source)
                    (summary . ,atom:entry-summary)))
             (item-content/atom it)
             #f                         ; afaik atom doesn't do enclosures
             ))

(define (item-link/atom it)
  ;; Find the <link rel="alternate"> of IT, or return the first <link>
  (let ((alt ((sxpath '(// (atom:link (@ rel (equal? "alternate")))))
              it)))
    (if (null? alt)
        (car (atom:entry-links it))
        (car alt))))

(define (item-content/atom it)
  ;; Return entry-content or entry-summary if content is external or binary
  (let ((c (atom:entry-content it)))
    (case (atom:content-kind c)
      ((text html textual) (atom:content-text c))
      ((xhtml) (atom:content-xhtml c))
      ((xml) (atom:content-xml c))
      ((binary external)
       (let ((sum (atom:entry-summary it)))
         (case (atom:summary-type sum)
           ((xhtml) (atom:summary-xhtml sum))
           (else (atom:summary-text sum)))))
      (else (error "Bad content type for item" it)))))