You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1713 lines
70 KiB
Scheme
1713 lines
70 KiB
Scheme
9 years ago
|
(import chicken scheme srfi-1 srfi-13 srfi-14)
|
||
|
(use rss html-parser srfi-19 http-client postgresql md5 message-digest
|
||
|
anaphora uri-common general-utils matchable posix coops coops-utils
|
||
|
args irregex sxml-transforms intarweb sxpath message-digest-bv
|
||
|
srfi-18 log5scm numbers medea
|
||
|
(prefix utf8 utf8:) (prefix utf8-srfi-13 utf8:)
|
||
|
(only sort-combinators group/key) atom rfc3339)
|
||
|
|
||
|
(client-software
|
||
|
'(("Mozilla/5.0" "44.0" "(X11; Linux x86_64; rv:44.0) Gecko/20100101")))
|
||
|
|
||
|
(define *program-config* (with-input-from-file "news.conf" read))
|
||
|
|
||
|
;; (use debug-utils box)
|
||
|
;; (breakpoint-accessor _)
|
||
|
|
||
|
(define-category debug)
|
||
|
(define-category info)
|
||
|
(define-category warn)
|
||
|
(define-category error)
|
||
|
|
||
|
(define-category debug+ (or debug info warn error))
|
||
|
(define-category info+ (or info warn error))
|
||
|
(define-category warn+ (or warn error))
|
||
|
|
||
|
(define-category fetch-feed)
|
||
|
(define-category fetch-media)
|
||
|
(define-category import)
|
||
|
(define-category rank)
|
||
|
(define-category update-article-view)
|
||
|
|
||
|
(define-output <standard-out
|
||
|
(conc "[" (current-category) "] "
|
||
|
"[" (date->string (current-date) "~D ~T") "]"
|
||
|
(if (current-context) (conc (current-context) " - ") "")
|
||
|
" - " (current-message)))
|
||
|
|
||
|
;; Making this a macro removes one spurious stack trace relative to a
|
||
|
;; function element.
|
||
|
(define-syntax log-error
|
||
|
(syntax-rules ()
|
||
|
((_ exn)
|
||
|
(log-error exn #f #f #f))
|
||
|
((_ exn str)
|
||
|
(log-error exn str #f #f))
|
||
|
((_ exn str default)
|
||
|
(log-error exn str default #f))
|
||
|
((_ exn str default call-chain)
|
||
|
(let* ((_chain (if call-chain call-chain (get-call-chain)))
|
||
|
(chain (with-output-to-string (lambda () (newline) (pp (drop-right _chain 2)))))
|
||
|
(cpa condition-property-accessor)
|
||
|
(exn-message (cpa 'exn 'message "(no message)"))
|
||
|
(exn-location (cpa 'exn 'location "*ERROR LOCATION UNKNOWN*"))
|
||
|
(exn-arguments (cpa 'exn 'arguments '()))
|
||
|
(exn? (condition-predicate 'exn)))
|
||
|
(log-for (error)
|
||
|
(with-output-to-string
|
||
|
(lambda ()
|
||
|
(print (exn-location exn))
|
||
|
(print (exn-message exn))
|
||
|
(print (exn-arguments exn))
|
||
|
(print chain)
|
||
|
(when str (print str)))))
|
||
|
default))))
|
||
|
|
||
|
;; if exception occurs ignore and return a indicator.
|
||
|
(define-syntax exn*
|
||
|
(syntax-rules ()
|
||
|
((_ exp)
|
||
|
(handle-exceptions
|
||
|
exn
|
||
|
'exception-occured
|
||
|
exp))))
|
||
|
|
||
|
(define (md5-hash s) (message-digest-string (md5-primitive) s))
|
||
|
(define skip-log-condition (make-property-condition 'skip-log))
|
||
|
(define skip-log-condition-predicate (condition-predicate 'skip-log))
|
||
|
|
||
|
(define (string-atom-date->date s)
|
||
|
(seconds->date
|
||
|
(rfc3339->seconds
|
||
|
(time->rfc3339
|
||
|
(rfc3339->local-time
|
||
|
(string->rfc3339 s))))))
|
||
|
|
||
|
;; A normalized string->date conversion to help out SRFI-19.
|
||
|
(define (string->date* s)
|
||
|
;; SRFI-19 can't handle string timezones so try to convert them to
|
||
|
;; numeric timezones.
|
||
|
(handle-exceptions
|
||
|
exn
|
||
|
(begin (log-for (warn) (conc "broken string->date* for: " s))
|
||
|
(signal skip-log-condition))
|
||
|
(let* ((d* (let ((possible-time-zone (string-take-right s 3))
|
||
|
(timezone-maps '(("GMT" . "-0800")
|
||
|
("PDT" . "-0000")
|
||
|
("PST" . "-0000")
|
||
|
(" UT" . " -0800")
|
||
|
("EDT" . "-0300")
|
||
|
("EST" . "-0300")
|
||
|
("CST" . "-0200")))
|
||
|
(timezone-offset-map '(("+0000" . "-0800")))
|
||
|
(possible-timezone-offset (string-take-right s 5))
|
||
|
(possible-non-english-day (string-take s 3))
|
||
|
;; Dom, Seg, Ter, Qua, Qui, Sex, Sab
|
||
|
(non-english-day-maps '(("Ter" . "Tue")
|
||
|
("Qui" . "Thu")))
|
||
|
(s* (string-copy s)))
|
||
|
;; MUST COME FIRST
|
||
|
(set! s*
|
||
|
(aif (assoc possible-timezone-offset timezone-offset-map equal?)
|
||
|
(++ (string-take s* (- (string-length s*) 5)) (cdr it))
|
||
|
s*))
|
||
|
(set! s*
|
||
|
(aif (assoc possible-time-zone timezone-maps equal?)
|
||
|
(++ (string-take s* (- (string-length s*) 3)) (cdr it))
|
||
|
s*))
|
||
|
(set! s*
|
||
|
(aif (assoc possible-non-english-day non-english-day-maps equal?)
|
||
|
(++ (cdr it) (string-drop s* 3))
|
||
|
s*))
|
||
|
s*))
|
||
|
(d (handle-exceptions exn
|
||
|
(string->date d* "~a, ~d ~B ~Y ~H:~M:~S ~z")
|
||
|
(string->date d* "~a, ~d ~b ~Y ~H:~M:~S ~z"))))
|
||
|
|
||
|
(date-add-duration d (make-duration hours: (if (= (date-zone-offset d) 0)
|
||
|
0
|
||
|
(fx/ (date-zone-offset d) 3600)))))))
|
||
|
|
||
|
;; Make the current date compatible with the output of the
|
||
|
;; string->date* function.
|
||
|
(define (current-date*)
|
||
|
(let ((date (current-date)))
|
||
|
(string->date (conc (date-day date) " " (date-month date) " "
|
||
|
(date-year date) " -0000")
|
||
|
"~e ~m ~Y ~z")))
|
||
|
|
||
|
(define (strip-html html)
|
||
|
(filter (cut irregex-search '(: (* whitespace) (+ (or alnum punct))) <>)
|
||
|
(flatten
|
||
|
(pre-post-order
|
||
|
(html->sxml html)
|
||
|
`((@ . ,(lambda x '()))
|
||
|
(*default* . ,(lambda (tag . body) body))
|
||
|
(*text* . ,(lambda (tag string) (or-default string? string ""))))))))
|
||
|
|
||
|
(define (strip-tags-from-sxml sxml)
|
||
|
(utf8:string-trim-both
|
||
|
(irregex-replace/all
|
||
|
"\n\t"
|
||
|
(apply
|
||
|
conc
|
||
|
(flatten
|
||
|
(pre-post-order*
|
||
|
sxml
|
||
|
`((span . ,(lambda (tag body) (if (> (length body) 1) (cdr body) "")))
|
||
|
(p . ,(lambda (tag body) (if (> (length body) 1) (cdr body) "")))
|
||
|
(em . ,(lambda (tag body) body))
|
||
|
(a . ,(lambda (tag body) (if (> (length body) 1) (cdr body) "")))
|
||
|
(img . ,(lambda (tag body) '()))
|
||
|
(*text* . ,(lambda (tag str) str))
|
||
|
(*default* . ,(lambda (tag str) str))
|
||
|
(*TOP* . ,(lambda (tag str) str))))))
|
||
|
" ")))
|
||
|
|
||
|
(define-class <media-item> ()
|
||
|
((media-item accessor: media-item-media-item)
|
||
|
(url)
|
||
|
(type)
|
||
|
(file-extension)))
|
||
|
|
||
|
(define-class <feed-item> ()
|
||
|
((feed-item accessor: feed-item-feed-item)
|
||
|
(title)
|
||
|
(guid)
|
||
|
(link)
|
||
|
(source accessor: feed-item-source)
|
||
|
(medias accessor: feed-item-medias)
|
||
|
(description)
|
||
|
(author)
|
||
|
(authors)
|
||
|
(date)
|
||
|
(tags accessor: feed-item-tags)))
|
||
|
|
||
|
(define-class <rss-feed-item> (<feed-item>))
|
||
|
(define-class <atom-feed-item> (<feed-item>))
|
||
|
|
||
|
(define-class <los-angeles-times-item> (<rss-feed-item>)
|
||
|
((source "Los Angeles Times")))
|
||
|
(define-class <new-york-times-item> (<rss-feed-item>)
|
||
|
((source "New York Times")))
|
||
|
(define-class <san-francisco-chronicle-item> (<rss-feed-item>)
|
||
|
((source "San Francisco Chronicle")))
|
||
|
(define-class <wall-street-journal-item> (<rss-feed-item>)
|
||
|
((source "San Francisco Chronicle")))
|
||
|
(define-class <hacker-news-item> (<rss-feed-item>)
|
||
|
((source "Hacker News")))
|
||
|
(define-class <lambda-the-ultimate-item> (<rss-feed-item>)
|
||
|
((source "Lambda the Ultimate")))
|
||
|
(define-class <planet-scheme-item> (<atom-feed-item>)
|
||
|
((source "Planet Scheme")))
|
||
|
(define-class <planet-haskell-item> (<atom-feed-item>)
|
||
|
((source "Planet Haskell")))
|
||
|
(define-class <fox-news-item> (<rss-feed-item>)
|
||
|
((source "Fox News")))
|
||
|
(define-class <xkcd-item> (<atom-feed-item>)
|
||
|
((source "XKCD")))
|
||
|
(define-class <christianity-today-item> (<rss-feed-item>)
|
||
|
((source "Christianity Today")))
|
||
|
(define-class <techcrunch-item> (<rss-feed-item>)
|
||
|
((source "TechCrunch")))
|
||
|
(define-class <engadget-item> (<rss-feed-item>)
|
||
|
((source "Engadget")))
|
||
|
(define-class <npr-item> (<rss-feed-item>)
|
||
|
((source "NPR")))
|
||
|
(define-class <eff-item> (<rss-feed-item>)
|
||
|
((source "EFF")))
|
||
|
(define-class <common-dreams-item> (<rss-feed-item>)
|
||
|
((source "Common Dreams")))
|
||
|
(define-class <raw-story-item> (<rss-feed-item>)
|
||
|
((source "Raw Story")))
|
||
|
(define-class <reddit-item> (<atom-feed-item>)
|
||
|
((source "Reddit")))
|
||
|
(define-class <think-progress-item> (<rss-feed-item>)
|
||
|
((source "Think Progress")))
|
||
|
|
||
|
|
||
|
(define-class <los-angeles-times-media-item> (<media-item>) ())
|
||
|
(define-class <new-york-times-media-item> (<media-item>) ())
|
||
|
(define-class <san-francisco-chronicle-media-item> (<media-item>) ())
|
||
|
(define-class <wall-street-journal-media-item> (<media-item>) ())
|
||
|
(define-class <hacker-news-media-item> (<media-item>) ())
|
||
|
(define-class <lambda-the-ultimate-media-item> (<media-item>) ())
|
||
|
(define-class <xkcd-media-item> (<media-item>) ())
|
||
|
(define-class <url-media-item> (<media-item>) ())
|
||
|
(define-class <png-media-item> (<media-item>) ())
|
||
|
(define-class <url-png-media-item> (<url-media-item> <png-media-item>) ())
|
||
|
|
||
|
(define-method (feed-item-title (feed-item <rss-feed-item>))
|
||
|
(rss:item-title (feed-item-feed-item feed-item)))
|
||
|
(define-method (feed-item-title (feed-item <atom-feed-item>))
|
||
|
(match (entry-title (feed-item-feed-item feed-item))
|
||
|
((atom:title ('@ ('type "text")) x) x)
|
||
|
((atom:title x) x)
|
||
|
((atom:title ('@ ('xml:lang "en-US")) x) x)
|
||
|
((atom:title ('@ ('xml:lang "en")) x) x)
|
||
|
(x (error (conc "title type not supported: "
|
||
|
(entry-title (feed-item-feed-item feed-item)))))))
|
||
|
(define-method (feed-item-title (feed-item <hacker-news-item>))
|
||
|
(alist-ref 'title (feed-item-feed-item feed-item)))
|
||
|
|
||
|
(define-method (feed-item-link (feed-item <rss-feed-item>))
|
||
|
(rss:item-link (feed-item-feed-item feed-item)))
|
||
|
(define-method (feed-item-link (feed-item <atom-feed-item>))
|
||
|
(let ((links (entry-links (feed-item-feed-item feed-item))))
|
||
|
(if (null? links)
|
||
|
(error "feed item has no link")
|
||
|
(let rec ((sxml (car links)))
|
||
|
(match sxml
|
||
|
(('atom:link x) (rec x))
|
||
|
(('@ x ...)
|
||
|
(let ((href-list (filter identity (map rec x))))
|
||
|
(if (null? href-list)
|
||
|
(error (conc "link not found: "
|
||
|
(entry-links (feed-item-feed-item feed-item))))
|
||
|
(car href-list))))
|
||
|
(('href x) x)
|
||
|
(_ #f))))))
|
||
|
(define-method (feed-item-link (feed-item <hacker-news-item>))
|
||
|
(conc "https://news.ycombinator.com/item?id="
|
||
|
(alist-ref 'id (feed-item-feed-item feed-item))))
|
||
|
|
||
|
(define-method (feed-item-guid (feed-item <rss-feed-item>))
|
||
|
(alist-ref 'guid (rss:item-attributes (feed-item-feed-item feed-item))))
|
||
|
(define-method (feed-item-guid (feed-item <atom-feed-item>))
|
||
|
(entry-id (feed-item-feed-item feed-item)))
|
||
|
(define-method (feed-item-guid (feed-item <los-angeles-times-item>))
|
||
|
;; TODO figure out a better GUID that won't clash with future
|
||
|
;; articles with the same title or articles with the same title from
|
||
|
;; a different source.
|
||
|
(last (utf8:string-split (feed-item-link feed-item) "/")))
|
||
|
(define-method (feed-item-guid (feed-item <new-york-times-item>))
|
||
|
(or (alist-ref 'guid (rss:item-attributes (feed-item-feed-item feed-item)))
|
||
|
(rss:item-link (feed-item-feed-item feed-item))))
|
||
|
(define-method (feed-item-guid (feed-item <hacker-news-item>))
|
||
|
(conc "hacker-news-" (alist-ref 'id (feed-item-feed-item feed-item))))
|
||
|
(define-method (feed-item-guid (feed-item <lambda-the-ultimate-item>))
|
||
|
(rss:item-link (feed-item-feed-item feed-item)))
|
||
|
(define-method (feed-item-guid (feed-item <fox-news-item>))
|
||
|
(or (alist-ref 'guid (rss:item-attributes (feed-item-feed-item feed-item)))
|
||
|
(rss:item-link (feed-item-feed-item feed-item))))
|
||
|
(define-method (feed-item-guid (feed-item <christianity-today-item>))
|
||
|
;; TODO figure out a better GUID that won't clash with future
|
||
|
;; articles with the same title or articles with the same title from
|
||
|
;; a different source.
|
||
|
(last (utf8:string-split (feed-item-link feed-item) "/")))
|
||
|
(define-method (feed-item-guid (feed-item <npr-item>))
|
||
|
(apply
|
||
|
conc
|
||
|
(uri-path
|
||
|
(uri-reference (rss:item-link (feed-item-feed-item feed-item))))))
|
||
|
|
||
|
(define-method (feed-item-description (feed-item <rss-feed-item>))
|
||
|
(rss:item-description (feed-item-feed-item feed-item)))
|
||
|
(define-method (feed-item-description (feed-item <atom-feed-item>))
|
||
|
(match (entry-content (feed-item-feed-item feed-item))
|
||
|
(('atom:content ('@ ('type "html")) x)
|
||
|
(let ((body (strip-html x)))
|
||
|
(if (null? body)
|
||
|
""
|
||
|
(car body))))
|
||
|
(('atom:content ('@ ('type "text")) x) x)
|
||
|
(_ "")))
|
||
|
(define-method (feed-item-description (feed-item <reddit-item>))
|
||
|
"")
|
||
|
(define-method (feed-item-description (feed-item <new-york-times-item>))
|
||
|
(or-default string?
|
||
|
(cadr (html->sxml (rss:item-description (feed-item-feed-item feed-item))))
|
||
|
""))
|
||
|
|
||
|
(define-method (feed-item-description (feed-item <los-angeles-times-item>))
|
||
|
(let ((d (rss:item-description (feed-item-feed-item feed-item))))
|
||
|
(if (string? d)
|
||
|
(utf8:string-trim-both (car (utf8:string-split d "\n")))
|
||
|
"")))
|
||
|
|
||
|
(define-method (feed-item-description (feed-item <san-francisco-chronicle-item>))
|
||
|
;; Can contain nested <p> elements but doesn't have to so filter
|
||
|
;; them out.
|
||
|
(let* ((d* (rss:item-description (feed-item-feed-item feed-item)))
|
||
|
(d** (or d* ""))
|
||
|
(d (strip-tags-from-sxml
|
||
|
(html->sxml d**))))
|
||
|
(cond ((and (string? d)
|
||
|
(irregex-search "\\$\\(\"." d))
|
||
|
"")
|
||
|
((string? d)
|
||
|
(utf8:string-trim-both d))
|
||
|
((and (list? d) (not (null? d)) (not (null? (cdr d))))
|
||
|
(utf8:string-trim-both (cadr d)))
|
||
|
(else ""))))
|
||
|
(define-method (feed-item-description (feed-item <hacker-news-item>))
|
||
|
"")
|
||
|
(define-method (feed-item-description (feed-item <lambda-the-ultimate-item>))
|
||
|
(or-default string?
|
||
|
(cadr (html->sxml (rss:item-description (feed-item-feed-item feed-item))))
|
||
|
""))
|
||
|
(define-method (feed-item-description (feed-item <fox-news-item>))
|
||
|
(let ((d (rss:item-description (feed-item-feed-item feed-item))))
|
||
|
(if d
|
||
|
(if (string? d)
|
||
|
(cadr (html->sxml d))
|
||
|
"")
|
||
|
"")))
|
||
|
(define-method (feed-item-description (feed-item <xkcd-item>))
|
||
|
(let ((r ((sxpath '(// (img (@ src *text*))
|
||
|
// ((@ title *text*))))
|
||
|
(html->sxml (summary-text
|
||
|
(entry-summary (feed-item-feed-item feed-item)))))))
|
||
|
(if (null? r)
|
||
|
""
|
||
|
(let ((r2 (car r)))
|
||
|
(if (string? r2)
|
||
|
r2
|
||
|
"")))))
|
||
|
(define-method (feed-item-description (feed-item <christianity-today-item>))
|
||
|
(strip-tags-from-sxml
|
||
|
((sxpath `(// (p (@ class ,(lambda (x y)
|
||
|
(if (not (null? x))
|
||
|
(let ((class (cadar x)))
|
||
|
(or (equal? "text" class)
|
||
|
(equal? "deck" class)))
|
||
|
#f))))))
|
||
|
(html->sxml (rss:item-description (feed-item-feed-item feed-item))))))
|
||
|
(define-method (feed-item-description (feed-item <techcrunch-item>))
|
||
|
(strip-tags-from-sxml
|
||
|
(html->sxml (rss:item-description (feed-item-feed-item feed-item)))))
|
||
|
(define-method (feed-item-description (feed-item <engadget-item>))
|
||
|
(strip-tags-from-sxml
|
||
|
(html->sxml (rss:item-description (feed-item-feed-item feed-item)))))
|
||
|
(define-method (feed-item-description (feed-item <npr-item>))
|
||
|
(strip-tags-from-sxml
|
||
|
(html->sxml (rss:item-description (feed-item-feed-item feed-item)))))
|
||
|
(define-method (feed-item-description (feed-item <eff-item>))
|
||
|
(apply conc (strip-html (rss:item-description (feed-item-feed-item feed-item)))))
|
||
|
(define-method (feed-item-description (feed-item <common-dreams-item>))
|
||
|
(apply
|
||
|
conc
|
||
|
((sxpath `(// (div (@ class ,(lambda (x y)
|
||
|
(if (not (null? x))
|
||
|
(let ((class (cadar x)))
|
||
|
(string-contains
|
||
|
class "field--type-text-with-summary")
|
||
|
)
|
||
|
#f))))
|
||
|
// p *text*))
|
||
|
(html->sxml (rss:item-description (feed-item-feed-item feed-item))))))
|
||
|
(define-method (feed-item-description (feed-item <raw-story-item>))
|
||
|
(apply conc (flatten (cdr (html->sxml
|
||
|
(rss:item-description (feed-item-feed-item feed-item)))))))
|
||
|
(define-method (feed-item-description (feed-item <think-progress-item>))
|
||
|
(strip-tags-from-sxml
|
||
|
(html->sxml (rss:item-description (feed-item-feed-item feed-item)))))
|
||
|
|
||
|
(define rgx:author-split (sre->irregex '(or " and " ", ") 'fast 'i))
|
||
|
(define-method (feed-item-author (feed-item <rss-feed-item>))
|
||
|
(or (alist-ref 'dc:creator (rss:item-attributes (feed-item-feed-item feed-item)))
|
||
|
""))
|
||
|
(define-method (feed-item-author (feed-item <atom-feed-item>))
|
||
|
;; Kinda weird but not sure how to do it better.
|
||
|
(string-intersperse
|
||
|
(map author-name (entry-authors (feed-item-feed-item feed-item)))
|
||
|
","))
|
||
|
(define-method (feed-item-author (feed-item <reddit-item>))
|
||
|
(cadr (string-split (author-name
|
||
|
(car (entry-authors (feed-item-feed-item feed-item)))) "/")))
|
||
|
(define-method (feed-item-author (feed-item <los-angeles-times-item>))
|
||
|
(or (alist-ref 'author (rss:item-attributes (feed-item-feed-item feed-item)))
|
||
|
(alist-ref 'dc:creator (rss:item-attributes (feed-item-feed-item feed-item)))
|
||
|
""))
|
||
|
(define-method (feed-item-author (feed-item <new-york-times-item>))
|
||
|
(let ((author
|
||
|
(or (alist-ref 'dc:creator
|
||
|
(rss:item-attributes (feed-item-feed-item feed-item)))
|
||
|
(alist-ref 'author
|
||
|
(rss:item-attributes (feed-item-feed-item feed-item))))))
|
||
|
(if (not (string? author))
|
||
|
"New York Times"
|
||
|
author)))
|
||
|
(define-method (feed-item-author (feed-item <san-francisco-chronicle-item>))
|
||
|
(let ((a (alist-ref 'dc:creator
|
||
|
(rss:item-attributes (feed-item-feed-item feed-item)))))
|
||
|
(if a (car (strip-html a)) "")))
|
||
|
(define-method (feed-item-author (feed-item <wall-street-journal-item>))
|
||
|
"")
|
||
|
(define-method (feed-item-author (feed-item <hacker-news-item>))
|
||
|
(alist-ref 'by (feed-item-feed-item feed-item)))
|
||
|
(define-method (feed-item-author (feed-item <lambda-the-ultimate-item>))
|
||
|
"")
|
||
|
(define-method (feed-item-author (feed-item <fox-news-item>))
|
||
|
(let ((author
|
||
|
(or (alist-ref 'dc:creator
|
||
|
(rss:item-attributes (feed-item-feed-item feed-item)))
|
||
|
(alist-ref 'author
|
||
|
(rss:item-attributes (feed-item-feed-item feed-item))))))
|
||
|
(cond ((not (string? author)) "")
|
||
|
((string=? author "foxnewsonline@foxnews.com (Fox News Online) ")
|
||
|
"")
|
||
|
(else author))))
|
||
|
(define-method (feed-item-author (feed-item <christianity-today-item>))
|
||
|
(let ((r (irregex-split
|
||
|
", guest writer"
|
||
|
(or (alist-ref 'dc:creator
|
||
|
(rss:item-attributes (feed-item-feed-item feed-item)))
|
||
|
(alist-ref 'author
|
||
|
(rss:item-attributes (feed-item-feed-item feed-item)))""))))
|
||
|
(if (null? r)
|
||
|
""
|
||
|
(car r))))
|
||
|
(define-method (feed-item-author (feed-item <eff-item>))
|
||
|
(or (alist-ref 'dc:creator (rss:item-attributes (feed-item-feed-item feed-item)))
|
||
|
""))
|
||
|
|
||
|
(define-method (feed-item-authors (feed-item <atom-feed-item>))
|
||
|
(map author-name (entry-authors (feed-item-feed-item feed-item))))
|
||
|
(define-method (feed-item-authors (feed-item <reddit-item>))
|
||
|
(list
|
||
|
(cadr (string-split (author-name
|
||
|
(car (entry-authors (feed-item-feed-item feed-item)))) "/"))))
|
||
|
(define-method (feed-item-authors (feed-item <rss-feed-item>))
|
||
|
(list (feed-item-author feed-item)))
|
||
|
(define-method (feed-item-authors (feed-item <new-york-times-item>))
|
||
|
(let ((author (feed-item-author feed-item)))
|
||
|
(if author
|
||
|
(map (lambda (s)
|
||
|
(utf8:string-titlecase
|
||
|
(utf8:string-trim-both
|
||
|
(cond ((string=? (utf8:string-downcase s) "first draft")
|
||
|
"The New York Times First Draft")
|
||
|
(else s)))))
|
||
|
(flatten (map (cut irregex-split rgx:author-split <>)
|
||
|
(irregex-split "By " author))))
|
||
|
"New York Times")))
|
||
|
(define-method (feed-item-authors (feed-item <los-angeles-times-item>))
|
||
|
(let ((author (feed-item-author feed-item)))
|
||
|
(if author
|
||
|
(if (equal? author "The Times editorial board")
|
||
|
'("The Los Angeles Times editorial board")
|
||
|
(irregex-split rgx:author-split author))
|
||
|
(list "Los Angeles Times"))))
|
||
|
(define-method (feed-item-authors (feed-item <san-francisco-chronicle-item>))
|
||
|
(let ((author (utf8:string-trim-both (feed-item-author feed-item))))
|
||
|
(if author
|
||
|
(flatten (map (cut irregex-split rgx:author-split <>)
|
||
|
(irregex-split "By " author)))
|
||
|
(list ""))))
|
||
|
(define-method (feed-item-authors (feed-item <wall-street-journal-item>))
|
||
|
(list ""))
|
||
|
(define-method (feed-item-authors (feed-item <hacker-news-item>))
|
||
|
(list (feed-item-author feed-item)))
|
||
|
(define-method (feed-item-authors (feed-item <lambda-the-ultimate-item>))
|
||
|
(list ""))
|
||
|
(define-method (feed-item-authors (feed-item <fox-news-item>))
|
||
|
(list (feed-item-author feed-item)))
|
||
|
(define-method (feed-item-authors (feed-item <christianity-today-item>))
|
||
|
(let ((author (feed-item-author feed-item)))
|
||
|
(if author
|
||
|
(irregex-split rgx:author-split author)
|
||
|
(list ""))))
|
||
|
(define-method (feed-item-authors (feed-item <techcrunch-item>))
|
||
|
(let ((author (feed-item-author feed-item)))
|
||
|
(if author
|
||
|
(irregex-split '(",") author)
|
||
|
(list ""))))
|
||
|
(define-method (feed-item-authors (feed-item <engadget-item>))
|
||
|
(let ((author (feed-item-author feed-item)))
|
||
|
(if author
|
||
|
(irregex-split rgx:author-split author)
|
||
|
(list ""))))
|
||
|
(define-method (feed-item-authors (feed-item <eff-item>))
|
||
|
(let ((author (feed-item-author feed-item)))
|
||
|
(if author
|
||
|
(irregex-split rgx:author-split author)
|
||
|
(list ""))))
|
||
|
|
||
|
(define-method (feed-item-date (feed-item <rss-feed-item>))
|
||
|
(string->date*
|
||
|
(alist-ref 'pubDate (rss:item-attributes (feed-item-feed-item feed-item)))))
|
||
|
(define-method (feed-item-date (feed-item <atom-feed-item>))
|
||
|
(let ((published (entry-published (feed-item-feed-item feed-item)))
|
||
|
(updated (entry-updated (feed-item-feed-item feed-item))))
|
||
|
(string-atom-date->date
|
||
|
(if (and published (not (string=? published "")))
|
||
|
published
|
||
|
updated))))
|
||
|
(define-method (feed-item-date (feed-item <hacker-news-item>))
|
||
|
(seconds->date (alist-ref 'time (feed-item-feed-item feed-item))))
|
||
|
(define-method (feed-item-date (feed-item <christianity-today-item>))
|
||
|
(let ((d (alist-ref 'pubDate (rss:item-attributes (feed-item-feed-item feed-item)))))
|
||
|
(if d (string->date* d) (current-date*))))
|
||
|
|
||
|
(define-method (media-item-type (media-item <media-item>)) 'jpg)
|
||
|
(define-method (media-item-type (media-item <xkcd-media-item>)) 'png)
|
||
|
(define-method (media-item-type (media-item <png-media-item>)) 'png)
|
||
|
(define-method (media-item-file-extension (media-item <media-item>)) "jpg")
|
||
|
(define-method (media-item-file-extension (media-item <xkcd-media-item>)) "png")
|
||
|
(define-method (media-item-file-extension (media-item <png-media-item>)) "png")
|
||
|
(define-method (media-item-relative-path (media-item <media-item>))
|
||
|
(++ (message-digest-string (md5-primitive) (media-item-url media-item))
|
||
|
"." (media-item-file-extension media-item)))
|
||
|
(define-method (media-item-filesystem-path (media-item <media-item>))
|
||
|
(++ (filesystem-path-img-original) "/" (media-item-relative-path media-item)))
|
||
|
|
||
|
(define-method (media-item-url (media-item <media-item>))
|
||
|
(rss:media-url (media-item-media-item media-item)))
|
||
|
(define-method (media-item-url (media-item <xkcd-media-item>))
|
||
|
(let ((r ((sxpath '(// (img (@ src *text*))
|
||
|
// ((@ src *text*))))
|
||
|
(html->sxml (summary-text (media-item-media-item media-item))))))
|
||
|
(if (null? r)
|
||
|
""
|
||
|
(let ((r2 (car r)))
|
||
|
(if (string? r2)
|
||
|
r2
|
||
|
"")))))
|
||
|
(define-method (media-item-url (media-item <url-media-item>))
|
||
|
(media-item-media-item media-item))
|
||
|
|
||
|
(define-method (print-object (item <feed-item>) #!optional (out (current-output-port)))
|
||
|
(print "title: " (exn* (feed-item-title item)))
|
||
|
(print "guid: " (exn* (feed-item-guid item)))
|
||
|
(print "link: " (exn* (feed-item-link item)))
|
||
|
(print "source: " (exn* (feed-item-source item)))
|
||
|
(print "description: " (exn* (feed-item-description item)))
|
||
|
(print "author: " (exn* (feed-item-author item)))
|
||
|
(print "date: " (exn* (feed-item-date item)))
|
||
|
(print "tags: " (exn* (feed-item-tags item))))
|
||
|
|
||
|
(define (rss-item-description-image-links rss-item)
|
||
|
(let* ((links
|
||
|
((sxpath `(// (img (@ src *text*))
|
||
|
// ((@ src *text*))))
|
||
|
(html->sxml (rss:item-description rss-item))))
|
||
|
(useful-links (filter
|
||
|
(lambda (link)
|
||
|
(or (irregex-search ".jpg" link)
|
||
|
(irregex-search ".png" link)))
|
||
|
links)))
|
||
|
useful-links))
|
||
|
|
||
|
(define (make-feed-item feed-item source tags type)
|
||
|
(match source
|
||
|
("New York Times"
|
||
|
(make <new-york-times-item>
|
||
|
'feed-item feed-item 'source source 'tags tags
|
||
|
'medias (map (lambda (i) (make <new-york-times-media-item>
|
||
|
'media-item i))
|
||
|
(rss:item-medias feed-item))))
|
||
|
("Los Angeles Times"
|
||
|
(make <los-angeles-times-item>
|
||
|
'feed-item feed-item 'source source 'tags tags
|
||
|
'medias (map (lambda (i) (make <los-angeles-times-media-item>
|
||
|
'media-item i))
|
||
|
(rss:item-medias feed-item))))
|
||
|
("San Francisco Chronicle"
|
||
|
(make <san-francisco-chronicle-item>
|
||
|
'feed-item feed-item 'source source 'tags tags
|
||
|
'medias (map (lambda (i) (make <san-francisco-chronicle-media-item>
|
||
|
'media-item i))
|
||
|
(rss:item-medias feed-item))))
|
||
|
("Wall Street Journal"
|
||
|
(make <wall-street-journal-item>
|
||
|
'feed-item feed-item 'source source 'tags tags
|
||
|
'medias (map (lambda (i) (make <wall-street-journal-media-item>
|
||
|
'media-item i))
|
||
|
(rss:item-medias feed-item))))
|
||
|
("Hacker News"
|
||
|
(make <hacker-news-item>
|
||
|
'feed-item feed-item 'source source 'tags tags
|
||
|
'medias '()))
|
||
|
("Lambda the Ultimate"
|
||
|
(make <lambda-the-ultimate-item>
|
||
|
'feed-item feed-item 'source source 'tags tags
|
||
|
'medias '()))
|
||
|
("Planet Scheme"
|
||
|
(make <planet-scheme-item>
|
||
|
'feed-item feed-item 'source source 'tags tags 'medias '()))
|
||
|
("Planet Haskell"
|
||
|
(make <planet-haskell-item>
|
||
|
'feed-item feed-item 'source source 'tags tags 'medias '()))
|
||
|
("Fox News"
|
||
|
(make <fox-news-item>
|
||
|
'feed-item feed-item 'source source 'tags tags
|
||
|
'medias (map (lambda (i) (make <media-item>
|
||
|
'media-item i))
|
||
|
(filter
|
||
|
(lambda (i) (rss:media? i))
|
||
|
(rss:item-medias feed-item)))))
|
||
|
("XKCD"
|
||
|
(make <xkcd-item>
|
||
|
'feed-item feed-item 'source source 'tags tags
|
||
|
'medias `(,(make <xkcd-media-item> 'media-item (entry-summary feed-item)))))
|
||
|
("Christianity Today"
|
||
|
(make <christianity-today-item>
|
||
|
'feed-item feed-item 'source source 'tags tags
|
||
|
'medias (map (lambda (link)
|
||
|
(make <url-media-item> 'media-item link))
|
||
|
(rss-item-description-image-links feed-item))))
|
||
|
("TechCrunch"
|
||
|
(make <techcrunch-item>
|
||
|
'feed-item feed-item 'source source 'tags tags
|
||
|
'medias (map (lambda (link)
|
||
|
(make <url-media-item> 'media-item link))
|
||
|
(rss-item-description-image-links feed-item))))
|
||
|
("Engadget"
|
||
|
(make <engadget-item>
|
||
|
'feed-item feed-item 'source source 'tags tags
|
||
|
'medias (map (lambda (link)
|
||
|
(make <url-png-media-item> 'media-item link))
|
||
|
(rss-item-description-image-links feed-item))))
|
||
|
("NPR"
|
||
|
(make <npr-item>
|
||
|
'feed-item feed-item 'source source 'tags tags
|
||
|
'medias '()))
|
||
|
("EFF"
|
||
|
(make <eff-item>
|
||
|
'feed-item feed-item 'source source 'tags tags
|
||
|
'medias '()))
|
||
|
("Common Dreams"
|
||
|
(make <common-dreams-item>
|
||
|
'feed-item feed-item 'source source 'tags tags
|
||
|
'medias '()))
|
||
|
("Raw Story"
|
||
|
(make <raw-story-item>
|
||
|
'feed-item feed-item 'source source 'tags tags
|
||
|
'medias '()))
|
||
|
("Reddit"
|
||
|
(make <reddit-item>
|
||
|
'feed-item feed-item 'source source 'tags tags
|
||
|
'medias '()))
|
||
|
("Think Progress"
|
||
|
(make <think-progress-item>
|
||
|
'feed-item feed-item 'source source 'tags tags
|
||
|
'medias '()))
|
||
|
(_
|
||
|
(case type
|
||
|
('rss
|
||
|
(make <rss-feed-item>
|
||
|
'feed-item feed-item 'source source 'tags tags 'medias '()))
|
||
|
('atom
|
||
|
(make <atom-feed-item>
|
||
|
'feed-item feed-item 'source source 'tags tags 'medias '()))
|
||
|
(else (error "feed item type unknown"))))))
|
||
|
|
||
|
(include "global.scm")
|
||
|
|
||
|
(include "db.scm")
|
||
|
(import (prefix news-db db:))
|
||
|
(define with-db db:with-db)
|
||
|
(define with-db/transaction db:with-db/transaction)
|
||
|
(db:connection-spec (alist-ref 'db-connection-spec *program-config*))
|
||
|
|
||
|
(define *hacker-news-base-uri* "https://hacker-news.firebaseio.com/v0/")
|
||
|
(define *hacker-news-top-stories-uri* "topstories")
|
||
|
(define *hacker-news-item-uri* "item/")
|
||
|
(define *hacker-news-uri-suffix* ".json")
|
||
|
|
||
|
(define (guid->guid-hash guid)
|
||
|
(message-digest-string (md5-primitive) guid))
|
||
|
|
||
|
(define (article-id-by-guid conn guid)
|
||
|
(let ((r (db:get-article-id-by-guid conn (guid->guid-hash guid))))
|
||
|
(if (> (row-count r) 0)
|
||
|
(value-at r)
|
||
|
#f)))
|
||
|
|
||
|
(define *sources* (with-input-from-file "feed-list.scm" read))
|
||
|
|
||
|
(define-class <feed-resource> ()
|
||
|
((content-type reader: feed-resource-content-type)
|
||
|
(tags reader: feed-resource-tags)))
|
||
|
(define-class <uri-feed-resource> (<feed-resource>)
|
||
|
((uri reader: feed-resource-uri)))
|
||
|
(define-class <xml-feed-resource> (<feed-resource>)
|
||
|
((content-type initform: "xml" reader: feed-resource-content-type)))
|
||
|
(define-class <hacker-news-feed-resource> (<feed-resource>)
|
||
|
((content-type initform: "json" reader: feed-resource-content-type)
|
||
|
(tags initform: '(hacker-news) reader: feed-resource-tags)))
|
||
|
(define-class <xml-uri-feed-resource> (<uri-feed-resource> <xml-feed-resource>))
|
||
|
|
||
|
(define-class <feed> ()
|
||
|
((name reader: feed-name)
|
||
|
(resources reader: feed-resources)))
|
||
|
(define-class <atom-feed> (<feed>))
|
||
|
(define-class <rss-feed> (<feed>))
|
||
|
(define-class <json-feed> (<feed>))
|
||
|
(define-class <new-york-times-feed> (<rss-feed>))
|
||
|
(define-class <los-angeles-times-feed> (<rss-feed>))
|
||
|
(define-class <san-francisco-chronicle-feed> (<rss-feed>))
|
||
|
(define-class <wall-street-journal-feed> (<rss-feed>))
|
||
|
(define-class <hacker-news-feed> (<json-feed>))
|
||
|
(define-class <fox-news-feed> (<rss-feed>))
|
||
|
(define-class <christianity-today-feed> (<rss-feed>))
|
||
|
(define-class <techcrunch-feed> (<rss-feed>))
|
||
|
(define-class <engadget-feed> (<rss-feed>))
|
||
|
(define-class <npr-feed> (<rss-feed>))
|
||
|
(define-class <common-dreams-feed> (<rss-feed>))
|
||
|
(define-class <raw-story-feed> (<rss-feed>))
|
||
|
(define-class <reddit-feed> (<atom-feed>))
|
||
|
|
||
|
(define-class <chronological-order-ranking-feed> (<feed>))
|
||
|
(define-class <rss-chronological-order-ranking-feed>
|
||
|
(<rss-feed> <chronological-order-ranking-feed>))
|
||
|
(define-class <atom-chronological-order-ranking-feed>
|
||
|
(<atom-feed> <chronological-order-ranking-feed>))
|
||
|
|
||
|
(define-method (feed-resource-root-filepath
|
||
|
(feed <feed>) (feed-resource <feed-resource>))
|
||
|
(conc "feed-" (feed-resource-content-type feed-resource) "/"
|
||
|
(string-intersperse (string-split (feed-name feed) "/") "-") "/"))
|
||
|
|
||
|
(define-method (feed-resource-filepath (feed <feed>) (resource <uri-feed-resource>))
|
||
|
(conc (feed-resource-root-filepath feed resource)
|
||
|
(md5-hash (feed-resource-uri resource)) "."
|
||
|
(feed-resource-content-type resource)))
|
||
|
|
||
|
(define-method (fetch-feed-resource (resource <uri-feed-resource>))
|
||
|
(with-input-from-request (feed-resource-uri resource) #f read-string))
|
||
|
|
||
|
(define-method (save-feed-resource-content (feed <feed>) (resource <uri-feed-resource>))
|
||
|
(create-directory (feed-resource-root-filepath feed resource))
|
||
|
(with-output-to-file (feed-resource-filepath feed resource)
|
||
|
(lambda ()
|
||
|
(write (fetch-feed-resource resource)))))
|
||
|
|
||
|
(define-method (fetch-feed-and-write-to-file (feed <feed>))
|
||
|
(for-each
|
||
|
(lambda (resource)
|
||
|
(log-for (info) (conc "fetching: " (feed-name feed) " "
|
||
|
(feed-resource-tags resource)))
|
||
|
(handle-exceptions exn
|
||
|
(log-error exn)
|
||
|
(save-feed-resource-content feed resource)))
|
||
|
(feed-resources feed)))
|
||
|
|
||
|
(define-method (fetch-feed-and-write-to-file (feed <hacker-news-feed>))
|
||
|
(let ((ids (fetch-hacker-news-top-stories-ids)))
|
||
|
(write-hacker-news-items-to-file ids "top-stories-ids.json")
|
||
|
(write-hacker-news-items-to-file (fetch-hacker-news-items ids) "top-stories.json")))
|
||
|
|
||
|
(define *mutex* (make-mutex 'foo))
|
||
|
(define-method (read-feed-resource-from-file
|
||
|
(feed <feed>) (resource <uri-feed-resource>))
|
||
|
(with-input-from-file (feed-resource-filepath feed resource) read))
|
||
|
|
||
|
(define (feed-xml->rss-items xml)
|
||
|
(with-input-from-string xml
|
||
|
(lambda ()
|
||
|
(rss:feed-items (rss:read (current-input-port))))))
|
||
|
|
||
|
(define (feed-xml->atom-items xml)
|
||
|
(with-input-from-string xml
|
||
|
(lambda ()
|
||
|
(feed-entries (read-atom-feed (current-input-port))))))
|
||
|
|
||
|
(define-method (feed-resource-items (feed <atom-feed>) (resource <feed-resource>))
|
||
|
(feed-xml->atom-items (read-feed-resource-from-file feed resource)))
|
||
|
|
||
|
(define-method (feed-resource-items (feed <rss-feed>) (resource <feed-resource>))
|
||
|
(feed-xml->rss-items (read-feed-resource-from-file feed resource)))
|
||
|
|
||
|
(define-method (feed-resource-items
|
||
|
(feed <hacker-news-feed>) (resource <hacker-news-feed-resource>))
|
||
|
(hacker-news-json-items->feed-items "top-stories.json"))
|
||
|
|
||
|
(define (handle-possible-ssax-parser-error exn _chain tags source)
|
||
|
(let ((chain (drop-right _chain 1)))
|
||
|
(if (and ((condition-predicate 'ssax) exn)
|
||
|
((condition-predicate 'parser-error) exn))
|
||
|
(begin
|
||
|
(log-for (warn) (conc "ssax parser error: "
|
||
|
((condition-property-accessor 'exn 'message) exn)
|
||
|
" for: " source " - " tags))
|
||
|
'())
|
||
|
(log-error exn (with-output-to-string
|
||
|
(lambda ()
|
||
|
(newline) (newline)
|
||
|
(pp (condition->list exn))
|
||
|
(newline) (newline)))
|
||
|
'() chain))))
|
||
|
|
||
|
(define-method (feed-resource-feed-items (feed <rss-feed>) (resource <feed-resource>))
|
||
|
(map
|
||
|
(lambda (i) (make-feed-item i (feed-name feed) (feed-resource-tags resource) 'rss))
|
||
|
(handle-exceptions
|
||
|
exn
|
||
|
(handle-possible-ssax-parser-error
|
||
|
exn (get-call-chain) (feed-resource-tags resource) (feed-name feed))
|
||
|
(dynamic-wind
|
||
|
(lambda () (mutex-lock! *mutex*))
|
||
|
(lambda () (feed-xml->rss-items (read-feed-resource-from-file feed resource)))
|
||
|
(lambda () (mutex-unlock! *mutex*))))))
|
||
|
|
||
|
(define-method (feed-resource-feed-items (feed <atom-feed>) (resource <feed-resource>))
|
||
|
(map
|
||
|
(lambda (i) (make-feed-item i (feed-name feed) (feed-resource-tags resource) 'atom))
|
||
|
(handle-exceptions
|
||
|
exn
|
||
|
(handle-possible-ssax-parser-error
|
||
|
exn (get-call-chain) (feed-resource-tags resource) (feed-name feed))
|
||
|
(dynamic-wind
|
||
|
(lambda () (mutex-lock! *mutex*))
|
||
|
(lambda () (feed-xml->atom-items (read-feed-resource-from-file feed resource)))
|
||
|
(lambda () (mutex-unlock! *mutex*))))))
|
||
|
|
||
|
(define-method (feed-resource-feed-items
|
||
|
(feed <hacker-news-feed>) (resource <hacker-news-feed-resource>))
|
||
|
(handle-exceptions exn
|
||
|
(log-error exn)
|
||
|
(feed-resource-items feed resource)))
|
||
|
|
||
|
(define (fetch-and-save-image uri filepath)
|
||
|
(if (uri-reference uri)
|
||
|
(condition-case
|
||
|
(call-with-input-request*
|
||
|
uri
|
||
|
#f
|
||
|
(lambda (p response)
|
||
|
(with-output-to-file filepath
|
||
|
(lambda ()
|
||
|
;; In CHICKEN strings are the same as binary and the
|
||
|
;; most efficient way to do things.
|
||
|
(let ((len (header-value 'content-length
|
||
|
(response-headers response))))
|
||
|
(write-string (read-string len p) len))))))
|
||
|
(exn (exn client-error)
|
||
|
(log-error exn ((condition-property-accessor 'exn 'message "nill") exn)))
|
||
|
(exn () (log-error exn)))
|
||
|
(log-for (warn) (conc "unable to get a URI reference from " uri))))
|
||
|
|
||
|
(define (save-feed-images-to-files feed-items)
|
||
|
(for-each
|
||
|
(lambda (item)
|
||
|
(for-each
|
||
|
(lambda (media-item)
|
||
|
(handle-exceptions exn
|
||
|
(log-error exn)
|
||
|
(fetch-and-save-image (media-item-url media-item)
|
||
|
(media-item-filesystem-path media-item))))
|
||
|
(filter (lambda (i) (media-item-url i)) (feed-item-medias item))))
|
||
|
feed-items))
|
||
|
|
||
|
(define-method (save-feed-resource-images (feed <feed>) (resource <feed-resource>))
|
||
|
(handle-exceptions exn
|
||
|
(log-error exn)
|
||
|
(save-feed-images-to-files
|
||
|
(with-db
|
||
|
(lambda (conn)
|
||
|
(log-for (info) (conc "saving images from: " (feed-name feed) " "
|
||
|
(feed-resource-tags resource)))
|
||
|
(filter (lambda (item)
|
||
|
(not (article-id-by-guid conn (feed-item-guid item))))
|
||
|
(handle-exceptions exn
|
||
|
(log-error exn '())
|
||
|
(feed-resource-feed-items feed resource))))))))
|
||
|
|
||
|
(define-method (save-feed-images (feed <feed>))
|
||
|
(for-each
|
||
|
(lambda (resource)
|
||
|
(save-feed-resource-images feed resource))
|
||
|
(feed-resources feed)))
|
||
|
|
||
|
;; hacker news
|
||
|
|
||
|
(define array-as-list-parser
|
||
|
(cons 'array (lambda (x) x)))
|
||
|
|
||
|
(json-parsers (cons array-as-list-parser (json-parsers)))
|
||
|
|
||
|
(define (fetch-from-hacker-news uri)
|
||
|
(with-input-from-request uri #f read-json))
|
||
|
|
||
|
(define (fetch-hacker-news-top-stories-ids #!key (limit 30))
|
||
|
(log-for (info) "fetching: Hacker News top stories")
|
||
|
(handle-exceptions exn
|
||
|
(log-error exn '())
|
||
|
(let ((r (fetch-from-hacker-news
|
||
|
(conc *hacker-news-base-uri* *hacker-news-top-stories-uri*
|
||
|
*hacker-news-uri-suffix*))))
|
||
|
(if limit (take r limit) r))))
|
||
|
|
||
|
(define (fetch-hacker-news-item id)
|
||
|
(fetch-from-hacker-news
|
||
|
(conc *hacker-news-base-uri* *hacker-news-item-uri* id *hacker-news-uri-suffix*)))
|
||
|
|
||
|
(define (hacker-news-root-filepath)
|
||
|
"feed-json/hacker-news/")
|
||
|
|
||
|
(define (fetch-hacker-news-items ids)
|
||
|
(let* ((mutex (make-mutex))
|
||
|
(items '())
|
||
|
(threads (map (lambda (id)
|
||
|
(make-thread
|
||
|
(lambda ()
|
||
|
(log-for (info) (conc "fetching: Hacker News item " id))
|
||
|
(handle-exceptions exn
|
||
|
(log-error exn)
|
||
|
(let ((r (fetch-hacker-news-item id)))
|
||
|
(mutex-lock! mutex)
|
||
|
(set! items (cons r items))
|
||
|
(mutex-unlock! mutex))))))
|
||
|
ids)))
|
||
|
(for-each (lambda (thread) (thread-start! thread)) threads)
|
||
|
(for-each (lambda (thread) (thread-join! thread)) threads)
|
||
|
items))
|
||
|
|
||
|
(define (write-hacker-news-items-to-file items path)
|
||
|
(create-directory (hacker-news-root-filepath) #t)
|
||
|
(with-output-to-file (conc (hacker-news-root-filepath) path)
|
||
|
(lambda () (write items))))
|
||
|
|
||
|
(define (read-hacker-news-items-from-file path)
|
||
|
(with-input-from-file (conc (hacker-news-root-filepath) path) read))
|
||
|
|
||
|
(define (hacker-news-json-items->feed-items path)
|
||
|
(map (lambda (item) (make-feed-item item "Hacker News" '(hacker-news) 'hacker-news))
|
||
|
(read-hacker-news-items-from-file path)))
|
||
|
|
||
|
|
||
|
(define (make-feed src)
|
||
|
(define (make-feed* feed-type resource-type src)
|
||
|
(make feed-type
|
||
|
'resources (map
|
||
|
(lambda (tags/uri)
|
||
|
(make resource-type
|
||
|
'tags (car tags/uri)
|
||
|
'uri (cdr tags/uri)))
|
||
|
(source-uris src))
|
||
|
'name (source-name src)))
|
||
|
(cond ((equal? (source-name src) "New York Times")
|
||
|
(make-feed* <new-york-times-feed> <xml-uri-feed-resource> src))
|
||
|
((equal? (source-name src) "Los Angeles Times")
|
||
|
(make-feed* <los-angeles-times-feed> <xml-uri-feed-resource> src))
|
||
|
((equal? (source-name src) "Wall Street Journal")
|
||
|
(make-feed* <wall-street-journal-feed> <xml-uri-feed-resource> src))
|
||
|
((equal? (source-name src) "San Francisco Chronicle")
|
||
|
(make-feed* <san-francisco-chronicle-feed> <xml-uri-feed-resource> src))
|
||
|
((equal? (source-name src) "Hacker News")
|
||
|
(make <hacker-news-feed> 'resources
|
||
|
`(,(make <hacker-news-feed-resource>))
|
||
|
'name "Hacker News"))
|
||
|
((equal? (source-name src) "Fox News")
|
||
|
(make-feed* <fox-news-feed> <xml-uri-feed-resource> src))
|
||
|
((equal? (source-name src) "Christianity Today")
|
||
|
(make-feed* <christianity-today-feed> <xml-uri-feed-resource> src))
|
||
|
((equal? (source-name src) "TechCrunch")
|
||
|
(make-feed* <techcrunch-feed> <xml-uri-feed-resource> src))
|
||
|
((equal? (source-name src) "Engadget")
|
||
|
(make-feed* <engadget-feed> <xml-uri-feed-resource> src))
|
||
|
((equal? (source-name src) "NPR")
|
||
|
(make-feed* <npr-feed> <xml-uri-feed-resource> src))
|
||
|
((equal? (source-name src) "Common Dreams")
|
||
|
(make-feed* <common-dreams-feed> <xml-uri-feed-resource> src))
|
||
|
((equal? (source-name src) "Raw Story")
|
||
|
(make-feed* <raw-story-feed> <xml-uri-feed-resource> src))
|
||
|
((equal? (source-name src) "Reddit")
|
||
|
(make-feed* <reddit-feed> <xml-uri-feed-resource> src))
|
||
|
((eq? (source-type src) 'rss-chronological)
|
||
|
(make-feed*
|
||
|
<rss-chronological-order-ranking-feed> <xml-uri-feed-resource> src))
|
||
|
((eq? (source-type src) 'atom-chronological)
|
||
|
(make-feed*
|
||
|
<atom-chronological-order-ranking-feed> <xml-uri-feed-resource> src))
|
||
|
((eq? (source-type src) 'rss)
|
||
|
(make-feed* <rss-feed> <xml-uri-feed-resource> src))
|
||
|
((eq? (source-type src) 'atom)
|
||
|
(make-feed* <atom-feed> <xml-uri-feed-resource> src))
|
||
|
(else (error "source type not supported"))))
|
||
|
|
||
|
;;; RANK ;;;
|
||
|
(define-method (rank-feed (feed <new-york-times-feed>))
|
||
|
(with-db
|
||
|
(lambda (conn)
|
||
|
(let ((name (feed-name feed)))
|
||
|
;; demote current elevated ranks
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 3 1000)))
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 4 1000)))
|
||
|
;; order matters. primary outranks secondary.
|
||
|
(with-transaction conn
|
||
|
(lambda ()
|
||
|
(db:set-rank-by-tag-association
|
||
|
conn name "ny-times-rank-secondary" 2 40)))
|
||
|
(with-transaction conn
|
||
|
(lambda ()
|
||
|
(db:set-rank-by-tag-association
|
||
|
conn name "ny-times-rank-primary" 1 3)))))))
|
||
|
|
||
|
(define-method (rank-feed (feed <los-angeles-times-feed>))
|
||
|
(let ((name (feed-name feed)))
|
||
|
(with-db
|
||
|
(lambda (conn)
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 3 1000)))
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 4 1000)))
|
||
|
(let ((sxml
|
||
|
(cdr
|
||
|
(with-input-from-string
|
||
|
(with-input-from-request "http://latimes.com" #f read-string)
|
||
|
html->sxml)))
|
||
|
(uri-root "http://www.latimes.com"))
|
||
|
;; secondary articles
|
||
|
(let ((secondary-uri-list
|
||
|
((sxpath '(// (section (@ class
|
||
|
(equal?
|
||
|
"trb_outfit_group trb_outfit_section")))
|
||
|
// ((@ data-content-url *text*))))
|
||
|
sxml)))
|
||
|
(unless (null? secondary-uri-list)
|
||
|
(with-transaction conn
|
||
|
(lambda ()
|
||
|
(db:set-rank-by-partial-uri
|
||
|
conn name
|
||
|
(map (lambda (u) (last (utf8:string-split u "/"))) secondary-uri-list)
|
||
|
2)))))
|
||
|
;; featured article
|
||
|
(let ((featured-uri-list
|
||
|
((sxpath '(// (section (@ data-outfit-type (equal? "centerpiece")))
|
||
|
(section
|
||
|
(@ class
|
||
|
(equal? "trb_outfit_primaryItem trb_outfit_section")))
|
||
|
((@ data-content-url *text*))))
|
||
|
sxml)))
|
||
|
(unless (null? featured-uri-list)
|
||
|
(with-transaction conn
|
||
|
(lambda ()
|
||
|
(db:set-rank-by-partial-uri
|
||
|
conn name
|
||
|
(map (lambda (u) (last (utf8:string-split u "/"))) featured-uri-list)
|
||
|
1))))))))))
|
||
|
|
||
|
(define-method (rank-feed (feed <wall-street-journal-feed>))
|
||
|
(with-db
|
||
|
(lambda (conn)
|
||
|
(let ((name (feed-name feed)))
|
||
|
;; demote current elevated ranks
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 3 1000)))
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 4 1000)))
|
||
|
;; order matters. primary outranks secondary.
|
||
|
(with-transaction conn
|
||
|
(lambda ()
|
||
|
(db:set-rank-by-tag-association
|
||
|
conn name "wsj-rank-secondary" 2 40)))
|
||
|
(with-transaction conn
|
||
|
(lambda ()
|
||
|
(db:set-rank-by-tag-association
|
||
|
conn name "wsj-rank-primary" 1 3)))))))
|
||
|
|
||
|
(define-method (rank-feed (feed <hacker-news-feed>))
|
||
|
(define (hn-guid id) (guid->guid-hash (conc "hacker-news-" id)))
|
||
|
(let ((name (feed-name feed)))
|
||
|
(with-db
|
||
|
(lambda (conn)
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 3 4 1000)))
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 3 1000)))
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 4 1000)))
|
||
|
(let ((items (read-hacker-news-items-from-file "top-stories-ids.json")))
|
||
|
(with-transaction conn
|
||
|
(lambda ()
|
||
|
(db:set-rank-by-guids
|
||
|
conn 1 (map-in-order hn-guid (take items 3)))
|
||
|
(db:set-rank-by-guids
|
||
|
conn 2 (map-in-order hn-guid (take (drop items 3) 3))))))))))
|
||
|
|
||
|
(define-method (rank-feed (feed <san-francisco-chronicle-feed>))
|
||
|
(let ((name (feed-name feed)))
|
||
|
(with-db
|
||
|
(lambda (conn)
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 3 1000)))
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 4 1000)))
|
||
|
(let* ((doc (cdr
|
||
|
(with-input-from-string
|
||
|
(with-input-from-request "http://sfgate.com" #f read-string)
|
||
|
html->sxml)))
|
||
|
(path-proc
|
||
|
(lambda (base)
|
||
|
(lambda (name)
|
||
|
((sxpath `(// (div (@ (equal? (class ,(string-append base " " name)))))
|
||
|
// a
|
||
|
((@ href *text*)))) doc)))))
|
||
|
(with-transaction conn
|
||
|
(lambda ()
|
||
|
(db:set-rank-by-partial-uri
|
||
|
conn name
|
||
|
((sxpath '(// (div (@ class (equal? "ctpl-centerpiece")))
|
||
|
// a
|
||
|
((@ href *text*))))
|
||
|
doc)
|
||
|
1)))
|
||
|
(db:set-rank-by-partial-uri
|
||
|
conn name
|
||
|
(map
|
||
|
car
|
||
|
(group/key
|
||
|
identity
|
||
|
(flatten
|
||
|
(append
|
||
|
(map (path-proc "package")
|
||
|
'("news" "sports" "business" "food" "realestate" "entertainment"))
|
||
|
(map (path-proc "headline-list")
|
||
|
'("news" "business" "national" "living"))))))
|
||
|
2))))))
|
||
|
|
||
|
(define-method (rank-feed (feed <chronological-order-ranking-feed>))
|
||
|
(let ((name (feed-name feed)))
|
||
|
(with-db
|
||
|
(lambda (conn)
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 3 1000)))
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 4 1000)))
|
||
|
(with-transaction conn
|
||
|
(lambda ()
|
||
|
(db:set-rank-by-date-difference
|
||
|
conn 2 (make-duration hours: 8) name 3 3)
|
||
|
(db:set-rank-by-date-difference
|
||
|
conn 1 (make-duration hours: 4) name 3 0)))))))
|
||
|
|
||
|
(define-method (rank-feed (feed <fox-news-feed>))
|
||
|
(with-db
|
||
|
(lambda (conn)
|
||
|
(let ((name (feed-name feed)))
|
||
|
;; demote current elevated ranks
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 3 1000)))
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 4 1000)))
|
||
|
(with-transaction conn
|
||
|
(lambda ()
|
||
|
(db:set-rank-by-tag-association
|
||
|
conn name "fox-news-rank-primary" 1 3)
|
||
|
(db:set-rank-by-tag-association
|
||
|
conn name "fox-news-rank-primary" 2 40 offset: 3)))))))
|
||
|
|
||
|
(define-method (rank-feed (feed <christianity-today-feed>))
|
||
|
(with-db
|
||
|
(lambda (conn)
|
||
|
(let ((name (feed-name feed)))
|
||
|
;; demote current elevated ranks
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 3 1000)))
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 4 1000)))
|
||
|
(with-transaction conn
|
||
|
(lambda ()
|
||
|
(db:set-rank-by-tag-association
|
||
|
conn name "christianity-today-rank-primary" 1 3)
|
||
|
(db:set-rank-by-tag-association
|
||
|
conn name "christianity-today-rank-primary" 2 40 offset: 3)))))))
|
||
|
|
||
|
(define-method (rank-feed (feed <techcrunch-feed>))
|
||
|
(let ((name (feed-name feed)))
|
||
|
(with-db
|
||
|
(lambda (conn)
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 3 1000)))
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 4 1000)))
|
||
|
(let ((sxml
|
||
|
(cdr
|
||
|
(with-input-from-string
|
||
|
(with-input-from-request "http://techcrunch.com/" #f read-string)
|
||
|
html->sxml))))
|
||
|
;; secondary articles
|
||
|
(let ((secondary-uri-list
|
||
|
((sxpath '(// (div (@ class
|
||
|
(equal?
|
||
|
"island-main")))
|
||
|
// a
|
||
|
// ((@ href *text*))))
|
||
|
sxml)))
|
||
|
(unless (null? secondary-uri-list)
|
||
|
(with-transaction conn
|
||
|
(lambda ()
|
||
|
(db:set-rank-by-partial-uri
|
||
|
conn name
|
||
|
(map (lambda (u) (last (utf8:string-split u "/"))) secondary-uri-list)
|
||
|
2)))))
|
||
|
;; featured article
|
||
|
(let ((featured-uri-list
|
||
|
((sxpath '(// (div (@ class
|
||
|
(equal?
|
||
|
"island-secondary")))
|
||
|
// a
|
||
|
// ((@ href *text*))))
|
||
|
sxml)))
|
||
|
(unless (null? featured-uri-list)
|
||
|
(with-transaction conn
|
||
|
(lambda ()
|
||
|
(db:set-rank-by-partial-uri
|
||
|
conn name
|
||
|
(map (lambda (u) (last (utf8:string-split u "/"))) featured-uri-list)
|
||
|
1))))))))))
|
||
|
|
||
|
(define-method (rank-feed (feed <engadget-feed>))
|
||
|
(let ((name (feed-name feed)))
|
||
|
(with-db
|
||
|
(lambda (conn)
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 3 4 1000)))
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 3 1000)))
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 2 1000)))
|
||
|
(let ((sxml
|
||
|
(cdr
|
||
|
(with-input-from-string
|
||
|
(with-input-from-request "http://www.engadget.com/" #f read-string)
|
||
|
html->sxml))))
|
||
|
;; featured articles
|
||
|
(let ((featured-uri-list
|
||
|
((sxpath '(// (li (@ class
|
||
|
(equal?
|
||
|
"lede-item post")))
|
||
|
// a
|
||
|
// ((@ href *text*))))
|
||
|
sxml)))
|
||
|
(unless (null? featured-uri-list)
|
||
|
(with-transaction conn
|
||
|
(lambda ()
|
||
|
(db:set-rank-by-partial-uri
|
||
|
conn name
|
||
|
(map (lambda (u) (last (utf8:string-split u "/"))) featured-uri-list)
|
||
|
1))))))))))
|
||
|
|
||
|
(define-method (rank-feed (feed <npr-feed>))
|
||
|
(with-db
|
||
|
(lambda (conn)
|
||
|
(let ((name (feed-name feed)))
|
||
|
;; demote current elevated ranks
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 3 1000)))
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 4 1000)))
|
||
|
(with-transaction conn
|
||
|
(lambda ()
|
||
|
(db:set-rank-by-tag-association
|
||
|
conn name "npr-rank-primary" 1 3)
|
||
|
(db:set-rank-by-tag-association
|
||
|
conn name "npr-rank-primary" 2 40 offset: 3)))))))
|
||
|
|
||
|
(define-method (rank-feed (feed <common-dreams-feed>))
|
||
|
(let ((name (feed-name feed)))
|
||
|
(with-db
|
||
|
(lambda (conn)
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 3 4 1000)))
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 3 1000)))
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 2 1000)))
|
||
|
|
||
|
(let ((sxml
|
||
|
(cdr
|
||
|
(with-input-from-string
|
||
|
(with-input-from-request
|
||
|
"http://www.commondreams.org/" #f read-string)
|
||
|
html->sxml))))
|
||
|
(for-each
|
||
|
(match-lambda
|
||
|
((rank uris)
|
||
|
(for-each
|
||
|
(lambda (uri)
|
||
|
(with-transaction conn
|
||
|
(lambda ()
|
||
|
(db:set-rank-by-uri
|
||
|
conn name
|
||
|
(string-append "http://www.commondreams.org" uri)
|
||
|
rank))))
|
||
|
uris)))
|
||
|
(map (match-lambda
|
||
|
((rank article-class)
|
||
|
`(,rank
|
||
|
,((sxpath `(// (div (@ class
|
||
|
,(lambda (x y)
|
||
|
(if (not (null? x))
|
||
|
(let ((class (cadar x)))
|
||
|
(string-contains class article-class))
|
||
|
#f))))
|
||
|
// (div (@ class
|
||
|
,(lambda (x y)
|
||
|
(if (not (null? x))
|
||
|
(let ((class (cadar x)))
|
||
|
(string-contains class "hp-title"))
|
||
|
#f))))
|
||
|
// (div (@ class (equal? "field-content")))
|
||
|
// a
|
||
|
// ((@ href *text*))))
|
||
|
sxml))))
|
||
|
'((1 "primary-news-article") (1 "secondary-news-article")
|
||
|
(2 "tertiary-news-article") (2 "remaining-news-article")))))))))
|
||
|
|
||
|
(define-method (rank-feed (feed <raw-story-feed>))
|
||
|
(let ((name (feed-name feed)))
|
||
|
(with-db
|
||
|
(lambda (conn)
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 3 4 1000)))
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 3 1000)))
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 2 1000)))
|
||
|
(let ((sxml
|
||
|
(cdr
|
||
|
(with-input-from-string
|
||
|
(with-input-from-request "http://www.rawstory.com/" #f read-string)
|
||
|
html->sxml))))
|
||
|
|
||
|
(let ((uri-list
|
||
|
((sxpath `(// (div (@ class
|
||
|
,(lambda (x y)
|
||
|
(if (not (null? x))
|
||
|
(let ((class (cadar x)))
|
||
|
(string-contains class "blog-item-class"))
|
||
|
#f))))
|
||
|
// a
|
||
|
// ((@ href *text*))))
|
||
|
sxml)))
|
||
|
(unless (null? uri-list)
|
||
|
(for-each
|
||
|
(match-lambda
|
||
|
((rank uris)
|
||
|
(with-transaction conn
|
||
|
(lambda ()
|
||
|
(for-each
|
||
|
(lambda (uri)
|
||
|
(db:set-rank-by-uri conn name uri rank))
|
||
|
uris)))))
|
||
|
`((1 ,(take uri-list 3))
|
||
|
(2 ,(drop uri-list 3)))))))))))
|
||
|
|
||
|
(define-method (rank-feed (feed <reddit-feed>))
|
||
|
(let ((name (feed-name feed)))
|
||
|
(with-db
|
||
|
(lambda (conn)
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 3 4 1000)))
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 3 1000)))
|
||
|
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 2 1000)))
|
||
|
|
||
|
|
||
|
(let ((items (feed-resource-feed-items feed (first (feed-resources feed)))))
|
||
|
(with-transaction conn
|
||
|
(lambda ()
|
||
|
(db:set-rank-by-guids
|
||
|
conn 1 (map-in-order (compose guid->guid-hash feed-item-guid)
|
||
|
(take items 3)))
|
||
|
(db:set-rank-by-guids
|
||
|
conn 2 (map-in-order (compose guid->guid-hash feed-item-guid)
|
||
|
(take (drop items 3) 3))))))))))
|
||
|
|
||
|
|
||
|
;; TEXT TAGS ;;
|
||
|
(define (set-text-tags)
|
||
|
(with-db
|
||
|
(lambda (conn)
|
||
|
(for-each
|
||
|
(lambda (tag/texts)
|
||
|
(with-transaction
|
||
|
conn
|
||
|
(lambda ()
|
||
|
(db:set-tags-by-text conn (cdr tag/texts) (car tag/texts)))))
|
||
|
'(("religion" "religion" "religious" "theology" "doctrine")
|
||
|
("bernie-sanders" "bernie sanders" "sanders bernie")
|
||
|
("hillary-clinton" "hillary clinton" "clinton hillary")
|
||
|
("jeb-bush" "jeb bush" "bush jeb")
|
||
|
("ted-cruz" "ted cruz" "cruz ted")
|
||
|
("rand-paul" "rand paul" "paul rand")
|
||
|
("donald-trump" "donald trump" "trump donald" "donald j trump")
|
||
|
("ben-carson" "ben carson" "carson ben")
|
||
|
("chris-christie" "christie chris")
|
||
|
("carly-fiorina" "carly fiorina" "fiorina carly")
|
||
|
("mike-huckabee" "mike huckabee" "huckabee mike")
|
||
|
("marco-rubio" "marco rubio" "rubio marco")
|
||
|
("president-2016"
|
||
|
"president 2016"
|
||
|
"2016 president"
|
||
|
"presidential 2016"
|
||
|
"2016 presidential"
|
||
|
"presidential campaign 2016"
|
||
|
"Presidential campaign 2016"
|
||
|
"presidential election 2016"
|
||
|
"Presidential election 2016"
|
||
|
"2016 presidential poll"
|
||
|
"presidential poll 2016"
|
||
|
"presidential candidate"
|
||
|
"Presidential candidate"
|
||
|
"bernie sanders"
|
||
|
"sanders bernie"
|
||
|
"hillary clinton"
|
||
|
"clinton hillary"
|
||
|
"jeb bush"
|
||
|
"bush jeb"
|
||
|
"ted cruz"
|
||
|
"cruz ted"
|
||
|
"rand paul"
|
||
|
"paul rand"
|
||
|
"donald trump"
|
||
|
"trump donald"
|
||
|
"donald j trump"
|
||
|
"ben carson"
|
||
|
"carson ben"
|
||
|
"christie chris"
|
||
|
"carly fiorina"
|
||
|
"fiorina carly"
|
||
|
"mike huckabee"
|
||
|
"huckabee mike"
|
||
|
"marco rubio"
|
||
|
"rubio marco")
|
||
|
("terrorism" "terrorism" "terrorist")
|
||
|
("isis" "isis" "isil" "islamic state" "islamic-state")
|
||
|
("climate-change" "climate change" "global warming" "greenhouse gas"
|
||
|
"fossile fuel" "fossile fuels" "clean energy" "cap and trade" "cap-and-trade"
|
||
|
"offshore drilling" "carbon tax" "renewables"))))))
|
||
|
|
||
|
;; IMPORT ;;
|
||
|
(define (get-image-dimensions path)
|
||
|
(map
|
||
|
string->number
|
||
|
(string-split
|
||
|
(let ((s (with-input-from-pipe
|
||
|
(string-append "identify -format \"%w,%h\" " path) read-string)))
|
||
|
(if (string-null? s)
|
||
|
(signal (make-property-condition 'invalid-image-format))
|
||
|
s))
|
||
|
",")))
|
||
|
|
||
|
(define (date-in-future? date)
|
||
|
;; There has to be a better way to do this...
|
||
|
;; just using the srfi-19 date comparison procedures doesn't work
|
||
|
;; since it can't seem to handle dates that have different timezones.
|
||
|
;; (> (time->seconds (date->time date)) (time->seconds (date->time (current-date))))
|
||
|
(> (time-compare (date->time date) (date->time (current-date))) -1))
|
||
|
|
||
|
(define (save-feed-items-to-database source tag items)
|
||
|
(with-db
|
||
|
(lambda (conn)
|
||
|
(for-each
|
||
|
(lambda (a)
|
||
|
(handle-exceptions
|
||
|
exn
|
||
|
(let* ((_chain (get-call-chain))
|
||
|
(chain (drop-right _chain 2)))
|
||
|
(cond ((and ((condition-property-accessor 'query 'error-code "nill") exn)
|
||
|
(string=?
|
||
|
((condition-property-accessor 'query 'error-code "nill") exn)
|
||
|
"23505")) ; unique violation, it should be ok...
|
||
|
(log-for (warn) (with-output-to-string
|
||
|
(lambda ()
|
||
|
(print "unique violation")
|
||
|
(print-object a)))))
|
||
|
((skip-log-condition-predicate exn) (void))
|
||
|
(else
|
||
|
(log-error exn (with-output-to-string
|
||
|
(lambda () (print-object a))) #f chain))))
|
||
|
(let* ((article-id (article-id-by-guid conn (feed-item-guid a)))
|
||
|
(images (if article-id
|
||
|
'()
|
||
|
(let ((first-item #t))
|
||
|
(filter
|
||
|
identity
|
||
|
(map (lambda (media-item)
|
||
|
(handle-exceptions exn
|
||
|
(let* ((_chain (get-call-chain))
|
||
|
(chain (drop-right _chain 1)))
|
||
|
(if ((condition-predicate
|
||
|
'invalid-image-format) exn)
|
||
|
(begin
|
||
|
(log-for (warn)
|
||
|
"invalid image format")
|
||
|
#f)
|
||
|
(log-error
|
||
|
exn
|
||
|
(with-output-to-string
|
||
|
(lambda ()
|
||
|
(print-object a)))
|
||
|
#f)))
|
||
|
(let* ((url (media-item-url media-item))
|
||
|
(path
|
||
|
(media-item-relative-path media-item))
|
||
|
(full-path
|
||
|
(media-item-filesystem-path media-item))
|
||
|
(file-exists (file-exists? full-path))
|
||
|
(dimen (if file-exists
|
||
|
(get-image-dimensions full-path)
|
||
|
#f)))
|
||
|
(if file-exists
|
||
|
`((cover
|
||
|
. ,(if first-item
|
||
|
(begin (set! first-item #f) #t)
|
||
|
#f))
|
||
|
(original-path . ,path)
|
||
|
(original-width . ,(car dimen))
|
||
|
(original-height . ,(cadr dimen)))
|
||
|
(begin (log-for
|
||
|
(warn)
|
||
|
(conc "image does not exist: "
|
||
|
full-path))
|
||
|
#f)))))
|
||
|
(filter (lambda (i) (media-item-url i))
|
||
|
(feed-item-medias a))))))))
|
||
|
(if (date-in-future? (feed-item-date a))
|
||
|
(log-for (warn) (with-output-to-string
|
||
|
(lambda ()
|
||
|
(print "date in the future: "
|
||
|
(exn* (feed-item-date a)))
|
||
|
(print "tags: " (exn* (feed-item-tags a)))
|
||
|
(print "link: " (exn* (feed-item-link a))))))
|
||
|
(with-transaction conn
|
||
|
(lambda ()
|
||
|
(db:upsert-article!
|
||
|
conn
|
||
|
title: (feed-item-title a)
|
||
|
author: (feed-item-author a)
|
||
|
authors: (feed-item-authors a)
|
||
|
;; TODO handle errors
|
||
|
date: (feed-item-date a)
|
||
|
;; TODO based on feed
|
||
|
tags: (map ->string (feed-item-tags a))
|
||
|
;; TODO error handle
|
||
|
description: (feed-item-description a)
|
||
|
content: ""
|
||
|
guid-hash: (guid->guid-hash (feed-item-guid a))
|
||
|
article-id: article-id
|
||
|
source: (feed-item-source a)
|
||
|
uri: (feed-item-link a)
|
||
|
images: images
|
||
|
insert-tag-if-not-exists: #t)))))))
|
||
|
(filter (lambda (item)
|
||
|
(unless (feed-item-title item)
|
||
|
(log-for (info) (conc "missing item title: " source tag)))
|
||
|
(feed-item-title item)) items)))))
|
||
|
|
||
|
(define-method (import-feed-resource (feed <feed>) (resource <feed-resource>))
|
||
|
(handle-exceptions exn
|
||
|
(log-error exn)
|
||
|
(save-feed-items-to-database
|
||
|
(feed-name feed) (feed-resource-tags resource)
|
||
|
(feed-resource-feed-items feed resource))))
|
||
|
|
||
|
(define-method (import-feed (feed <feed>))
|
||
|
(for-each
|
||
|
(lambda (resource)
|
||
|
(log-for (info) (conc "importing feed: " (feed-name feed) " - "
|
||
|
(feed-resource-tags resource)))
|
||
|
(import-feed-resource feed resource))
|
||
|
(feed-resources feed)))
|
||
|
|
||
|
(define opts
|
||
|
(list (args:make-option (f fetch-feeds) #:none "fetch and save feeds")
|
||
|
(args:make-option (m fetch-media) #:none "fetch and save media objects")
|
||
|
(args:make-option (i import) #:none "import into database")
|
||
|
(args:make-option (u update-article-view) #:none "update article view")
|
||
|
(args:make-option (r rank) #:none "rank articles")
|
||
|
(args:make-option (t set-text-tags) #:none "set text tags")
|
||
|
(args:make-option (d debug-level) (optional: "LEVEL")
|
||
|
"debug level [default: 1]")
|
||
|
(args:make-option (h help) #:none "display usage" (usage))))
|
||
|
|
||
|
(define (usage)
|
||
|
(with-output-to-port (current-error-port)
|
||
|
(lambda ()
|
||
|
(print "Usage: " (car (argv)) " [options...]")
|
||
|
(newline)
|
||
|
(print (args:usage opts))))
|
||
|
(exit 1))
|
||
|
|
||
|
(define (source-name source) (car source))
|
||
|
(define (source-type source) (cadr source))
|
||
|
(define (source-uris source) (cddr source))
|
||
|
|
||
|
(receive
|
||
|
(options operands) (args:parse (command-line-arguments) opts)
|
||
|
|
||
|
(let ((debug-level (or (and (alist-ref 'debug-level options)
|
||
|
(string->number (alist-ref 'debug-level options)))
|
||
|
1)))
|
||
|
(when (> debug-level 0)
|
||
|
(start-sender error-sender
|
||
|
(port-sender (current-error-port))
|
||
|
(output (<standard-out message))
|
||
|
(category error))
|
||
|
(when (>= debug-level 2)
|
||
|
(start-sender warn-sender
|
||
|
(port-sender (current-output-port))
|
||
|
(output (<standard-out message))
|
||
|
(category warn)))
|
||
|
(when (>= debug-level 3)
|
||
|
(start-sender info-sender
|
||
|
(port-sender (current-output-port))
|
||
|
(output (<standard-out message))
|
||
|
(category info)))
|
||
|
(when (= debug-level 4)
|
||
|
(start-sender debug-sender
|
||
|
(port-sender (current-output-port))
|
||
|
(output (<standard-out message))
|
||
|
(category debug)))))
|
||
|
|
||
|
(let* ((force (null? (filter (lambda (a)
|
||
|
(and (not (equal? (car a) 'd))
|
||
|
(not (equal? (car a) 'debug-level))))
|
||
|
options)))
|
||
|
(tasks (map (lambda (src) `(,(source-name src) . ,(make-queue))) *sources*))
|
||
|
(task-add!
|
||
|
(lambda (src proc)
|
||
|
(let ((src-queue (alist-ref (source-name src) tasks)))
|
||
|
(when src-queue
|
||
|
(queue-add! src-queue proc))))))
|
||
|
|
||
|
(print "options:")
|
||
|
(when (or force (alist-ref 'f options))
|
||
|
(print " fetch feeds")
|
||
|
(for-each
|
||
|
(lambda (src)
|
||
|
(task-add! src
|
||
|
(lambda ()
|
||
|
(fetch-feed-and-write-to-file (make-feed src)))))
|
||
|
*sources*))
|
||
|
|
||
|
(when (or force (alist-ref 'm options))
|
||
|
(print " fetch media")
|
||
|
(for-each
|
||
|
(lambda (src)
|
||
|
(task-add!
|
||
|
src
|
||
|
(lambda ()
|
||
|
(save-feed-images (make-feed src)))))
|
||
|
*sources*))
|
||
|
|
||
|
(when (or force (alist-ref 'i options))
|
||
|
(print " import articles")
|
||
|
(for-each
|
||
|
(lambda (src)
|
||
|
(task-add!
|
||
|
src
|
||
|
(lambda ()
|
||
|
(import-feed (make-feed src)))))
|
||
|
*sources*))
|
||
|
|
||
|
(when (or force (alist-ref 'r options))
|
||
|
(print " rank articles")
|
||
|
(for-each
|
||
|
(lambda (src)
|
||
|
(task-add! src (lambda () (handle-exceptions exn
|
||
|
(log-error exn)
|
||
|
(let ((feed (make-feed src)))
|
||
|
(log-for (info) (conc "ranking " (feed-name feed)))
|
||
|
(rank-feed feed))))))
|
||
|
*sources*))
|
||
|
|
||
|
(when (or force (alist-ref 't options))
|
||
|
(print " set text tags"))
|
||
|
|
||
|
(when (or force (alist-ref 'u options))
|
||
|
(print " refresh article view"))
|
||
|
(newline)
|
||
|
|
||
|
(log-for (info) "running tasks")
|
||
|
(let ((threads (map
|
||
|
(lambda (task)
|
||
|
(make-thread
|
||
|
(lambda ()
|
||
|
(for-each
|
||
|
(lambda (e)
|
||
|
(e))
|
||
|
(queue->list (cdr task))))))
|
||
|
tasks)))
|
||
|
(for-each (cut thread-start! <>) threads)
|
||
|
(for-each (cut thread-join! <>) threads))
|
||
|
(log-for (info) "completed tasks")
|
||
|
|
||
|
(when (or force (alist-ref 't options))
|
||
|
(log-for (info) "setting text tags")
|
||
|
(set-text-tags))
|
||
|
|
||
|
(when (or force (alist-ref 'u options))
|
||
|
(log-for (info) "refreshing views...")
|
||
|
(with-db
|
||
|
(lambda (conn)
|
||
|
(db:refresh-article-view! conn)
|
||
|
(db:refresh-tags-and-sources-view! conn))))
|
||
|
|
||
|
(log-for (info) "done")))
|