You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1713 lines
70 KiB
Scheme

(import chicken scheme srfi-1 srfi-13 srfi-14)
(use rss html-parser srfi-19 http-client postgresql md5 message-digest
anaphora uri-common general-utils matchable posix coops coops-utils
args irregex sxml-transforms intarweb sxpath message-digest-bv
srfi-18 log5scm numbers medea
(prefix utf8 utf8:) (prefix utf8-srfi-13 utf8:)
(only sort-combinators group/key) atom rfc3339)
(client-software
'(("Mozilla/5.0" "44.0" "(X11; Linux x86_64; rv:44.0) Gecko/20100101")))
(define *program-config* (with-input-from-file "news.conf" read))
;; (use debug-utils box)
;; (breakpoint-accessor _)
(define-category debug)
(define-category info)
(define-category warn)
(define-category error)
(define-category debug+ (or debug info warn error))
(define-category info+ (or info warn error))
(define-category warn+ (or warn error))
(define-category fetch-feed)
(define-category fetch-media)
(define-category import)
(define-category rank)
(define-category update-article-view)
(define-output <standard-out
(conc "[" (current-category) "] "
"[" (date->string (current-date) "~D ~T") "]"
(if (current-context) (conc (current-context) " - ") "")
" - " (current-message)))
;; Making this a macro removes one spurious stack trace relative to a
;; function element.
(define-syntax log-error
(syntax-rules ()
((_ exn)
(log-error exn #f #f #f))
((_ exn str)
(log-error exn str #f #f))
((_ exn str default)
(log-error exn str default #f))
((_ exn str default call-chain)
(let* ((_chain (if call-chain call-chain (get-call-chain)))
(chain (with-output-to-string (lambda () (newline) (pp (drop-right _chain 2)))))
(cpa condition-property-accessor)
(exn-message (cpa 'exn 'message "(no message)"))
(exn-location (cpa 'exn 'location "*ERROR LOCATION UNKNOWN*"))
(exn-arguments (cpa 'exn 'arguments '()))
(exn? (condition-predicate 'exn)))
(log-for (error)
(with-output-to-string
(lambda ()
(print (exn-location exn))
(print (exn-message exn))
(print (exn-arguments exn))
(print chain)
(when str (print str)))))
default))))
;; if exception occurs ignore and return a indicator.
(define-syntax exn*
(syntax-rules ()
((_ exp)
(handle-exceptions
exn
'exception-occured
exp))))
(define (md5-hash s) (message-digest-string (md5-primitive) s))
(define skip-log-condition (make-property-condition 'skip-log))
(define skip-log-condition-predicate (condition-predicate 'skip-log))
(define (string-atom-date->date s)
(seconds->date
(rfc3339->seconds
(time->rfc3339
(rfc3339->local-time
(string->rfc3339 s))))))
;; A normalized string->date conversion to help out SRFI-19.
(define (string->date* s)
;; SRFI-19 can't handle string timezones so try to convert them to
;; numeric timezones.
(handle-exceptions
exn
(begin (log-for (warn) (conc "broken string->date* for: " s))
(signal skip-log-condition))
(let* ((d* (let ((possible-time-zone (string-take-right s 3))
(timezone-maps '(("GMT" . "-0800")
("PDT" . "-0000")
("PST" . "-0000")
(" UT" . " -0800")
("EDT" . "-0300")
("EST" . "-0300")
("CST" . "-0200")))
(timezone-offset-map '(("+0000" . "-0800")))
(possible-timezone-offset (string-take-right s 5))
(possible-non-english-day (string-take s 3))
;; Dom, Seg, Ter, Qua, Qui, Sex, Sab
(non-english-day-maps '(("Ter" . "Tue")
("Qui" . "Thu")))
(s* (string-copy s)))
;; MUST COME FIRST
(set! s*
(aif (assoc possible-timezone-offset timezone-offset-map equal?)
(++ (string-take s* (- (string-length s*) 5)) (cdr it))
s*))
(set! s*
(aif (assoc possible-time-zone timezone-maps equal?)
(++ (string-take s* (- (string-length s*) 3)) (cdr it))
s*))
(set! s*
(aif (assoc possible-non-english-day non-english-day-maps equal?)
(++ (cdr it) (string-drop s* 3))
s*))
s*))
(d (handle-exceptions exn
(string->date d* "~a, ~d ~B ~Y ~H:~M:~S ~z")
(string->date d* "~a, ~d ~b ~Y ~H:~M:~S ~z"))))
(date-add-duration d (make-duration hours: (if (= (date-zone-offset d) 0)
0
(fx/ (date-zone-offset d) 3600)))))))
;; Make the current date compatible with the output of the
;; string->date* function.
(define (current-date*)
(let ((date (current-date)))
(string->date (conc (date-day date) " " (date-month date) " "
(date-year date) " -0000")
"~e ~m ~Y ~z")))
(define (strip-html html)
(filter (cut irregex-search '(: (* whitespace) (+ (or alnum punct))) <>)
(flatten
(pre-post-order
(html->sxml html)
`((@ . ,(lambda x '()))
(*default* . ,(lambda (tag . body) body))
(*text* . ,(lambda (tag string) (or-default string? string ""))))))))
(define (strip-tags-from-sxml sxml)
(utf8:string-trim-both
(irregex-replace/all
"\n\t"
(apply
conc
(flatten
(pre-post-order*
sxml
`((span . ,(lambda (tag body) (if (> (length body) 1) (cdr body) "")))
(p . ,(lambda (tag body) (if (> (length body) 1) (cdr body) "")))
(em . ,(lambda (tag body) body))
(a . ,(lambda (tag body) (if (> (length body) 1) (cdr body) "")))
(img . ,(lambda (tag body) '()))
(*text* . ,(lambda (tag str) str))
(*default* . ,(lambda (tag str) str))
(*TOP* . ,(lambda (tag str) str))))))
" ")))
(define-class <media-item> ()
((media-item accessor: media-item-media-item)
(url)
(type)
(file-extension)))
(define-class <feed-item> ()
((feed-item accessor: feed-item-feed-item)
(title)
(guid)
(link)
(source accessor: feed-item-source)
(medias accessor: feed-item-medias)
(description)
(author)
(authors)
(date)
(tags accessor: feed-item-tags)))
(define-class <rss-feed-item> (<feed-item>))
(define-class <atom-feed-item> (<feed-item>))
(define-class <los-angeles-times-item> (<rss-feed-item>)
((source "Los Angeles Times")))
(define-class <new-york-times-item> (<rss-feed-item>)
((source "New York Times")))
(define-class <san-francisco-chronicle-item> (<rss-feed-item>)
((source "San Francisco Chronicle")))
(define-class <wall-street-journal-item> (<rss-feed-item>)
((source "San Francisco Chronicle")))
(define-class <hacker-news-item> (<rss-feed-item>)
((source "Hacker News")))
(define-class <lambda-the-ultimate-item> (<rss-feed-item>)
((source "Lambda the Ultimate")))
(define-class <planet-scheme-item> (<atom-feed-item>)
((source "Planet Scheme")))
(define-class <planet-haskell-item> (<atom-feed-item>)
((source "Planet Haskell")))
(define-class <fox-news-item> (<rss-feed-item>)
((source "Fox News")))
(define-class <xkcd-item> (<atom-feed-item>)
((source "XKCD")))
(define-class <christianity-today-item> (<rss-feed-item>)
((source "Christianity Today")))
(define-class <techcrunch-item> (<rss-feed-item>)
((source "TechCrunch")))
(define-class <engadget-item> (<rss-feed-item>)
((source "Engadget")))
(define-class <npr-item> (<rss-feed-item>)
((source "NPR")))
(define-class <eff-item> (<rss-feed-item>)
((source "EFF")))
(define-class <common-dreams-item> (<rss-feed-item>)
((source "Common Dreams")))
(define-class <raw-story-item> (<rss-feed-item>)
((source "Raw Story")))
(define-class <reddit-item> (<atom-feed-item>)
((source "Reddit")))
(define-class <think-progress-item> (<rss-feed-item>)
((source "Think Progress")))
(define-class <los-angeles-times-media-item> (<media-item>) ())
(define-class <new-york-times-media-item> (<media-item>) ())
(define-class <san-francisco-chronicle-media-item> (<media-item>) ())
(define-class <wall-street-journal-media-item> (<media-item>) ())
(define-class <hacker-news-media-item> (<media-item>) ())
(define-class <lambda-the-ultimate-media-item> (<media-item>) ())
(define-class <xkcd-media-item> (<media-item>) ())
(define-class <url-media-item> (<media-item>) ())
(define-class <png-media-item> (<media-item>) ())
(define-class <url-png-media-item> (<url-media-item> <png-media-item>) ())
(define-method (feed-item-title (feed-item <rss-feed-item>))
(rss:item-title (feed-item-feed-item feed-item)))
(define-method (feed-item-title (feed-item <atom-feed-item>))
(match (entry-title (feed-item-feed-item feed-item))
((atom:title ('@ ('type "text")) x) x)
((atom:title x) x)
((atom:title ('@ ('xml:lang "en-US")) x) x)
((atom:title ('@ ('xml:lang "en")) x) x)
(x (error (conc "title type not supported: "
(entry-title (feed-item-feed-item feed-item)))))))
(define-method (feed-item-title (feed-item <hacker-news-item>))
(alist-ref 'title (feed-item-feed-item feed-item)))
(define-method (feed-item-link (feed-item <rss-feed-item>))
(rss:item-link (feed-item-feed-item feed-item)))
(define-method (feed-item-link (feed-item <atom-feed-item>))
(let ((links (entry-links (feed-item-feed-item feed-item))))
(if (null? links)
(error "feed item has no link")
(let rec ((sxml (car links)))
(match sxml
(('atom:link x) (rec x))
(('@ x ...)
(let ((href-list (filter identity (map rec x))))
(if (null? href-list)
(error (conc "link not found: "
(entry-links (feed-item-feed-item feed-item))))
(car href-list))))
(('href x) x)
(_ #f))))))
(define-method (feed-item-link (feed-item <hacker-news-item>))
(conc "https://news.ycombinator.com/item?id="
(alist-ref 'id (feed-item-feed-item feed-item))))
(define-method (feed-item-guid (feed-item <rss-feed-item>))
(alist-ref 'guid (rss:item-attributes (feed-item-feed-item feed-item))))
(define-method (feed-item-guid (feed-item <atom-feed-item>))
(entry-id (feed-item-feed-item feed-item)))
(define-method (feed-item-guid (feed-item <los-angeles-times-item>))
;; TODO figure out a better GUID that won't clash with future
;; articles with the same title or articles with the same title from
;; a different source.
(last (utf8:string-split (feed-item-link feed-item) "/")))
(define-method (feed-item-guid (feed-item <new-york-times-item>))
(or (alist-ref 'guid (rss:item-attributes (feed-item-feed-item feed-item)))
(rss:item-link (feed-item-feed-item feed-item))))
(define-method (feed-item-guid (feed-item <hacker-news-item>))
(conc "hacker-news-" (alist-ref 'id (feed-item-feed-item feed-item))))
(define-method (feed-item-guid (feed-item <lambda-the-ultimate-item>))
(rss:item-link (feed-item-feed-item feed-item)))
(define-method (feed-item-guid (feed-item <fox-news-item>))
(or (alist-ref 'guid (rss:item-attributes (feed-item-feed-item feed-item)))
(rss:item-link (feed-item-feed-item feed-item))))
(define-method (feed-item-guid (feed-item <christianity-today-item>))
;; TODO figure out a better GUID that won't clash with future
;; articles with the same title or articles with the same title from
;; a different source.
(last (utf8:string-split (feed-item-link feed-item) "/")))
(define-method (feed-item-guid (feed-item <npr-item>))
(apply
conc
(uri-path
(uri-reference (rss:item-link (feed-item-feed-item feed-item))))))
(define-method (feed-item-description (feed-item <rss-feed-item>))
(rss:item-description (feed-item-feed-item feed-item)))
(define-method (feed-item-description (feed-item <atom-feed-item>))
(match (entry-content (feed-item-feed-item feed-item))
(('atom:content ('@ ('type "html")) x)
(let ((body (strip-html x)))
(if (null? body)
""
(car body))))
(('atom:content ('@ ('type "text")) x) x)
(_ "")))
(define-method (feed-item-description (feed-item <reddit-item>))
"")
(define-method (feed-item-description (feed-item <new-york-times-item>))
(or-default string?
(cadr (html->sxml (rss:item-description (feed-item-feed-item feed-item))))
""))
(define-method (feed-item-description (feed-item <los-angeles-times-item>))
(let ((d (rss:item-description (feed-item-feed-item feed-item))))
(if (string? d)
(utf8:string-trim-both (car (utf8:string-split d "\n")))
"")))
(define-method (feed-item-description (feed-item <san-francisco-chronicle-item>))
;; Can contain nested <p> elements but doesn't have to so filter
;; them out.
(let* ((d* (rss:item-description (feed-item-feed-item feed-item)))
(d** (or d* ""))
(d (strip-tags-from-sxml
(html->sxml d**))))
(cond ((and (string? d)
(irregex-search "\\$\\(\"." d))
"")
((string? d)
(utf8:string-trim-both d))
((and (list? d) (not (null? d)) (not (null? (cdr d))))
(utf8:string-trim-both (cadr d)))
(else ""))))
(define-method (feed-item-description (feed-item <hacker-news-item>))
"")
(define-method (feed-item-description (feed-item <lambda-the-ultimate-item>))
(or-default string?
(cadr (html->sxml (rss:item-description (feed-item-feed-item feed-item))))
""))
(define-method (feed-item-description (feed-item <fox-news-item>))
(let ((d (rss:item-description (feed-item-feed-item feed-item))))
(if d
(if (string? d)
(cadr (html->sxml d))
"")
"")))
(define-method (feed-item-description (feed-item <xkcd-item>))
(let ((r ((sxpath '(// (img (@ src *text*))
// ((@ title *text*))))
(html->sxml (summary-text
(entry-summary (feed-item-feed-item feed-item)))))))
(if (null? r)
""
(let ((r2 (car r)))
(if (string? r2)
r2
"")))))
(define-method (feed-item-description (feed-item <christianity-today-item>))
(strip-tags-from-sxml
((sxpath `(// (p (@ class ,(lambda (x y)
(if (not (null? x))
(let ((class (cadar x)))
(or (equal? "text" class)
(equal? "deck" class)))
#f))))))
(html->sxml (rss:item-description (feed-item-feed-item feed-item))))))
(define-method (feed-item-description (feed-item <techcrunch-item>))
(strip-tags-from-sxml
(html->sxml (rss:item-description (feed-item-feed-item feed-item)))))
(define-method (feed-item-description (feed-item <engadget-item>))
(strip-tags-from-sxml
(html->sxml (rss:item-description (feed-item-feed-item feed-item)))))
(define-method (feed-item-description (feed-item <npr-item>))
(strip-tags-from-sxml
(html->sxml (rss:item-description (feed-item-feed-item feed-item)))))
(define-method (feed-item-description (feed-item <eff-item>))
(apply conc (strip-html (rss:item-description (feed-item-feed-item feed-item)))))
(define-method (feed-item-description (feed-item <common-dreams-item>))
(apply
conc
((sxpath `(// (div (@ class ,(lambda (x y)
(if (not (null? x))
(let ((class (cadar x)))
(string-contains
class "field--type-text-with-summary")
)
#f))))
// p *text*))
(html->sxml (rss:item-description (feed-item-feed-item feed-item))))))
(define-method (feed-item-description (feed-item <raw-story-item>))
(apply conc (flatten (cdr (html->sxml
(rss:item-description (feed-item-feed-item feed-item)))))))
(define-method (feed-item-description (feed-item <think-progress-item>))
(strip-tags-from-sxml
(html->sxml (rss:item-description (feed-item-feed-item feed-item)))))
(define rgx:author-split (sre->irregex '(or " and " ", ") 'fast 'i))
(define-method (feed-item-author (feed-item <rss-feed-item>))
(or (alist-ref 'dc:creator (rss:item-attributes (feed-item-feed-item feed-item)))
""))
(define-method (feed-item-author (feed-item <atom-feed-item>))
;; Kinda weird but not sure how to do it better.
(string-intersperse
(map author-name (entry-authors (feed-item-feed-item feed-item)))
","))
(define-method (feed-item-author (feed-item <reddit-item>))
(cadr (string-split (author-name
(car (entry-authors (feed-item-feed-item feed-item)))) "/")))
(define-method (feed-item-author (feed-item <los-angeles-times-item>))
(or (alist-ref 'author (rss:item-attributes (feed-item-feed-item feed-item)))
(alist-ref 'dc:creator (rss:item-attributes (feed-item-feed-item feed-item)))
""))
(define-method (feed-item-author (feed-item <new-york-times-item>))
(let ((author
(or (alist-ref 'dc:creator
(rss:item-attributes (feed-item-feed-item feed-item)))
(alist-ref 'author
(rss:item-attributes (feed-item-feed-item feed-item))))))
(if (not (string? author))
"New York Times"
author)))
(define-method (feed-item-author (feed-item <san-francisco-chronicle-item>))
(let ((a (alist-ref 'dc:creator
(rss:item-attributes (feed-item-feed-item feed-item)))))
(if a (car (strip-html a)) "")))
(define-method (feed-item-author (feed-item <wall-street-journal-item>))
"")
(define-method (feed-item-author (feed-item <hacker-news-item>))
(alist-ref 'by (feed-item-feed-item feed-item)))
(define-method (feed-item-author (feed-item <lambda-the-ultimate-item>))
"")
(define-method (feed-item-author (feed-item <fox-news-item>))
(let ((author
(or (alist-ref 'dc:creator
(rss:item-attributes (feed-item-feed-item feed-item)))
(alist-ref 'author
(rss:item-attributes (feed-item-feed-item feed-item))))))
(cond ((not (string? author)) "")
((string=? author "foxnewsonline@foxnews.com (Fox News Online) ")
"")
(else author))))
(define-method (feed-item-author (feed-item <christianity-today-item>))
(let ((r (irregex-split
", guest writer"
(or (alist-ref 'dc:creator
(rss:item-attributes (feed-item-feed-item feed-item)))
(alist-ref 'author
(rss:item-attributes (feed-item-feed-item feed-item)))""))))
(if (null? r)
""
(car r))))
(define-method (feed-item-author (feed-item <eff-item>))
(or (alist-ref 'dc:creator (rss:item-attributes (feed-item-feed-item feed-item)))
""))
(define-method (feed-item-authors (feed-item <atom-feed-item>))
(map author-name (entry-authors (feed-item-feed-item feed-item))))
(define-method (feed-item-authors (feed-item <reddit-item>))
(list
(cadr (string-split (author-name
(car (entry-authors (feed-item-feed-item feed-item)))) "/"))))
(define-method (feed-item-authors (feed-item <rss-feed-item>))
(list (feed-item-author feed-item)))
(define-method (feed-item-authors (feed-item <new-york-times-item>))
(let ((author (feed-item-author feed-item)))
(if author
(map (lambda (s)
(utf8:string-titlecase
(utf8:string-trim-both
(cond ((string=? (utf8:string-downcase s) "first draft")
"The New York Times First Draft")
(else s)))))
(flatten (map (cut irregex-split rgx:author-split <>)
(irregex-split "By " author))))
"New York Times")))
(define-method (feed-item-authors (feed-item <los-angeles-times-item>))
(let ((author (feed-item-author feed-item)))
(if author
(if (equal? author "The Times editorial board")
'("The Los Angeles Times editorial board")
(irregex-split rgx:author-split author))
(list "Los Angeles Times"))))
(define-method (feed-item-authors (feed-item <san-francisco-chronicle-item>))
(let ((author (utf8:string-trim-both (feed-item-author feed-item))))
(if author
(flatten (map (cut irregex-split rgx:author-split <>)
(irregex-split "By " author)))
(list ""))))
(define-method (feed-item-authors (feed-item <wall-street-journal-item>))
(list ""))
(define-method (feed-item-authors (feed-item <hacker-news-item>))
(list (feed-item-author feed-item)))
(define-method (feed-item-authors (feed-item <lambda-the-ultimate-item>))
(list ""))
(define-method (feed-item-authors (feed-item <fox-news-item>))
(list (feed-item-author feed-item)))
(define-method (feed-item-authors (feed-item <christianity-today-item>))
(let ((author (feed-item-author feed-item)))
(if author
(irregex-split rgx:author-split author)
(list ""))))
(define-method (feed-item-authors (feed-item <techcrunch-item>))
(let ((author (feed-item-author feed-item)))
(if author
(irregex-split '(",") author)
(list ""))))
(define-method (feed-item-authors (feed-item <engadget-item>))
(let ((author (feed-item-author feed-item)))
(if author
(irregex-split rgx:author-split author)
(list ""))))
(define-method (feed-item-authors (feed-item <eff-item>))
(let ((author (feed-item-author feed-item)))
(if author
(irregex-split rgx:author-split author)
(list ""))))
(define-method (feed-item-date (feed-item <rss-feed-item>))
(string->date*
(alist-ref 'pubDate (rss:item-attributes (feed-item-feed-item feed-item)))))
(define-method (feed-item-date (feed-item <atom-feed-item>))
(let ((published (entry-published (feed-item-feed-item feed-item)))
(updated (entry-updated (feed-item-feed-item feed-item))))
(string-atom-date->date
(if (and published (not (string=? published "")))
published
updated))))
(define-method (feed-item-date (feed-item <hacker-news-item>))
(seconds->date (alist-ref 'time (feed-item-feed-item feed-item))))
(define-method (feed-item-date (feed-item <christianity-today-item>))
(let ((d (alist-ref 'pubDate (rss:item-attributes (feed-item-feed-item feed-item)))))
(if d (string->date* d) (current-date*))))
(define-method (media-item-type (media-item <media-item>)) 'jpg)
(define-method (media-item-type (media-item <xkcd-media-item>)) 'png)
(define-method (media-item-type (media-item <png-media-item>)) 'png)
(define-method (media-item-file-extension (media-item <media-item>)) "jpg")
(define-method (media-item-file-extension (media-item <xkcd-media-item>)) "png")
(define-method (media-item-file-extension (media-item <png-media-item>)) "png")
(define-method (media-item-relative-path (media-item <media-item>))
(++ (message-digest-string (md5-primitive) (media-item-url media-item))
"." (media-item-file-extension media-item)))
(define-method (media-item-filesystem-path (media-item <media-item>))
(++ (filesystem-path-img-original) "/" (media-item-relative-path media-item)))
(define-method (media-item-url (media-item <media-item>))
(rss:media-url (media-item-media-item media-item)))
(define-method (media-item-url (media-item <xkcd-media-item>))
(let ((r ((sxpath '(// (img (@ src *text*))
// ((@ src *text*))))
(html->sxml (summary-text (media-item-media-item media-item))))))
(if (null? r)
""
(let ((r2 (car r)))
(if (string? r2)
r2
"")))))
(define-method (media-item-url (media-item <url-media-item>))
(media-item-media-item media-item))
(define-method (print-object (item <feed-item>) #!optional (out (current-output-port)))
(print "title: " (exn* (feed-item-title item)))
(print "guid: " (exn* (feed-item-guid item)))
(print "link: " (exn* (feed-item-link item)))
(print "source: " (exn* (feed-item-source item)))
(print "description: " (exn* (feed-item-description item)))
(print "author: " (exn* (feed-item-author item)))
(print "date: " (exn* (feed-item-date item)))
(print "tags: " (exn* (feed-item-tags item))))
(define (rss-item-description-image-links rss-item)
(let* ((links
((sxpath `(// (img (@ src *text*))
// ((@ src *text*))))
(html->sxml (rss:item-description rss-item))))
(useful-links (filter
(lambda (link)
(or (irregex-search ".jpg" link)
(irregex-search ".png" link)))
links)))
useful-links))
(define (make-feed-item feed-item source tags type)
(match source
("New York Times"
(make <new-york-times-item>
'feed-item feed-item 'source source 'tags tags
'medias (map (lambda (i) (make <new-york-times-media-item>
'media-item i))
(rss:item-medias feed-item))))
("Los Angeles Times"
(make <los-angeles-times-item>
'feed-item feed-item 'source source 'tags tags
'medias (map (lambda (i) (make <los-angeles-times-media-item>
'media-item i))
(rss:item-medias feed-item))))
("San Francisco Chronicle"
(make <san-francisco-chronicle-item>
'feed-item feed-item 'source source 'tags tags
'medias (map (lambda (i) (make <san-francisco-chronicle-media-item>
'media-item i))
(rss:item-medias feed-item))))
("Wall Street Journal"
(make <wall-street-journal-item>
'feed-item feed-item 'source source 'tags tags
'medias (map (lambda (i) (make <wall-street-journal-media-item>
'media-item i))
(rss:item-medias feed-item))))
("Hacker News"
(make <hacker-news-item>
'feed-item feed-item 'source source 'tags tags
'medias '()))
("Lambda the Ultimate"
(make <lambda-the-ultimate-item>
'feed-item feed-item 'source source 'tags tags
'medias '()))
("Planet Scheme"
(make <planet-scheme-item>
'feed-item feed-item 'source source 'tags tags 'medias '()))
("Planet Haskell"
(make <planet-haskell-item>
'feed-item feed-item 'source source 'tags tags 'medias '()))
("Fox News"
(make <fox-news-item>
'feed-item feed-item 'source source 'tags tags
'medias (map (lambda (i) (make <media-item>
'media-item i))
(filter
(lambda (i) (rss:media? i))
(rss:item-medias feed-item)))))
("XKCD"
(make <xkcd-item>
'feed-item feed-item 'source source 'tags tags
'medias `(,(make <xkcd-media-item> 'media-item (entry-summary feed-item)))))
("Christianity Today"
(make <christianity-today-item>
'feed-item feed-item 'source source 'tags tags
'medias (map (lambda (link)
(make <url-media-item> 'media-item link))
(rss-item-description-image-links feed-item))))
("TechCrunch"
(make <techcrunch-item>
'feed-item feed-item 'source source 'tags tags
'medias (map (lambda (link)
(make <url-media-item> 'media-item link))
(rss-item-description-image-links feed-item))))
("Engadget"
(make <engadget-item>
'feed-item feed-item 'source source 'tags tags
'medias (map (lambda (link)
(make <url-png-media-item> 'media-item link))
(rss-item-description-image-links feed-item))))
("NPR"
(make <npr-item>
'feed-item feed-item 'source source 'tags tags
'medias '()))
("EFF"
(make <eff-item>
'feed-item feed-item 'source source 'tags tags
'medias '()))
("Common Dreams"
(make <common-dreams-item>
'feed-item feed-item 'source source 'tags tags
'medias '()))
("Raw Story"
(make <raw-story-item>
'feed-item feed-item 'source source 'tags tags
'medias '()))
("Reddit"
(make <reddit-item>
'feed-item feed-item 'source source 'tags tags
'medias '()))
("Think Progress"
(make <think-progress-item>
'feed-item feed-item 'source source 'tags tags
'medias '()))
(_
(case type
('rss
(make <rss-feed-item>
'feed-item feed-item 'source source 'tags tags 'medias '()))
('atom
(make <atom-feed-item>
'feed-item feed-item 'source source 'tags tags 'medias '()))
(else (error "feed item type unknown"))))))
(include "global.scm")
(include "db.scm")
(import (prefix news-db db:))
(define with-db db:with-db)
(define with-db/transaction db:with-db/transaction)
(db:connection-spec (alist-ref 'db-connection-spec *program-config*))
(define *hacker-news-base-uri* "https://hacker-news.firebaseio.com/v0/")
(define *hacker-news-top-stories-uri* "topstories")
(define *hacker-news-item-uri* "item/")
(define *hacker-news-uri-suffix* ".json")
(define (guid->guid-hash guid)
(message-digest-string (md5-primitive) guid))
(define (article-id-by-guid conn guid)
(let ((r (db:get-article-id-by-guid conn (guid->guid-hash guid))))
(if (> (row-count r) 0)
(value-at r)
#f)))
(define *sources* (with-input-from-file "feed-list.scm" read))
(define-class <feed-resource> ()
((content-type reader: feed-resource-content-type)
(tags reader: feed-resource-tags)))
(define-class <uri-feed-resource> (<feed-resource>)
((uri reader: feed-resource-uri)))
(define-class <xml-feed-resource> (<feed-resource>)
((content-type initform: "xml" reader: feed-resource-content-type)))
(define-class <hacker-news-feed-resource> (<feed-resource>)
((content-type initform: "json" reader: feed-resource-content-type)
(tags initform: '(hacker-news) reader: feed-resource-tags)))
(define-class <xml-uri-feed-resource> (<uri-feed-resource> <xml-feed-resource>))
(define-class <feed> ()
((name reader: feed-name)
(resources reader: feed-resources)))
(define-class <atom-feed> (<feed>))
(define-class <rss-feed> (<feed>))
(define-class <json-feed> (<feed>))
(define-class <new-york-times-feed> (<rss-feed>))
(define-class <los-angeles-times-feed> (<rss-feed>))
(define-class <san-francisco-chronicle-feed> (<rss-feed>))
(define-class <wall-street-journal-feed> (<rss-feed>))
(define-class <hacker-news-feed> (<json-feed>))
(define-class <fox-news-feed> (<rss-feed>))
(define-class <christianity-today-feed> (<rss-feed>))
(define-class <techcrunch-feed> (<rss-feed>))
(define-class <engadget-feed> (<rss-feed>))
(define-class <npr-feed> (<rss-feed>))
(define-class <common-dreams-feed> (<rss-feed>))
(define-class <raw-story-feed> (<rss-feed>))
(define-class <reddit-feed> (<atom-feed>))
(define-class <chronological-order-ranking-feed> (<feed>))
(define-class <rss-chronological-order-ranking-feed>
(<rss-feed> <chronological-order-ranking-feed>))
(define-class <atom-chronological-order-ranking-feed>
(<atom-feed> <chronological-order-ranking-feed>))
(define-method (feed-resource-root-filepath
(feed <feed>) (feed-resource <feed-resource>))
(conc "feed-" (feed-resource-content-type feed-resource) "/"
(string-intersperse (string-split (feed-name feed) "/") "-") "/"))
(define-method (feed-resource-filepath (feed <feed>) (resource <uri-feed-resource>))
(conc (feed-resource-root-filepath feed resource)
(md5-hash (feed-resource-uri resource)) "."
(feed-resource-content-type resource)))
(define-method (fetch-feed-resource (resource <uri-feed-resource>))
(with-input-from-request (feed-resource-uri resource) #f read-string))
(define-method (save-feed-resource-content (feed <feed>) (resource <uri-feed-resource>))
(create-directory (feed-resource-root-filepath feed resource))
(with-output-to-file (feed-resource-filepath feed resource)
(lambda ()
(write (fetch-feed-resource resource)))))
(define-method (fetch-feed-and-write-to-file (feed <feed>))
(for-each
(lambda (resource)
(log-for (info) (conc "fetching: " (feed-name feed) " "
(feed-resource-tags resource)))
(handle-exceptions exn
(log-error exn)
(save-feed-resource-content feed resource)))
(feed-resources feed)))
(define-method (fetch-feed-and-write-to-file (feed <hacker-news-feed>))
(let ((ids (fetch-hacker-news-top-stories-ids)))
(write-hacker-news-items-to-file ids "top-stories-ids.json")
(write-hacker-news-items-to-file (fetch-hacker-news-items ids) "top-stories.json")))
(define *mutex* (make-mutex 'foo))
(define-method (read-feed-resource-from-file
(feed <feed>) (resource <uri-feed-resource>))
(with-input-from-file (feed-resource-filepath feed resource) read))
(define (feed-xml->rss-items xml)
(with-input-from-string xml
(lambda ()
(rss:feed-items (rss:read (current-input-port))))))
(define (feed-xml->atom-items xml)
(with-input-from-string xml
(lambda ()
(feed-entries (read-atom-feed (current-input-port))))))
(define-method (feed-resource-items (feed <atom-feed>) (resource <feed-resource>))
(feed-xml->atom-items (read-feed-resource-from-file feed resource)))
(define-method (feed-resource-items (feed <rss-feed>) (resource <feed-resource>))
(feed-xml->rss-items (read-feed-resource-from-file feed resource)))
(define-method (feed-resource-items
(feed <hacker-news-feed>) (resource <hacker-news-feed-resource>))
(hacker-news-json-items->feed-items "top-stories.json"))
(define (handle-possible-ssax-parser-error exn _chain tags source)
(let ((chain (drop-right _chain 1)))
(if (and ((condition-predicate 'ssax) exn)
((condition-predicate 'parser-error) exn))
(begin
(log-for (warn) (conc "ssax parser error: "
((condition-property-accessor 'exn 'message) exn)
" for: " source " - " tags))
'())
(log-error exn (with-output-to-string
(lambda ()
(newline) (newline)
(pp (condition->list exn))
(newline) (newline)))
'() chain))))
(define-method (feed-resource-feed-items (feed <rss-feed>) (resource <feed-resource>))
(map
(lambda (i) (make-feed-item i (feed-name feed) (feed-resource-tags resource) 'rss))
(handle-exceptions
exn
(handle-possible-ssax-parser-error
exn (get-call-chain) (feed-resource-tags resource) (feed-name feed))
(dynamic-wind
(lambda () (mutex-lock! *mutex*))
(lambda () (feed-xml->rss-items (read-feed-resource-from-file feed resource)))
(lambda () (mutex-unlock! *mutex*))))))
(define-method (feed-resource-feed-items (feed <atom-feed>) (resource <feed-resource>))
(map
(lambda (i) (make-feed-item i (feed-name feed) (feed-resource-tags resource) 'atom))
(handle-exceptions
exn
(handle-possible-ssax-parser-error
exn (get-call-chain) (feed-resource-tags resource) (feed-name feed))
(dynamic-wind
(lambda () (mutex-lock! *mutex*))
(lambda () (feed-xml->atom-items (read-feed-resource-from-file feed resource)))
(lambda () (mutex-unlock! *mutex*))))))
(define-method (feed-resource-feed-items
(feed <hacker-news-feed>) (resource <hacker-news-feed-resource>))
(handle-exceptions exn
(log-error exn)
(feed-resource-items feed resource)))
(define (fetch-and-save-image uri filepath)
(if (uri-reference uri)
(condition-case
(call-with-input-request*
uri
#f
(lambda (p response)
(with-output-to-file filepath
(lambda ()
;; In CHICKEN strings are the same as binary and the
;; most efficient way to do things.
(let ((len (header-value 'content-length
(response-headers response))))
(write-string (read-string len p) len))))))
(exn (exn client-error)
(log-error exn ((condition-property-accessor 'exn 'message "nill") exn)))
(exn () (log-error exn)))
(log-for (warn) (conc "unable to get a URI reference from " uri))))
(define (save-feed-images-to-files feed-items)
(for-each
(lambda (item)
(for-each
(lambda (media-item)
(handle-exceptions exn
(log-error exn)
(fetch-and-save-image (media-item-url media-item)
(media-item-filesystem-path media-item))))
(filter (lambda (i) (media-item-url i)) (feed-item-medias item))))
feed-items))
(define-method (save-feed-resource-images (feed <feed>) (resource <feed-resource>))
(handle-exceptions exn
(log-error exn)
(save-feed-images-to-files
(with-db
(lambda (conn)
(log-for (info) (conc "saving images from: " (feed-name feed) " "
(feed-resource-tags resource)))
(filter (lambda (item)
(not (article-id-by-guid conn (feed-item-guid item))))
(handle-exceptions exn
(log-error exn '())
(feed-resource-feed-items feed resource))))))))
(define-method (save-feed-images (feed <feed>))
(for-each
(lambda (resource)
(save-feed-resource-images feed resource))
(feed-resources feed)))
;; hacker news
(define array-as-list-parser
(cons 'array (lambda (x) x)))
(json-parsers (cons array-as-list-parser (json-parsers)))
(define (fetch-from-hacker-news uri)
(with-input-from-request uri #f read-json))
(define (fetch-hacker-news-top-stories-ids #!key (limit 30))
(log-for (info) "fetching: Hacker News top stories")
(handle-exceptions exn
(log-error exn '())
(let ((r (fetch-from-hacker-news
(conc *hacker-news-base-uri* *hacker-news-top-stories-uri*
*hacker-news-uri-suffix*))))
(if limit (take r limit) r))))
(define (fetch-hacker-news-item id)
(fetch-from-hacker-news
(conc *hacker-news-base-uri* *hacker-news-item-uri* id *hacker-news-uri-suffix*)))
(define (hacker-news-root-filepath)
"feed-json/hacker-news/")
(define (fetch-hacker-news-items ids)
(let* ((mutex (make-mutex))
(items '())
(threads (map (lambda (id)
(make-thread
(lambda ()
(log-for (info) (conc "fetching: Hacker News item " id))
(handle-exceptions exn
(log-error exn)
(let ((r (fetch-hacker-news-item id)))
(mutex-lock! mutex)
(set! items (cons r items))
(mutex-unlock! mutex))))))
ids)))
(for-each (lambda (thread) (thread-start! thread)) threads)
(for-each (lambda (thread) (thread-join! thread)) threads)
items))
(define (write-hacker-news-items-to-file items path)
(create-directory (hacker-news-root-filepath) #t)
(with-output-to-file (conc (hacker-news-root-filepath) path)
(lambda () (write items))))
(define (read-hacker-news-items-from-file path)
(with-input-from-file (conc (hacker-news-root-filepath) path) read))
(define (hacker-news-json-items->feed-items path)
(map (lambda (item) (make-feed-item item "Hacker News" '(hacker-news) 'hacker-news))
(read-hacker-news-items-from-file path)))
(define (make-feed src)
(define (make-feed* feed-type resource-type src)
(make feed-type
'resources (map
(lambda (tags/uri)
(make resource-type
'tags (car tags/uri)
'uri (cdr tags/uri)))
(source-uris src))
'name (source-name src)))
(cond ((equal? (source-name src) "New York Times")
(make-feed* <new-york-times-feed> <xml-uri-feed-resource> src))
((equal? (source-name src) "Los Angeles Times")
(make-feed* <los-angeles-times-feed> <xml-uri-feed-resource> src))
((equal? (source-name src) "Wall Street Journal")
(make-feed* <wall-street-journal-feed> <xml-uri-feed-resource> src))
((equal? (source-name src) "San Francisco Chronicle")
(make-feed* <san-francisco-chronicle-feed> <xml-uri-feed-resource> src))
((equal? (source-name src) "Hacker News")
(make <hacker-news-feed> 'resources
`(,(make <hacker-news-feed-resource>))
'name "Hacker News"))
((equal? (source-name src) "Fox News")
(make-feed* <fox-news-feed> <xml-uri-feed-resource> src))
((equal? (source-name src) "Christianity Today")
(make-feed* <christianity-today-feed> <xml-uri-feed-resource> src))
((equal? (source-name src) "TechCrunch")
(make-feed* <techcrunch-feed> <xml-uri-feed-resource> src))
((equal? (source-name src) "Engadget")
(make-feed* <engadget-feed> <xml-uri-feed-resource> src))
((equal? (source-name src) "NPR")
(make-feed* <npr-feed> <xml-uri-feed-resource> src))
((equal? (source-name src) "Common Dreams")
(make-feed* <common-dreams-feed> <xml-uri-feed-resource> src))
((equal? (source-name src) "Raw Story")
(make-feed* <raw-story-feed> <xml-uri-feed-resource> src))
((equal? (source-name src) "Reddit")
(make-feed* <reddit-feed> <xml-uri-feed-resource> src))
((eq? (source-type src) 'rss-chronological)
(make-feed*
<rss-chronological-order-ranking-feed> <xml-uri-feed-resource> src))
((eq? (source-type src) 'atom-chronological)
(make-feed*
<atom-chronological-order-ranking-feed> <xml-uri-feed-resource> src))
((eq? (source-type src) 'rss)
(make-feed* <rss-feed> <xml-uri-feed-resource> src))
((eq? (source-type src) 'atom)
(make-feed* <atom-feed> <xml-uri-feed-resource> src))
(else (error "source type not supported"))))
;;; RANK ;;;
(define-method (rank-feed (feed <new-york-times-feed>))
(with-db
(lambda (conn)
(let ((name (feed-name feed)))
;; demote current elevated ranks
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 3 1000)))
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 4 1000)))
;; order matters. primary outranks secondary.
(with-transaction conn
(lambda ()
(db:set-rank-by-tag-association
conn name "ny-times-rank-secondary" 2 40)))
(with-transaction conn
(lambda ()
(db:set-rank-by-tag-association
conn name "ny-times-rank-primary" 1 3)))))))
(define-method (rank-feed (feed <los-angeles-times-feed>))
(let ((name (feed-name feed)))
(with-db
(lambda (conn)
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 3 1000)))
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 4 1000)))
(let ((sxml
(cdr
(with-input-from-string
(with-input-from-request "http://latimes.com" #f read-string)
html->sxml)))
(uri-root "http://www.latimes.com"))
;; secondary articles
(let ((secondary-uri-list
((sxpath '(// (section (@ class
(equal?
"trb_outfit_group trb_outfit_section")))
// ((@ data-content-url *text*))))
sxml)))
(unless (null? secondary-uri-list)
(with-transaction conn
(lambda ()
(db:set-rank-by-partial-uri
conn name
(map (lambda (u) (last (utf8:string-split u "/"))) secondary-uri-list)
2)))))
;; featured article
(let ((featured-uri-list
((sxpath '(// (section (@ data-outfit-type (equal? "centerpiece")))
(section
(@ class
(equal? "trb_outfit_primaryItem trb_outfit_section")))
((@ data-content-url *text*))))
sxml)))
(unless (null? featured-uri-list)
(with-transaction conn
(lambda ()
(db:set-rank-by-partial-uri
conn name
(map (lambda (u) (last (utf8:string-split u "/"))) featured-uri-list)
1))))))))))
(define-method (rank-feed (feed <wall-street-journal-feed>))
(with-db
(lambda (conn)
(let ((name (feed-name feed)))
;; demote current elevated ranks
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 3 1000)))
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 4 1000)))
;; order matters. primary outranks secondary.
(with-transaction conn
(lambda ()
(db:set-rank-by-tag-association
conn name "wsj-rank-secondary" 2 40)))
(with-transaction conn
(lambda ()
(db:set-rank-by-tag-association
conn name "wsj-rank-primary" 1 3)))))))
(define-method (rank-feed (feed <hacker-news-feed>))
(define (hn-guid id) (guid->guid-hash (conc "hacker-news-" id)))
(let ((name (feed-name feed)))
(with-db
(lambda (conn)
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 3 4 1000)))
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 3 1000)))
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 4 1000)))
(let ((items (read-hacker-news-items-from-file "top-stories-ids.json")))
(with-transaction conn
(lambda ()
(db:set-rank-by-guids
conn 1 (map-in-order hn-guid (take items 3)))
(db:set-rank-by-guids
conn 2 (map-in-order hn-guid (take (drop items 3) 3))))))))))
(define-method (rank-feed (feed <san-francisco-chronicle-feed>))
(let ((name (feed-name feed)))
(with-db
(lambda (conn)
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 3 1000)))
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 4 1000)))
(let* ((doc (cdr
(with-input-from-string
(with-input-from-request "http://sfgate.com" #f read-string)
html->sxml)))
(path-proc
(lambda (base)
(lambda (name)
((sxpath `(// (div (@ (equal? (class ,(string-append base " " name)))))
// a
((@ href *text*)))) doc)))))
(with-transaction conn
(lambda ()
(db:set-rank-by-partial-uri
conn name
((sxpath '(// (div (@ class (equal? "ctpl-centerpiece")))
// a
((@ href *text*))))
doc)
1)))
(db:set-rank-by-partial-uri
conn name
(map
car
(group/key
identity
(flatten
(append
(map (path-proc "package")
'("news" "sports" "business" "food" "realestate" "entertainment"))
(map (path-proc "headline-list")
'("news" "business" "national" "living"))))))
2))))))
(define-method (rank-feed (feed <chronological-order-ranking-feed>))
(let ((name (feed-name feed)))
(with-db
(lambda (conn)
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 3 1000)))
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 4 1000)))
(with-transaction conn
(lambda ()
(db:set-rank-by-date-difference
conn 2 (make-duration hours: 8) name 3 3)
(db:set-rank-by-date-difference
conn 1 (make-duration hours: 4) name 3 0)))))))
(define-method (rank-feed (feed <fox-news-feed>))
(with-db
(lambda (conn)
(let ((name (feed-name feed)))
;; demote current elevated ranks
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 3 1000)))
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 4 1000)))
(with-transaction conn
(lambda ()
(db:set-rank-by-tag-association
conn name "fox-news-rank-primary" 1 3)
(db:set-rank-by-tag-association
conn name "fox-news-rank-primary" 2 40 offset: 3)))))))
(define-method (rank-feed (feed <christianity-today-feed>))
(with-db
(lambda (conn)
(let ((name (feed-name feed)))
;; demote current elevated ranks
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 3 1000)))
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 4 1000)))
(with-transaction conn
(lambda ()
(db:set-rank-by-tag-association
conn name "christianity-today-rank-primary" 1 3)
(db:set-rank-by-tag-association
conn name "christianity-today-rank-primary" 2 40 offset: 3)))))))
(define-method (rank-feed (feed <techcrunch-feed>))
(let ((name (feed-name feed)))
(with-db
(lambda (conn)
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 3 1000)))
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 4 1000)))
(let ((sxml
(cdr
(with-input-from-string
(with-input-from-request "http://techcrunch.com/" #f read-string)
html->sxml))))
;; secondary articles
(let ((secondary-uri-list
((sxpath '(// (div (@ class
(equal?
"island-main")))
// a
// ((@ href *text*))))
sxml)))
(unless (null? secondary-uri-list)
(with-transaction conn
(lambda ()
(db:set-rank-by-partial-uri
conn name
(map (lambda (u) (last (utf8:string-split u "/"))) secondary-uri-list)
2)))))
;; featured article
(let ((featured-uri-list
((sxpath '(// (div (@ class
(equal?
"island-secondary")))
// a
// ((@ href *text*))))
sxml)))
(unless (null? featured-uri-list)
(with-transaction conn
(lambda ()
(db:set-rank-by-partial-uri
conn name
(map (lambda (u) (last (utf8:string-split u "/"))) featured-uri-list)
1))))))))))
(define-method (rank-feed (feed <engadget-feed>))
(let ((name (feed-name feed)))
(with-db
(lambda (conn)
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 3 4 1000)))
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 3 1000)))
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 2 1000)))
(let ((sxml
(cdr
(with-input-from-string
(with-input-from-request "http://www.engadget.com/" #f read-string)
html->sxml))))
;; featured articles
(let ((featured-uri-list
((sxpath '(// (li (@ class
(equal?
"lede-item post")))
// a
// ((@ href *text*))))
sxml)))
(unless (null? featured-uri-list)
(with-transaction conn
(lambda ()
(db:set-rank-by-partial-uri
conn name
(map (lambda (u) (last (utf8:string-split u "/"))) featured-uri-list)
1))))))))))
(define-method (rank-feed (feed <npr-feed>))
(with-db
(lambda (conn)
(let ((name (feed-name feed)))
;; demote current elevated ranks
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 3 1000)))
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 4 1000)))
(with-transaction conn
(lambda ()
(db:set-rank-by-tag-association
conn name "npr-rank-primary" 1 3)
(db:set-rank-by-tag-association
conn name "npr-rank-primary" 2 40 offset: 3)))))))
(define-method (rank-feed (feed <common-dreams-feed>))
(let ((name (feed-name feed)))
(with-db
(lambda (conn)
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 3 4 1000)))
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 3 1000)))
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 2 1000)))
(let ((sxml
(cdr
(with-input-from-string
(with-input-from-request
"http://www.commondreams.org/" #f read-string)
html->sxml))))
(for-each
(match-lambda
((rank uris)
(for-each
(lambda (uri)
(with-transaction conn
(lambda ()
(db:set-rank-by-uri
conn name
(string-append "http://www.commondreams.org" uri)
rank))))
uris)))
(map (match-lambda
((rank article-class)
`(,rank
,((sxpath `(// (div (@ class
,(lambda (x y)
(if (not (null? x))
(let ((class (cadar x)))
(string-contains class article-class))
#f))))
// (div (@ class
,(lambda (x y)
(if (not (null? x))
(let ((class (cadar x)))
(string-contains class "hp-title"))
#f))))
// (div (@ class (equal? "field-content")))
// a
// ((@ href *text*))))
sxml))))
'((1 "primary-news-article") (1 "secondary-news-article")
(2 "tertiary-news-article") (2 "remaining-news-article")))))))))
(define-method (rank-feed (feed <raw-story-feed>))
(let ((name (feed-name feed)))
(with-db
(lambda (conn)
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 3 4 1000)))
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 3 1000)))
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 2 1000)))
(let ((sxml
(cdr
(with-input-from-string
(with-input-from-request "http://www.rawstory.com/" #f read-string)
html->sxml))))
(let ((uri-list
((sxpath `(// (div (@ class
,(lambda (x y)
(if (not (null? x))
(let ((class (cadar x)))
(string-contains class "blog-item-class"))
#f))))
// a
// ((@ href *text*))))
sxml)))
(unless (null? uri-list)
(for-each
(match-lambda
((rank uris)
(with-transaction conn
(lambda ()
(for-each
(lambda (uri)
(db:set-rank-by-uri conn name uri rank))
uris)))))
`((1 ,(take uri-list 3))
(2 ,(drop uri-list 3)))))))))))
(define-method (rank-feed (feed <reddit-feed>))
(let ((name (feed-name feed)))
(with-db
(lambda (conn)
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 3 4 1000)))
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 2 3 1000)))
(with-transaction conn (lambda () (db:set-rank-by-rank conn name 1 2 1000)))
(let ((items (feed-resource-feed-items feed (first (feed-resources feed)))))
(with-transaction conn
(lambda ()
(db:set-rank-by-guids
conn 1 (map-in-order (compose guid->guid-hash feed-item-guid)
(take items 3)))
(db:set-rank-by-guids
conn 2 (map-in-order (compose guid->guid-hash feed-item-guid)
(take (drop items 3) 3))))))))))
;; TEXT TAGS ;;
(define (set-text-tags)
(with-db
(lambda (conn)
(for-each
(lambda (tag/texts)
(with-transaction
conn
(lambda ()
(db:set-tags-by-text conn (cdr tag/texts) (car tag/texts)))))
'(("religion" "religion" "religious" "theology" "doctrine")
("bernie-sanders" "bernie sanders" "sanders bernie")
("hillary-clinton" "hillary clinton" "clinton hillary")
("jeb-bush" "jeb bush" "bush jeb")
("ted-cruz" "ted cruz" "cruz ted")
("rand-paul" "rand paul" "paul rand")
("donald-trump" "donald trump" "trump donald" "donald j trump")
("ben-carson" "ben carson" "carson ben")
("chris-christie" "christie chris")
("carly-fiorina" "carly fiorina" "fiorina carly")
("mike-huckabee" "mike huckabee" "huckabee mike")
("marco-rubio" "marco rubio" "rubio marco")
("president-2016"
"president 2016"
"2016 president"
"presidential 2016"
"2016 presidential"
"presidential campaign 2016"
"Presidential campaign 2016"
"presidential election 2016"
"Presidential election 2016"
"2016 presidential poll"
"presidential poll 2016"
"presidential candidate"
"Presidential candidate"
"bernie sanders"
"sanders bernie"
"hillary clinton"
"clinton hillary"
"jeb bush"
"bush jeb"
"ted cruz"
"cruz ted"
"rand paul"
"paul rand"
"donald trump"
"trump donald"
"donald j trump"
"ben carson"
"carson ben"
"christie chris"
"carly fiorina"
"fiorina carly"
"mike huckabee"
"huckabee mike"
"marco rubio"
"rubio marco")
("terrorism" "terrorism" "terrorist")
("isis" "isis" "isil" "islamic state" "islamic-state")
("climate-change" "climate change" "global warming" "greenhouse gas"
"fossile fuel" "fossile fuels" "clean energy" "cap and trade" "cap-and-trade"
"offshore drilling" "carbon tax" "renewables"))))))
;; IMPORT ;;
(define (get-image-dimensions path)
(map
string->number
(string-split
(let ((s (with-input-from-pipe
(string-append "identify -format \"%w,%h\" " path) read-string)))
(if (string-null? s)
(signal (make-property-condition 'invalid-image-format))
s))
",")))
(define (date-in-future? date)
;; There has to be a better way to do this...
;; just using the srfi-19 date comparison procedures doesn't work
;; since it can't seem to handle dates that have different timezones.
;; (> (time->seconds (date->time date)) (time->seconds (date->time (current-date))))
(> (time-compare (date->time date) (date->time (current-date))) -1))
(define (save-feed-items-to-database source tag items)
(with-db
(lambda (conn)
(for-each
(lambda (a)
(handle-exceptions
exn
(let* ((_chain (get-call-chain))
(chain (drop-right _chain 2)))
(cond ((and ((condition-property-accessor 'query 'error-code "nill") exn)
(string=?
((condition-property-accessor 'query 'error-code "nill") exn)
"23505")) ; unique violation, it should be ok...
(log-for (warn) (with-output-to-string
(lambda ()
(print "unique violation")
(print-object a)))))
((skip-log-condition-predicate exn) (void))
(else
(log-error exn (with-output-to-string
(lambda () (print-object a))) #f chain))))
(let* ((article-id (article-id-by-guid conn (feed-item-guid a)))
(images (if article-id
'()
(let ((first-item #t))
(filter
identity
(map (lambda (media-item)
(handle-exceptions exn
(let* ((_chain (get-call-chain))
(chain (drop-right _chain 1)))
(if ((condition-predicate
'invalid-image-format) exn)
(begin
(log-for (warn)
"invalid image format")
#f)
(log-error
exn
(with-output-to-string
(lambda ()
(print-object a)))
#f)))
(let* ((url (media-item-url media-item))
(path
(media-item-relative-path media-item))
(full-path
(media-item-filesystem-path media-item))
(file-exists (file-exists? full-path))
(dimen (if file-exists
(get-image-dimensions full-path)
#f)))
(if file-exists
`((cover
. ,(if first-item
(begin (set! first-item #f) #t)
#f))
(original-path . ,path)
(original-width . ,(car dimen))
(original-height . ,(cadr dimen)))
(begin (log-for
(warn)
(conc "image does not exist: "
full-path))
#f)))))
(filter (lambda (i) (media-item-url i))
(feed-item-medias a))))))))
(if (date-in-future? (feed-item-date a))
(log-for (warn) (with-output-to-string
(lambda ()
(print "date in the future: "
(exn* (feed-item-date a)))
(print "tags: " (exn* (feed-item-tags a)))
(print "link: " (exn* (feed-item-link a))))))
(with-transaction conn
(lambda ()
(db:upsert-article!
conn
title: (feed-item-title a)
author: (feed-item-author a)
authors: (feed-item-authors a)
;; TODO handle errors
date: (feed-item-date a)
;; TODO based on feed
tags: (map ->string (feed-item-tags a))
;; TODO error handle
description: (feed-item-description a)
content: ""
guid-hash: (guid->guid-hash (feed-item-guid a))
article-id: article-id
source: (feed-item-source a)
uri: (feed-item-link a)
images: images
insert-tag-if-not-exists: #t)))))))
(filter (lambda (item)
(unless (feed-item-title item)
(log-for (info) (conc "missing item title: " source tag)))
(feed-item-title item)) items)))))
(define-method (import-feed-resource (feed <feed>) (resource <feed-resource>))
(handle-exceptions exn
(log-error exn)
(save-feed-items-to-database
(feed-name feed) (feed-resource-tags resource)
(feed-resource-feed-items feed resource))))
(define-method (import-feed (feed <feed>))
(for-each
(lambda (resource)
(log-for (info) (conc "importing feed: " (feed-name feed) " - "
(feed-resource-tags resource)))
(import-feed-resource feed resource))
(feed-resources feed)))
(define opts
(list (args:make-option (f fetch-feeds) #:none "fetch and save feeds")
(args:make-option (m fetch-media) #:none "fetch and save media objects")
(args:make-option (i import) #:none "import into database")
(args:make-option (u update-article-view) #:none "update article view")
(args:make-option (r rank) #:none "rank articles")
(args:make-option (t set-text-tags) #:none "set text tags")
(args:make-option (d debug-level) (optional: "LEVEL")
"debug level [default: 1]")
(args:make-option (h help) #:none "display usage" (usage))))
(define (usage)
(with-output-to-port (current-error-port)
(lambda ()
(print "Usage: " (car (argv)) " [options...]")
(newline)
(print (args:usage opts))))
(exit 1))
(define (source-name source) (car source))
(define (source-type source) (cadr source))
(define (source-uris source) (cddr source))
(receive
(options operands) (args:parse (command-line-arguments) opts)
(let ((debug-level (or (and (alist-ref 'debug-level options)
(string->number (alist-ref 'debug-level options)))
1)))
(when (> debug-level 0)
(start-sender error-sender
(port-sender (current-error-port))
(output (<standard-out message))
(category error))
(when (>= debug-level 2)
(start-sender warn-sender
(port-sender (current-output-port))
(output (<standard-out message))
(category warn)))
(when (>= debug-level 3)
(start-sender info-sender
(port-sender (current-output-port))
(output (<standard-out message))
(category info)))
(when (= debug-level 4)
(start-sender debug-sender
(port-sender (current-output-port))
(output (<standard-out message))
(category debug)))))
(let* ((force (null? (filter (lambda (a)
(and (not (equal? (car a) 'd))
(not (equal? (car a) 'debug-level))))
options)))
(tasks (map (lambda (src) `(,(source-name src) . ,(make-queue))) *sources*))
(task-add!
(lambda (src proc)
(let ((src-queue (alist-ref (source-name src) tasks)))
(when src-queue
(queue-add! src-queue proc))))))
(print "options:")
(when (or force (alist-ref 'f options))
(print " fetch feeds")
(for-each
(lambda (src)
(task-add! src
(lambda ()
(fetch-feed-and-write-to-file (make-feed src)))))
*sources*))
(when (or force (alist-ref 'm options))
(print " fetch media")
(for-each
(lambda (src)
(task-add!
src
(lambda ()
(save-feed-images (make-feed src)))))
*sources*))
(when (or force (alist-ref 'i options))
(print " import articles")
(for-each
(lambda (src)
(task-add!
src
(lambda ()
(import-feed (make-feed src)))))
*sources*))
(when (or force (alist-ref 'r options))
(print " rank articles")
(for-each
(lambda (src)
(task-add! src (lambda () (handle-exceptions exn
(log-error exn)
(let ((feed (make-feed src)))
(log-for (info) (conc "ranking " (feed-name feed)))
(rank-feed feed))))))
*sources*))
(when (or force (alist-ref 't options))
(print " set text tags"))
(when (or force (alist-ref 'u options))
(print " refresh article view"))
(newline)
(log-for (info) "running tasks")
(let ((threads (map
(lambda (task)
(make-thread
(lambda ()
(for-each
(lambda (e)
(e))
(queue->list (cdr task))))))
tasks)))
(for-each (cut thread-start! <>) threads)
(for-each (cut thread-join! <>) threads))
(log-for (info) "completed tasks")
(when (or force (alist-ref 't options))
(log-for (info) "setting text tags")
(set-text-tags))
(when (or force (alist-ref 'u options))
(log-for (info) "refreshing views...")
(with-db
(lambda (conn)
(db:refresh-article-view! conn)
(db:refresh-tags-and-sources-view! conn))))
(log-for (info) "done")))