zs

Zeitungsschau rss to email converter
git clone git://r-36.net/zs
Log | Files | Refs | LICENSE

commit ac54587c59ad0bdd7c84681f295d924f27019644
parent 63fe7a682af0938334c13a7132b52b933dafec13
Author: Christoph Lohmann <20h@r-36.net>
Date:   Mon, 22 May 2017 19:29:14 +0200

Add JSON Feed support.

Diffstat:
zeitungsschau/feed.py | 96+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 94 insertions(+), 2 deletions(-)

diff --git a/zeitungsschau/feed.py b/zeitungsschau/feed.py @@ -16,6 +16,7 @@ import codecs import html import urllib.parse import socket +import json def parseiso(dstr, now): try: @@ -39,7 +40,84 @@ def parsexml(astr): # Throw XML parsing errors so we can blame the feed authors. return xml -def parse(astr): +def parsejson(astr): + js = json.loads(astr) + + feed = {} + articles = [] + now = datetime.now(pytz.utc) + now = now.replace(hour=20, minute=20, second=20, microsecond=20) + + if "title" in js: + feed["title"] = js["title"] + if "description" in js: + feed["description"] = js["description"] + if "home_page_url" in js: + feed["link"] = js["home_page_url"] + if "feed_url" in js: + feed["link"] = js["feed_url"] + if "author" in js: + if "name" in js["author"]: + feed["author"] = js["author"]["name"] + feed["updated"] = now + + for item in js["items"]: + article = {} + if "url" in item: + article["file"] = item["url"] + if "title" in item: + article["title"] = item["title"] + if "id" in item: + article["id"] = item["id"] + else: + if "link" in article: + article["id"] = article["link"] + elif "file" in article: + article["id"] = article["file"] + else: + article["id"] = article["text"][:30] + + if "summary" in item: + article["text"] = html.unescape(item["summary"]) + if "content_html" in item: + article["text"] = html.unescape(item["content_html"]) + if "content_text" in item: + article["text"] = html.unescape(item["content_text"]) + if "date_published" in item: + article["updated"] = \ + dateutil.parser.parse(item["date_published"]) + else: + article["updated"] = now + + if article["updated"] == now: + article["uuid"] = "" + else: + article["uuid"] = "%s" % (article["updated"]) + + for e in ("id", "title", "file"): + if e in article: + article["uuid"] = "%s-%s" % \ + (article["uuid"],\ + article[e]) + + def mkuuid(s): + return hashlib.sha256(str(s).\ + encode("utf8")).hexdigest() + if len(article["uuid"]) == 0: + article["uuid"] = mkuuid(now) + else: + article["uuid"] = mkuuid(article["uuid"]) + + # sanity checks + if "title" not in article and "text" not in article \ + and "file" not in article: + continue + + articles.append(article) + + return feed + +def parseatom(astr): xml = parsexml(astr) if xml == None: return None @@ -246,6 +324,7 @@ def parse(astr): return feed def fetch(uri): + ftype = "xml" if "file://" in uri: fd = codecs.open(uri[7:], "r", "utf-8") fval = fd.read().encode("utf-8") @@ -280,5 +359,18 @@ def fetch(uri): fval = fd.content rcode = fd.status_code - return (rcode, parse(fval)) + if "Content-Type" in fd.headers: + if "application/json" in fd.headers["Content-Type"]: + ftype = "json" + + if ftype == "xml": + suri = uri.lower().rsplit(".", 1) + if len(suri) > 1: + if suri[-1] == "json": + ftype = "json" + + if ftype == "xml": + return (rcode, parsexml(fval)) + else: + return (rcode, parsejson(fval))