commit ac54587c59ad0bdd7c84681f295d924f27019644
parent 63fe7a682af0938334c13a7132b52b933dafec13
Author: Christoph Lohmann <20h@r-36.net>
Date: Mon, 22 May 2017 19:29:14 +0200
Add JSON Feed support.
Diffstat:
zeitungsschau/feed.py | | | 96 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- |
1 file changed, 94 insertions(+), 2 deletions(-)
diff --git a/zeitungsschau/feed.py b/zeitungsschau/feed.py
@@ -16,6 +16,7 @@ import codecs
import html
import urllib.parse
import socket
+import json
def parseiso(dstr, now):
try:
@@ -39,7 +40,84 @@ def parsexml(astr):
# Throw XML parsing errors so we can blame the feed authors.
return xml
-def parse(astr):
+def parsejson(astr):
+ js = json.loads(astr)
+
+ feed = {}
+ articles = []
+ now = datetime.now(pytz.utc)
+ now = now.replace(hour=20, minute=20, second=20, microsecond=20)
+
+ if "title" in js:
+ feed["title"] = js["title"]
+ if "description" in js:
+ feed["description"] = js["description"]
+ if "home_page_url" in js:
+ feed["link"] = js["home_page_url"]
+ if "feed_url" in js:
+ feed["link"] = js["feed_url"]
+ if "author" in js:
+ if "name" in js["author"]:
+ feed["author"] = js["author"]["name"]
+ feed["updated"] = now
+
+ for item in js["items"]:
+ article = {}
+ if "url" in item:
+ article["file"] = item["url"]
+ if "title" in item:
+ article["title"] = item["title"]
+ if "id" in item:
+ article["id"] = item["id"]
+ else:
+ if "link" in article:
+ article["id"] = article["link"]
+ elif "file" in article:
+ article["id"] = article["file"]
+ else:
+ article["id"] = article["text"][:30]
+
+ if "summary" in item:
+ article["text"] = html.unescape(item["summary"])
+ if "content_html" in item:
+ article["text"] = html.unescape(item["content_html"])
+ if "content_text" in item:
+ article["text"] = html.unescape(item["content_text"])
+ if "date_published" in item:
+ article["updated"] = \
+ dateutil.parser.parse(item["date_published"])
+ else:
+ article["updated"] = now
+
+ if article["updated"] == now:
+ article["uuid"] = ""
+ else:
+ article["uuid"] = "%s" % (article["updated"])
+
+ for e in ("id", "title", "file"):
+ if e in article:
+ article["uuid"] = "%s-%s" % \
+ (article["uuid"],\
+ article[e])
+
+ def mkuuid(s):
+ return hashlib.sha256(str(s).\
+ encode("utf8")).hexdigest()
+ if len(article["uuid"]) == 0:
+ article["uuid"] = mkuuid(now)
+ else:
+ article["uuid"] = mkuuid(article["uuid"])
+
+ # sanity checks
+ if "title" not in article and "text" not in article \
+ and "file" not in article:
+ continue
+
+ articles.append(article)
+
+ return feed
+
+def parseatom(astr):
xml = parsexml(astr)
if xml == None:
return None
@@ -246,6 +324,7 @@ def parse(astr):
return feed
def fetch(uri):
+ ftype = "xml"
if "file://" in uri:
fd = codecs.open(uri[7:], "r", "utf-8")
fval = fd.read().encode("utf-8")
@@ -280,5 +359,18 @@ def fetch(uri):
fval = fd.content
rcode = fd.status_code
- return (rcode, parse(fval))
+ if "Content-Type" in fd.headers:
+ if "application/json" in fd.headers["Content-Type"]:
+ ftype = "json"
+
+ if ftype == "xml":
+ suri = uri.lower().rsplit(".", 1)
+ if len(suri) > 1:
+ if suri[-1] == "json":
+ ftype = "json"
+
+ if ftype == "xml":
+ return (rcode, parsexml(fval))
+ else:
+ return (rcode, parsejson(fval))