commit 9891ca73640aa4fa074c54e92913f847ba1e756b
parent 9e95a0f332a1bfabfba59c9bad6460e70731db9f
Author: Christoph Lohmann <20h@r-36.net>
Date: Wed, 11 Nov 2015 22:08:35 +0100
Simplify parsing and unescape text entries.
Diffstat:
1 file changed, 29 insertions(+), 33 deletions(-)
diff --git a/zeitungsschau/feed.py b/zeitungsschau/feed.py
@@ -13,6 +13,7 @@ import requests
import hashlib
import pytz
import codecs
+import html
def parseiso(dstr, now):
try:
@@ -32,18 +33,9 @@ def removenamespaces(xml):
elem.tag = elem.tag[nsl:]
def parsexml(astr):
- try:
- xml = objectify.fromstring(astr)
- removenamespaces(xml)
- except etree.XMLSyntaxError:
- try:
- parser = etree.HTMLParser()
- xml = objectify.fromstring(astr, parser)
- removenamespaces(xml)
- except etree.XMLSyntaxError:
- parser = etree.XMLParser(resolve_entities=False)
- xml = objectify.fromstring(astr, parser)
- removenamespaces(xml)
+ xml = objectify.fromstring(astr)
+ removenamespaces(xml)
+ # Throw XML parsing errors so we can blame the feed authors.
return xml
def parse(astr):
@@ -57,10 +49,6 @@ def parse(astr):
isrdf = False
now = datetime.now(pytz.utc)
- feede = xml.xpath(".//feed")
- if len(feede) > 0:
- xml = feede[0]
-
if hasattr(xml, "channel"):
if hasattr(xml, "item"):
isrdf = True
@@ -71,11 +59,11 @@ def parse(astr):
feed["title"] = ""
for e in ("title", "description"):
if hasattr(xml, e):
- feed[e] = str(xml[e])
+ feed[e] = html.unescape(str(xml[e]))
if hasattr(xml, "image") and hasattr(xml.image, "title"):
if "title" not in feed:
- feed["title"] = str(xml.image.title)
+ feed["title"] = html.unescape(str(xml.image.title))
if hasattr(xml, "updated"):
feed["updated"] = parseiso(xml.updated, now)
@@ -93,25 +81,25 @@ def parse(astr):
feed["link"] = str(xml.link)
if hasattr(xml, "webmaster"):
- feed["email"] = str(xml.webmaster)
+ feed["email"] = html.unescape(str(xml.webmaster))
elif hasattr(xml, "owner") and hasattr(xml.owner, "email"):
- feed["email"] = str(xml.owner.email)
+ feed["email"] = html.unescape(str(xml.owner.email))
elif hasattr(xml, "author") and hasattr(xml.author, "email"):
- feed["email"] = str(xml.author.email)
+ feed["email"] = html.unescape(str(xml.author.email))
elif hasattr(xml, "webMaster"):
- feed["email"] = str(xml.webMaster)
+ feed["email"] = html.unescape(str(xml.webMaster))
elif hasattr(xml, "managingeditor"):
- feed["email"] = str(xml.managingeditor)
+ feed["email"] = html.unescape(str(xml.managingeditor))
elif hasattr(xml, "managingEditor"):
- feed["email"] = str(xml.managingEditor)
+ feed["email"] = html.unescape(str(xml.managingEditor))
if hasattr(xml, "author"):
if hasattr(xml.author, "name"):
- feed["author"] = str(xml.author.name)
+ feed["author"] = html.unescape(str(xml.author.name))
else:
- feed["author"] = str(xml.author)
+ feed["author"] = html.unescape(str(xml.author))
elif hasattr(xml, "creator"):
- feed["author"] = str(xml.creator)
+ feed["author"] = html.unescape(str(xml.creator))
entryname = "entry"
if isrss == True or isrdf == True:
@@ -123,7 +111,8 @@ def parse(astr):
article = {}
# title
if hasattr(entry, "title"):
- article["title"] = str(entry["title"])
+ article["title"] = html.unescape(\
+ str(entry["title"]))
# link
if hasattr(entry, "link"):
@@ -149,8 +138,9 @@ def parse(astr):
hasattr(entry.group, "content"):
if "url" in entry.group.content:
article["file"] = \
+ html.unescape(\
str(entry.group.content.\
- attrib["file"])
+ attrib["file"]))
# updated
try:
@@ -171,19 +161,25 @@ def parse(astr):
# author
if hasattr(entry, "author"):
if hasattr(entry.author, "name"):
- article["author"] = str(entry.author.name)
+ article["author"] = html.unescape(\
+ str(entry.author.name))
else:
- article["author"] = str(entry.author)
+ article["author"] = html.unescape(\
+ str(entry.author))
elif hasattr(entry, "creator"):
- article["author"] = str(entry.creator)
+ article["author"] = html.unescape(\
+ str(entry.creator))
# tags
if hasattr(entry, "category"):
article["tags"] = []
for cat in entry["category"][:]:
- article["tags"].append(str(cat))
+ article["tags"].append(\
+ html.unescape(\
+ str(cat)))
# text
+ # Don't unescape the text, it might contain HTML.
if hasattr(entry, "encoded"):
article["text"] = str(entry.encoded)
elif hasattr(entry, "content"):