Simplify parsing and unescape text entries. - zs - Zeitungsschau rss to email converter

commit 9891ca73640aa4fa074c54e92913f847ba1e756b
parent 9e95a0f332a1bfabfba59c9bad6460e70731db9f
Author: Christoph Lohmann <20h@r-36.net>
Date:   Wed, 11 Nov 2015 22:08:35 +0100

Simplify parsing and unescape text entries.

Diffstat:
zeitungsschau/feed.py  | 62 +++++++++++++++++++++++++++++---------------------------------

1 file changed, 29 insertions(+), 33 deletions(-)
diff --git a/zeitungsschau/feed.py b/zeitungsschau/feed.py
@@ -13,6 +13,7 @@ import requests
 import hashlib
 import pytz
 import codecs
+import html
 
 def parseiso(dstr, now):
 	try:
@@ -32,18 +33,9 @@ def removenamespaces(xml):
 				elem.tag = elem.tag[nsl:]
 
 def parsexml(astr):
-	try:
-		xml = objectify.fromstring(astr)
-		removenamespaces(xml)
-	except etree.XMLSyntaxError:
-		try:
-			parser = etree.HTMLParser()
-			xml = objectify.fromstring(astr, parser)
-			removenamespaces(xml)
-		except etree.XMLSyntaxError:
-			parser = etree.XMLParser(resolve_entities=False)
-			xml = objectify.fromstring(astr, parser)
-			removenamespaces(xml)
+	xml = objectify.fromstring(astr)
+	removenamespaces(xml)
+	# Throw XML parsing errors so we can blame the feed authors.
 	return xml
 
 def parse(astr):
@@ -57,10 +49,6 @@ def parse(astr):
 	isrdf = False
 	now = datetime.now(pytz.utc)
 
-	feede = xml.xpath(".//feed")
-	if len(feede) > 0:
-		xml = feede[0]
-
 	if hasattr(xml, "channel"):
 		if hasattr(xml, "item"):
 			isrdf = True
@@ -71,11 +59,11 @@ def parse(astr):
 	feed["title"] = ""
 	for e in ("title", "description"):
 		if hasattr(xml, e):
-			feed[e] = str(xml[e])
+			feed[e] = html.unescape(str(xml[e]))
 	
 	if hasattr(xml, "image") and hasattr(xml.image, "title"):
 		if "title" not in feed:
-			feed["title"] = str(xml.image.title)
+			feed["title"] = html.unescape(str(xml.image.title))
 
 	if hasattr(xml, "updated"):
 		feed["updated"] = parseiso(xml.updated, now) 
@@ -93,25 +81,25 @@ def parse(astr):
 			feed["link"] = str(xml.link)
 
 	if hasattr(xml, "webmaster"):
-		feed["email"] = str(xml.webmaster)
+		feed["email"] = html.unescape(str(xml.webmaster))
 	elif hasattr(xml, "owner") and hasattr(xml.owner, "email"):
-		feed["email"] = str(xml.owner.email)
+		feed["email"] = html.unescape(str(xml.owner.email))
 	elif hasattr(xml, "author") and hasattr(xml.author, "email"):
-		feed["email"] = str(xml.author.email)
+		feed["email"] = html.unescape(str(xml.author.email))
 	elif hasattr(xml, "webMaster"):
-		feed["email"] = str(xml.webMaster)
+		feed["email"] = html.unescape(str(xml.webMaster))
 	elif hasattr(xml, "managingeditor"):
-		feed["email"] = str(xml.managingeditor)
+		feed["email"] = html.unescape(str(xml.managingeditor))
 	elif hasattr(xml, "managingEditor"):
-		feed["email"] = str(xml.managingEditor)
+		feed["email"] = html.unescape(str(xml.managingEditor))
 
 	if hasattr(xml, "author"):
 		if hasattr(xml.author, "name"):
-			feed["author"] = str(xml.author.name)
+			feed["author"] = html.unescape(str(xml.author.name))
 		else:
-			feed["author"] = str(xml.author)
+			feed["author"] = html.unescape(str(xml.author))
 	elif hasattr(xml, "creator"):
-		feed["author"] = str(xml.creator)
+		feed["author"] = html.unescape(str(xml.creator))
 
 	entryname = "entry"
 	if isrss == True or isrdf == True:
@@ -123,7 +111,8 @@ def parse(astr):
 			article = {}
 			# title
 			if hasattr(entry, "title"):
-				article["title"] = str(entry["title"])
+				article["title"] = html.unescape(\
+						str(entry["title"]))
 
 			# link
 			if hasattr(entry, "link"):
@@ -149,8 +138,9 @@ def parse(astr):
 					hasattr(entry.group, "content"):
 				if "url" in entry.group.content:
 					article["file"] = \
+						html.unescape(\
 						str(entry.group.content.\
-						attrib["file"])
+						attrib["file"]))
 
 			# updated
 			try:
@@ -171,19 +161,25 @@ def parse(astr):
 			# author
 			if hasattr(entry, "author"):
 				if hasattr(entry.author, "name"):
-					article["author"] = str(entry.author.name)
+					article["author"] = html.unescape(\
+							str(entry.author.name))
 				else:
-					article["author"] = str(entry.author)
+					article["author"] = html.unescape(\
+							str(entry.author))
 			elif hasattr(entry, "creator"):
-				article["author"] = str(entry.creator)
+				article["author"] = html.unescape(\
+						str(entry.creator))
 
 			# tags
 			if hasattr(entry, "category"):
 				article["tags"] = []
 				for cat in entry["category"][:]:
-					article["tags"].append(str(cat))
+					article["tags"].append(\
+							html.unescape(\
+							str(cat)))
 
 			# text
+			# Don't unescape the text, it might contain HTML.
 			if hasattr(entry, "encoded"):
 				article["text"] = str(entry.encoded)
 			elif hasattr(entry, "content"):

	zs Zeitungsschau rss to email converter
	git clone git://r-36.net/zs
	Log \| Files \| Refs \| LICENSE