commit 45fa8f0de67209e4cceeb1df6e3439efe55e7617
parent f9891c3921193415865fa7d3aae02407dea12ab0
Author: Christoph Lohmann <20h@r-36.net>
Date: Wed, 11 Nov 2015 18:02:11 +0100
First HTML parsing with missing entities.
Diffstat:
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/zeitungsschau/feed.py b/zeitungsschau/feed.py
@@ -33,10 +33,12 @@ def parsexml(astr):
except etree.XMLSyntaxError:
try:
parser = etree.HTMLParser()
- xml = objectify.fromstring(astr)
+ xml = objectify.fromstring(astr, parser)
removenamespaces(xml)
except etree.XMLSyntaxError:
- return None
+ parser = etree.XMLParser(resolve_entities=False)
+ xml = objectify.fromstring(astr, parser)
+ removenamespaces(xml)
return xml
def parse(astr):
@@ -50,6 +52,10 @@ def parse(astr):
isrdf = False
now = datetime.now(pytz.utc)
+ feede = xml.xpath(".//feed")
+ if len(feede) > 0:
+ xml = feede[0]
+
if hasattr(xml, "channel"):
if hasattr(xml, "item"):
isrdf = True