commit 18454a1fe6f19aa9cdd780128a713066ee23ba9d
parent 0626d417d52845d03223244faa8238210ff87229
Author: Christoph Lohmann <20h@r-36.net>
Date: Wed, 19 Mar 2014 18:14:03 +0100
Add handling for xml syntax errors.
If XML fails, try HTML. If that fails, bail.
Diffstat:
2 files changed, 16 insertions(+), 2 deletions(-)
diff --git a/feed.py b/feed.py
@@ -6,6 +6,7 @@
#
from lxml import objectify
+from lxml import etree
from datetime import datetime
import dateutil.parser
import urllib.request, urllib.parse, urllib.error
@@ -23,12 +24,22 @@ def removenamespaces(xml):
elem.tag = elem.tag[nsl:]
def parsexml(astr):
- xml = objectify.fromstring(astr)
- removenamespaces(xml)
+ try:
+ xml = objectify.fromstring(astr)
+ removenamespaces(xml)
+ except etree.XMLSyntaxError:
+ try:
+ parser = etree.HTMLParser()
+ xml = objectify.fromstring(astr)
+ removenamespaces(xml)
+ except etree.XMLSyntaxError:
+ return None
return xml
def parse(astr):
xml = parsexml(astr)
+ if xml == None:
+ return None
feed = {}
articles = []
diff --git a/zs.py b/zs.py
@@ -46,6 +46,9 @@ def run(db, selfeed=None, dryrun=False):
estr = "incompleteread"
continue
+ if curfeed == None:
+ continue
+
# retry handling
if estr != None:
if retries > 2: