commit 622855cba32e158a1b0cacea44e21361685d2577
parent 10c3bbd37c63294007b0f3c28d47665fc625785b
Author: Christoph Lohmann <20h@r-36.net>
Date: Thu, 10 Sep 2020 14:11:51 +0200
Merge branch 'master' of ssh://r-36.net:443/srv/git/zs
Diffstat:
3 files changed, 19 insertions(+), 8 deletions(-)
diff --git a/zeitungsschau/feed.py b/zeitungsschau/feed.py
@@ -5,8 +5,9 @@
# by 20h
#
-from lxml import objectify
-from lxml import etree
+import lxml
+import lxml.objectify
+import html
from datetime import datetime
import dateutil.parser
from dateutil.tz import gettz
@@ -14,7 +15,6 @@ import requests
import hashlib
import pytz
import codecs
-import html
import urllib.parse
import socket
import json
@@ -44,9 +44,10 @@ def removenamespaces(xml):
elem.tag = elem.tag[nsl:]
def parsexml(astr):
- xml = objectify.fromstring(astr)
+ xml = lxml.objectify.fromstring(html.unescape(astr.decode("utf-8")).encode("utf-8"))
removenamespaces(xml)
# Throw XML parsing errors so we can blame the feed authors.
+ #print(lxml.objectify.dump(xml))
return xml
def parsetwtxtfeed(astr, uri):
@@ -278,6 +279,8 @@ def parseatomfeed(astr):
if hasattr(entry, "updated"):
article["updated"] = parseiso(entry.updated,\
now)
+ elif hasattr(entry, "temporary"):
+ article["updated"] = now
elif hasattr(entry, "pubDate"):
article["updated"] = parseiso(entry.pubDate,\
now)
@@ -397,7 +400,6 @@ def fetch(uri):
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect((host, port))
s.send(("%s\r\n" % (selector)).encode("utf-8"))
- s.shutdown(1)
fd = s.makefile("r")
fval = fd.read().encode("utf-8")
s.close()
diff --git a/zeitungsschau/feedemail.py b/zeitungsschau/feedemail.py
@@ -13,6 +13,7 @@ from email.header import Header
import time
import subprocess
import lxml.html
+import lxml.etree
import urllib.parse
import html2text
@@ -20,10 +21,13 @@ import html2text
def normalizeheader(hstr):
if len(hstr) == 0:
return ""
+ try:
+ return lxml.html.fromstring(hstr).text_content().\
+ replace(u"\xa0", "").\
+ replace("\n", " ").strip()
+ except lxml.etree.ParserError:
+ return ""
- return lxml.html.fromstring(hstr).text_content().\
- replace(u"\xa0", "").\
- replace("\n", " ").strip()
class LocalSendmail(object):
cmd="/usr/sbin/sendmail -f \"%s\" \"%s\""
diff --git a/zs b/zs
@@ -52,6 +52,11 @@ def run(db, selfeed=None, dryrun=False, onlychanges=False):
print("fetch %s" % (feeduri))
curfeed = None
rcode = 0
+
+ """
+ # All errors.
+ (rcode, curfeed) = feed.fetch(feeduri)
+ """
try:
(rcode, curfeed) = feed.fetch(feeduri)
except socket.gaierror: