commit cc1e0defe58a83c1d59a31d72b7e8e7dec726883
parent ee341c7915c2d85c90eb6deef11c964fb88986fa
Author: Christoph Lohmann <20h@r-36.net>
Date: Wed, 24 Jan 2018 14:07:16 +0100
Parse subjects for HTML content and clean it.
Some RSS feeds have escaped HTML in escaped HTML in subjects. We need to
clean this mess up manually on our side.
Diffstat:
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/zeitungsschau/feedemail.py b/zeitungsschau/feedemail.py
@@ -12,11 +12,14 @@ from email.utils import formataddr, formatdate, parseaddr
from email.header import Header
import time
import subprocess
+import lxml.html
import html2text
def normalizeheader(hstr):
- return hstr.replace("\n", " ").strip()
+ return lxml.html.fromstring(hstr).text_content().\
+ replace(u"\xa0", "").\
+ replace("\n", " ").strip()
class LocalSendmail(object):
cmd="/usr/sbin/sendmail -f \"%s\" \"%s\""
@@ -58,7 +61,7 @@ def send(feed, to, smtphost="localhost", smtpport=None, ssl="False",\
normalizeheader(article["title"]),\
"utf-8")
else:
- subject = Header(normalizeheader(text[:70]),\
+ subject = Header(normalizeheader(text[:20]),\
"utf-8")
# Append metadata.