Parse subjects for HTML content and clean it. - zs - Zeitungsschau rss to email converter

commit cc1e0defe58a83c1d59a31d72b7e8e7dec726883
parent ee341c7915c2d85c90eb6deef11c964fb88986fa
Author: Christoph Lohmann <20h@r-36.net>
Date:   Wed, 24 Jan 2018 14:07:16 +0100

Parse subjects for HTML content and clean it.

Some RSS feeds have escaped HTML in escaped HTML in subjects. We need to
clean this mess up manually on our side.

Diffstat:
zeitungsschau/feedemail.py  | 7 +++++--

1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/zeitungsschau/feedemail.py b/zeitungsschau/feedemail.py
@@ -12,11 +12,14 @@ from email.utils import formataddr, formatdate, parseaddr
 from email.header import Header
 import time
 import subprocess
+import lxml.html
 
 import html2text
 
 def normalizeheader(hstr):
-	return hstr.replace("\n", " ").strip()
+	return lxml.html.fromstring(hstr).text_content().\
+			replace(u"\xa0", "").\
+			replace("\n", " ").strip()
 
 class LocalSendmail(object):
 	cmd="/usr/sbin/sendmail -f \"%s\" \"%s\""
@@ -58,7 +61,7 @@ def send(feed, to, smtphost="localhost", smtpport=None, ssl="False",\
 					normalizeheader(article["title"]),\
 					"utf-8")
 		else:
-			subject = Header(normalizeheader(text[:70]),\
+			subject = Header(normalizeheader(text[:20]),\
 					"utf-8")
 
 		# Append metadata.

	zs Zeitungsschau rss to email converter
	git clone git://r-36.net/zs
	Log \| Files \| Refs \| LICENSE