zs

Zeitungsschau rss to email converter
git clone git://r-36.net/zs
Log | Files | Refs | LICENSE

commit cc1e0defe58a83c1d59a31d72b7e8e7dec726883
parent ee341c7915c2d85c90eb6deef11c964fb88986fa
Author: Christoph Lohmann <20h@r-36.net>
Date:   Wed, 24 Jan 2018 14:07:16 +0100

Parse subjects for HTML content and clean it.

Some RSS feeds have escaped HTML in escaped HTML in subjects. We need to
clean this mess up manually on our side.

Diffstat:
zeitungsschau/feedemail.py | 7+++++--
1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/zeitungsschau/feedemail.py b/zeitungsschau/feedemail.py @@ -12,11 +12,14 @@ from email.utils import formataddr, formatdate, parseaddr from email.header import Header import time import subprocess +import lxml.html import html2text def normalizeheader(hstr): - return hstr.replace("\n", " ").strip() + return lxml.html.fromstring(hstr).text_content().\ + replace(u"\xa0", "").\ + replace("\n", " ").strip() class LocalSendmail(object): cmd="/usr/sbin/sendmail -f \"%s\" \"%s\"" @@ -58,7 +61,7 @@ def send(feed, to, smtphost="localhost", smtpport=None, ssl="False",\ normalizeheader(article["title"]),\ "utf-8") else: - subject = Header(normalizeheader(text[:70]),\ + subject = Header(normalizeheader(text[:20]),\ "utf-8") # Append metadata.