zs

Zeitungsschau rss to email converter
git clone git://r-36.net/zs
Log | Files | Refs | LICENSE

commit 5a5d10ddc8ffc58403a4469fa04edf781148e9d7
Author: Christoph Lohmann <20h@r-36.net>
Date:   Sun,  9 Mar 2014 18:26:25 +0100

Initial commit of Zeitungsschau.

Diffstat:
feed.py | 170+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
feeddb.py | 180+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
feedemail.py | 97+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
opml.py | 51+++++++++++++++++++++++++++++++++++++++++++++++++++
zs.py | 122+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 620 insertions(+), 0 deletions(-)

diff --git a/feed.py b/feed.py @@ -0,0 +1,170 @@ +# +# Copy me if you can. +# by 20h +# + +from lxml import objectify +from datetime import datetime +import dateutil.parser +import urllib.request, urllib.parse, urllib.error + +def parseiso(dstr): + return dateutil.parser.parse(str(dstr)) + +def removenamespaces(xml): + for key in xml.nsmap: + nsstr = u'{%s}' % (xml.nsmap[key]) + nsl = len(nsstr) + + for elem in xml.getiterator(): + if elem.tag.startswith(nsstr): + elem.tag = elem.tag[nsl:] + +def parsexml(astr): + xml = objectify.fromstring(astr) + removenamespaces(xml) + return xml + +def parse(astr): + xml = parsexml(astr) + + feed = {} + articles = [] + isrss = False + isrdf = False + + if hasattr(xml, "channel"): + if hasattr(xml, "item"): + isrdf = True + oxml = xml + xml = xml.channel + isrss = True + + feed["title"] = "" + for e in ("title", "description"): + if hasattr(xml, e): + feed[e] = str(xml[e]) + + if hasattr(xml, "image") and hasattr(xml.image, "title"): + if "title" not in feed: + feed["title"] = str(xml.image.title) + + if hasattr(xml, "updated"): + feed["updated"] = parseiso(xml.updated) + elif hasattr(xml, "pubDate"): + feed["updated"] = parseiso(xml.pubDate) + elif hasattr(xml, "lastBuildDate"): + feed["updated"] = parseiso(xml.lastBuildDate) + else: + feed["updated"] = datetime.now() + + if hasattr(xml, "link"): + if "href" in xml.link.attrib: + feed["link"] = str(xml.link.attrib["href"]) + else: + feed["link"] = str(xml.link) + + if hasattr(xml, "webmaster"): + feed["email"] = str(xml.webmaster) + elif hasattr(xml, "owner") and hasattr(xml.owner, "email"): + feed["email"] = str(xml.owner.email) + elif hasattr(xml, "author") and hasattr(xml.author, "email"): + feed["email"] = str(xml.author.email) + elif hasattr(xml, "webMaster"): + feed["email"] = str(xml.webMaster) + elif hasattr(xml, "managingeditor"): + feed["email"] = str(xml.managingeditor) + elif hasattr(xml, "managingEditor"): + feed["email"] = str(xml.managingEditor) + + if hasattr(xml, "author"): + if hasattr(xml.author, "name"): + feed["author"] = str(xml.author.name) + else: + feed["author"] = str(xml.author) + elif hasattr(xml, "creator"): + feed["author"] = str(xml.creator) + + entryname = "entry" + if isrss == True or isrdf == True: + entryname = "item" + if isrdf == True: + xml = oxml + if hasattr(xml, entryname): + for entry in xml[entryname][:]: + article = {} + # title + if hasattr(entry, "title"): + article["title"] = str(entry["title"]) + + # link + if hasattr(entry, "link"): + if "href" in entry.link.attrib: + article["link"] = str(entry.link.attrib["href"]) + else: + article["link"] = str(entry.link) + elif hasattr(entry, "source"): + article["link"] = str(entry.source) + + # id + if hasattr(entry, "id"): + article["id"] = str(entry["id"]) + + # enclosure + if hasattr(entry, "enclosure"): + if "href" in entry.enclosure.attrib: + article["file"] = \ + str(entry.enclosure.attrib["href"]) + elif "url" in entry.enclosure.attrib: + article["file"] = \ + str(entry.enclosure.attrib["url"]) + else: + article["file"] = str(entry.enclosure) + + # updated + if hasattr(entry, "updated"): + article["updated"] = parseiso(entry.updated) + elif hasattr(entry, "pubDate"): + article["updated"] = parseiso(entry.pubDate) + elif hasattr(entry, "date"): + article["updated"] = parseiso(entry.date) + else: + article["updated"] = datetime.now() + + # author + if hasattr(entry, "author"): + if hasattr(entry.author, "name"): + article["author"] = str(entry.author.name) + else: + article["author"] = str(entry.author) + elif hasattr(entry, "creator"): + article["author"] = str(entry.creator) + + # tags + if hasattr(entry, "category"): + article["tags"] = [] + for cat in entry["category"][:]: + article["tags"].append(str(cat)) + + # text + if hasattr(entry, "encoded"): + article["text"] = str(entry.encoded) + elif hasattr(entry, "content"): + article["text"] = str(entry.content) + elif hasattr(entry, "summary"): + article["text"] = str(entry.summary) + elif hasattr(entry, "description"): + article["text"] = str(entry.description) + + articles.append(article) + feed["articles"] = articles + + return feed + +class feedopener(urllib.request.FancyURLopener): + version = "Zeitungsschau/1.0" +urllib.request._urlopener = feedopener + +def fetch(uri): + return parse(urllib.request.urlopen(uri).read()) + diff --git a/feeddb.py b/feeddb.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python +# coding=utf-8 +# +# Copy me if you can. +# by 20h +# + +import shelve +import os +import os.path +import fcntl +from subprocess import Popen + +class feeddb(object): + db = None + lockf = None + feeds = {} + cfg = {} + + def __init__(self, path="~/.zs/feed.db", email=None): + dbpath = os.path.expanduser(path) + path = os.path.abspath(os.path.dirname(dbpath)) + if not os.path.exists(path): + os.makedirs(path, 0o750) + lockpath = "%s.lck" % (dbpath) + self.lockf = open(lockpath, "w") + fcntl.lockf(self.lockf.fileno(), fcntl.LOCK_EX) + self.db = shelve.open(dbpath) + if "feeds" in self.db: + self.feeds = self.db["feeds"] + if "cfg" in self.db: + self.cfg = self.db["cfg"] + + if not "email" in self.cfg: + print("You need to specify the default email. Please "\ + "run 'zs cfg email me@me.com' to "\ + "set it.") + + if not "smtphost" in self.cfg: + self.cfg["smtphost"] = "localhost" + if not "smtpport" in self.cfg: + self.cfg["smtpport"] = None + if not "smtpssl" in self.cfg: + self.cfg["smtpssl"] = False + if not "smtpuser" in self.cfg: + self.cfg["smtpuser"] = None + if not "smtppassword" in self.cfg: + self.cfg["smtppassword"] = None + + def __del__(self): + if self.db != None: + self.db["feeds"] = self.feeds + self.db["cfg"] = self.cfg + self.db.close() + if self.lockf != None: + fcntl.flock(self.lockf.fileno(), fcntl.LOCK_UN) + self.lockf.close() + + def readfeed(self, uri): + if not uri in self.feeds: + return None + return self.feeds[uri] + + def writefeed(self, uri, feed): + self.feeds[uri] = feed + + def sethook(self, uri, hookfile): + feed = self.readfeed(uri) + if feed == None: + return + feed["hook"] = hookfile + self.writefeed(uri, feed) + + def runhook(self, uri): + feed = self.readfeed(uri) + if feed == None: + return + if not "hook" in feed: + return + + cmd = os.path.expanduser(feed["hook"]) + if not os.path.exists(cmd): + return + + fd = open("/dev/null") + if os.fork() == 0: + p = Popen(cmd, shell=True, stdout=fd, stderr=fd) + p.wait() + + def setfeedval(self, uri, key, value): + feed = self.readfeed(uri) + if feed == None: + return + feed[key] = value + self.writefeed(uri, feed) + + def pause(self, uri): + self.setfeedval(uri, "pause", True) + + def unpause(self, uri): + self.setfeedval(uri, "pause", False) + + def addfeed(self, uri, email=None): + if not uri in self.feeds: + feed = {} + if email == None: + feed["toemail"] = self.cfg["email"] + else: + feed["toemail"] = email + feed["uri"] = uri + feed["pause"] = False + feed["articles"] = [] + self.writefeed(uri, feed) + + def delfeed(self, uri): + if uri in self.feeds: + del self.feeds[uri] + + def listfeeds(self): + return list(self.feeds.keys()) + + def listactivefeeds(self): + rfeeds = [] + for f in self.feeds: + if self.feeds[f]["pause"] == False: + rfeeds.append(f) + return rfeeds + + def mergefeed(self, uri, curfeed): + rarticles = [] + feed = self.readfeed(uri) + if feed == None: + return curfeed + + history = feed["articles"] + for article in curfeed["articles"]: + if not article in history: + article["unread"] = True + history.append(article) + rarticles.append(article) + feed["articles"] = history + + for metakey in ("link", "title", "updated", "author", \ + "email"): + if metakey in curfeed: + feed[metakey] = curfeed[metakey] + + self.writefeed(uri, feed) + curfeed["articles"] = rarticles + + return curfeed + + def unreadarticles(self, uri): + rfeed = {} + rfeed["articles"] = [] + feed = self.readfeed(uri) + if feed == None: + return rfeed + + for metakey in ("link", "title", "updated", "author", \ + "email", "toemail"): + if metakey in feed: + rfeed[metakey] = feed[metakey] + + history = feed["articles"] + for article in history: + if article["unread"] == True: + rfeed["articles"].append(article) + + return rfeed + + def setreadarticles(self, uri, curfeed=None): + feed = self.readfeed(uri) + if feed == None: + return + + for article in curfeed["articles"]: + if article in feed["history"]: + article["unread"] == False + diff --git a/feedemail.py b/feedemail.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +# coding=utf-8 +# +# Copy me if you can. +# by 20h +# + +import smtplib +from email.mime.text import MIMEText +from email.mime.multipart import MIMEMultipart +from email.utils import formataddr, formatdate, parseaddr +from email.header import Header + +import html2text as h2t +h2t.UNICODE_SNOB = 1 +h2t.LINKS_EACH_PARAGRAPH = 0 +h2t.BODY_WIDTH = 0 +h2t.INLINE_LINKS = 0 +html2text = h2t.html2text + +def normalizeheader(hstr): + return hstr.replace("\n", " ").strip() + +def send(feed, to, smtphost="localhost", smtpport=None, ssl=False, \ + user=None, password=None): + articles = feed["articles"] + + for article in articles: + if "text" in article: + text = html2text(article["text"]) + else: + text = "" + + if "title" in article: + subject = Header( \ + normalizeheader(article["title"]),\ + "utf-8") + else: + subject = Header(normalizeheader(text[:70]),\ + "utf-8") + + # Append metadata. + if "link" in article: + text = "%sLink: %s\n" % (text, article["link"]) + if "file" in article: + text = "%sEnclosure: %s\n" % (text, article["file"]) + + msg = MIMEText(text, "plain", "utf-8") + + if "email" in feed: + faddr = feed["email"] + else: + faddr = "none@none.no" + if "title" in feed: + if "author" in article: + fname = "%s: %s" % (feed["title"], \ + article["author"]) + else: + fname = feed["title"] + + msg["From"] = formataddr((fname, faddr)) + msg["To"] = formataddr(parseaddr(to)) + msg["Date"] = formatdate() + msg["Subject"] = subject + + if "link" in article: + msg["X-RSS-URL"] = article["link"] + if "link" in feed: + msg["X-RSS-Feed"] = feed["link"] + if "id" in article: + msg["X-RSS-ID"] = article["id"] + if "tags" in article: + msg["X-RSS-TAGS"] = Header(",".join(article["tags"]),\ + "utf-8") + msg["User-Agent"] = "Zeitungsschau" + + print(msg.as_string()) + + if ssl == True: + s = smtplib.SMTP_SSL() + else: + s = smtplib.SMTP() + if smtpport != None: + s.connect(smtphost, smtpport) + else: + s.connect(smtphost) + + if user != None and password != None: + s.ehlo() + if ssl == False: + s.starttls() + s.ehlo() + s.login(user, password) + + s.sendmail(faddr, to, msg.as_string()) + s.quit() + diff --git a/opml.py b/opml.py @@ -0,0 +1,51 @@ +# +# Copy me if you can. +# by 20h +# + +from lxml import etree +from datetime import datetime + +def read(ostr): + parser = etree.XMLParser(recover=True, encoding='utf-8') + xml = etree.fromstring(ostr, parser) + + rssfeeds = [] + + feeds = xml.xpath("//outline") + for feed in feeds: + if "xmlUrl" in feed.attrib: + rssfeeds.append(feed.attrib["xmlUrl"]) + elif "text" in feed.attrib: + rssfeeds.append(feed.attrib["text"]) + + return rssfeeds + +def write(rssfeeds): + opmle = etree.Element("opml") + + heade = etree.SubElement(opmle, "head") + titlee = etree.SubElement(heade, "title") + + daten = datetime.now().strftime("%Y-%m-%dT%H:%M:%S%Z") + datece = etree.SubElement(heade, "dateCreated") + datece.text = daten + dateme = etree.SubElement(heade, "dateModified") + dateme.text = daten + ownerne = etree.SubElement(heade, "ownerName") + ownerne.text = "Me" + docse = etree.SubElement(heade, "docs") + docse.text = "http://dev.opml.org/spec2.html" + + bodye = etree.SubElement(opmle, "body") + + for rss in rssfeeds: + outlinee = etree.SubElement(bodye, "outline") + outlinee.attrib["type"] = "rss" + outlinee.attrib["text"] = rss + outlinee.attrib["xmlUrl"] = rss + + return etree.tostring(opmle, encoding="utf-8", \ + pretty_print=True, \ + xml_declaration=True).decode("utf-8") + diff --git a/zs.py b/zs.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python +# coding=utf-8 +# +# Copy me if you can. +# by 20h +# + +import sys +import os +import feed +import feeddb +import opml +import feedemail + +def run(db, selfeed=None): + feeduris = db.listfeeds() + + if feeduris != None and selfeed in feeduris: + feeduris = [selfeed] + print("feeduris: %s" % (feeduris)) + + for feeduri in feeduris: + curfeed = feed.fetch(feeduri) + print("curfeed: %s" % (curfeed)) + db.mergefeed(feeduri, curfeed) + ufeed = db.unreadarticles(feeduri) + print("unread: %s" % (ufeed)) + + if "toemail" in ufeed: + toemail = ufeed["toemail"] + else: + toemail = db.cfg["email"] + feedemail.send(ufeed, toemail, db.cfg["smtphost"], \ + db.cfg["smtpport"], db.cfg["smtpssl"], \ + db.cfg["smtpuser"], db.cfg["smtppassword"]) + db.setreadarticles(feeduri, ufeed) + +def usage(app): + app = os.path.basename(app) + sys.stderr.write("usage: %s [-h] cmd\n" % (app)) + sys.exit(1) + +def main(args): + retval = 0 + + if len(args) < 2: + usage(args[0]) + + db = feeddb.feeddb() + + if args[1] == "run": + if len(args) > 2: + run(db, args[2]) + else: + run(db) + + elif args[1] == "cfg": + if len(args) < 3: + for k in db.cfg: + print("%s = '%s'" % (k, db.cfg[k])) + elif len(args) < 4: + if args[2] in db.cfg: + print("%s = '%s'" % (args[2], \ + db.cfg[args[2]])) + else: + retval = 1 + else: + db.cfg[args[2]] = args[3] + print("%s = '%s'" % (args[2], db.cfg[args[2]])) + + elif args[1] == "add": + if len(args) < 3: + usage(args[0]) + email = None + if len(args) > 3: + email = args[3] + db.addfeed(args[2], email) + + elif args[1] == "list": + for f in db.listfeeds(): + print(f) + + elif args[1] == "delete": + if len(args) < 3: + usage(args[0]) + db.delfeed(args[1]) + + elif args[1] == "pause": + if len(args) < 3: + usage(args[0]) + db.pause(args[2]) + + elif args[1] == "unpause": + if len(args) < 3: + usage(args[0]) + db.unpause(args[2]) + + elif args[1] == "opmlexport": + if len(args) > 2: + filen = open(args[2], "w") + else: + filen = sys.stdout + filen.write(opml.write(db.listfeeds())) + + elif args[1] == "opmlimport": + if len(args) > 2: + filen = open(args[2], "r") + else: + filen = sys.stdin + feedlist = db.listfeeds() + nfeedlist = opml.read(filen.read().encode("utf-8")) + for f in nfeedlist: + if not f in feedlist: + print("import feed: %s" % (f)) + db.addfeed(f) + + del db + return retval + +if __name__ == "__main__": + sys.exit(main(sys.argv)) +