feed.py (10854B)
1 # 2 # See LICENSE for licensing details. 3 # 4 # Copy me if you can. 5 # by 20h 6 # 7 8 import lxml 9 import lxml.objectify 10 import html 11 from datetime import datetime 12 import dateutil.parser 13 from dateutil.tz import gettz 14 import requests 15 import hashlib 16 import pytz 17 import codecs 18 import urllib.parse 19 import socket 20 import json 21 import pytz 22 23 def parseiso(dstr, now): 24 def gettzinfo(zone, offset): 25 try: 26 return gettz(zone) 27 except: 28 return None 29 30 try: 31 return dateutil.parser.parse(str(dstr), default=now, 32 tzinfos=gettzinfo) 33 except: 34 # Invalid time format. Could not be parsed. 35 return now 36 37 def removenamespaces(xml): 38 for key in xml.nsmap: 39 nsstr = u'{%s}' % (xml.nsmap[key]) 40 nsl = len(nsstr) 41 42 for elem in xml.getiterator(): 43 if elem.tag.startswith(nsstr): 44 elem.tag = elem.tag[nsl:] 45 46 def parsexml(astr): 47 xml = lxml.objectify.fromstring(html.unescape(astr.decode("utf-8")).encode("utf-8")) 48 removenamespaces(xml) 49 # Throw XML parsing errors so we can blame the feed authors. 50 #print(lxml.objectify.dump(xml)) 51 return xml 52 53 def parsetwtxtfeed(astr, uri): 54 feed = {} 55 articles = [] 56 now = datetime.now(pytz.utc) 57 now = now.replace(hour=20, minute=20, second=20, microsecond=20) 58 59 feed["title"] = uri 60 feed["link"] = uri 61 feed["updated"] = now 62 63 lines = astr.split("\n"); 64 for line in lines: 65 # People already reinterpret the standard. :( 66 if len(line) == 0: 67 continue 68 if line[0] == "#": 69 continue 70 71 createdtxt, ltext = line.split("\t", 1) 72 created = parseiso(createdtxt, now) 73 74 article = {} 75 article["id"] = createdtxt 76 article["title"] = ltext 77 article["text"] = ltext 78 article["uuid"] = createdtxt 79 article["updated"] = created 80 81 if article["updated"] == now: 82 article["uuid"] = "" 83 else: 84 article["uuid"] = "%s" % (article["updated"]) 85 86 articles.append(article) 87 88 feed["articles"] = articles 89 90 return feed 91 92 def parsejsonfeed(astr): 93 js = json.loads(astr) 94 95 feed = {} 96 articles = [] 97 now = datetime.now(pytz.utc) 98 now = now.replace(hour=20, minute=20, second=20, microsecond=20) 99 100 if "title" in js: 101 feed["title"] = js["title"] 102 if "description" in js: 103 feed["description"] = js["description"] 104 if "home_page_url" in js: 105 feed["link"] = js["home_page_url"] 106 if "feed_url" in js: 107 feed["link"] = js["feed_url"] 108 if "author" in js: 109 if "name" in js["author"]: 110 feed["author"] = js["author"]["name"] 111 feed["updated"] = now 112 113 if "items" in js: 114 for item in js["items"]: 115 article = {} 116 if "url" in item: 117 article["file"] = item["url"] 118 if "title" in item: 119 article["title"] = item["title"] 120 if "id" in item: 121 article["id"] = item["id"] 122 else: 123 if "link" in article: 124 article["id"] = article["link"] 125 elif "file" in article: 126 article["id"] = article["file"] 127 else: 128 article["id"] = article["text"][:30] 129 130 if "summary" in item: 131 article["text"] = html.unescape(item["summary"]) 132 if "content_html" in item: 133 article["text"] = html.unescape(item["content_html"]) 134 if "content_text" in item: 135 article["text"] = html.unescape(item["content_text"]) 136 if "date_published" in item: 137 article["updated"] = \ 138 dateutil.parser.parse(item["date_published"]) 139 else: 140 article["updated"] = now 141 142 if article["updated"] == now: 143 article["uuid"] = "" 144 else: 145 article["uuid"] = "%s" % (article["updated"]) 146 147 for e in ("id", "title", "file"): 148 if e in article: 149 article["uuid"] = "%s-%s" % \ 150 (article["uuid"],\ 151 article[e]) 152 153 def mkuuid(s): 154 return hashlib.sha256(str(s).\ 155 encode("utf8")).hexdigest() 156 if len(article["uuid"]) == 0: 157 article["uuid"] = mkuuid(now) 158 else: 159 article["uuid"] = mkuuid(article["uuid"]) 160 161 # sanity checks 162 if "title" not in article and "text" not in article \ 163 and "file" not in article: 164 continue 165 166 articles.append(article) 167 168 feed["articles"] = articles 169 170 return feed 171 172 def parseatomfeed(astr): 173 xml = parsexml(astr) 174 if xml == None: 175 return None 176 177 feed = {} 178 articles = [] 179 isrss = False 180 isrdf = False 181 now = datetime.now(pytz.utc) 182 now = now.replace(hour=20, minute=20, second=20, microsecond=20) 183 184 if hasattr(xml, "channel"): 185 if hasattr(xml, "item"): 186 isrdf = True 187 oxml = xml 188 xml = xml.channel 189 isrss = True 190 191 feed["title"] = "" 192 for e in ("title", "description"): 193 if hasattr(xml, e): 194 feed[e] = html.unescape(str(xml[e])) 195 196 if hasattr(xml, "image") and hasattr(xml.image, "title"): 197 if "title" not in feed: 198 feed["title"] = html.unescape(str(xml.image.title)) 199 200 if hasattr(xml, "updated"): 201 feed["updated"] = parseiso(xml.updated, now) 202 elif hasattr(xml, "pubDate"): 203 feed["updated"] = parseiso(xml.pubDate, now) 204 elif hasattr(xml, "lastBuildDate"): 205 feed["updated"] = parseiso(xml.lastBuildDate, now) 206 else: 207 feed["updated"] = now 208 209 if hasattr(xml, "link"): 210 if "href" in xml.link.attrib: 211 feed["link"] = str(xml.link.attrib["href"]) 212 else: 213 feed["link"] = str(xml.link) 214 215 if hasattr(xml, "webmaster"): 216 feed["email"] = html.unescape(str(xml.webmaster)) 217 elif hasattr(xml, "owner") and hasattr(xml.owner, "email"): 218 feed["email"] = html.unescape(str(xml.owner.email)) 219 elif hasattr(xml, "author") and hasattr(xml.author, "email"): 220 feed["email"] = html.unescape(str(xml.author.email)) 221 elif hasattr(xml, "webMaster"): 222 feed["email"] = html.unescape(str(xml.webMaster)) 223 elif hasattr(xml, "managingeditor"): 224 feed["email"] = html.unescape(str(xml.managingeditor)) 225 elif hasattr(xml, "managingEditor"): 226 feed["email"] = html.unescape(str(xml.managingEditor)) 227 228 if hasattr(xml, "author"): 229 if hasattr(xml.author, "name"): 230 feed["author"] = html.unescape(str(xml.author.name)) 231 else: 232 feed["author"] = html.unescape(str(xml.author)) 233 elif hasattr(xml, "creator"): 234 feed["author"] = html.unescape(str(xml.creator)) 235 236 entryname = "entry" 237 if isrss == True or isrdf == True: 238 entryname = "item" 239 if isrdf == True: 240 xml = oxml 241 if hasattr(xml, entryname): 242 for entry in xml[entryname][:]: 243 article = {} 244 # title 245 if hasattr(entry, "title"): 246 article["title"] = html.unescape(\ 247 str(entry["title"])) 248 249 # link 250 if hasattr(entry, "link"): 251 if "href" in entry.link.attrib: 252 article["link"] = str(entry.link.attrib["href"]) 253 else: 254 article["link"] = str(entry.link) 255 elif hasattr(entry, "source"): 256 article["link"] = str(entry.source) 257 258 # enclosure 259 if hasattr(entry, "enclosure"): 260 if "href" in entry.enclosure.attrib: 261 article["file"] = \ 262 str(entry.enclosure.attrib["href"]) 263 elif "url" in entry.enclosure.attrib: 264 article["file"] = \ 265 str(entry.enclosure.attrib["url"]) 266 else: 267 article["file"] = str(entry.enclosure) 268 269 if hasattr(entry, "group") and \ 270 hasattr(entry.group, "content"): 271 if "url" in entry.group.content: 272 article["file"] = \ 273 html.unescape(\ 274 str(entry.group.content.\ 275 attrib["file"])) 276 277 # updated 278 try: 279 if hasattr(entry, "updated"): 280 article["updated"] = parseiso(entry.updated,\ 281 now) 282 elif hasattr(entry, "temporary"): 283 article["updated"] = now 284 elif hasattr(entry, "pubDate"): 285 article["updated"] = parseiso(entry.pubDate,\ 286 now) 287 elif hasattr(entry, "date"): 288 article["updated"] = parseiso(entry.date, now) 289 else: 290 article["updated"] = now 291 except TypeError: 292 # There was some error in parseiso. 293 article["updated"] = now 294 295 # author 296 if hasattr(entry, "author"): 297 if hasattr(entry.author, "name"): 298 article["author"] = html.unescape(\ 299 str(entry.author.name)) 300 else: 301 article["author"] = html.unescape(\ 302 str(entry.author)) 303 elif hasattr(entry, "creator"): 304 article["author"] = html.unescape(\ 305 str(entry.creator)) 306 307 # tags 308 if hasattr(entry, "category"): 309 article["tags"] = [] 310 for cat in entry["category"][:]: 311 article["tags"].append(\ 312 html.unescape(\ 313 str(cat))) 314 315 # text 316 # Don't unescape the text, it might contain HTML. 317 if hasattr(entry, "encoded"): 318 article["text"] = str(entry.encoded) 319 elif hasattr(entry, "content"): 320 article["text"] = str(entry.content) 321 elif hasattr(entry, "summary"): 322 article["text"] = str(entry.summary) 323 elif hasattr(entry, "description"): 324 article["text"] = str(entry.description) 325 326 # id 327 if hasattr(entry, "id"): 328 article["id"] = str(entry["id"]) 329 else: 330 if "link" in article: 331 article["id"] = article["link"] 332 elif "file" in article: 333 article["id"] = article["file"] 334 else: 335 article["id"] = article["text"][:30] 336 337 if article["updated"] == now: 338 article["uuid"] = "" 339 else: 340 article["uuid"] = "%s" % (article["updated"]) 341 342 # Certain websites need exceptions due to their 343 # »programmers« being stupid. 344 if "link" in feed: 345 if "youtube.com" in feed["link"]: 346 article["uuid"] = "" 347 348 for e in ("id", "title", "file"): 349 if e in article: 350 article["uuid"] = "%s-%s" % \ 351 (article["uuid"],\ 352 article[e]) 353 354 def mkuuid(s): 355 return hashlib.sha256(str(s).\ 356 encode("utf8")).hexdigest() 357 if len(article["uuid"]) == 0: 358 article["uuid"] = mkuuid(now) 359 else: 360 article["uuid"] = mkuuid(article["uuid"]) 361 362 # sanity checks 363 if "title" not in article and "text" not in article \ 364 and "file" not in article: 365 continue 366 367 articles.append(article) 368 369 try: 370 feed["articles"] = sorted(articles, key=lambda article: \ 371 article["updated"]) 372 except TypeError: 373 for article in articles: 374 print(article["updated"]) 375 376 return feed 377 378 def fetch(uri): 379 ftype = "xml" 380 if "file://" in uri: 381 fd = codecs.open(uri[7:], "r", "utf-8") 382 fval = fd.read().encode("utf-8") 383 fd.close() 384 rcode = 200 385 elif "gopher://" in uri: 386 urls = urllib.parse.urlparse(uri, allow_fragments=False) 387 if ":" in urls.netloc: 388 (host, port) = urls.netloc.split(":") 389 else: 390 host = urls.netloc 391 port = 70 392 if len(urls.path) > 2: 393 if len(urls.query) > 0: 394 selector = "%s?%s" % (urls.path[2:], urls.query) 395 else: 396 selector = urls.path[2:] 397 else: 398 selector = "" 399 400 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 401 s.connect((host, port)) 402 s.send(("%s\r\n" % (selector)).encode("utf-8")) 403 fd = s.makefile("r") 404 fval = fd.read().encode("utf-8") 405 s.close() 406 rcode = 200 407 else: 408 fd = requests.get(uri, timeout=20,\ 409 headers={"User-Agent": "Zeitungsschau/1.0"}) 410 fval = fd.content 411 rcode = fd.status_code 412 413 if "Content-Type" in fd.headers: 414 if "application/json" in fd.headers["Content-Type"]: 415 ftype = "json" 416 417 if ftype == "xml": 418 suri = uri.lower().rsplit(".", 1) 419 if len(suri) > 1: 420 if suri[-1] == "json": 421 ftype = "json" 422 elif suri[-1] == "txt": 423 ftype = "twtxt" 424 425 if ftype == "xml": 426 rval = (rcode, parseatomfeed(fval)) 427 elif ftype == "twtxt": 428 rval = (rcode, parsetwtxtfeed(fval.decode("utf-8"), uri)) 429 else: 430 rval = (rcode, parsejsonfeed(fval.decode("utf-8"))) 431 432 if rval[1] != None: 433 rval[1]["feeduri"] = uri 434 435 return rval 436