eomyidae (15243B)
1 #!/usr/bin/env python 2 # coding=utf-8 3 # 4 # See the LICENSE file for details. 5 # 6 7 import os 8 import sys 9 import getopt 10 import urllib.parse 11 import socket 12 import io 13 import pickle 14 import time 15 import hashlib 16 import errno 17 import random 18 import operator 19 import math 20 from multiprocessing import Pool 21 from datetime import datetime 22 from datetime import timedelta 23 24 def parseuri(uri): 25 urls = urllib.parse.urlparse(uri, allow_fragments=False) 26 if ":" in urls.netloc: 27 (host, port) = urls.netloc.split(":")[:2] 28 else: 29 host = urls.netloc 30 port = 70 31 32 mtype = "1" 33 if len(urls.path) > 1: 34 mtype = urls.path[1] 35 36 if len(urls.path) > 2: 37 if len(urls.query) > 0: 38 selector = "%s?%s" % (urls.path[2:], urls.query) 39 else: 40 selector = urls.path[2:] 41 else: 42 selector = "" 43 44 return (host, port, mtype, selector) 45 46 def poolgopher(req): 47 data = gopher(req[0], req[1], req[2], req[3]) 48 req.append(data) 49 return req 50 51 def gopher(uri=None, host=None, port=70, selector=""): 52 #print("gopher(uri = %s, host = %s, port = %d, selector = %s)" % \ 53 # (uri, host, port, selector)) 54 if uri != None: 55 (host, port, mtype, selector) = parseuri(uri) 56 port = int(port) 57 58 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 59 s.settimeout(20) 60 try: 61 s.connect((host, port)) 62 except socket.gaierror: 63 return "" 64 except socket.timeout: 65 return "" 66 except TimeoutError: 67 return "" 68 except ConnectionResetError: 69 return "" 70 except OverflowError: 71 return "" 72 except OSError as e: 73 # No route to host. 74 if e.errno == 113: 75 return "" 76 77 try: 78 s.send(("%s\r\n" % (selector)).encode("utf-8")) 79 except BrokenPipeError: 80 return "" 81 82 fd = s.makefile("b") 83 try: 84 data = fd.read() 85 except socket.timeout: 86 fd.close() 87 return "" 88 except ConnectionResetError: 89 fd.close() 90 return "" 91 fd.close() 92 93 try: 94 content = data.decode(errors='replace') 95 except UnicodeDecodeError: 96 content = data.decode("iso-8859-1") 97 98 return content 99 100 def parsemenu(data): 101 menu = [] 102 lines = data.split("\n") 103 for line in lines: 104 line = line.strip() 105 if len(line) < 1: 106 continue 107 108 mtype = line[0] 109 110 # Last entry 111 if mtype == ".": 112 break 113 114 elements = line[1:].split("\t") 115 if len(elements) < 4: 116 continue 117 (description, selector, host, port) = elements[:4] 118 menu.append([mtype, description, selector, host, port]) 119 120 return menu 121 122 def menu2text(menu): 123 text = "" 124 for entry in menu: 125 if type(entry[1]) != str: 126 continue 127 128 text += "%s\n" % (entry[1]) 129 130 return text 131 132 ## Robots.txt 133 # https://en.wikipedia.org/wiki/Robots.txt 134 # # Comment 135 # User-agent: somebot 136 # Disallow: /path 137 # Allow: /path 138 # Crawl-delay: seconds 139 def parserobots(data): 140 robots = [] 141 lines = data.split("\n") 142 for line in lines: 143 line = line.strip() 144 if "#" in line: 145 (line, comment) = line.split("#", 1) 146 if len(line) < 0: 147 # Empty line, needed for bot-specific rules. 148 robots.append(["",""]) 149 continue 150 if not ":" in line: 151 continue 152 153 (header, value) = line.strip().split(":", 1) 154 value = value.strip().lower() 155 header = header.strip().lower() 156 robots.append([header, value]) 157 return robots 158 159 def adaptrobots(robotsdata): 160 filterlines = {} 161 robotslines = parserobots(robotsdata) 162 i = 0 163 164 allowlines = [] 165 disallowlines = [] 166 otherlines = [] 167 iseomyidae = False 168 while i < len(robotslines): 169 header = robotslines[i][0].lower() 170 value = robotslines[i][1] 171 if header == "user-agent": 172 ua = value.split("/") 173 if ua[0] == "eomyidae" or ua[0] == "*": 174 iseomyidae = 1 175 else: 176 iseomyidae = 0 177 elif header == "allow" and iseomyidae == True: 178 allowlines.append(value) 179 elif header == "disallow" and iseomyidae == True: 180 disallowlines.append(value) 181 elif header == "": 182 iseomyidae = False 183 else: 184 if iseomyidae == True: 185 otherlines.append([header, value]) 186 i += 1 187 188 filterlines["allow"] = allowlines 189 filterlines["disallow"] = disallowlines 190 filterlines["other"] = otherlines 191 if len(allowlines) > 0 or len(disallowlines) > 0 \ 192 or len(otherlines) > 0: 193 filterlines["empty"] = False 194 else: 195 filterlines["empty"] = True 196 197 return filterlines 198 199 def mkpath(cachepath): 200 try: 201 os.makedirs(cachepath) 202 except OSError as e: 203 if e.errno != errno.EEXIST: 204 raise 205 206 def mkopen(cachefile): 207 if not os.path.exists(cachefile): 208 fd = open(cachefile, "xb") 209 else: 210 fd = open(cachefile, "wb") 211 return fd 212 213 def informserveradmin(uri, host=None, port=70): 214 if host == None: 215 (host, port, mtype, selector) = parseuri(uri) 216 port = int(port) 217 218 # We are nice and inform before every robots.txt, how to contact us. 219 gopher(host=host, port=port, selector="This is eomyidae, your " 220 "friendly crawler. See " 221 "gopher://gopherproject.org/1/eomyidae for " 222 "more info. Have a nice day!") 223 224 def cacherobots(cachedir, uri, host=None, port=70, force=False, \ 225 filtercache=None): 226 if host == None: 227 (host, port, mtype, selector) = parseuri(uri) 228 port = int(port) 229 230 if filtercache != None and host in filtercache: 231 #print("Got filterlines from memory filtercache.") 232 return filtercache[host] 233 234 print("Getting robots for %s:%d" % (host, port)) 235 236 cachepath = "%s/%s:%d" % (cachedir, host, port) 237 mkpath(cachepath) 238 239 cacherobotstxt = "%s/robots.txt" % (cachepath) 240 cacherobotspickle = "%s/robots.pickle" % (cachepath) 241 filterlines = {} 242 if not os.path.exists(cacherobotstxt) or force == True: 243 # Be nice. 244 informserveradmin(uri=uri, host=host, port=port) 245 246 robotsdata = gopher(host=host, port=port, selector="/robots.txt") 247 print("Got new robots.txt.") 248 print(robotsdata) 249 robotstxtfd = mkopen(cacherobotstxt) 250 robotstxtfd.write(robotsdata.encode()) 251 robotstxtfd.close() 252 253 filterlines = adaptrobots(robotsdata) 254 # Do not store if there is nothing, so we save I/O later. 255 if filterlines["empty"] == False: 256 print("Storing filterlines.") 257 storelistdb(cacherobotspickle, filterlines) 258 259 else: 260 if os.path.exists(cacherobotspickle): 261 #print("Loading filterlines from cache.") 262 filterlines = loadlistdb(cacherobotspickle) 263 else: 264 #print("No filterlines available in cache.") 265 filterlines["empty"] = True 266 267 #print(filterlines) 268 if filtercache != None: 269 filtercache[host] = filterlines 270 271 return filterlines 272 273 def selectorisallowed(filterlines, selector): 274 if filterlines["empty"] == True: 275 return True 276 277 def robotsmatch(pattern, selector): 278 #print("pattern = %s, selector = %s" % (pattern, selector)) 279 if pattern == '*': 280 #print("Just start match.") 281 return True 282 elif pattern[0] == '*': 283 #print("Begins with star.") 284 if pattern[-1] == '*': 285 #print("Begins and ends with star.") 286 if pattern[1:-1] in selector: 287 #print("Matches.") 288 return True 289 else: 290 return False 291 else: 292 return selector.endswith(pattern[1:]) 293 elif pattern[-1] == '*': 294 #print("Ends with star.") 295 return selector.startswith(pattern[:-1]) 296 else: 297 return selector.startswith(pattern) 298 299 isallowed = True 300 for line in filterlines["disallow"]: 301 # TODO: Should this be match everything? 302 if len(line) == 0: 303 continue 304 if robotsmatch(line, selector) == True: 305 #print("isallowed = False") 306 isallowed = False 307 for line in filterlines["allow"]: 308 # TODO: Should this be match everything? 309 if len(line) == 0: 310 continue 311 if robotsmatch(line, selector) == True: 312 #print("isallowed = True") 313 isallowed = True 314 315 #print("isallowed = %d" % (isallowed)) 316 return isallowed 317 318 def loadselectorstxt(filename): 319 selectors = [] 320 321 if os.path.exists(filename): 322 fd = open(filename, "r") 323 for line in fd: 324 fields = line.split("|") 325 selectors.append(fields) 326 fd.close() 327 328 return selectors 329 330 def loadlist(filename): 331 listelems = [] 332 333 if os.path.exists(filename): 334 fd = open(filename, "r") 335 for line in fd: 336 line = line.strip() 337 if len(line) == 0: 338 continue 339 if line[0] == "#": 340 continue 341 listelems.append(line) 342 fd.close() 343 344 return listelems 345 346 def loadlistdb(filename): 347 listelems = [] 348 349 if os.path.exists(filename): 350 fd = open(filename, "rb") 351 try: 352 listelems = pickle.load(fd) 353 except EOFError: 354 return [] 355 fd.close() 356 357 return listelems 358 359 def storelistdb(filename, listelems): 360 fd = mkopen(filename) 361 pickle.dump(listelems, fd) 362 fd.close() 363 364 def storerawdata(cachedir, uri, data, host=None, port=70): 365 if host == None: 366 (host, port, mtype, selector) = parseuri(uri) 367 port = int(port) 368 369 cachepath = "%s/%s:%s" % (cachedir, host, port) 370 mkpath(cachepath) 371 372 m = hashlib.sha256() 373 m.update(uri.encode()) 374 urihash = m.hexdigest() 375 376 cachepath = "%s/%s.menu" % (cachepath, urihash) 377 fd = mkopen(cachepath) 378 #print("Storing %s at %s" % (uri, cachepath)) 379 fd.write(("%s\n" % (uri)).encode()) 380 fd.write(data.encode()) 381 fd.close() 382 383 def usage(app): 384 app = os.path.basename(app) 385 print("usage: %s [-hor] [-b base] [-f blocklist] [-w n] [starturl]" % (app), file=sys.stderr) 386 sys.exit(1) 387 388 def main(args): 389 try: 390 opts, largs = getopt.getopt(args[1:], "hb:f:ow:r") 391 except getopt.GetoptError as err: 392 print(str(err)) 393 usage(args[0]) 394 395 blocklistfile = None 396 blocklist = [] 397 398 base = "." 399 starturi = None 400 workernum = 1 401 robotscache = {} 402 forcehostscount = False 403 for o, a in opts: 404 if o == "-h": 405 usage(args[0]) 406 elif o == "-b": 407 base = a 408 elif o == "-f": 409 blocklistfile = a 410 blocklist = loadlist(blocklistfile) 411 print("blocklist: %s" % (blocklist)) 412 elif o == "-o": 413 forcehostscount = True 414 elif o == "-r": 415 # Do not cache robots.txt in memory. 416 robotscache = None 417 elif o == "-w": 418 try: 419 workernum = int(a) 420 except ValueError: 421 workernum = 1 422 else: 423 assert False, "unhandled option" 424 425 os.chdir(base) 426 cachedir = "%s/cache" % (base) 427 428 if len(largs) > 0: 429 starturi = largs[0] 430 431 knownuris = loadlistdb("knownuris.pickle") 432 if knownuris == []: 433 knownuris = {} 434 lastlenknownuris = len(knownuris) 435 436 def isblocked(uri): 437 for rule in blocklist: 438 if uri.startswith(rule): 439 return True 440 return False 441 442 def addhostscount(host): 443 if host in hostscount: 444 hostscount[host] += 1 445 else: 446 hostscount[host] = 1 447 448 def subhostscount(host): 449 if host in hostscount: 450 hostscount[host] -= 1 451 if hostscount[host] <= 0: 452 del hostscount[host] 453 454 def addhostscache(uri, host=None, port=70, selector="/"): 455 if uri != None and host == None: 456 (host, port, mtype, selector) = parseuri(uri) 457 port = int(port) 458 else: 459 try: 460 port = int(port) 461 except ValueError: 462 return 463 464 if uri in knownuris: 465 print("ignored for queue: %s" % (uri)) 466 return 467 if host == "": 468 print("ignored for queue: %s" % (uri)) 469 return 470 if isblocked(uri): 471 print("blocked by filters: %s" % (uri)) 472 return 473 474 addhostscount(host) 475 476 if not host in hostscache: 477 hostscache[host] = {} 478 if not "queue" in hostscache[host]: 479 hostscache[host]["queue"] = {} 480 481 filterrules = cacherobots(cachedir, uri, \ 482 host=host, \ 483 port=port, \ 484 filtercache=robotscache) 485 if selectorisallowed(filterrules, selector) == True: 486 hostscache[host]["queue"][uri] = None 487 print("pushed to queue: %s" % (uri)) 488 else: 489 pass 490 print("blocked by robots: %s" % (uri)) 491 492 def getqueuelen(): 493 queuelen = 0 494 for host in hostscache: 495 queuelen += len(hostscache[host]["queue"]) 496 return queuelen 497 498 hostscache = loadlistdb("hostscache.pickle") 499 if hostscache == []: 500 hostscache = {} 501 hostscount = loadlistdb("hostscount.pickle") 502 if hostscount == [] or forcehostscount == True: 503 hostscount = {} 504 for host in list(hostscache.keys()): 505 print("host = %s, queuelen = %d" \ 506 % (host, \ 507 len(hostscache[host]["queue"]))) 508 if len(hostscache[host]["queue"]) == 0: 509 del hostscache[host] 510 continue 511 for uri in hostscache[host]["queue"]: 512 (host, port, mtype, selector) = parseuri(uri) 513 addhostscount(host) 514 515 def storestate(): 516 if blocklistfile != None: 517 blocklist = loadlist(blocklistfile) 518 if len(blocklist) > 0: 519 print("blocklist: %s" % (blocklist)) 520 print("################## Storing state to disc.") 521 storelistdb("knownuris.pickle", knownuris) 522 storelistdb("hostscache.pickle", hostscache) 523 storelistdb("hostscount.pickle", hostscount) 524 print("################## Storing state to disc done.") 525 526 jobs = [] 527 if starturi != None: 528 #print("starturi = %s" % (starturi)) 529 if not isblocked(starturi): 530 (starthost, startport, startmtype, startselector) = parseuri(starturi) 531 addhostscache(starturi, \ 532 selector=startselector, \ 533 host=starthost, \ 534 port=startport) 535 try: 536 jobs.append([starturi, starthost, int(startport), startselector]) 537 except ValueError: 538 # Please fix your URI. 539 pass 540 541 # Store state keeper. 542 startnow = datetime.now() 543 storedelta = timedelta(seconds=10) # 30 seconds 544 545 lastlenknownhosts = len(hostscache) 546 lastlenuriqueue = getqueuelen() 547 while lastlenuriqueue > 0: 548 if len(jobs) < workernum: 549 for host in list(hostscache.keys()): 550 if len(hostscache[host]["queue"]) == 0: 551 del hostscache[host] 552 if host in hostscount: 553 del hostscount[host] 554 555 selhosts = sorted(hostscount.items(), \ 556 key=operator.itemgetter(1))[:workernum*2] 557 558 # Give hosts with many selectors more jobs. 559 hostjobs = {} 560 for selhost in selhosts: 561 # 10 ** x 562 hostjobs[selhost[0]] = \ 563 math.floor(math.log10(selhost[1])) 564 if hostjobs[selhost[0]] == 0: 565 hostjobs[selhost[0]] = 1 566 print("Queue Status: %s" % (hostjobs)) 567 568 for selhost in selhosts: 569 selhost = selhost[0] 570 seluris = hostscache[selhost]["queue"] 571 while hostjobs[selhost] > 0: 572 if len(seluris) == 0: 573 break 574 jobitem = seluris.popitem() 575 if isblocked(jobitem[0]): 576 continue 577 (host, port, mtype, selector) = parseuri(jobitem[0]) 578 job = [jobitem[0], host, port, selector] 579 if job not in jobs: 580 jobs.append([jobitem[0], host, port, selector]) 581 hostjobs[selhost] -= 1 582 583 print("Getting %d jobs." % (len(jobs))) 584 585 dataresults = [] 586 with Pool(processes=workernum) as pool: 587 dataresults = pool.map(poolgopher, jobs) 588 #data = gopher(host=host, port=port, selector=selector) 589 jobs = [] 590 591 for dataresult in dataresults: 592 (cururi, host, port, selector, data) = dataresult 593 subhostscount(host) 594 storerawdata(cachedir, cururi, data, host=host, port=port) 595 menudata = parsemenu(data) 596 #print(menudata) 597 for mi in menudata: 598 # Only menus so far. 599 if mi[0] == "1": 600 # Fix menu items with ports in hosts. 601 if ":" in mi[3]: 602 mi[3] = mi[3].split(":")[0] 603 604 guri = "gopher://%s:%s/%s%s" % \ 605 (mi[3], mi[4], mi[0], mi[2]) 606 607 addhostscache(guri, host=mi[3], \ 608 port=mi[4], \ 609 selector=mi[2]) 610 611 print("Uri %s done." % (cururi)) 612 knownuris[cururi] = None 613 614 lenuriqueue = getqueuelen() 615 lenknownuris = len(knownuris) 616 lenknownhosts = len(hostscache) 617 print("> queue hosts = %d (%d) %s" % \ 618 (lenknownhosts, lenknownhosts - 619 lastlenknownhosts, hostscache.keys())) 620 print("> uri queue len = %d (%d)" % \ 621 (lenuriqueue, lenuriqueue - lastlenuriqueue)) 622 print("> visited uris = %d (%d)" % \ 623 (lenknownuris, lenknownuris - lastlenknownuris)) 624 lastlenknownuris = lenknownuris 625 lastlenuriqueue = lenuriqueue 626 lastlenknownhosts = lenknownhosts 627 628 # TODO: Remove after debugging 629 nowdelta = datetime.now() - startnow 630 if nowdelta >= storedelta: 631 storestate() 632 startnow = datetime.now() 633 634 time.sleep(0.2) # don't be too harsh on servers 635 636 #break #oneshot 637 638 # Save at end of even single shot. 639 storestate() 640 641 return 0 642 643 if __name__ == "__main__": 644 sys.exit(main(sys.argv)) 645