]> code.delx.au - bg-scripts/blob - bin/new_wget_image_board.py
RandomBG: Make Listener non-writeable
[bg-scripts] / bin / new_wget_image_board.py
1 #!/usr/bin/env python
2
3 import sys, os, re, itertools
4 from wget_lib import *
5
6 import twisted_wget
7 from twisted_wget import reactor
8 from Enum import enum
9
10 DEBUG = True
11
12 URL_TYPE = enum('ImageBoard', 'HtmlPage', 'Image', 'Other')
13
14 def addtolist(list, *regexStrs):
15 def decorator(func):
16 for regexStr in regexStrs:
17 regex = re.compile(regexStr)
18 list.append( (regex, func) )
19 return func
20 return decorator
21
22 class Downloader(object):
23 htmlParsers = []
24 class ParserException(Exception):
25 pass
26
27 def __init__(self, url):
28 self.url = url
29 self.deferred = None
30
31 def downloadFiles(self):
32 # XXX: This is a major hack and needs to be cleaned
33 def commonCallback(downloadObject):
34 self.workingUrls.remove(downloadObject)
35 self.activeHosts.remove(downloadObject.host)
36 self.__scheduleDownloadLater()
37 def successCallback(downloadObject, data):
38 print 'Downloaded %s' % downloadObject.url
39 commonCallback(downloadObject)
40 downloadObject.data = data
41 downloadObject.callback(downloadObject)
42 self.doneUrls.add(downloadObject)
43 def errorCallback(downloadObject, data):
44 commonCallback(downloadObject)
45 print 'Error: %s' % data
46 def doDownload(file):
47 print 'About to download "%s"' % file.url
48 twisted_wget.downloadURL(file.url,
49 successBack = lambda data: successCallback(file, data),
50 errorBack = lambda data: errorCallback(file, data)
51 )
52 self.waitingUrls.remove(file)
53 self.workingUrls.add(file)
54 self.activeHosts.add(file.host)
55
56
57 self.deferred = None
58 for file in list(self.waitingUrls):
59 if file.host not in self.activeHosts:
60 doDownload(file)
61
62 # Notes:
63 # - image_object.data is a string containing all of the data
64 # - image_object.url is a string containing the url where the data was downloaded from
65 def _parseImageBoard(self, image_object):
66 assert(image_object.data != None)
67 assert(image_object.url != None)
68
69 for parser_regex, parser in self.htmlParsers:
70 if parser_regex.search(image_object.url):
71 return parser(image_object)
72 raise DownloadManager.ParserException('Could not find the correct parser')
73
74 @addtolist(htmlParsers, '\.4chan\.org')
75 def _parseImageBoard_4chan(self, image_object):
76 import htmldata
77 def __extractImageUrlsFromList(urllist):
78 for url_elem in urllist:
79 if url_elem.tag_name.upper() == 'A' and isImageURL(url_elem.url):
80 yield url_elem.url
81
82 # TODO: Extract a better filename from the list
83 urllist = __extractImageUrlsFromList( htmldata.urlextract(image_object.data, image_object.url) )
84 urllist = xRemoveDups(urllist)
85 urllist = itertools.ifilter(
86 lambda elem: elem.find('/thumb/') == -1,
87 itertools.ifilter(lambda elem: elem.find('/src.cgi/') == -1, urllist)
88 )
89
90 if DEBUG:
91 urllist, urllist_dup = itertools.tee(urllist)
92 print >>sys.stderr, 'Got the following urls: \n\t%s' % '\n\t'.join(urllist_dup)
93
94 for url in urllist:
95 self.downloadImage(url, referer = image_object.url)
96
97 def main(output_directory):
98 dm = DownloadManager()
99 for url in sys.argv[1:]:
100 dm.recursiveDownloadImages(url)
101
102 reactor.run()
103
104 if __name__ == "__main__":
105 output_directory = os.environ.get('WGET_IMAGEBOARD_DIRECTORY',
106 os.path.join(os.environ['HOME'], 'Images_old', 'wget'))
107 main(output_directory)