--- /dev/null
+#!/usr/bin/env python
+
+import sys, os, re, itertools
+from wget_lib import *
+
+import twisted_wget
+from twisted_wget import reactor
+from Enum import enum
+
+DEBUG = True
+
+URL_TYPE = enum('ImageBoard', 'HtmlPage', 'Image', 'Other')
+
+def addtolist(list, *regexStrs):
+ def decorator(func):
+ for regexStr in regexStrs:
+ regex = re.compile(regexStr)
+ list.append( (regex, func) )
+ return func
+ return decorator
+
+class Downloader(object):
+ htmlParsers = []
+ class ParserException(Exception):
+ pass
+
+ def __init__(self, url):
+ self.url = url
+ self.deferred = None
+
+ def downloadFiles(self):
+ # XXX: This is a major hack and needs to be cleaned
+ def commonCallback(downloadObject):
+ self.workingUrls.remove(downloadObject)
+ self.activeHosts.remove(downloadObject.host)
+ self.__scheduleDownloadLater()
+ def successCallback(downloadObject, data):
+ print 'Downloaded %s' % downloadObject.url
+ commonCallback(downloadObject)
+ downloadObject.data = data
+ downloadObject.callback(downloadObject)
+ self.doneUrls.add(downloadObject)
+ def errorCallback(downloadObject, data):
+ commonCallback(downloadObject)
+ print 'Error: %s' % data
+ def doDownload(file):
+ print 'About to download "%s"' % file.url
+ twisted_wget.downloadURL(file.url,
+ successBack = lambda data: successCallback(file, data),
+ errorBack = lambda data: errorCallback(file, data)
+ )
+ self.waitingUrls.remove(file)
+ self.workingUrls.add(file)
+ self.activeHosts.add(file.host)
+
+
+ self.deferred = None
+ for file in list(self.waitingUrls):
+ if file.host not in self.activeHosts:
+ doDownload(file)
+
+ # Notes:
+ # - image_object.data is a string containing all of the data
+ # - image_object.url is a string containing the url where the data was downloaded from
+ def _parseImageBoard(self, image_object):
+ assert(image_object.data != None)
+ assert(image_object.url != None)
+
+ for parser_regex, parser in self.htmlParsers:
+ if parser_regex.search(image_object.url):
+ return parser(image_object)
+ raise DownloadManager.ParserException('Could not find the correct parser')
+
+ @addtolist(htmlParsers, '\.4chan\.org')
+ def _parseImageBoard_4chan(self, image_object):
+ import htmldata
+ def __extractImageUrlsFromList(urllist):
+ for url_elem in urllist:
+ if url_elem.tag_name.upper() == 'A' and isImageURL(url_elem.url):
+ yield url_elem.url
+
+ # TODO: Extract a better filename from the list
+ urllist = __extractImageUrlsFromList( htmldata.urlextract(image_object.data, image_object.url) )
+ urllist = xRemoveDups(urllist)
+ urllist = itertools.ifilter(
+ lambda elem: elem.find('/thumb/') == -1,
+ itertools.ifilter(lambda elem: elem.find('/src.cgi/') == -1, urllist)
+ )
+
+ if DEBUG:
+ urllist, urllist_dup = itertools.tee(urllist)
+ print >>sys.stderr, 'Got the following urls: \n\t%s' % '\n\t'.join(urllist_dup)
+
+ for url in urllist:
+ self.downloadImage(url, referer = image_object.url)
+
+def main(output_directory):
+ dm = DownloadManager()
+ for url in sys.argv[1:]:
+ dm.recursiveDownloadImages(url)
+
+ reactor.run()
+
+if __name__ == "__main__":
+ output_directory = os.environ.get('WGET_IMAGEBOARD_DIRECTORY',
+ os.path.join(os.environ['HOME'], 'Images_old', 'wget'))
+ main(output_directory)