+++ /dev/null
-#!/usr/bin/env python
-
-import sys, os, re, itertools
-from wget_lib import *
-
-import twisted_wget
-from twisted_wget import reactor
-from Enum import enum
-
-DEBUG = True
-
-URL_TYPE = enum('ImageBoard', 'HtmlPage', 'Image', 'Other')
-
-def addtolist(list, *regexStrs):
- def decorator(func):
- for regexStr in regexStrs:
- regex = re.compile(regexStr)
- list.append( (regex, func) )
- return func
- return decorator
-
-class Downloader(object):
- htmlParsers = []
- class ParserException(Exception):
- pass
-
- def __init__(self, url):
- self.url = url
- self.deferred = None
-
- def downloadFiles(self):
- # XXX: This is a major hack and needs to be cleaned
- def commonCallback(downloadObject):
- self.workingUrls.remove(downloadObject)
- self.activeHosts.remove(downloadObject.host)
- self.__scheduleDownloadLater()
- def successCallback(downloadObject, data):
- print 'Downloaded %s' % downloadObject.url
- commonCallback(downloadObject)
- downloadObject.data = data
- downloadObject.callback(downloadObject)
- self.doneUrls.add(downloadObject)
- def errorCallback(downloadObject, data):
- commonCallback(downloadObject)
- print 'Error: %s' % data
- def doDownload(file):
- print 'About to download "%s"' % file.url
- twisted_wget.downloadURL(file.url,
- successBack = lambda data: successCallback(file, data),
- errorBack = lambda data: errorCallback(file, data)
- )
- self.waitingUrls.remove(file)
- self.workingUrls.add(file)
- self.activeHosts.add(file.host)
-
-
- self.deferred = None
- for file in list(self.waitingUrls):
- if file.host not in self.activeHosts:
- doDownload(file)
-
- # Notes:
- # - image_object.data is a string containing all of the data
- # - image_object.url is a string containing the url where the data was downloaded from
- def _parseImageBoard(self, image_object):
- assert(image_object.data != None)
- assert(image_object.url != None)
-
- for parser_regex, parser in self.htmlParsers:
- if parser_regex.search(image_object.url):
- return parser(image_object)
- raise DownloadManager.ParserException('Could not find the correct parser')
-
- @addtolist(htmlParsers, '\.4chan\.org')
- def _parseImageBoard_4chan(self, image_object):
- import htmldata
- def __extractImageUrlsFromList(urllist):
- for url_elem in urllist:
- if url_elem.tag_name.upper() == 'A' and isImageURL(url_elem.url):
- yield url_elem.url
-
- # TODO: Extract a better filename from the list
- urllist = __extractImageUrlsFromList( htmldata.urlextract(image_object.data, image_object.url) )
- urllist = xRemoveDups(urllist)
- urllist = itertools.ifilter(
- lambda elem: elem.find('/thumb/') == -1,
- itertools.ifilter(lambda elem: elem.find('/src.cgi/') == -1, urllist)
- )
-
- if DEBUG:
- urllist, urllist_dup = itertools.tee(urllist)
- print >>sys.stderr, 'Got the following urls: \n\t%s' % '\n\t'.join(urllist_dup)
-
- for url in urllist:
- self.downloadImage(url, referer = image_object.url)
-
-def main(output_directory):
- dm = DownloadManager()
- for url in sys.argv[1:]:
- dm.recursiveDownloadImages(url)
-
- reactor.run()
-
-if __name__ == "__main__":
- output_directory = os.environ.get('WGET_IMAGEBOARD_DIRECTORY',
- os.path.join(os.environ['HOME'], 'Images_old', 'wget'))
- main(output_directory)