]> code.delx.au - bg-scripts/blobdiff - bin/new_wget_image_board.py
Removing a lot of unused libraries
[bg-scripts] / bin / new_wget_image_board.py
diff --git a/bin/new_wget_image_board.py b/bin/new_wget_image_board.py
deleted file mode 100755 (executable)
index 3ee536b..0000000
+++ /dev/null
@@ -1,107 +0,0 @@
-#!/usr/bin/env python
-
-import sys, os, re, itertools
-from wget_lib import *
-
-import twisted_wget
-from twisted_wget import reactor
-from Enum import enum
-
-DEBUG = True
-
-URL_TYPE = enum('ImageBoard', 'HtmlPage', 'Image', 'Other')
-
-def addtolist(list, *regexStrs):
-       def decorator(func):
-               for regexStr in regexStrs:
-                       regex = re.compile(regexStr)
-                       list.append( (regex, func) )
-               return func
-       return decorator
-
-class Downloader(object):
-       htmlParsers = []
-       class ParserException(Exception):
-               pass
-
-       def __init__(self, url):
-               self.url = url
-               self.deferred = None
-
-       def downloadFiles(self):
-               # XXX: This is a major hack and needs to be cleaned
-               def commonCallback(downloadObject):
-                       self.workingUrls.remove(downloadObject)
-                       self.activeHosts.remove(downloadObject.host)
-                       self.__scheduleDownloadLater()
-               def successCallback(downloadObject, data): 
-                       print 'Downloaded %s' % downloadObject.url
-                       commonCallback(downloadObject)
-                       downloadObject.data = data
-                       downloadObject.callback(downloadObject)
-                       self.doneUrls.add(downloadObject)
-               def errorCallback(downloadObject, data):
-                       commonCallback(downloadObject)
-                       print 'Error: %s' % data
-               def doDownload(file):
-                       print 'About to download "%s"' % file.url
-                       twisted_wget.downloadURL(file.url, 
-                                                                        successBack = lambda data: successCallback(file, data),
-                                                                        errorBack =   lambda data: errorCallback(file, data)
-                                                                       )
-                       self.waitingUrls.remove(file)   
-                       self.workingUrls.add(file)
-                       self.activeHosts.add(file.host)
-
-
-               self.deferred = None
-               for file in list(self.waitingUrls):
-                       if file.host not in self.activeHosts:
-                               doDownload(file)
-
-       # Notes:
-       #  - image_object.data is a string containing all of the data
-       #  - image_object.url is a string containing the url where the data was downloaded from
-       def _parseImageBoard(self, image_object):
-               assert(image_object.data != None)
-               assert(image_object.url != None)
-
-               for parser_regex, parser in self.htmlParsers:
-                       if parser_regex.search(image_object.url):
-                               return parser(image_object)
-               raise DownloadManager.ParserException('Could not find the correct parser')
-
-       @addtolist(htmlParsers, '\.4chan\.org')
-       def _parseImageBoard_4chan(self, image_object):
-               import htmldata
-               def __extractImageUrlsFromList(urllist):
-                       for url_elem in urllist:
-                               if url_elem.tag_name.upper() == 'A' and isImageURL(url_elem.url):
-                                       yield url_elem.url
-
-               # TODO: Extract a better filename from the list
-               urllist = __extractImageUrlsFromList( htmldata.urlextract(image_object.data, image_object.url) )
-               urllist = xRemoveDups(urllist)
-               urllist = itertools.ifilter(
-                                            lambda elem: elem.find('/thumb/') == -1,
-                                            itertools.ifilter(lambda elem: elem.find('/src.cgi/') == -1, urllist)
-                                           )
-
-               if DEBUG:
-                       urllist, urllist_dup = itertools.tee(urllist)
-                       print >>sys.stderr, 'Got the following urls: \n\t%s' % '\n\t'.join(urllist_dup)
-
-               for url in urllist:
-                       self.downloadImage(url, referer = image_object.url)
-
-def main(output_directory):
-       dm = DownloadManager()
-       for url in sys.argv[1:]:
-               dm.recursiveDownloadImages(url)
-
-       reactor.run()
-
-if __name__ == "__main__":
-       output_directory = os.environ.get('WGET_IMAGEBOARD_DIRECTORY', 
-                                         os.path.join(os.environ['HOME'], 'Images_old', 'wget'))
-       main(output_directory)