]>
code.delx.au - bg-scripts/blob - bin/new_wget_image_board.py
3ee536bbf52ebaaea7c6befe0cf3deaae28e5053
3 import sys
, os
, re
, itertools
7 from twisted_wget
import reactor
12 URL_TYPE
= enum('ImageBoard', 'HtmlPage', 'Image', 'Other')
14 def addtolist(list, *regexStrs
):
16 for regexStr
in regexStrs
:
17 regex
= re
.compile(regexStr
)
18 list.append( (regex
, func
) )
22 class Downloader(object):
24 class ParserException(Exception):
27 def __init__(self
, url
):
31 def downloadFiles(self
):
32 # XXX: This is a major hack and needs to be cleaned
33 def commonCallback(downloadObject
):
34 self
.workingUrls
.remove(downloadObject
)
35 self
.activeHosts
.remove(downloadObject
.host
)
36 self
.__scheduleDownloadLater
()
37 def successCallback(downloadObject
, data
):
38 print 'Downloaded %s' % downloadObject
.url
39 commonCallback(downloadObject
)
40 downloadObject
.data
= data
41 downloadObject
.callback(downloadObject
)
42 self
.doneUrls
.add(downloadObject
)
43 def errorCallback(downloadObject
, data
):
44 commonCallback(downloadObject
)
45 print 'Error: %s' % data
47 print 'About to download "%s"' % file.url
48 twisted_wget
.downloadURL(file.url
,
49 successBack
= lambda data
: successCallback(file, data
),
50 errorBack
= lambda data
: errorCallback(file, data
)
52 self
.waitingUrls
.remove(file)
53 self
.workingUrls
.add(file)
54 self
.activeHosts
.add(file.host
)
58 for file in list(self
.waitingUrls
):
59 if file.host
not in self
.activeHosts
:
63 # - image_object.data is a string containing all of the data
64 # - image_object.url is a string containing the url where the data was downloaded from
65 def _parseImageBoard(self
, image_object
):
66 assert(image_object
.data
!= None)
67 assert(image_object
.url
!= None)
69 for parser_regex
, parser
in self
.htmlParsers
:
70 if parser_regex
.search(image_object
.url
):
71 return parser(image_object
)
72 raise DownloadManager
.ParserException('Could not find the correct parser')
74 @addtolist(htmlParsers
, '\.4chan\.org')
75 def _parseImageBoard_4chan(self
, image_object
):
77 def __extractImageUrlsFromList(urllist
):
78 for url_elem
in urllist
:
79 if url_elem
.tag_name
.upper() == 'A' and isImageURL(url_elem
.url
):
82 # TODO: Extract a better filename from the list
83 urllist
= __extractImageUrlsFromList( htmldata
.urlextract(image_object
.data
, image_object
.url
) )
84 urllist
= xRemoveDups(urllist
)
85 urllist
= itertools
.ifilter(
86 lambda elem
: elem
.find('/thumb/') == -1,
87 itertools
.ifilter(lambda elem
: elem
.find('/src.cgi/') == -1, urllist
)
91 urllist
, urllist_dup
= itertools
.tee(urllist
)
92 print >>sys
.stderr
, 'Got the following urls: \n\t%s' % '\n\t'.join(urllist_dup
)
95 self
.downloadImage(url
, referer
= image_object
.url
)
97 def main(output_directory
):
98 dm
= DownloadManager()
99 for url
in sys
.argv
[1:]:
100 dm
.recursiveDownloadImages(url
)
104 if __name__
== "__main__":
105 output_directory
= os
.environ
.get('WGET_IMAGEBOARD_DIRECTORY',
106 os
.path
.join(os
.environ
['HOME'], 'Images_old', 'wget'))
107 main(output_directory
)