]> code.delx.au - webdl/blob - common.py
f51ec5305f60e04eda6686e8cdcf480c188d9c28
[webdl] / common.py
1 from lxml import etree, html
2 import cookielib
3 import json
4 try:
5 import hashlib
6 except ImportError:
7 import md5 as hashlib
8 import os
9 import re
10 import shutil
11 import signal
12 import subprocess
13 import sys
14 import tempfile
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 try:
22 import autosocks
23 autosocks.try_autosocks()
24 except ImportError:
25 pass
26
27 CACHE_DIR = os.path.expanduser("~/.cache/webdl")
28 USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:21.0) Gecko/20100101 Firefox/21.0"
29
30 class Node(object):
31 def __init__(self, title, parent=None):
32 self.title = title
33 if parent:
34 parent.children.append(self)
35 self.parent = parent
36 self.children = []
37 self.can_download = False
38
39 def get_children(self):
40 if not self.children:
41 self.fill_children()
42 return self.children
43
44 def fill_children(self):
45 pass
46
47 def download(self):
48 raise NotImplemented
49
50
51 def load_root_node():
52 root_node = Node("Root")
53
54 import iview
55 iview.fill_nodes(root_node)
56
57 import sbs
58 sbs.fill_nodes(root_node)
59
60 import plus7
61 plus7.fill_nodes(root_node)
62
63 import brightcove
64 brightcove.fill_nodes(root_node)
65
66 return root_node
67
68 valid_chars = frozenset("-_.()!@#%^ abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
69 def sanify_filename(filename):
70 filename = filename.encode("ascii", "ignore")
71 filename = "".join(c for c in filename if c in valid_chars)
72 return filename
73
74 cookiejar = cookielib.CookieJar()
75 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
76 def _urlopen(url, referrer=None):
77 req = urllib2.Request(url)
78 req.add_header("User-Agent", USER_AGENT)
79 if referrer:
80 req.add_header("Referer", referrer)
81 return urlopener.open(req)
82
83 def urlopen(url, max_age):
84 ### print url
85 if not os.path.isdir(CACHE_DIR):
86 os.makedirs(CACHE_DIR)
87
88 if max_age <= 0:
89 return _urlopen(url)
90
91 filename = hashlib.md5(url).hexdigest()
92 filename = os.path.join(CACHE_DIR, filename)
93 if os.path.exists(filename):
94 file_age = int(time.time()) - os.path.getmtime(filename)
95 if file_age < max_age:
96 return open(filename)
97
98 src = _urlopen(url)
99 dst = open(filename, "w")
100 try:
101 shutil.copyfileobj(src, dst)
102 except Exception, e:
103 try:
104 os.unlink(filename)
105 except OSError:
106 pass
107 raise e
108 src.close()
109 dst.close()
110
111 return open(filename)
112
113 def grab_text(url, max_age):
114 f = urlopen(url, max_age)
115 text = f.read().decode("utf-8")
116 f.close()
117 return text
118
119 def grab_html(url, max_age):
120 f = urlopen(url, max_age)
121 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
122 f.close()
123 return doc
124
125 def grab_xml(url, max_age):
126 f = urlopen(url, max_age)
127 doc = etree.parse(f, etree.XMLParser(encoding="utf-8", recover=True))
128 f.close()
129 return doc
130
131 def grab_json(url, max_age, skip_assignment=False, skip_function=False):
132 f = urlopen(url, max_age)
133 if skip_assignment:
134 text = f.read()
135 pos = text.find("=")
136 doc = json.loads(text[pos+1:])
137 elif skip_function:
138 text = f.read()
139 pos = text.find("(")
140 rpos = text.rfind(")")
141 doc = json.loads(text[pos+1:rpos])
142 else:
143 doc = json.load(f)
144 f.close()
145 return doc
146
147 def exec_subprocess(cmd):
148 try:
149 p = subprocess.Popen(cmd)
150 ret = p.wait()
151 if ret != 0:
152 print >>sys.stderr, cmd[0], "exited with error code:", ret
153 return False
154 else:
155 return True
156 except OSError, e:
157 print >>sys.stderr, "Failed to run", cmd[0], e
158 except KeyboardInterrupt:
159 print "Cancelled", cmd
160 try:
161 p.terminate()
162 p.wait()
163 except KeyboardInterrupt:
164 p.send_signal(signal.SIGKILL)
165 p.wait()
166 return False
167
168
169 def convert_flv_mp4(orig_filename):
170 basename = os.path.splitext(orig_filename)[0]
171 flv_filename = basename + ".flv"
172 mp4_filename = basename + ".mp4"
173 if orig_filename != flv_filename:
174 os.rename(orig_filename, flv_filename)
175 print "Converting %s to mp4" % flv_filename
176 cmd = [
177 "avconv",
178 "-i", flv_filename,
179 "-acodec", "copy",
180 "-vcodec", "copy",
181 mp4_filename,
182 ]
183 if not exec_subprocess(cmd):
184 return
185 try:
186 flv_size = os.stat(flv_filename).st_size
187 mp4_size = os.stat(mp4_filename).st_size
188 if abs(flv_size - mp4_size) < 0.05 * flv_size:
189 os.unlink(flv_filename)
190 else:
191 print >>sys.stderr, "The size of", mp4_filename, "is suspicious, did avconv fail?"
192 except Exception, e:
193 print "Conversion failed", e
194
195 def convert_filename(filename):
196 if os.path.splitext(filename.lower())[1] in (".mp4", ".flv"):
197 f = open(filename)
198 fourcc = f.read(4)
199 f.close()
200 if fourcc == "FLV\x01":
201 convert_flv_mp4(filename)
202
203 def download_rtmp(filename, vbase, vpath, hash_url=None):
204 filename = sanify_filename(filename)
205 print "Downloading: %s" % filename
206 if vpath.endswith(".flv"):
207 vpath = vpath[:-4]
208 cmd = [
209 "rtmpdump",
210 "-o", filename,
211 "-r", vbase,
212 "-y", vpath,
213 ]
214 if hash_url is not None:
215 cmd += ["--swfVfy", hash_url]
216 if exec_subprocess(cmd):
217 convert_filename(filename)
218 return True
219 else:
220 return False
221
222 def download_urllib(filename, url, referrer=None):
223 filename = sanify_filename(filename)
224 print "Downloading: %s" % filename
225 try:
226 src = _urlopen(url, referrer)
227 dst = open(filename, "w")
228 while True:
229 buf = src.read(1024*1024)
230 if not buf:
231 break
232 dst.write(buf)
233 sys.stdout.write(".")
234 sys.stdout.flush()
235 print
236 except KeyboardInterrupt:
237 print "\nCancelled", url
238 return False
239 finally:
240 try:
241 src.close()
242 except:
243 pass
244 try:
245 dst.close()
246 except:
247 pass
248
249 convert_filename(filename)
250 return True
251
252 def natural_sort(l, key=None):
253 ignore_list = ["a", "the"]
254 def key_func(k):
255 if key is not None:
256 k = key(k)
257 k = k.lower()
258 newk = []
259 for c in re.split("([0-9]+)", k):
260 c = c.strip()
261 if c.isdigit():
262 newk.append(int(c))
263 else:
264 for subc in c.split():
265 if subc not in ignore_list:
266 newk.append(subc)
267 return newk
268
269 return sorted(l, key=key_func)
270
271 def append_to_qs(url, params):
272 r = list(urlparse.urlsplit(url))
273 qs = urlparse.parse_qs(r[3])
274 for k, v in params.iteritems():
275 if v is not None:
276 qs[k] = v
277 elif qs.has_key(k):
278 del qs[k]
279 r[3] = urllib.urlencode(qs, True)
280 url = urlparse.urlunsplit(r)
281 return url
282