]> code.delx.au - youtube-cgi/blob - youtube.cgi
3530fed37524dae22219659f1900143f58005ac5
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/python2
2
3 from __future__ import division
4
5 import cookielib
6 import cgi
7 import json
8 from lxml import html
9 import os
10 import re
11 import resource
12 import shutil
13 import subprocess
14 import sys
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 MAX_MEMORY_BYTES = 128 * 1024*1024
22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
23
24 MIMETYPES = {
25 "video/mp4": "mp4",
26 "video/x-flv": "flv",
27 "video/3gpp": "3gp",
28 }
29
30 QUALITIES = {
31 "hd1080": 5,
32 "hd720": 4,
33 "large": 3,
34 "medium": 2,
35 "small": 1,
36 }
37
38
39 class VideoUnavailable(Exception):
40 pass
41
42 def print_form(url="", msg=""):
43 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
44 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
45 sys.stdout.write("""
46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
47 <html xmlns="http://www.w3.org/1999/xhtml">
48 <head>
49 <title>delx.net.au - YouTube Scraper</title>
50 <link rel="stylesheet" type="text/css" href="/style.css"/>
51 <style type="text/css">
52 input[type="text"] {
53 width: 100%;
54 }
55 .error {
56 color: red;
57 }
58 </style>
59 </head>
60 <body>
61 <h1>delx.net.au - YouTube Scraper</h1>
62 {0}
63 <form action="" method="get">
64 <p>This page will let you easily download YouTube videos to watch offline. It
65 will automatically grab the highest quality version.</p>
66 <div><input type="text" name="url" value="{1}"/></div>
67 <div><input type="submit" value="Download!"/></div>
68 </form>
69 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
70 to easily download videos. Right-click the link and add it to bookmarks,
71 then when you're looking at a YouTube page select that bookmark from your
72 browser's bookmarks menu to download the video straight away.</p>
73 </body>
74 </html>
75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
76
77 cookiejar = cookielib.CookieJar()
78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
79 referrer = ""
80
81 def urlopen(url, offset=None):
82 global referrer
83 req = urllib2.Request(url)
84 if referrer:
85 req.add_header("Referer", referrer)
86 referrer = url
87
88 req.add_header("User-Agent", USER_AGENT)
89
90 if offset:
91 req.add_header("Range", "bytes=%d-" % offset)
92
93 res = urlopener.open(req)
94
95 content_range = res.info().getheader("Content-Range")
96 if content_range:
97 tokens = content_range.split()
98 assert tokens[0] == "bytes"
99 start = int(tokens[1].split("-")[0])
100 assert start == offset
101 return res
102
103 def parse_url(url):
104 f = urlopen(url)
105 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
106 f.close()
107 return doc
108
109 def append_to_qs(url, params):
110 r = list(urlparse.urlsplit(url))
111 qs = urlparse.parse_qs(r[3])
112 qs.update(params)
113 r[3] = urllib.urlencode(qs, True)
114 url = urlparse.urlunsplit(r)
115 return url
116
117 def convert_from_old_itag(player_config):
118 url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
119 url_data["url"] = []
120 for itag_url in url_data["itag"]:
121 pos = itag_url.find("url=")
122 url_data["url"].append(itag_url[pos+4:])
123 player_config["args"]["url_encoded_fmt_stream_map"] = urllib.urlencode(url_data, True)
124
125 def get_player_config(doc):
126 player_config = None
127 for script in doc.xpath("//script"):
128 if not script.text:
129 continue
130 for line in script.text.split("\n"):
131 if "yt.playerConfig =" in line:
132 p1 = line.find("=")
133 p2 = line.rfind(";")
134 if p1 >= 0 and p2 > 0:
135 return json.loads(line[p1+1:p2])
136 if "ytplayer.config =" in line:
137 p1 = line.find("ytplayer.config =")
138 p2 = line.rfind(";")
139 if p1 >= 0 and p2 > 0:
140 return json.loads(line[p1+18:p2])
141 if "'PLAYER_CONFIG': " in line:
142 p1 = line.find(":")
143 if p1 >= 0:
144 player_config = json.loads(line[p1+1:])
145 convert_from_old_itag(player_config)
146 return player_config
147
148 def extract_function(output, script, func_name):
149 p1 = script.find("function " + func_name)
150 p2 = script.find("}", p1)
151 code = script[p1:p2+1]
152 output.append(code)
153 deps = re.findall(R"[^\.]\b([a-zA-Z]+)\(", code)
154 deps = set(deps)
155 deps.remove(func_name)
156 for dep in deps:
157 extract_function(output, script, dep)
158
159 def decode_signature(js_url, s):
160 script = urlopen(js_url).read()
161 func_name = re.search(R"\b([a-zA-Z]+)\([a-zA-Z]+\.s\);", script).groups()[0]
162
163 codes = []
164 extract_function(codes, script, func_name)
165
166 p = subprocess.Popen(
167 "js",
168 shell=True,
169 close_fds=True,
170 stdin=subprocess.PIPE,
171 stdout=subprocess.PIPE
172 )
173 for code in codes:
174 p.stdin.write(code + "\n")
175 p.stdin.write("console.log(%s('%s'));\n" % (func_name, s))
176 p.stdin.close()
177
178 signature = p.stdout.read().strip()
179 if p.wait() != 0:
180 raise Exception("js failed to execute: %d" % p.returncode)
181
182 return signature
183
184 def get_best_video(player_config):
185 url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
186 js_url = player_config["assets"]["js"]
187
188 best_url = None
189 best_quality = None
190 best_extension = None
191 for url_data in url_data_list:
192 url_data = urlparse.parse_qs(url_data)
193 mimetype = url_data["type"][0].split(";")[0]
194 quality = url_data["quality"][0]
195
196 if quality not in QUALITIES:
197 continue
198 if mimetype not in MIMETYPES:
199 continue
200
201 extension = MIMETYPES[mimetype]
202 quality = QUALITIES.get(quality, -1)
203
204 if best_quality is not None and quality < best_quality:
205 continue
206
207 video_url = url_data["url"][0]
208 if "sig" in url_data:
209 signature = url_data["sig"][0]
210 else:
211 signature = decode_signature(js_url, url_data["s"][0])
212 video_url = append_to_qs(video_url, {"signature": signature})
213
214 best_url = video_url
215 best_quality = quality
216 best_extension = extension
217
218 return best_url, best_extension
219
220 def sanitize_filename(filename):
221 return (
222 re.sub("\s+", " ", filename.strip())
223 .replace("\\", "-")
224 .replace("/", "-")
225 .replace("\0", " ")
226 )
227
228 def get_video_url(doc):
229 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
230 if unavailable:
231 raise VideoUnavailable(unavailable[0].strip())
232
233 player_config = get_player_config(doc)
234 if not player_config:
235 raise VideoUnavailable("Could not find video URL")
236
237 video_url, extension = get_best_video(player_config)
238 if not video_url:
239 return None, None
240
241 title = doc.xpath("/html/head/title/text()")[0]
242 filename = sanitize_filename(title)
243 filename += "." + extension
244
245 return video_url, filename
246
247 def write_video(filename, video_data):
248 httpinfo = video_data.info()
249 encoded_filename = urllib.quote(filename.encode("utf-8"))
250 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
251 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
252 sys.stdout.write("\r\n")
253 shutil.copyfileobj(video_data, sys.stdout)
254 video_data.close()
255
256 def cgimain():
257 args = cgi.parse()
258 try:
259 url = args["url"][0]
260 except:
261 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
262 return
263
264 try:
265 doc = parse_url(url)
266 video_url, filename = get_video_url(doc)
267 video_data = urlopen(video_url)
268 write_video(filename, video_data)
269 except VideoUnavailable, e:
270 print_form(
271 url=url,
272 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
273 )
274 except Exception, e:
275 print_form(
276 url=url,
277 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
278 )
279 return
280
281 def pp_size(size):
282 suffixes = ["", "KiB", "MiB", "GiB"]
283 for i, suffix in enumerate(suffixes):
284 if size < 1024:
285 break
286 size /= 1024
287 return "%.2f %s" % (size, suffix)
288
289 def copy_with_progress(content_length, infile, outfile):
290 def print_status():
291 rate = 0
292 if now != last_ts:
293 rate = last_bytes_read / (now - last_ts)
294 sys.stdout.write("\33[2K\r")
295 sys.stdout.write("%s / %s (%s/sec)" % (
296 pp_size(bytes_read),
297 pp_size(content_length),
298 pp_size(rate),
299 ))
300 sys.stdout.flush()
301
302 last_ts = 0
303 last_bytes_read = 0
304 bytes_read = 0
305 while True:
306 now = time.time()
307 if now - last_ts > 0.5:
308 print_status()
309 last_ts = now
310 last_bytes_read = 0
311
312 buf = infile.read(32768)
313 if not buf:
314 break
315 outfile.write(buf)
316 last_bytes_read += len(buf)
317 bytes_read += len(buf)
318
319 # Newline at the end
320 print_status()
321 print
322
323 def main():
324 try:
325 url = sys.argv[1]
326 except:
327 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
328 sys.exit(1)
329
330 doc = parse_url(url)
331 video_url, filename = get_video_url(doc)
332 print "Downloading", filename.encode("utf-8")
333
334 outfile = open(filename, "a")
335 offset = outfile.tell()
336 if offset > 0:
337 print "Resuming download from", pp_size(offset)
338 total_size = None
339
340 while True:
341 try:
342 video_data = urlopen(video_url, offset)
343 except urllib2.HTTPError, e:
344 if e.code == 416:
345 print "File is complete!"
346 break
347 else:
348 raise
349
350 content_length = int(video_data.info().getheader("Content-Length"))
351 if total_size is None:
352 total_size = content_length
353
354 try:
355 copy_with_progress(content_length, video_data, outfile)
356 except IOError, e:
357 print
358
359 video_data.close()
360 if outfile.tell() != total_size:
361 old_offset = offset
362 offset = outfile.tell()
363 if old_offset == offset:
364 time.sleep(1)
365 print "Restarting download from", pp_size(offset)
366 else:
367 break
368
369 outfile.close()
370
371
372 if __name__ == "__main__":
373 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
374 if os.environ.has_key("SCRIPT_NAME"):
375 cgimain()
376 else:
377 try:
378 main()
379 except KeyboardInterrupt:
380 print "\nExiting..."
381 sys.exit(1)
382