]> code.delx.au - youtube-cgi/blob - youtube.cgi
handle videos with no signature
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/python2
2
3 from __future__ import division
4
5 import cookielib
6 import cgi
7 import json
8 from lxml import html
9 import os
10 import re
11 import resource
12 import shutil
13 import subprocess
14 import sys
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 MAX_MEMORY_BYTES = 128 * 1024*1024
22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
23
24 MIMETYPES = {
25 "video/mp4": "mp4",
26 "video/x-flv": "flv",
27 "video/3gpp": "3gp",
28 }
29
30 QUALITIES = {
31 "hd1080": 5,
32 "hd720": 4,
33 "large": 3,
34 "medium": 2,
35 "small": 1,
36 }
37
38
39 class VideoUnavailable(Exception):
40 pass
41
42 def print_form(url="", msg=""):
43 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
44 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
45 sys.stdout.write("""
46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
47 <html xmlns="http://www.w3.org/1999/xhtml">
48 <head>
49 <title>delx.net.au - YouTube Scraper</title>
50 <link rel="stylesheet" type="text/css" href="/style.css"/>
51 <style type="text/css">
52 input[type="text"] {
53 width: 100%;
54 }
55 .error {
56 color: red;
57 }
58 </style>
59 </head>
60 <body>
61 <h1>delx.net.au - YouTube Scraper</h1>
62 {0}
63 <form action="" method="get">
64 <p>This page will let you easily download YouTube videos to watch offline. It
65 will automatically grab the highest quality version.</p>
66 <div><input type="text" name="url" value="{1}"/></div>
67 <div><input type="submit" value="Download!"/></div>
68 </form>
69 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
70 to easily download videos. Right-click the link and add it to bookmarks,
71 then when you're looking at a YouTube page select that bookmark from your
72 browser's bookmarks menu to download the video straight away.</p>
73 </body>
74 </html>
75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
76
77 cookiejar = cookielib.CookieJar()
78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
79 referrer = ""
80
81 def urlopen(url, offset=None):
82 if url.startswith("//"):
83 url = "http:" + url
84
85 global referrer
86 req = urllib2.Request(url)
87 if referrer:
88 req.add_header("Referer", referrer)
89 referrer = url
90
91 req.add_header("User-Agent", USER_AGENT)
92
93 if offset:
94 req.add_header("Range", "bytes=%d-" % offset)
95
96 res = urlopener.open(req)
97
98 content_range = res.info().getheader("Content-Range")
99 if content_range:
100 tokens = content_range.split()
101 assert tokens[0] == "bytes"
102 start = int(tokens[1].split("-")[0])
103 assert start == offset
104 return res
105
106 def parse_url(url):
107 f = urlopen(url)
108 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
109 f.close()
110 return doc
111
112 def append_to_qs(url, params):
113 r = list(urlparse.urlsplit(url))
114 qs = urlparse.parse_qs(r[3])
115 qs.update(params)
116 r[3] = urllib.urlencode(qs, True)
117 url = urlparse.urlunsplit(r)
118 return url
119
120 def convert_from_old_itag(player_config):
121 url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
122 url_data["url"] = []
123 for itag_url in url_data["itag"]:
124 pos = itag_url.find("url=")
125 url_data["url"].append(itag_url[pos+4:])
126 player_config["args"]["url_encoded_fmt_stream_map"] = urllib.urlencode(url_data, True)
127
128 def get_player_config(doc):
129 player_config = None
130 for script in doc.xpath("//script"):
131 if not script.text:
132 continue
133 for line in script.text.split("\n"):
134 if "yt.playerConfig =" in line:
135 p1 = line.find("=")
136 p2 = line.rfind(";")
137 if p1 >= 0 and p2 > 0:
138 return json.loads(line[p1+1:p2])
139 if "ytplayer.config =" in line:
140 p1 = line.find("ytplayer.config =")
141 p2 = line.rfind(";")
142 if p1 >= 0 and p2 > 0:
143 return json.loads(line[p1+18:p2])
144 if "'PLAYER_CONFIG': " in line:
145 p1 = line.find(":")
146 if p1 >= 0:
147 player_config = json.loads(line[p1+1:])
148 convert_from_old_itag(player_config)
149 return player_config
150
151 def extract_function(output, script, func_name):
152 p1 = script.find("function " + func_name + "(")
153 p2 = script.find("}", p1)
154 code = script[p1:p2+1]
155 output.append(code)
156 deps = re.findall(R"[^\.][= ]([\$0-9a-zA-Z]+)\(", code)
157 deps = set(deps)
158 deps.remove(func_name)
159 for dep in deps:
160 extract_function(output, script, dep)
161
162 def decode_signature(js_url, s):
163 script = urlopen(js_url).read()
164 func_name = re.search(R"\b([a-zA-Z]+)\([a-zA-Z]+\.s\);", script).groups()[0]
165
166 codes = []
167 extract_function(codes, script, func_name)
168
169 p = subprocess.Popen(
170 "js",
171 shell=True,
172 close_fds=True,
173 stdin=subprocess.PIPE,
174 stdout=subprocess.PIPE
175 )
176 for code in codes:
177 p.stdin.write(code + "\n")
178 p.stdin.write("console.log(%s('%s'));\n" % (func_name, s))
179 p.stdin.close()
180
181 signature = p.stdout.read().strip()
182 if p.wait() != 0:
183 raise Exception("js failed to execute: %d" % p.returncode)
184
185 return signature
186
187 def get_best_video(player_config):
188 url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
189 js_url = player_config["assets"]["js"]
190
191 best_url = None
192 best_quality = None
193 best_extension = None
194 for url_data in url_data_list:
195 url_data = urlparse.parse_qs(url_data)
196 mimetype = url_data["type"][0].split(";")[0]
197 quality = url_data["quality"][0]
198
199 if quality not in QUALITIES:
200 continue
201 if mimetype not in MIMETYPES:
202 continue
203
204 extension = MIMETYPES[mimetype]
205 quality = QUALITIES.get(quality, -1)
206
207 if best_quality is not None and quality < best_quality:
208 continue
209
210 video_url = url_data["url"][0]
211 if "sig" in url_data:
212 signature = url_data["sig"][0]
213 elif "s" in url_data:
214 signature = decode_signature(js_url, url_data["s"][0])
215 else:
216 signature = None
217
218 if signature:
219 video_url = append_to_qs(video_url, {"signature": signature})
220
221 best_url = video_url
222 best_quality = quality
223 best_extension = extension
224
225 return best_url, best_extension
226
227 def sanitize_filename(filename):
228 return (
229 re.sub("\s+", " ", filename.strip())
230 .replace("\\", "-")
231 .replace("/", "-")
232 .replace("\0", " ")
233 )
234
235 def get_video_url(doc):
236 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
237 if unavailable:
238 raise VideoUnavailable(unavailable[0].strip())
239
240 player_config = get_player_config(doc)
241 if not player_config:
242 raise VideoUnavailable("Could not find video URL")
243
244 video_url, extension = get_best_video(player_config)
245 if not video_url:
246 return None, None
247
248 title = doc.xpath("/html/head/title/text()")[0]
249 filename = sanitize_filename(title)
250 filename += "." + extension
251
252 return video_url, filename
253
254 def write_video(filename, video_data):
255 httpinfo = video_data.info()
256 encoded_filename = urllib.quote(filename.encode("utf-8"))
257 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
258 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
259 sys.stdout.write("\r\n")
260 shutil.copyfileobj(video_data, sys.stdout)
261 video_data.close()
262
263 def cgimain():
264 args = cgi.parse()
265 try:
266 url = args["url"][0]
267 except:
268 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
269 return
270
271 try:
272 doc = parse_url(url)
273 video_url, filename = get_video_url(doc)
274 video_data = urlopen(video_url)
275 write_video(filename, video_data)
276 except VideoUnavailable, e:
277 print_form(
278 url=url,
279 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
280 )
281 except Exception, e:
282 print_form(
283 url=url,
284 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
285 )
286 return
287
288 def pp_size(size):
289 suffixes = ["", "KiB", "MiB", "GiB"]
290 for i, suffix in enumerate(suffixes):
291 if size < 1024:
292 break
293 size /= 1024
294 return "%.2f %s" % (size, suffix)
295
296 def copy_with_progress(content_length, infile, outfile):
297 def print_status():
298 rate = 0
299 if now != last_ts:
300 rate = last_bytes_read / (now - last_ts)
301 sys.stdout.write("\33[2K\r")
302 sys.stdout.write("%s / %s (%s/sec)" % (
303 pp_size(bytes_read),
304 pp_size(content_length),
305 pp_size(rate),
306 ))
307 sys.stdout.flush()
308
309 last_ts = 0
310 last_bytes_read = 0
311 bytes_read = 0
312 while True:
313 now = time.time()
314 if now - last_ts > 0.5:
315 print_status()
316 last_ts = now
317 last_bytes_read = 0
318
319 buf = infile.read(32768)
320 if not buf:
321 break
322 outfile.write(buf)
323 last_bytes_read += len(buf)
324 bytes_read += len(buf)
325
326 # Newline at the end
327 print_status()
328 print
329
330 def main():
331 try:
332 url = sys.argv[1]
333 except:
334 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
335 sys.exit(1)
336
337 doc = parse_url(url)
338 video_url, filename = get_video_url(doc)
339 print "Downloading", filename.encode("utf-8")
340
341 outfile = open(filename, "a")
342 offset = outfile.tell()
343 if offset > 0:
344 print "Resuming download from", pp_size(offset)
345 total_size = None
346
347 while True:
348 try:
349 video_data = urlopen(video_url, offset)
350 except urllib2.HTTPError, e:
351 if e.code == 416:
352 print "File is complete!"
353 break
354 else:
355 raise
356
357 content_length = int(video_data.info().getheader("Content-Length"))
358 if total_size is None:
359 total_size = content_length
360
361 try:
362 copy_with_progress(content_length, video_data, outfile)
363 except IOError, e:
364 print
365
366 video_data.close()
367 if outfile.tell() != total_size:
368 old_offset = offset
369 offset = outfile.tell()
370 if old_offset == offset:
371 time.sleep(1)
372 print "Restarting download from", pp_size(offset)
373 else:
374 break
375
376 outfile.close()
377
378
379 if __name__ == "__main__":
380 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
381 if os.environ.has_key("SCRIPT_NAME"):
382 cgimain()
383 else:
384 try:
385 main()
386 except KeyboardInterrupt:
387 print "\nExiting..."
388 sys.exit(1)
389