]> code.delx.au - youtube-cgi/blob - youtube.cgi
fixes for youtube changes
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/python2
2
3 from __future__ import division
4
5 import cookielib
6 import cgi
7 import json
8 from lxml import html
9 import os
10 import re
11 import resource
12 import shutil
13 import subprocess
14 import sys
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 MAX_MEMORY_BYTES = 128 * 1024*1024
22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
23
24 MIMETYPES = {
25 "video/mp4": "mp4",
26 "video/x-flv": "flv",
27 "video/3gpp": "3gp",
28 }
29
30 QUALITIES = {
31 "hd1080": 5,
32 "hd720": 4,
33 "large": 3,
34 "medium": 2,
35 "small": 1,
36 }
37
38
39 class VideoUnavailable(Exception):
40 pass
41
42 def print_form(url="", msg=""):
43 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
44 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
45 sys.stdout.write("""
46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
47 <html xmlns="http://www.w3.org/1999/xhtml">
48 <head>
49 <title>delx.net.au - YouTube Scraper</title>
50 <link rel="stylesheet" type="text/css" href="/style.css"/>
51 <style type="text/css">
52 input[type="text"] {
53 width: 100%;
54 }
55 .error {
56 color: red;
57 }
58 </style>
59 </head>
60 <body>
61 <h1>delx.net.au - YouTube Scraper</h1>
62 {0}
63 <form action="" method="get">
64 <p>This page will let you easily download YouTube videos to watch offline. It
65 will automatically grab the highest quality version.</p>
66 <div><input type="text" name="url" value="{1}"/></div>
67 <div><input type="submit" value="Download!"/></div>
68 </form>
69 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
70 to easily download videos. Right-click the link and add it to bookmarks,
71 then when you're looking at a YouTube page select that bookmark from your
72 browser's bookmarks menu to download the video straight away.</p>
73 </body>
74 </html>
75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
76
77 cookiejar = cookielib.CookieJar()
78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
79 referrer = ""
80
81 def urlopen(url, offset=None):
82 if url.startswith("//"):
83 url = "http:" + url
84
85 global referrer
86 req = urllib2.Request(url)
87 if referrer:
88 req.add_header("Referer", referrer)
89 referrer = url
90
91 req.add_header("User-Agent", USER_AGENT)
92
93 if offset:
94 req.add_header("Range", "bytes=%d-" % offset)
95
96 res = urlopener.open(req)
97
98 content_range = res.info().getheader("Content-Range")
99 if content_range:
100 tokens = content_range.split()
101 assert tokens[0] == "bytes"
102 start = int(tokens[1].split("-")[0])
103 assert start == offset
104 return res
105
106 def parse_url(url):
107 f = urlopen(url)
108 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
109 f.close()
110 return doc
111
112 def append_to_qs(url, params):
113 r = list(urlparse.urlsplit(url))
114 qs = urlparse.parse_qs(r[3])
115 qs.update(params)
116 r[3] = urllib.urlencode(qs, True)
117 url = urlparse.urlunsplit(r)
118 return url
119
120 def convert_from_old_itag(player_config):
121 url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
122 url_data["url"] = []
123 for itag_url in url_data["itag"]:
124 pos = itag_url.find("url=")
125 url_data["url"].append(itag_url[pos+4:])
126 player_config["args"]["url_encoded_fmt_stream_map"] = urllib.urlencode(url_data, True)
127
128 def get_player_config(doc):
129 player_config = None
130 for script in doc.xpath("//script"):
131 if not script.text:
132 continue
133 for line in script.text.split("\n"):
134 if "yt.playerConfig =" in line:
135 p1 = line.find("=")
136 p2 = line.rfind(";")
137 if p1 >= 0 and p2 > 0:
138 return json.loads(line[p1+1:p2])
139 if "ytplayer.config =" in line:
140 p1 = line.find("ytplayer.config =")
141 p2 = line.rfind(";")
142 if p1 >= 0 and p2 > 0:
143 return json.loads(line[p1+18:p2])
144 if "'PLAYER_CONFIG': " in line:
145 p1 = line.find(":")
146 if p1 >= 0:
147 player_config = json.loads(line[p1+1:])
148 convert_from_old_itag(player_config)
149 return player_config
150
151 def extract_function(output, script, func_name):
152 p1 = script.find("function " + func_name + "(")
153 p2 = script.find("}", p1)
154 code = script[p1:p2+1]
155 output.append(code)
156 deps = re.findall(R"[^\.][= ]([\$0-9a-zA-Z]+)\(", code)
157 deps = set(deps)
158 deps.remove(func_name)
159 for dep in deps:
160 extract_function(output, script, dep)
161
162 def decode_signature(js_url, s):
163 script = urlopen(js_url).read()
164 func_name = re.search(R"\b([a-zA-Z]+)\([a-zA-Z]+\.s\);", script).groups()[0]
165
166 codes = []
167 extract_function(codes, script, func_name)
168
169 p = subprocess.Popen(
170 "js",
171 shell=True,
172 close_fds=True,
173 stdin=subprocess.PIPE,
174 stdout=subprocess.PIPE
175 )
176 for code in codes:
177 p.stdin.write(code + "\n")
178 p.stdin.write("console.log(%s('%s'));\n" % (func_name, s))
179 p.stdin.close()
180
181 signature = p.stdout.read().strip()
182 if p.wait() != 0:
183 raise Exception("js failed to execute: %d" % p.returncode)
184
185 return signature
186
187 def get_best_video(player_config):
188 url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
189 js_url = player_config["assets"]["js"]
190
191 best_url = None
192 best_quality = None
193 best_extension = None
194 for url_data in url_data_list:
195 url_data = urlparse.parse_qs(url_data)
196 mimetype = url_data["type"][0].split(";")[0]
197 quality = url_data["quality"][0]
198
199 if quality not in QUALITIES:
200 continue
201 if mimetype not in MIMETYPES:
202 continue
203
204 extension = MIMETYPES[mimetype]
205 quality = QUALITIES.get(quality, -1)
206
207 if best_quality is not None and quality < best_quality:
208 continue
209
210 video_url = url_data["url"][0]
211 if "sig" in url_data:
212 signature = url_data["sig"][0]
213 else:
214 signature = decode_signature(js_url, url_data["s"][0])
215 video_url = append_to_qs(video_url, {"signature": signature})
216
217 best_url = video_url
218 best_quality = quality
219 best_extension = extension
220
221 return best_url, best_extension
222
223 def sanitize_filename(filename):
224 return (
225 re.sub("\s+", " ", filename.strip())
226 .replace("\\", "-")
227 .replace("/", "-")
228 .replace("\0", " ")
229 )
230
231 def get_video_url(doc):
232 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
233 if unavailable:
234 raise VideoUnavailable(unavailable[0].strip())
235
236 player_config = get_player_config(doc)
237 if not player_config:
238 raise VideoUnavailable("Could not find video URL")
239
240 video_url, extension = get_best_video(player_config)
241 if not video_url:
242 return None, None
243
244 title = doc.xpath("/html/head/title/text()")[0]
245 filename = sanitize_filename(title)
246 filename += "." + extension
247
248 return video_url, filename
249
250 def write_video(filename, video_data):
251 httpinfo = video_data.info()
252 encoded_filename = urllib.quote(filename.encode("utf-8"))
253 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
254 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
255 sys.stdout.write("\r\n")
256 shutil.copyfileobj(video_data, sys.stdout)
257 video_data.close()
258
259 def cgimain():
260 args = cgi.parse()
261 try:
262 url = args["url"][0]
263 except:
264 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
265 return
266
267 try:
268 doc = parse_url(url)
269 video_url, filename = get_video_url(doc)
270 video_data = urlopen(video_url)
271 write_video(filename, video_data)
272 except VideoUnavailable, e:
273 print_form(
274 url=url,
275 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
276 )
277 except Exception, e:
278 print_form(
279 url=url,
280 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
281 )
282 return
283
284 def pp_size(size):
285 suffixes = ["", "KiB", "MiB", "GiB"]
286 for i, suffix in enumerate(suffixes):
287 if size < 1024:
288 break
289 size /= 1024
290 return "%.2f %s" % (size, suffix)
291
292 def copy_with_progress(content_length, infile, outfile):
293 def print_status():
294 rate = 0
295 if now != last_ts:
296 rate = last_bytes_read / (now - last_ts)
297 sys.stdout.write("\33[2K\r")
298 sys.stdout.write("%s / %s (%s/sec)" % (
299 pp_size(bytes_read),
300 pp_size(content_length),
301 pp_size(rate),
302 ))
303 sys.stdout.flush()
304
305 last_ts = 0
306 last_bytes_read = 0
307 bytes_read = 0
308 while True:
309 now = time.time()
310 if now - last_ts > 0.5:
311 print_status()
312 last_ts = now
313 last_bytes_read = 0
314
315 buf = infile.read(32768)
316 if not buf:
317 break
318 outfile.write(buf)
319 last_bytes_read += len(buf)
320 bytes_read += len(buf)
321
322 # Newline at the end
323 print_status()
324 print
325
326 def main():
327 try:
328 url = sys.argv[1]
329 except:
330 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
331 sys.exit(1)
332
333 doc = parse_url(url)
334 video_url, filename = get_video_url(doc)
335 print "Downloading", filename.encode("utf-8")
336
337 outfile = open(filename, "a")
338 offset = outfile.tell()
339 if offset > 0:
340 print "Resuming download from", pp_size(offset)
341 total_size = None
342
343 while True:
344 try:
345 video_data = urlopen(video_url, offset)
346 except urllib2.HTTPError, e:
347 if e.code == 416:
348 print "File is complete!"
349 break
350 else:
351 raise
352
353 content_length = int(video_data.info().getheader("Content-Length"))
354 if total_size is None:
355 total_size = content_length
356
357 try:
358 copy_with_progress(content_length, video_data, outfile)
359 except IOError, e:
360 print
361
362 video_data.close()
363 if outfile.tell() != total_size:
364 old_offset = offset
365 offset = outfile.tell()
366 if old_offset == offset:
367 time.sleep(1)
368 print "Restarting download from", pp_size(offset)
369 else:
370 break
371
372 outfile.close()
373
374
375 if __name__ == "__main__":
376 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
377 if os.environ.has_key("SCRIPT_NAME"):
378 cgimain()
379 else:
380 try:
381 main()
382 except KeyboardInterrupt:
383 print "\nExiting..."
384 sys.exit(1)
385