]> code.delx.au - youtube-cgi/blob - youtube.cgi
fed997a42a2c20e42c11b740b5f1c4ba35065c34
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/env python
2
3 from __future__ import division
4
5 import cookielib
6 import cgi
7 import json
8 from lxml import html
9 import os
10 import re
11 import resource
12 import shutil
13 import subprocess
14 import sys
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 MAX_MEMORY_BYTES = 128 * 1024*1024
22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
23
24 MIMETYPES = {
25 "video/mp4": "mp4",
26 "video/x-flv": "flv",
27 "video/3gpp": "3gp",
28 }
29
30 QUALITIES = {
31 "hd1080": 5,
32 "hd720": 4,
33 "large": 3,
34 "medium": 2,
35 "small": 1,
36 }
37
38
39 class VideoUnavailable(Exception):
40 pass
41
42 def print_form(url="", msg=""):
43 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
44 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
45 sys.stdout.write("""
46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
47 <html xmlns="http://www.w3.org/1999/xhtml">
48 <head>
49 <title>delx.net.au - YouTube Scraper</title>
50 <link rel="stylesheet" type="text/css" href="/style.css"/>
51 <style type="text/css">
52 input[type="text"] {
53 width: 100%;
54 }
55 .error {
56 color: red;
57 }
58 </style>
59 </head>
60 <body>
61 <h1>delx.net.au - YouTube Scraper</h1>
62 {0}
63 <form action="" method="get">
64 <p>This page will let you easily download YouTube videos to watch offline. It
65 will automatically grab the highest quality version.</p>
66 <div><input type="text" name="url" value="{1}"/></div>
67 <div><input type="submit" value="Download!"/></div>
68 </form>
69 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
70 to easily download videos. Right-click the link and add it to bookmarks,
71 then when you're looking at a YouTube page select that bookmark from your
72 browser's bookmarks menu to download the video straight away.</p>
73 </body>
74 </html>
75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
76
77 cookiejar = cookielib.CookieJar()
78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
79 referrer = ""
80
81 def urlopen(url, offset=None):
82 global referrer
83 req = urllib2.Request(url)
84 if referrer:
85 req.add_header("Referer", referrer)
86 referrer = url
87
88 req.add_header("User-Agent", USER_AGENT)
89
90 if offset:
91 req.add_header("Range", "bytes=%d-" % offset)
92
93 res = urlopener.open(req)
94
95 content_range = res.info().getheader("Content-Range")
96 if content_range:
97 tokens = content_range.split()
98 assert tokens[0] == "bytes"
99 start = int(tokens[1].split("-")[0])
100 assert start == offset
101 return res
102
103 def parse_url(url):
104 f = urlopen(url)
105 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
106 f.close()
107 return doc
108
109 def append_to_qs(url, params):
110 r = list(urlparse.urlsplit(url))
111 qs = urlparse.parse_qs(r[3])
112 qs.update(params)
113 r[3] = urllib.urlencode(qs, True)
114 url = urlparse.urlunsplit(r)
115 return url
116
117 def convert_from_old_itag(player_config):
118 url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
119 url_data["url"] = []
120 for itag_url in url_data["itag"]:
121 pos = itag_url.find("url=")
122 url_data["url"].append(itag_url[pos+4:])
123 player_config["args"]["url_encoded_fmt_stream_map"] = urllib.urlencode(url_data, True)
124
125 def get_player_config(doc):
126 player_config = None
127 for script in doc.xpath("//script"):
128 if not script.text:
129 continue
130 for line in script.text.split("\n"):
131 if "yt.playerConfig =" in line:
132 p1 = line.find("=")
133 p2 = line.rfind(";")
134 if p1 >= 0 and p2 > 0:
135 return json.loads(line[p1+1:p2])
136 if "'PLAYER_CONFIG': " in line:
137 p1 = line.find(":")
138 if p1 >= 0:
139 player_config = json.loads(line[p1+1:])
140 convert_from_old_itag(player_config)
141 return player_config
142
143 def get_best_video(player_config):
144 url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
145
146 best_url = None
147 best_quality = None
148 best_extension = None
149 for url_data in url_data_list:
150 url_data = urlparse.parse_qs(url_data)
151 video_url = url_data["url"][0]
152 mimetype = url_data["type"][0].split(";")[0]
153 quality = url_data["quality"][0]
154 signature = url_data["sig"][0]
155
156 if quality not in QUALITIES:
157 continue
158 if mimetype not in MIMETYPES:
159 continue
160
161 extension = MIMETYPES[mimetype]
162 quality = QUALITIES.get(quality, -1)
163 video_url = append_to_qs(video_url, {"signature": signature})
164
165 if best_quality is None or quality > best_quality:
166 best_url = video_url
167 best_quality = quality
168 best_extension = extension
169
170 return best_url, best_extension
171
172 def sanitize_filename(filename):
173 return (
174 re.sub("\s+", " ", filename.strip())
175 .replace("\\", "-")
176 .replace("/", "-")
177 .replace("\0", " ")
178 )
179
180 def get_video_url(doc):
181 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
182 if unavailable:
183 raise VideoUnavailable(unavailable[0].strip())
184
185 player_config = get_player_config(doc)
186 if not player_config:
187 raise VideoUnavailable("Could not find video URL")
188
189 video_url, extension = get_best_video(player_config)
190 if not video_url:
191 return None, None
192
193 title = doc.xpath("/html/head/title/text()")[0]
194 filename = sanitize_filename(title)
195 filename += "." + extension
196
197 return video_url, filename
198
199 def write_video(filename, video_data):
200 httpinfo = video_data.info()
201 encoded_filename = urllib.quote(filename.encode("utf-8"))
202 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
203 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
204 sys.stdout.write("\r\n")
205 shutil.copyfileobj(video_data, sys.stdout)
206 video_data.close()
207
208 def cgimain():
209 args = cgi.parse()
210 try:
211 url = args["url"][0]
212 except:
213 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
214 return
215
216 try:
217 doc = parse_url(url)
218 video_url, filename = get_video_url(doc)
219 video_data = urlopen(video_url)
220 write_video(filename, video_data)
221 except VideoUnavailable, e:
222 print_form(
223 url=url,
224 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
225 )
226 except Exception, e:
227 print_form(
228 url=url,
229 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
230 )
231 return
232
233 def pp_size(size):
234 suffixes = ["", "KiB", "MiB", "GiB"]
235 for i, suffix in enumerate(suffixes):
236 if size < 1024:
237 break
238 size /= 1024
239 return "%.2f %s" % (size, suffix)
240
241 def copy_with_progress(content_length, infile, outfile):
242 def print_status():
243 rate = 0
244 if now != last_ts:
245 rate = last_bytes_read / (now - last_ts)
246 sys.stdout.write("\33[2K\r")
247 sys.stdout.write("%s / %s (%s/sec)" % (
248 pp_size(bytes_read),
249 pp_size(content_length),
250 pp_size(rate),
251 ))
252 sys.stdout.flush()
253
254 last_ts = 0
255 last_bytes_read = 0
256 bytes_read = 0
257 while True:
258 now = time.time()
259 if now - last_ts > 0.5:
260 print_status()
261 last_ts = now
262 last_bytes_read = 0
263
264 buf = infile.read(32768)
265 if not buf:
266 break
267 outfile.write(buf)
268 last_bytes_read += len(buf)
269 bytes_read += len(buf)
270
271 # Newline at the end
272 print_status()
273 print
274
275 def main():
276 try:
277 url = sys.argv[1]
278 except:
279 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
280 sys.exit(1)
281
282 doc = parse_url(url)
283 video_url, filename = get_video_url(doc)
284 print "Downloading", filename.encode("utf-8")
285
286 outfile = open(filename, "a")
287 offset = outfile.tell()
288 if offset > 0:
289 print "Resuming download from", pp_size(offset)
290 total_size = None
291
292 while True:
293 try:
294 video_data = urlopen(video_url, offset)
295 except urllib2.HTTPError, e:
296 if e.code == 416:
297 print "File is complete!"
298 break
299 else:
300 raise
301
302 content_length = int(video_data.info().getheader("Content-Length"))
303 if total_size is None:
304 total_size = content_length
305
306 try:
307 copy_with_progress(content_length, video_data, outfile)
308 except IOError, e:
309 print
310
311 video_data.close()
312 if outfile.tell() != total_size:
313 old_offset = offset
314 offset = outfile.tell()
315 if old_offset == offset:
316 time.sleep(1)
317 print "Restarting download from", pp_size(offset)
318 else:
319 break
320
321 outfile.close()
322
323
324 if __name__ == "__main__":
325 resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
326 if os.environ.has_key("SCRIPT_NAME"):
327 cgimain()
328 else:
329 try:
330 main()
331 except KeyboardInterrupt:
332 print "\nExiting..."
333 sys.exit(1)
334