]> code.delx.au - youtube-cgi/blob - youtube.cgi
initial import
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/env python
2
3 import cookielib
4 import cgi
5 import itertools
6 import json
7 from lxml.html import document_fromstring, tostring
8 import os
9 import re
10 import resource
11 import shutil
12 import subprocess
13 import sys
14 import urllib
15 import urllib2
16 import urlparse
17
18
19 MAX_MEMORY_BYTES = 128 * 1024*1024
20 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
21
22 MIMETYPES = {
23 "video/mp4": "mp4",
24 "video/x-flv": "flv",
25 "video/3gpp": "3gp",
26 }
27
28 QUALITIES = {
29 "large": 3,
30 "medium": 2,
31 "small": 1,
32 }
33
34
35 class VideoUnavailable(Exception):
36 pass
37
38 def print_form(url="", msg=""):
39 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
40 print "Content-Type: application/xhtml+xml\r\n\r\n"
41 print """
42 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
43 <html xmlns="http://www.w3.org/1999/xhtml">
44 <head>
45 <title>delx.net.au - YouTube Scraper</title>
46 <link rel="stylesheet" type="text/css" href="/style.css"/>
47 <style type="text/css">
48 input[type="text"] {
49 width: 100%;
50 }
51 .error {
52 color: red;
53 }
54 </style>
55 </head>
56 <body>
57 <h1>delx.net.au - YouTube Scraper</h1>
58 {0}
59 <form action="" method="get">
60 <p>This page will let you easily download YouTube videos to watch offline. It
61 will automatically grab the highest quality version.</p>
62 <div><input type="text" name="url" value="{1}"/></div>
63 <div><input type="submit" value="Download!"/></div>
64 </form>
65 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
66 to easily download videos. Right-click the link and add it to bookmarks,
67 then when you're looking at a YouTube page select that bookmark from your
68 browser's bookmarks menu to download the video straight away.</p>
69 </body>
70 </html>
71 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url)
72
73 cookiejar = cookielib.CookieJar()
74 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
75 referrer = ""
76
77 def urlopen(url):
78 global referrer
79 req = urllib2.Request(url)
80 if referrer:
81 req.add_header("Referer", referrer)
82 referrer = url
83 req.add_header("User-Agent", USER_AGENT)
84 return urlopener.open(req)
85
86 def parse_url(url):
87 f = urlopen(url)
88 doc = document_fromstring(f.read())
89 f.close()
90 return doc
91
92 def append_to_qs(url, params):
93 r = list(urlparse.urlsplit(url))
94 qs = urlparse.parse_qs(r[3])
95 qs.update(params)
96 r[3] = urllib.urlencode(qs, True)
97 url = urlparse.urlunsplit(r)
98 return url
99
100 def convert_from_old_itag(player_config):
101 url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
102 url_data["url"] = []
103 for itag_url in url_data["itag"]:
104 pos = itag_url.find("url=")
105 url_data["url"].append(itag_url[pos+4:])
106 player_config["args"]["url_encoded_fmt_stream_map"] = urllib.urlencode(url_data, True)
107
108 def get_player_config(doc):
109 player_config = None
110 for script in doc.xpath("//script"):
111 if not script.text:
112 continue
113 for line in script.text.split("\n"):
114 if "yt.playerConfig =" in line:
115 p1 = line.find("=")
116 p2 = line.rfind(";")
117 if p1 >= 0 and p2 > 0:
118 return json.loads(line[p1+1:p2])
119 if "'PLAYER_CONFIG': " in line:
120 p1 = line.find(":")
121 if p1 >= 0:
122 player_config = json.loads(line[p1+1:])
123 convert_from_old_itag(player_config)
124 return player_config
125
126 def get_best_video(player_config):
127 url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
128 url_data = itertools.izip_longest(
129 url_data["url"],
130 url_data["type"],
131 url_data["quality"],
132 url_data.get("sig", []),
133 )
134 best_url = None
135 best_quality = None
136 best_extension = None
137 for video_url, mimetype, quality, signature in url_data:
138 mimetype = mimetype.split(";")[0]
139 if mimetype not in MIMETYPES:
140 continue
141 extension = "." + MIMETYPES[mimetype]
142 quality = QUALITIES.get(quality.split(",")[0], -1)
143 if best_quality is None or quality > best_quality:
144 if signature:
145 video_url = append_to_qs(video_url, {"signature": signature})
146 best_url = video_url
147 best_quality = quality
148 best_extension = extension
149
150 return best_url, best_extension
151
152 def get_video_url(doc):
153 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
154 if unavailable:
155 raise VideoUnavailable(unavailable[0].strip())
156
157 player_config = get_player_config(doc)
158 if not player_config:
159 raise VideoUnavailable("Could not find video URL")
160
161 video_url, extension = get_best_video(player_config)
162 if not video_url:
163 return None, None
164
165 title = doc.xpath("/html/head/title/text()")[0]
166 title = re.sub("\s+", " ", title.strip())
167 valid_chars = frozenset("-_.() abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
168 filename = "".join(c for c in title.encode("ascii", "ignore") if c in valid_chars)
169 filename += extension
170
171 return video_url, filename
172
173 def cgimain():
174 args = cgi.parse()
175 try:
176 url = args["url"][0]
177 except:
178 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
179 return
180
181 try:
182 doc = parse_url(url)
183 video_url, filename = get_video_url(doc)
184 data = urlopen(video_url)
185 httpinfo = data.info()
186 sys.stdout.write("Content-Disposition: attachment; filename=\"%s\"\r\n" % filename)
187 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
188 sys.stdout.write("\r\n")
189 shutil.copyfileobj(data, sys.stdout)
190 data.close()
191 except VideoUnavailable, e:
192 print_form(
193 url=url,
194 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
195 )
196 except Exception, e:
197 print_form(
198 url=url,
199 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
200 )
201 return
202
203 def main():
204 try:
205 url = sys.argv[1]
206 except:
207 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
208 sys.exit(1)
209 doc = parse_url(url)
210 video_url, filename = get_video_url(doc)
211 data = urlopen(video_url)
212 outfile = open(filename, "w")
213 shutil.copyfileobj(data, outfile)
214 data.close()
215 outfile.close()
216
217
218 if __name__ == "__main__":
219 resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
220 if os.environ.has_key("SCRIPT_NAME"):
221 cgimain()
222 else:
223 main()
224