code.delx.au - youtube-cgi/blob - youtube.cgi

   1 #!/usr/bin/env python
   2
   3 from __future__ import division
   4
   5 import cookielib
   6 import cgi
   7 import json
   8 from lxml import html
   9 import os
  10 import re
  11 import resource
  12 import shutil
  13 import subprocess
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18 import urlparse
  19
  20
  21 MAX_MEMORY_BYTES = 128 * 1024*1024
  22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
  23
  24 MIMETYPES = {
  25         "video/mp4": "mp4",
  26         "video/x-flv": "flv",
  27         "video/3gpp": "3gp",
  28 }
  29
  30 QUALITIES = {
  31         "hd1080": 5,
  32         "hd720": 4,
  33         "large": 3,
  34         "medium": 2,
  35         "small": 1,
  36 }
  37
  38
  39 class VideoUnavailable(Exception):
  40         pass
  41
  42 def print_form(url="", msg=""):
  43         script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
  44         sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
  45         sys.stdout.write("""
  46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
  47 <html xmlns="http://www.w3.org/1999/xhtml">
  48 <head>
  49         <title>delx.net.au - YouTube Scraper</title>
  50         <link rel="stylesheet" type="text/css" href="/style.css"/>
  51         <style type="text/css">
  52                 input[type="text"] {
  53                         width: 100%;
  54                 }
  55                 .error {
  56                         color: red;
  57                 }
  58         </style>
  59 </head>
  60 <body>
  61         <h1>delx.net.au - YouTube Scraper</h1>
  62         {0}
  63         <form action="" method="get">
  64         <p>This page will let you easily download YouTube videos to watch offline. It
  65         will automatically grab the highest quality version.</p>
  66         <div><input type="text" name="url" value="{1}"/></div>
  67         <div><input type="submit" value="Download!"/></div>
  68         </form>
  69         <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
  70         to easily download videos. Right-click the link and add it to bookmarks,
  71         then when you're looking at a YouTube page select that bookmark from your
  72         browser's bookmarks menu to download the video straight away.</p>
  73 </body>
  74 </html>
  75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
  76
  77 cookiejar = cookielib.CookieJar()
  78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
  79 referrer = ""
  80
  81 def urlopen(url, offset=None):
  82         global referrer
  83         req = urllib2.Request(url)
  84         if referrer:
  85                 req.add_header("Referer", referrer)
  86         referrer = url
  87
  88         req.add_header("User-Agent", USER_AGENT)
  89
  90         if offset:
  91                 req.add_header("Range", "bytes=%d-" % offset)
  92
  93         res = urlopener.open(req)
  94
  95         content_range = res.info().getheader("Content-Range")
  96         if content_range:
  97                 tokens = content_range.split()
  98                 assert tokens[0] == "bytes"
  99                 start = int(tokens[1].split("-")[0])
 100                 assert start == offset
 101         return res
 102
 103 def parse_url(url):
 104         f = urlopen(url)
 105         doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
 106         f.close()
 107         return doc
 108
 109 def append_to_qs(url, params):
 110         r = list(urlparse.urlsplit(url))
 111         qs = urlparse.parse_qs(r[3])
 112         qs.update(params)
 113         r[3] = urllib.urlencode(qs, True)
 114         url = urlparse.urlunsplit(r)
 115         return url
 116
 117 def convert_from_old_itag(player_config):
 118         url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
 119         url_data["url"] = []
 120         for itag_url in url_data["itag"]:
 121                 pos = itag_url.find("url=")
 122                 url_data["url"].append(itag_url[pos+4:])
 123         player_config["args"]["url_encoded_fmt_stream_map"] = urllib.urlencode(url_data, True)
 124
 125 def get_player_config(doc):
 126         player_config = None
 127         for script in doc.xpath("//script"):
 128                 if not script.text:
 129                         continue
 130                 for line in script.text.split("\n"):
 131                         if "yt.playerConfig =" in line:
 132                                 p1 = line.find("=")
 133                                 p2 = line.rfind(";")
 134                                 if p1 >= 0 and p2 > 0:
 135                                         return json.loads(line[p1+1:p2])
 136                         if "'PLAYER_CONFIG': " in line:
 137                                 p1 = line.find(":")
 138                                 if p1 >= 0:
 139                                         player_config = json.loads(line[p1+1:])
 140                                         convert_from_old_itag(player_config)
 141                                         return player_config
 142
 143 def get_best_video(player_config):
 144         url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
 145
 146         best_url = None
 147         best_quality = None
 148         best_extension = None
 149         for url_data in url_data_list:
 150                 url_data = urlparse.parse_qs(url_data)
 151                 video_url = url_data["url"][0]
 152                 mimetype = url_data["type"][0].split(";")[0]
 153                 quality = url_data["quality"][0]
 154                 signature = url_data["sig"][0]
 155
 156                 if quality not in QUALITIES:
 157                         continue
 158                 if mimetype not in MIMETYPES:
 159                         continue
 160
 161                 extension = MIMETYPES[mimetype]
 162                 quality = QUALITIES.get(quality, -1)
 163                 video_url = append_to_qs(video_url, {"signature": signature})
 164
 165                 if best_quality is None or quality > best_quality:
 166                         best_url = video_url
 167                         best_quality = quality
 168                         best_extension = extension
 169
 170         return best_url, best_extension
 171
 172 def sanitize_filename(filename):
 173         return (
 174                 re.sub("\s+", " ", filename.strip())
 175                 .replace("\\", "-")
 176                 .replace("/", "-")
 177                 .replace("\0", " ")
 178         )
 179
 180 def get_video_url(doc):
 181         unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
 182         if unavailable:
 183                 raise VideoUnavailable(unavailable[0].strip())
 184
 185         player_config = get_player_config(doc)
 186         if not player_config:
 187                 raise VideoUnavailable("Could not find video URL")
 188
 189         video_url, extension = get_best_video(player_config)
 190         if not video_url:
 191                 return None, None
 192
 193         title = doc.xpath("/html/head/title/text()")[0]
 194         filename = sanitize_filename(title)
 195         filename += "." + extension
 196
 197         return video_url, filename
 198
 199 def write_video(filename, video_data):
 200         httpinfo = video_data.info()
 201         encoded_filename = urllib.quote(filename.encode("utf-8"))
 202         sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
 203         sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
 204         sys.stdout.write("\r\n")
 205         shutil.copyfileobj(video_data, sys.stdout)
 206         video_data.close()
 207
 208 def cgimain():
 209         args = cgi.parse()
 210         try:
 211                 url = args["url"][0]
 212         except:
 213                 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
 214                 return
 215
 216         try:
 217                 doc = parse_url(url)
 218                 video_url, filename = get_video_url(doc)
 219                 video_data = urlopen(video_url)
 220                 write_video(filename, video_data)
 221         except VideoUnavailable, e:
 222                 print_form(
 223                         url=url,
 224                         msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
 225                 )
 226         except Exception, e:
 227                 print_form(
 228                         url=url,
 229                         msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
 230                 )
 231                 return
 232
 233 def pp_size(size):
 234         suffixes = ["", "KiB", "MiB", "GiB"]
 235         for i, suffix in enumerate(suffixes):
 236                 if size < 1024:
 237                         break
 238                 size /= 1024
 239         return "%.2f %s" % (size, suffix)
 240
 241 def copy_with_progress(content_length, infile, outfile):
 242         def print_status():
 243                 rate = 0
 244                 if now != last_ts:
 245                         rate = last_bytes_read / (now - last_ts)
 246                 sys.stdout.write("\33[2K\r")
 247                 sys.stdout.write("%s / %s (%s/sec)" % (
 248                         pp_size(bytes_read),
 249                         pp_size(content_length),
 250                         pp_size(rate),
 251                 ))
 252                 sys.stdout.flush()
 253
 254         last_ts = 0
 255         last_bytes_read = 0
 256         bytes_read = 0
 257         while True:
 258                 now = time.time()
 259                 if now - last_ts > 0.5:
 260                         print_status()
 261                         last_ts = now
 262                         last_bytes_read = 0
 263
 264                 buf = infile.read(32768)
 265                 if not buf:
 266                         break
 267                 outfile.write(buf)
 268                 last_bytes_read += len(buf)
 269                 bytes_read += len(buf)
 270
 271         # Newline at the end
 272         print_status()
 273         print
 274
 275 def main():
 276         try:
 277                 url = sys.argv[1]
 278         except:
 279                 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
 280                 sys.exit(1)
 281
 282         doc = parse_url(url)
 283         video_url, filename = get_video_url(doc)
 284         print "Downloading", filename.encode("utf-8")
 285
 286         outfile = open(filename, "a")
 287         offset = outfile.tell()
 288         if offset > 0:
 289                 print "Resuming download from", pp_size(offset)
 290         total_size = None
 291
 292         while True:
 293                 try:
 294                         video_data = urlopen(video_url, offset)
 295                 except urllib2.HTTPError, e:
 296                         if e.code == 416:
 297                                 print "File is complete!"
 298                                 break
 299                         else:
 300                                 raise
 301
 302                 content_length = int(video_data.info().getheader("Content-Length"))
 303                 if total_size is None:
 304                         total_size = content_length
 305
 306                 try:
 307                         copy_with_progress(content_length, video_data, outfile)
 308                 except IOError, e:
 309                         print
 310
 311                 video_data.close()
 312                 if outfile.tell() != total_size:
 313                         old_offset = offset
 314                         offset = outfile.tell()
 315                         if old_offset == offset:
 316                                 time.sleep(1)
 317                         print "Restarting download from", pp_size(offset)
 318                 else:
 319                         break
 320
 321         outfile.close()
 322
 323
 324 if __name__ == "__main__":
 325         resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
 326         if os.environ.has_key("SCRIPT_NAME"):
 327                 cgimain()
 328         else:
 329                 try:
 330                         main()
 331                 except KeyboardInterrupt:
 332                         print "\nExiting..."
 333                         sys.exit(1)
 334