From 50ddeee3bf353c6bfd4130a7ad6948277c386e4b Mon Sep 17 00:00:00 2001 From: James Bunton Date: Wed, 19 Sep 2012 14:53:00 +1000 Subject: [PATCH] proper unicode support for filenames --- youtube.cgi | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/youtube.cgi b/youtube.cgi index 2546a14..293db8a 100755 --- a/youtube.cgi +++ b/youtube.cgi @@ -4,7 +4,7 @@ import cookielib import cgi import itertools import json -from lxml.html import document_fromstring, tostring +from lxml import html import os import re import resource @@ -37,8 +37,8 @@ class VideoUnavailable(Exception): def print_form(url="", msg=""): script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"]) - print "Content-Type: application/xhtml+xml\r\n\r\n" - print """ + sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n") + sys.stdout.write(""" @@ -68,7 +68,7 @@ def print_form(url="", msg=""): browser's bookmarks menu to download the video straight away.

-""".replace("{0}", msg).replace("{1}", url).replace("{2}", script_url) +""".replace("{0}", msg).replace("{1}", url).replace("{2}", script_url)) cookiejar = cookielib.CookieJar() urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar)) @@ -85,7 +85,7 @@ def urlopen(url): def parse_url(url): f = urlopen(url) - doc = document_fromstring(f.read()) + doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True)) f.close() return doc @@ -138,7 +138,7 @@ def get_best_video(player_config): mimetype = mimetype.split(";")[0] if mimetype not in MIMETYPES: continue - extension = "." + MIMETYPES[mimetype] + extension = MIMETYPES[mimetype] quality = QUALITIES.get(quality.split(",")[0], -1) if best_quality is None or quality > best_quality: if signature: @@ -149,6 +149,14 @@ def get_best_video(player_config): return best_url, best_extension +def sanitize_filename(filename): + return ( + re.sub("\s+", " ", filename.strip()) + .replace("\\", "-") + .replace("/", "-") + .replace("\0", " ") + ) + def get_video_url(doc): unavailable = doc.xpath("//div[@id='unavailable-message']/text()") if unavailable: @@ -163,13 +171,20 @@ def get_video_url(doc): return None, None title = doc.xpath("/html/head/title/text()")[0] - title = re.sub("\s+", " ", title.strip()) - valid_chars = frozenset("-_.() abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789") - filename = "".join(c for c in title.encode("ascii", "ignore") if c in valid_chars) - filename += extension + filename = sanitize_filename(title) + filename += "." + extension return video_url, filename +def write_video(filename, video_data): + httpinfo = video_data.info() + encoded_filename = urllib.quote(filename.encode("utf-8")) + sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename) + sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length")) + sys.stdout.write("\r\n") + shutil.copyfileobj(video_data, sys.stdout) + video_data.close() + def cgimain(): args = cgi.parse() try: @@ -181,13 +196,8 @@ def cgimain(): try: doc = parse_url(url) video_url, filename = get_video_url(doc) - data = urlopen(video_url) - httpinfo = data.info() - sys.stdout.write("Content-Disposition: attachment; filename=\"%s\"\r\n" % filename) - sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length")) - sys.stdout.write("\r\n") - shutil.copyfileobj(data, sys.stdout) - data.close() + video_data = urlopen(video_url) + write_video(filename, video_data) except VideoUnavailable, e: print_form( url=url, -- 2.39.2