From 5f4884dce8e3eb3215ee8b97469a741310669083 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 22 Oct 2020 14:30:33 -0700 Subject: Put vid title at end of download urls so downloads w/ that filename --- server.py | 3 +++ youtube/util.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ youtube/watch.py | 11 +++++++++++ 3 files changed, 68 insertions(+) diff --git a/server.py b/server.py index a7a3fc8..e456e3c 100644 --- a/server.py +++ b/server.py @@ -41,6 +41,9 @@ def proxy_site(env, start_response, video=False): headers['Range'] = env['HTTP_RANGE'] url = "https://" + env['SERVER_NAME'] + env['PATH_INFO'] + # remove /name portion + if video and '/videoplayback/name/' in url: + url = url[0:url.rfind('/name/')] if env['QUERY_STRING']: url += '?' + env['QUERY_STRING'] diff --git a/youtube/util.py b/youtube/util.py index 579f512..e468224 100644 --- a/youtube/util.py +++ b/youtube/util.py @@ -15,6 +15,7 @@ import json import gevent import gevent.queue import gevent.lock +import collections # The trouble with the requests library: It ships its own certificate bundle via certifi # instead of using the system certificate store, meaning self-signed certificates @@ -435,3 +436,56 @@ def check_gevent_exceptions(*tasks): if task.exception: raise task.exception + +# https://stackoverflow.com/a/62888 +replacement_map = collections.OrderedDict([ + ('<', '_'), + ('>', '_'), + (': ', ' - '), + (':', '-'), + ('"', "'"), + ('/', '_'), + ('\\', '_'), + ('|', '-'), + ('?', ''), + ('*', '_'), + ('\t', ' '), +]) +DOS_names = {'con', 'prn', 'aux', 'nul', 'com0', 'com1', 'com2', 'com3', 'com4', 'com5', 'com6', 'com7', 'com8', 'com9', 'lpt0', 'lpt1', 'lpt2', 'lpt3', 'lpt4', 'lpt5', 'lpt6', 'lpt7', 'lpt8', 'lpt9'} +def to_valid_filename(name): + '''Changes the name so it's valid on Windows, Linux, and Mac''' + # See https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file + # for Windows specs + + # Additional recommendations for Linux: + # https://dwheeler.com/essays/fixing-unix-linux-filenames.html#standards + + # remove control characters + name = re.sub(r'[\x00-\x1f]', '_', name) + + # reserved characters + for reserved_char, replacement in replacement_map.items(): + name = name.replace(reserved_char, replacement) + + # check for all periods/spaces + if all(c == '.' or c == ' ' for c in name): + name = '_'*len(name) + + # remove trailing periods and spaces + name = name.rstrip('. ') + + # check for reserved DOS names, such as nul or nul.txt + base_ext_parts = name.rsplit('.', maxsplit=1) + if base_ext_parts[0].lower() in DOS_names: + base_ext_parts[0] += '_' + name = '.'.join(base_ext_parts) + + # check for blank name + if name == '': + name = '_' + + # check if name begins with a hyphen, period, or space + if name[0] in ('-', '.', ' '): + name = '_' + name + + return name diff --git a/youtube/watch.py b/youtube/watch.py index 11ef9f2..bc2008c 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -398,6 +398,17 @@ def get_watch_page(video_id=None): for fmt in info['formats']: fmt['url'] = util.prefix_url(fmt['url']) + # Add video title to end of url path so it has a filename other than just + # "videoplayback" when downloaded + title = urllib.parse.quote(util.to_valid_filename(info['title'])) + for fmt in info['formats']: + filename = title + ext = fmt.get('ext') + if ext: + filename += '.' + ext + fmt['url'] = fmt['url'].replace( + '/videoplayback', + '/videoplayback/name/' + filename) if settings.gather_googlevideo_domains: with open(os.path.join(settings.data_dir, 'googlevideo-domains.txt'), 'a+', encoding='utf-8') as f: -- cgit v1.2.3