From 75e8930958ea305a26a4652ef88639f7ad5db356 Mon Sep 17 00:00:00 2001
From: James Taylor <user234683@users.noreply.github.com>
Date: Mon, 19 Oct 2020 12:55:03 -0700
Subject: yt_data_extract: normalize thumbnail and author urls

for instance, urls that start with // become https://

adjustment required in comments.py because the url was left as a
relative url in yt_data_extract by mistake and was using URL_ORIGIN
prefix as fix.

see #31
---
 youtube/yt_data_extract/common.py          | 17 +++++++++++------
 youtube/yt_data_extract/everything_else.py | 12 ++++++------
 2 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'youtube/yt_data_extract')

diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py
index 2d3b637..9610479 100644
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -90,15 +90,20 @@ def remove_redirect(url):
         return urllib.parse.parse_qs(query_string)['q'][0]
     return url
 
-youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$')
+norm_url_re = re.compile(r'^(?:(?:https?:)?//)?((?:[\w-]+\.)+[\w-]+)?(/.*)$')
 def normalize_url(url):
+    '''Insert https, resolve relative paths for youtube.com, and put www. infront of youtube.com'''
     if url is None:
         return None
-    match = youtube_url_re.fullmatch(url)
+    match = norm_url_re.fullmatch(url)
     if match is None:
-        raise Exception()
+        raise Exception(url)
 
-    return 'https://www.youtube.com' + match.group(1)
+    domain = match.group(1) or 'www.youtube.com'
+    if domain == 'youtube.com':
+        domain = 'www.youtube.com'
+
+    return 'https://' + domain + match.group(2)
 
 def _recover_urls(runs):
     for run in runs:
@@ -240,11 +245,11 @@ def extract_item_info(item, additional_info={}):
         ))
         info['author_url'] = ('https://www.youtube.com/channel/' + info['author_id']) if info['author_id'] else None
     info['description'] = extract_formatted_text(multi_get(item, 'descriptionSnippet', 'descriptionText'))
-    info['thumbnail'] = multi_deep_get(item,
+    info['thumbnail'] = normalize_url(multi_deep_get(item,
         ['thumbnail', 'thumbnails', 0, 'url'],      # videos
         ['thumbnails', 0, 'thumbnails', 0, 'url'],  # playlists
         ['thumbnailRenderer', 'showCustomThumbnailRenderer', 'thumbnail', 'thumbnails', 0, 'url'], # shows
-    )
+    ))
 
     info['badges'] = []
     for badge_node in multi_get(item, 'badges', 'ownerBadges', default=()):
diff --git a/youtube/yt_data_extract/everything_else.py b/youtube/yt_data_extract/everything_else.py
index 5bb8709..d91dad5 100644
--- a/youtube/yt_data_extract/everything_else.py
+++ b/youtube/yt_data_extract/everything_else.py
@@ -49,10 +49,10 @@ def extract_channel_info(polymer_json, tab):
     if info['short_description'] and len(info['short_description']) > 730:
         info['short_description'] = info['short_description'][0:730] + '...'
     info['channel_name'] = metadata.get('title')
-    info['avatar'] = multi_deep_get(metadata,
+    info['avatar'] = normalize_url(multi_deep_get(metadata,
         ['avatar', 'thumbnails', 0, 'url'],
         ['thumbnail', 'thumbnails', 0, 'url'],
-    )
+    ))
     channel_url = multi_get(metadata, 'urlCanonical', 'channelUrl')
     if channel_url:
         channel_id = get(channel_url.rstrip('/').split('/'), -1)
@@ -263,13 +263,13 @@ def extract_comments_info(polymer_json):
 
         # These 3 are sometimes absent, likely because the channel was deleted
         comment_info['author'] = extract_str(comment_renderer.get('authorText'))
-        comment_info['author_url'] = deep_get(comment_renderer,
-            'authorEndpoint', 'commandMetadata', 'webCommandMetadata', 'url')
+        comment_info['author_url'] = normalize_url(deep_get(comment_renderer,
+            'authorEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'))
         comment_info['author_id'] = deep_get(comment_renderer,
             'authorEndpoint', 'browseEndpoint', 'browseId')
 
-        comment_info['author_avatar'] = deep_get(comment_renderer,
-            'authorThumbnail', 'thumbnails', 0, 'url')
+        comment_info['author_avatar'] = normalize_url(deep_get(
+            comment_renderer, 'authorThumbnail', 'thumbnails', 0, 'url'))
         comment_info['id'] = comment_renderer.get('commentId')
         comment_info['text'] = extract_formatted_text(comment_renderer.get('contentText'))
         comment_info['time_published'] = extract_str(comment_renderer.get('publishedTimeText'))
-- 
cgit v1.2.3


From c9d0f685a43d95d653db56a00efe520e3a04d0d2 Mon Sep 17 00:00:00 2001
From: James Taylor <user234683@users.noreply.github.com>
Date: Mon, 19 Oct 2020 13:53:57 -0700
Subject: Use get_video_info to get video urls if player response missing

Fixes failure mode 1 in #22
---
 youtube/yt_data_extract/watch_extraction.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'youtube/yt_data_extract')

diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py
index 340a367..f89cec1 100644
--- a/youtube/yt_data_extract/watch_extraction.py
+++ b/youtube/yt_data_extract/watch_extraction.py
@@ -447,7 +447,8 @@ def _extract_playability_error(info, player_response, error_prefix=''):
 
 SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
 def extract_watch_info(polymer_json):
-    info = {'playability_error': None, 'error': None}
+    info = {'playability_error': None, 'error': None,
+        'player_response_missing': None}
 
     if isinstance(polymer_json, dict):
         top_level = polymer_json
@@ -477,6 +478,10 @@ def extract_watch_info(polymer_json):
     else:
         embedded_player_response = {}
 
+    # see https://github.com/user234683/youtube-local/issues/22#issuecomment-706395160
+    info['player_response_missing'] = not (
+        player_response or embedded_player_response)
+
     # captions
     info['automatic_caption_languages'] = []
     info['manual_caption_languages'] = []
@@ -580,7 +585,8 @@ def get_caption_url(info, language, format, automatic=False, translation_languag
     return url
 
 def update_with_age_restricted_info(info, video_info_page):
-    ERROR_PREFIX = 'Error bypassing age-restriction: '
+    '''Inserts urls from 'player_response' in get_video_info page'''
+    ERROR_PREFIX = 'Error getting missing player or bypassing age-restriction: '
 
     video_info = urllib.parse.parse_qs(video_info_page)
     player_response = deep_get(video_info, 'player_response', 0)
-- 
cgit v1.2.3


From a27b575380378f1b490dcabb8cc67f05adee5daa Mon Sep 17 00:00:00 2001
From: zrose584 <57181548+zrose584@users.noreply.github.com>
Date: Wed, 21 Oct 2020 10:35:01 +0200
Subject: remove trailing whitespaces

---
 youtube/yt_data_extract/common.py           | 2 +-
 youtube/yt_data_extract/everything_else.py  | 2 +-
 youtube/yt_data_extract/watch_extraction.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'youtube/yt_data_extract')

diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py
index 9610479..683b1c6 100644
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -295,7 +295,7 @@ def extract_item_info(item, additional_info={}):
         info['duration'] = extract_str(item.get('lengthText'))
 
         # if it's an item in a playlist, get its index
-        if 'index' in item: # url has wrong index on playlist page 
+        if 'index' in item: # url has wrong index on playlist page
             info['index'] = extract_int(item.get('index'))
         elif 'indexText' in item:
             # Current item in playlist has ▶ instead of the actual index, must
diff --git a/youtube/yt_data_extract/everything_else.py b/youtube/yt_data_extract/everything_else.py
index d91dad5..b4b612d 100644
--- a/youtube/yt_data_extract/everything_else.py
+++ b/youtube/yt_data_extract/everything_else.py
@@ -164,7 +164,7 @@ def extract_playlist_metadata(polymer_json):
     metadata['video_count'] = extract_int(header.get('numVideosText'))
     metadata['description'] = extract_str(header.get('descriptionText'), default='')
     metadata['author'] = extract_str(header.get('ownerText'))
-    metadata['author_id'] = multi_deep_get(header, 
+    metadata['author_id'] = multi_deep_get(header,
         ['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'],
         ['ownerEndpoint', 'browseEndpoint', 'browseId'])
     if metadata['author_id']:
diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py
index f89cec1..5e57c15 100644
--- a/youtube/yt_data_extract/watch_extraction.py
+++ b/youtube/yt_data_extract/watch_extraction.py
@@ -172,7 +172,7 @@ def _extract_watch_info_mobile(top_level):
         else:
             info['playlist'] = {}
             info['playlist']['title'] = playlist.get('title')
-            info['playlist']['author'] = extract_str(multi_get(playlist, 
+            info['playlist']['author'] = extract_str(multi_get(playlist,
                 'ownerName', 'longBylineText', 'shortBylineText', 'ownerText'))
             author_id = deep_get(playlist, 'longBylineText', 'runs', 0,
                 'navigationEndpoint', 'browseEndpoint', 'browseId')
-- 
cgit v1.2.3