aboutsummaryrefslogtreecommitdiffstats
path: root/youtube/yt_data_extract
diff options
context:
space:
mode:
Diffstat (limited to 'youtube/yt_data_extract')
-rw-r--r--youtube/yt_data_extract/common.py26
-rw-r--r--youtube/yt_data_extract/watch_extraction.py34
2 files changed, 58 insertions, 2 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py
index 877444e..974d981 100644
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -73,6 +73,15 @@ def conservative_update(obj, key, value):
if obj.get(key) is None:
obj[key] = value
+def concat_or_none(*strings):
+ '''Concatenates strings. Returns None if any of the arguments are None'''
+ result = ''
+ for string in strings:
+ if string is None:
+ return None
+ result += string
+ return result
+
def remove_redirect(url):
if url is None:
return None
@@ -268,6 +277,23 @@ def extract_item_info(item, additional_info={}):
info['approx_view_count'] = '0'
info['duration'] = extract_str(item.get('lengthText'))
+
+ # if it's an item in a playlist, get its index
+ if 'index' in item: # url has wrong index on playlist page
+ info['index'] = extract_int(item.get('index'))
+ elif 'indexText' in item:
+ # Current item in playlist has ▶ instead of the actual index, must
+ # dig into url
+ match = re.search(r'index=(\d+)', deep_get(item,
+ 'navigationEndpoint', 'commandMetadata', 'webCommandMetadata',
+ 'url', default=''))
+ if match is None: # worth a try then
+ info['index'] = extract_int(item.get('indexText'))
+ else:
+ info['index'] = int(match.group(1))
+ else:
+ info['index'] = None
+
elif primary_type in ('playlist', 'radio'):
info['id'] = item.get('playlistId')
info['video_count'] = extract_int(item.get('videoCount'))
diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py
index bc02313..0b30c91 100644
--- a/youtube/yt_data_extract/watch_extraction.py
+++ b/youtube/yt_data_extract/watch_extraction.py
@@ -2,7 +2,7 @@ from .common import (get, multi_get, deep_get, multi_deep_get,
liberal_update, conservative_update, remove_redirect, normalize_url,
extract_str, extract_formatted_text, extract_int, extract_approx_int,
extract_date, check_missing_keys, extract_item_info, extract_items,
- extract_response)
+ extract_response, concat_or_none)
import json
import urllib.parse
@@ -160,7 +160,37 @@ def _extract_watch_info_mobile(top_level):
response = top_level.get('response', {})
- # video info from metadata renderers
+ # this renderer has the stuff visible on the page
+ # check for playlist
+ items, _ = extract_items(response,
+ item_types={'singleColumnWatchNextResults'})
+ if items:
+ watch_next_results = items[0]['singleColumnWatchNextResults']
+ playlist = deep_get(watch_next_results, 'playlist', 'playlist')
+ if playlist is None:
+ info['playlist'] = None
+ else:
+ info['playlist'] = {}
+ info['playlist']['title'] = playlist.get('title')
+ info['playlist']['author'] = extract_str(multi_get(playlist,
+ 'ownerName', 'longBylineText', 'shortBylineText', 'ownerText'))
+ author_id = deep_get(playlist, 'longBylineText', 'runs', 0,
+ 'navigationEndpoint', 'browseEndpoint', 'browseId')
+ info['playlist']['author_id'] = author_id
+ if author_id:
+ info['playlist']['author_url'] = concat_or_none(
+ 'https://www.youtube.com/channel/', author_id)
+ info['playlist']['id'] = playlist.get('playlistId')
+ info['playlist']['url'] = concat_or_none(
+ 'https://www.youtube.com/playlist?list=',
+ info['playlist']['id'])
+ info['playlist']['video_count'] = playlist.get('totalVideos')
+ info['playlist']['current_index'] = playlist.get('currentIndex')
+ info['playlist']['items'] = [
+ extract_item_info(i) for i in playlist.get('contents', ())]
+
+ # Holds the visible video info. It is inside singleColumnWatchNextResults
+ # but use our convenience function instead
items, _ = extract_items(response, item_types={'slimVideoMetadataRenderer'})
if items:
video_info = items[0]['slimVideoMetadataRenderer']