diff options
Diffstat (limited to 'youtube/yt_data_extract')
-rw-r--r-- | youtube/yt_data_extract/common.py | 7 | ||||
-rw-r--r-- | youtube/yt_data_extract/everything_else.py | 95 |
2 files changed, 83 insertions, 19 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py index e7b31b7..7903db5 100644 --- a/youtube/yt_data_extract/common.py +++ b/youtube/yt_data_extract/common.py @@ -185,7 +185,7 @@ def extract_int(string, default=None, whole_word=True): return default def extract_approx_int(string): - '''e.g. "15.1M" from "15.1M subscribers"''' + '''e.g. "15.1M" from "15.1M subscribers" or '4,353' from 4353''' if not isinstance(string, str): string = extract_str(string) if not string: @@ -193,7 +193,10 @@ def extract_approx_int(string): match = re.search(r'\b(\d+(?:\.\d+)?[KMBTkmbt]?)\b', string.replace(',', '')) if match is None: return None - return match.group(1) + result = match.group(1) + if re.fullmatch(r'\d+', result): + result = '{:,}'.format(int(result)) + return result MONTH_ABBREVIATIONS = {'jan':'1', 'feb':'2', 'mar':'3', 'apr':'4', 'may':'5', 'jun':'6', 'jul':'7', 'aug':'8', 'sep':'9', 'oct':'10', 'nov':'11', 'dec':'12'} def extract_date(date_text): diff --git a/youtube/yt_data_extract/everything_else.py b/youtube/yt_data_extract/everything_else.py index 7740c67..0f64649 100644 --- a/youtube/yt_data_extract/everything_else.py +++ b/youtube/yt_data_extract/everything_else.py @@ -85,23 +85,84 @@ def extract_channel_info(polymer_json, tab, continuation=False): if tab in ('search', 'playlists'): info['is_last_page'] = (ctoken is None) elif tab == 'about': - items, _ = extract_items(response, item_types={'channelAboutFullMetadataRenderer'}) - if not items: - info['error'] = 'Could not find channelAboutFullMetadataRenderer' - return info - channel_metadata = items[0]['channelAboutFullMetadataRenderer'] - - info['links'] = [] - for link_json in channel_metadata.get('primaryLinks', ()): - url = remove_redirect(deep_get(link_json, 'navigationEndpoint', 'urlEndpoint', 'url')) - if not (url.startswith('http://') or url.startswith('https://')): - url = 'http://' + url - text = extract_str(link_json.get('title')) - info['links'].append( (text, url) ) - - info['date_joined'] = extract_date(channel_metadata.get('joinedDateText')) - info['view_count'] = extract_int(channel_metadata.get('viewCountText')) - info['description'] = extract_str(channel_metadata.get('description'), default='') + # Latest type + items, _ = extract_items(response, item_types={'aboutChannelRenderer'}) + if items: + a_metadata = deep_get(items, 0, 'aboutChannelRenderer', + 'metadata', 'aboutChannelViewModel') + if not a_metadata: + info['error'] = 'Could not find aboutChannelViewModel' + return info + + info['links'] = [] + for link_outer in a_metadata.get('links', ()): + link = link_outer.get('channelExternalLinkViewModel') or {} + link_content = extract_str(deep_get(link, 'link', 'content')) + for run in deep_get(link, 'link', 'commandRuns') or (): + url = remove_redirect(deep_get(run, 'onTap', + 'innertubeCommand', 'urlEndpoint', 'url')) + if url and not (url.startswith('http://') + or url.startswith('https://')): + url = 'https://' + url + if link_content is None or (link_content in url): + break + else: # didn't break + url = link_content + if url and not (url.startswith('http://') + or url.startswith('https://')): + url = 'https://' + url + text = extract_str(deep_get(link, 'title', 'content')) + info['links'].append( (text, url) ) + + info['date_joined'] = extract_date( + a_metadata.get('joinedDateText') + ) + info['view_count'] = extract_int(a_metadata.get('viewCountText')) + info['approx_view_count'] = extract_approx_int( + a_metadata.get('viewCountText') + ) + info['description'] = extract_str( + a_metadata.get('description'), default='' + ) + info['approx_video_count'] = extract_approx_int( + a_metadata.get('videoCountText') + ) + info['approx_subscriber_count'] = extract_approx_int( + a_metadata.get('subscriberCountText') + ) + info['country'] = extract_str(a_metadata.get('country')) + info['canonical_url'] = extract_str( + a_metadata.get('canonicalChannelUrl') + ) + + # Old type + else: + items, _ = extract_items(response, + item_types={'channelAboutFullMetadataRenderer'}) + if not items: + info['error'] = 'Could not find aboutChannelRenderer or channelAboutFullMetadataRenderer' + return info + a_metadata = items[0]['channelAboutFullMetadataRenderer'] + + info['links'] = [] + for link_json in a_metadata.get('primaryLinks', ()): + url = remove_redirect(deep_get(link_json, 'navigationEndpoint', + 'urlEndpoint', 'url')) + if url and not (url.startswith('http://') + or url.startswith('https://')): + url = 'https://' + url + text = extract_str(link_json.get('title')) + info['links'].append( (text, url) ) + + info['date_joined'] = extract_date(a_metadata.get('joinedDateText')) + info['view_count'] = extract_int(a_metadata.get('viewCountText')) + info['description'] = extract_str(a_metadata.get( + 'description'), default='') + + info['approx_video_count'] = None + info['approx_subscriber_count'] = None + info['country'] = None + info['canonical_url'] = None else: raise NotImplementedError('Unknown or unsupported channel tab: ' + tab) |