Extraction: Move channel extraction to yt_data_extract

author: James Taylor <user234683@users.noreply.github.com> 2019-09-08 17:28:11 -0700
committer: James Taylor <user234683@users.noreply.github.com> 2019-09-08 17:28:11 -0700
commit: bd343ed71f628e0f1dd1eb3f45fb4e04887f223f (patch)
tree: 72f9de0764b920b5d464290da68244f5ee8caa6d /youtube
parent: fb1a3531c59f5d9cee406295bbe006730695c249 (diff)
download: yt-local-bd343ed71f628e0f1dd1eb3f45fb4e04887f223f.tar.lz
yt-local-bd343ed71f628e0f1dd1eb3f45fb4e04887f223f.tar.xz
yt-local-bd343ed71f628e0f1dd1eb3f45fb4e04887f223f.zip
3 files changed, 124 insertions, 121 deletions
diff --git a/youtube/channel.py b/youtube/channel.py
index 79b7c9b..16d0a3f 100644
--- a/youtube/channel.py
+++ b/youtube/channel.py
@@ -137,124 +137,6 @@ def get_channel_search_json(channel_id, query, page):
 
     return polymer_json
 
-def extract_info(polymer_json, tab):
-    response = polymer_json[1]['response']
-    try:
-        microformat = response['microformat']['microformatDataRenderer']
-
-    # channel doesn't exist or was terminated
-    # example terminated channel: https://www.youtube.com/channel/UCnKJeK_r90jDdIuzHXC0Org
-    except KeyError:
-        if 'alerts' in response and len(response['alerts']) > 0:
-            result = ''
-            for alert in response['alerts']:
-                result += alert['alertRenderer']['text']['simpleText'] + '\n'
-            flask.abort(200, result)
-        elif 'errors' in response['responseContext']:
-            for error in response['responseContext']['errors']['error']:
-                if error['code'] == 'INVALID_VALUE' and error['location'] == 'browse_id':
-                    flask.abort(404, 'This channel does not exist')
-        raise
-
-
-    info = {}
-    info['current_tab'] = tab
-
-
-    # stuff from microformat (info given by youtube for every page on channel)
-    info['short_description'] = microformat['description']
-    info['channel_name'] = microformat['title']
-    info['avatar'] = microformat['thumbnail']['thumbnails'][0]['url']
-    channel_url = microformat['urlCanonical'].rstrip('/')
-    channel_id = channel_url[channel_url.rfind('/')+1:]
-    info['channel_id'] = channel_id
-    info['channel_url'] = 'https://www.youtube.com/channel/' + channel_id
-
-    info['items'] = []
-
-    # empty channel
-    if 'contents' not in response and 'continuationContents' not in response:
-        return info
-
-
-    # find the tab with content
-    # example channel where tabs do not have definite index: https://www.youtube.com/channel/UC4gQ8i3FD7YbhOgqUkeQEJg
-    # TODO: maybe use the 'selected' attribute for this?
-    if 'continuationContents' not in response:
-        tab_renderer = None
-        tab_content = None
-        for tab_json in response['contents']['twoColumnBrowseResultsRenderer']['tabs']:
-            try:
-                tab_renderer = tab_json['tabRenderer']
-            except KeyError:
-                tab_renderer = tab_json['expandableTabRenderer']
-            try:
-                tab_content = tab_renderer['content']
-                break
-            except KeyError:
-                pass
-        else:   # didn't break
-            raise Exception("No tabs found with content")
-        assert tab == tab_renderer['title'].lower()
-
-
-    # extract tab-specific info
-    if tab in ('videos', 'playlists', 'search'):    # find the list of items
-        if 'continuationContents' in response:
-            try:
-                items = response['continuationContents']['gridContinuation']['items']
-            except KeyError:
-                items = response['continuationContents']['sectionListContinuation']['contents']     # for search
-        else:
-            contents = tab_content['sectionListRenderer']['contents']
-            if 'itemSectionRenderer' in contents[0]:
-                item_section = contents[0]['itemSectionRenderer']['contents'][0]
-                try:
-                    items = item_section['gridRenderer']['items']
-                except KeyError:
-                    if "messageRenderer" in item_section:
-                        items = []
-                    else:
-                        raise Exception('gridRenderer missing but messageRenderer not found')
-            else:
-                items = contents    # for search
-
-        additional_info = {'author': info['channel_name'], 'author_url': 'https://www.youtube.com/channel/' + channel_id}
-        info['items'] = [yt_data_extract.renderer_info(renderer, additional_info) for renderer in items]
-
-    elif tab == 'about':
-        channel_metadata = tab_content['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']
-
-
-        info['links'] = []
-        for link_json in channel_metadata.get('primaryLinks', ()):
-            url = link_json['navigationEndpoint']['urlEndpoint']['url']
-            if url.startswith('/redirect'):     # youtube puts these on external links to do tracking
-                query_string = url[url.find('?')+1: ]
-                url = urllib.parse.parse_qs(query_string)['q'][0]
-
-            text = yt_data_extract.get_plain_text(link_json['title'])
-
-            info['links'].append( (text, url) )
-
-
-        info['stats'] = []
-        for stat_name in ('subscriberCountText', 'joinedDateText', 'viewCountText', 'country'):
-            try:
-                stat = channel_metadata[stat_name]
-            except KeyError:
-                continue
-            info['stats'].append(yt_data_extract.get_plain_text(stat))
-
-        if 'description' in channel_metadata:
-            info['description'] = yt_data_extract.get_text(channel_metadata['description'])
-        else:
-            info['description'] = ''
-
-    else:
-        raise NotImplementedError('Unknown or unsupported channel tab: ' + tab)
-
-    return info
 
 def post_process_channel_info(info):
     info['avatar'] = util.prefix_url(info['avatar'])
@@ -303,7 +185,7 @@ def get_channel_page(channel_id, tab='videos'):
         flask.abort(404, 'Unknown channel tab: ' + tab)
 
 
-    info = extract_info(json.loads(polymer_json), tab)
+    info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab)
     post_process_channel_info(info)
     if tab in ('videos', 'search'):
         info['number_of_videos'] = number_of_videos
@@ -343,7 +225,7 @@ def get_channel_page_general_url(base_url, tab, request):
         flask.abort(404, 'Unknown channel tab: ' + tab)
 
 
-    info = extract_info(json.loads(polymer_json), tab)
+    info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab)
     post_process_channel_info(info)
     if tab in ('videos', 'search'):
         info['number_of_videos'] = 1000
diff --git a/youtube/subscriptions.py b/youtube/subscriptions.py
index 56bdf93..175622f 100644
--- a/youtube/subscriptions.py
+++ b/youtube/subscriptions.py
@@ -455,7 +455,7 @@ def _get_upstream_videos(channel_id):
         print('Failed to read atoma feed for ' + channel_status_name)
         traceback.print_exc()
 
-    videos = channel.extract_info(json.loads(channel_tab), 'videos')['items']
+    videos = yt_data_extract.extract_channel_info(json.loads(channel_tab), 'videos')['items']
     for i, video_item in enumerate(videos):
         if 'description' not in video_item:
             video_item['description'] = ''
diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py
index 663edc4..c666ede 100644
--- a/youtube/yt_data_extract.py
+++ b/youtube/yt_data_extract.py
@@ -3,6 +3,7 @@ from youtube import util
 import html
 import json
 import re
+import urllib
 
 # videos (all of type str):
 
@@ -279,3 +280,123 @@ def parse_info_prepare_for_html(renderer, additional_info={}):
     return item
 
 
+def extract_channel_info(polymer_json, tab):
+    response = polymer_json[1]['response']
+    try:
+        microformat = response['microformat']['microformatDataRenderer']
+
+    # channel doesn't exist or was terminated
+    # example terminated channel: https://www.youtube.com/channel/UCnKJeK_r90jDdIuzHXC0Org
+    except KeyError:
+        if 'alerts' in response and len(response['alerts']) > 0:
+            result = ''
+            for alert in response['alerts']:
+                result += alert['alertRenderer']['text']['simpleText'] + '\n'
+            flask.abort(200, result)
+        elif 'errors' in response['responseContext']:
+            for error in response['responseContext']['errors']['error']:
+                if error['code'] == 'INVALID_VALUE' and error['location'] == 'browse_id':
+                    flask.abort(404, 'This channel does not exist')
+        raise
+
+
+    info = {}
+    info['current_tab'] = tab
+
+
+    # stuff from microformat (info given by youtube for every page on channel)
+    info['short_description'] = microformat['description']
+    info['channel_name'] = microformat['title']
+    info['avatar'] = microformat['thumbnail']['thumbnails'][0]['url']
+    channel_url = microformat['urlCanonical'].rstrip('/')
+    channel_id = channel_url[channel_url.rfind('/')+1:]
+    info['channel_id'] = channel_id
+    info['channel_url'] = 'https://www.youtube.com/channel/' + channel_id
+
+    info['items'] = []
+
+    # empty channel
+    if 'contents' not in response and 'continuationContents' not in response:
+        return info
+
+
+    # find the tab with content
+    # example channel where tabs do not have definite index: https://www.youtube.com/channel/UC4gQ8i3FD7YbhOgqUkeQEJg
+    # TODO: maybe use the 'selected' attribute for this?
+    if 'continuationContents' not in response:
+        tab_renderer = None
+        tab_content = None
+        for tab_json in response['contents']['twoColumnBrowseResultsRenderer']['tabs']:
+            try:
+                tab_renderer = tab_json['tabRenderer']
+            except KeyError:
+                tab_renderer = tab_json['expandableTabRenderer']
+            try:
+                tab_content = tab_renderer['content']
+                break
+            except KeyError:
+                pass
+        else:   # didn't break
+            raise Exception("No tabs found with content")
+        assert tab == tab_renderer['title'].lower()
+
+
+    # extract tab-specific info
+    if tab in ('videos', 'playlists', 'search'):    # find the list of items
+        if 'continuationContents' in response:
+            try:
+                items = response['continuationContents']['gridContinuation']['items']
+            except KeyError:
+                items = response['continuationContents']['sectionListContinuation']['contents']     # for search
+        else:
+            contents = tab_content['sectionListRenderer']['contents']
+            if 'itemSectionRenderer' in contents[0]:
+                item_section = contents[0]['itemSectionRenderer']['contents'][0]
+                try:
+                    items = item_section['gridRenderer']['items']
+                except KeyError:
+                    if "messageRenderer" in item_section:
+                        items = []
+                    else:
+                        raise Exception('gridRenderer missing but messageRenderer not found')
+            else:
+                items = contents    # for search
+
+        additional_info = {'author': info['channel_name'], 'author_url': 'https://www.youtube.com/channel/' + channel_id}
+        info['items'] = [renderer_info(renderer, additional_info) for renderer in items]
+
+    elif tab == 'about':
+        channel_metadata = tab_content['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer']
+
+
+        info['links'] = []
+        for link_json in channel_metadata.get('primaryLinks', ()):
+            url = link_json['navigationEndpoint']['urlEndpoint']['url']
+            if url.startswith('/redirect'):     # youtube puts these on external links to do tracking
+                query_string = url[url.find('?')+1: ]
+                url = urllib.parse.parse_qs(query_string)['q'][0]
+
+            text = get_plain_text(link_json['title'])
+
+            info['links'].append( (text, url) )
+
+
+        info['stats'] = []
+        for stat_name in ('subscriberCountText', 'joinedDateText', 'viewCountText', 'country'):
+            try:
+                stat = channel_metadata[stat_name]
+            except KeyError:
+                continue
+            info['stats'].append(get_plain_text(stat))
+
+        if 'description' in channel_metadata:
+            info['description'] = get_text(channel_metadata['description'])
+        else:
+            info['description'] = ''
+
+    else:
+        raise NotImplementedError('Unknown or unsupported channel tab: ' + tab)
+
+    return info
+
+
author	James Taylor <user234683@users.noreply.github.com>	2019-09-08 17:28:11 -0700
committer	James Taylor <user234683@users.noreply.github.com>	2019-09-08 17:28:11 -0700
commit	bd343ed71f628e0f1dd1eb3f45fb4e04887f223f (patch)
tree	72f9de0764b920b5d464290da68244f5ee8caa6d /youtube
parent	fb1a3531c59f5d9cee406295bbe006730695c249 (diff)
download	yt-local-bd343ed71f628e0f1dd1eb3f45fb4e04887f223f.tar.lz yt-local-bd343ed71f628e0f1dd1eb3f45fb4e04887f223f.tar.xz yt-local-bd343ed71f628e0f1dd1eb3f45fb4e04887f223f.zip