From 3dee7ea0d1156642d02f504b9229676b287ddf0a Mon Sep 17 00:00:00 2001 From: James Taylor Date: Sat, 7 Aug 2021 17:05:58 -0700 Subject: Switch to new comments api now that old one is being disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit watch_comment api periodically gives the error "Top level comments mweb servlet is turned down." The continuation items for the new api are in a different arrangement in the json, so changes were necessary to the extract_items function. Signed-off-by: Jesús --- youtube/comments.py | 39 +++++++++++++++--------------- youtube/util.py | 13 ++++++++++ youtube/yt_data_extract/common.py | 39 ++++++++++++++++++++++++------ youtube/yt_data_extract/everything_else.py | 6 ++--- 4 files changed, 66 insertions(+), 31 deletions(-) (limited to 'youtube') diff --git a/youtube/comments.py b/youtube/comments.py index b23c079..54baf61 100644 --- a/youtube/comments.py +++ b/youtube/comments.py @@ -47,25 +47,23 @@ def make_comment_ctoken(video_id, sort=0, offset=0, lc='', secret_key=''): return base64.urlsafe_b64encode(result).decode('ascii') -mobile_headers = { - 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1', - 'Accept': '*/*', - 'Accept-Language': 'en-US,en;q=0.5', - 'X-YouTube-Client-Name': '2', - 'X-YouTube-Client-Version': '2.20180823', -} - - def request_comments(ctoken, replies=False): - base_url = 'https://m.youtube.com/watch_comment?' - if replies: - base_url += 'action_get_comment_replies=1&ctoken=' - else: - base_url += 'action_get_comments=1&ctoken=' - url = base_url + ctoken.replace("=", "%3D") + "&pbj=1" + url = 'https://m.youtube.com/youtubei/v1/next' + url += '?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' + data = json.dumps({ + 'context': { + 'client': { + 'hl': 'en', + 'gl': 'US', + 'clientName': 'MWEB', + 'clientVersion': '2.20210804.02.00', + }, + }, + 'continuation': ctoken.replace('=', '%3D'), + }) content = util.fetch_url( - url, headers=mobile_headers, + url, headers=util.mobile_xhr_headers + util.json_header, data=data, report_text='Retrieved comments', debug_name='request_comments') content = content.decode('utf-8') @@ -178,10 +176,9 @@ def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''): ('Direct link', this_sort_url) ] + ctoken = make_comment_ctoken(video_id, sort, offset, lc) comments_info.update(yt_data_extract.extract_comments_info( - request_comments( - make_comment_ctoken(video_id, sort, offset, lc, secret_key) - ) + request_comments(ctoken), ctoken=ctoken )) post_process_comments_info(comments_info) @@ -212,7 +209,9 @@ def get_comments_page(): ctoken = request.args.get('ctoken', '') replies = request.args.get('replies', '0') == '1' - comments_info = yt_data_extract.extract_comments_info(request_comments(ctoken, replies)) + comments_info = yt_data_extract.extract_comments_info( + request_comments(ctoken, replies), ctoken=ctoken + ) post_process_comments_info(comments_info) if not replies: diff --git a/youtube/util.py b/youtube/util.py index daec2df..462d371 100644 --- a/youtube/util.py +++ b/youtube/util.py @@ -387,6 +387,19 @@ mobile_user_agent = 'Mozilla/5.0 (Linux; Android 7.0; Redmi Note 4 Build/NRD90M) mobile_ua = (('User-Agent', mobile_user_agent),) desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0' desktop_ua = (('User-Agent', desktop_user_agent),) +json_header = (('Content-Type', 'application/json'),) +desktop_xhr_headers = ( + ('Accept', '*/*'), + ('Accept-Language', 'en-US,en;q=0.5'), + ('X-YouTube-Client-Name', '1'), + ('X-YouTube-Client-Version', '2.20180830'), +) + desktop_ua +mobile_xhr_headers = ( + ('Accept', '*/*'), + ('Accept-Language', 'en-US,en;q=0.5'), + ('X-YouTube-Client-Name', '2'), + ('X-YouTube-Client-Version', '2.20180830'), +) + mobile_ua class RateLimitedQueue(gevent.queue.Queue): diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py index e87808b..74ac1d6 100644 --- a/youtube/yt_data_extract/common.py +++ b/youtube/yt_data_extract/common.py @@ -478,6 +478,22 @@ def extract_items_from_renderer(renderer, item_types=_item_types): renderer = None + +def extract_items_from_renderer_list(renderers, item_types=_item_types): + '''Same as extract_items_from_renderer, but provide a list of renderers''' + items = [] + ctoken = None + for renderer in renderers: + new_items, new_ctoken = extract_items_from_renderer( + renderer, + item_types=item_types) + items += new_items + # prioritize ctoken associated with items + if (not ctoken) or (new_ctoken and new_items): + ctoken = new_ctoken + return items, ctoken + + def extract_items(response, item_types=_item_types, search_engagement_panels=False): '''return items, ctoken''' @@ -495,6 +511,15 @@ def extract_items(response, item_types=_item_types, item_types=item_types) if items: break + elif 'onResponseReceivedEndpoints' in response: + for endpoint in response.get('onResponseReceivedEndpoints', []): + items, ctoken = extract_items_from_renderer_list( + deep_get(endpoint, 'appendContinuationItemsAction', + 'continuationItems', default=[]), + item_types=item_types, + ) + if items: + break elif 'contents' in response: renderer = get(response, 'contents', {}) items, ctoken = extract_items_from_renderer( @@ -502,11 +527,11 @@ def extract_items(response, item_types=_item_types, item_types=item_types) if search_engagement_panels and 'engagementPanels' in response: - for engagement_renderer in response['engagementPanels']: - additional_items, cont = extract_items_from_renderer( - engagement_renderer, - item_types=item_types) - items += additional_items - if cont and not ctoken: - ctoken = cont + new_items, new_ctoken = extract_items_from_renderer_list( + response['engagementPanels'], item_types=item_types + ) + items += new_items + if (not ctoken) or (new_ctoken and new_items): + ctoken = new_ctoken + return items, ctoken diff --git a/youtube/yt_data_extract/everything_else.py b/youtube/yt_data_extract/everything_else.py index 7275975..ba3cd77 100644 --- a/youtube/yt_data_extract/everything_else.py +++ b/youtube/yt_data_extract/everything_else.py @@ -222,15 +222,13 @@ def _ctoken_metadata(ctoken): result['sort'] = 0 return result -def extract_comments_info(polymer_json): +def extract_comments_info(polymer_json, ctoken=None): response, err = extract_response(polymer_json) if err: return {'error': err} info = {'error': None} - url = multi_deep_get(polymer_json, [1, 'url'], ['url']) - if url: - ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0] + if ctoken: metadata = _ctoken_metadata(ctoken) else: metadata = {} -- cgit v1.2.3