aboutsummaryrefslogtreecommitdiffstats
path: root/youtube
diff options
context:
space:
mode:
Diffstat (limited to 'youtube')
-rw-r--r--youtube/comments.py99
-rw-r--r--youtube/util.py9
-rw-r--r--youtube/yt_data_extract.py113
3 files changed, 114 insertions, 107 deletions
diff --git a/youtube/comments.py b/youtube/comments.py
index 3b1ef86..250a95f 100644
--- a/youtube/comments.py
+++ b/youtube/comments.py
@@ -48,24 +48,6 @@ def comment_replies_ctoken(video_id, comment_id, max_results=500):
result = proto.nested(2, proto.string(2, video_id)) + proto.uint(3,6) + proto.nested(6, params)
return base64.urlsafe_b64encode(result).decode('ascii')
-def ctoken_metadata(ctoken):
- result = dict()
- params = proto.parse(proto.b64_to_bytes(ctoken))
- result['video_id'] = proto.parse(params[2])[2].decode('ascii')
-
- offset_information = proto.parse(params[6])
- result['offset'] = offset_information.get(5, 0)
-
- result['is_replies'] = False
- if (3 in offset_information) and (2 in proto.parse(offset_information[3])):
- result['is_replies'] = True
- result['sort'] = None
- else:
- try:
- result['sort'] = proto.parse(offset_information[4])[6]
- except KeyError:
- result['sort'] = 0
- return result
mobile_headers = {
@@ -91,7 +73,9 @@ def request_comments(ctoken, replies=False):
print("got <!DOCTYPE>, retrying")
continue
break
- return content
+
+ polymer_json = json.loads(util.uppercase_escape(content.decode('utf-8')))
+ return polymer_json
def single_comment_ctoken(video_id, comment_id):
@@ -102,77 +86,6 @@ def single_comment_ctoken(video_id, comment_id):
-def parse_comments_polymer(content):
- try:
- video_title = ''
- content = json.loads(util.uppercase_escape(content.decode('utf-8')))
- url = content[1]['url']
- ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0]
- metadata = ctoken_metadata(ctoken)
-
- try:
- comments_raw = content[1]['response']['continuationContents']['commentSectionContinuation']['items']
- except KeyError:
- comments_raw = content[1]['response']['continuationContents']['commentRepliesContinuation']['contents']
-
- ctoken = util.default_multi_get(content, 1, 'response', 'continuationContents', 'commentSectionContinuation', 'continuations', 0, 'nextContinuationData', 'continuation', default='')
-
- comments = []
- for comment_json in comments_raw:
- number_of_replies = 0
- try:
- comment_thread = comment_json['commentThreadRenderer']
- except KeyError:
- comment_renderer = comment_json['commentRenderer']
- else:
- if 'commentTargetTitle' in comment_thread:
- video_title = comment_thread['commentTargetTitle']['runs'][0]['text']
-
- if 'replies' in comment_thread:
- view_replies_text = yt_data_extract.get_plain_text(comment_thread['replies']['commentRepliesRenderer']['moreText'])
- view_replies_text = view_replies_text.replace(',', '')
- match = re.search(r'(\d+)', view_replies_text)
- if match is None:
- number_of_replies = 1
- else:
- number_of_replies = int(match.group(1))
- comment_renderer = comment_thread['comment']['commentRenderer']
-
- comment = {
- 'author_id': comment_renderer.get('authorId', ''),
- 'author_avatar': comment_renderer['authorThumbnail']['thumbnails'][0]['url'],
- 'likes': comment_renderer['likeCount'],
- 'published': yt_data_extract.get_plain_text(comment_renderer['publishedTimeText']),
- 'text': comment_renderer['contentText'].get('runs', ''),
- 'number_of_replies': number_of_replies,
- 'comment_id': comment_renderer['commentId'],
- }
-
- if 'authorText' in comment_renderer: # deleted channels have no name or channel link
- comment['author'] = yt_data_extract.get_plain_text(comment_renderer['authorText'])
- comment['author_url'] = comment_renderer['authorEndpoint']['commandMetadata']['webCommandMetadata']['url']
- comment['author_channel_id'] = comment_renderer['authorEndpoint']['browseEndpoint']['browseId']
- else:
- comment['author'] = ''
- comment['author_url'] = ''
- comment['author_channel_id'] = ''
-
- comments.append(comment)
- except Exception as e:
- print('Error parsing comments: ' + str(e))
- comments = ()
- ctoken = ''
-
- return {
- 'ctoken': ctoken,
- 'comments': comments,
- 'video_title': video_title,
- 'video_id': metadata['video_id'],
- 'offset': metadata['offset'],
- 'is_replies': metadata['is_replies'],
- 'sort': metadata['sort'],
- }
-
def post_process_comments_info(comments_info):
for comment in comments_info['comments']:
comment['author_url'] = util.URL_ORIGIN + comment['author_url']
@@ -207,7 +120,7 @@ def post_process_comments_info(comments_info):
comment['likes_text'] = str(comment['likes']) + ' likes'
comments_info['include_avatars'] = settings.enable_comment_avatars
- if comments_info['ctoken'] != '':
+ if comments_info['ctoken']:
comments_info['more_comments_url'] = util.URL_ORIGIN + '/comments?ctoken=' + comments_info['ctoken']
comments_info['page_number'] = page_number = str(int(comments_info['offset']/20) + 1)
@@ -222,7 +135,7 @@ def post_process_comments_info(comments_info):
def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''):
if settings.comments_mode:
- comments_info = parse_comments_polymer(request_comments(make_comment_ctoken(video_id, sort, offset, lc, secret_key)))
+ comments_info = yt_data_extract.parse_comments_polymer(request_comments(make_comment_ctoken(video_id, sort, offset, lc, secret_key)))
post_process_comments_info(comments_info)
post_comment_url = util.URL_ORIGIN + "/post_comment?video_id=" + video_id
@@ -247,7 +160,7 @@ def get_comments_page():
ctoken = comment_replies_ctoken(video_id, parent_id)
replies = True
- comments_info = parse_comments_polymer(request_comments(ctoken, replies))
+ comments_info = yt_data_extract.parse_comments_polymer(request_comments(ctoken, replies))
post_process_comments_info(comments_info)
if not replies:
diff --git a/youtube/util.py b/youtube/util.py
index a81ae83..5b63e2a 100644
--- a/youtube/util.py
+++ b/youtube/util.py
@@ -277,15 +277,6 @@ def video_id(url):
url_parts = urllib.parse.urlparse(url)
return urllib.parse.parse_qs(url_parts.query)['v'][0]
-def default_multi_get(object, *keys, default):
- ''' Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. Last argument is the default value to use in case of any IndexErrors or KeyErrors '''
- try:
- for key in keys:
- object = object[key]
- return object
- except (IndexError, KeyError):
- return default
-
# default, sddefault, mqdefault, hqdefault, hq720
def get_thumbnail_url(video_id):
diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py
index 440cc0d..551b663 100644
--- a/youtube/yt_data_extract.py
+++ b/youtube/yt_data_extract.py
@@ -1,4 +1,4 @@
-from youtube import util
+from youtube import util, proto
import html
import json
@@ -59,10 +59,14 @@ def format_text_runs(runs):
return result
-
-
-
-
+def default_multi_get(object, *keys, default):
+ ''' Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. Last argument is the default value to use in case of any IndexErrors or KeyErrors '''
+ try:
+ for key in keys:
+ object = object[key]
+ return object
+ except (IndexError, KeyError):
+ return default
def get_url(node):
@@ -501,3 +505,102 @@ def extract_playlist_info(polymer_json):
return info
+def ctoken_metadata(ctoken):
+ result = dict()
+ params = proto.parse(proto.b64_to_bytes(ctoken))
+ result['video_id'] = proto.parse(params[2])[2].decode('ascii')
+
+ offset_information = proto.parse(params[6])
+ result['offset'] = offset_information.get(5, 0)
+
+ result['is_replies'] = False
+ if (3 in offset_information) and (2 in proto.parse(offset_information[3])):
+ result['is_replies'] = True
+ result['sort'] = None
+ else:
+ try:
+ result['sort'] = proto.parse(offset_information[4])[6]
+ except KeyError:
+ result['sort'] = 0
+ return result
+
+def parse_comments_polymer(polymer_json):
+ try:
+ video_title = ''
+ response, err = get_response(polymer_json)
+ if err:
+ raise Exception(err)
+
+ try:
+ url = polymer_json[1]['url']
+ except (TypeError, IndexError, KeyError):
+ url = polymer_json['url']
+
+ ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0]
+ metadata = ctoken_metadata(ctoken)
+
+ try:
+ comments_raw = response['continuationContents']['commentSectionContinuation']['items']
+ except KeyError:
+ comments_raw = response['continuationContents']['commentRepliesContinuation']['contents']
+
+ ctoken = default_multi_get(response, 'continuationContents', 'commentSectionContinuation', 'continuations', 0, 'nextContinuationData', 'continuation', default='')
+
+ comments = []
+ for comment_json in comments_raw:
+ number_of_replies = 0
+ try:
+ comment_thread = comment_json['commentThreadRenderer']
+ except KeyError:
+ comment_renderer = comment_json['commentRenderer']
+ else:
+ if 'commentTargetTitle' in comment_thread:
+ video_title = comment_thread['commentTargetTitle']['runs'][0]['text']
+
+ if 'replies' in comment_thread:
+ view_replies_text = get_plain_text(comment_thread['replies']['commentRepliesRenderer']['moreText'])
+ view_replies_text = view_replies_text.replace(',', '')
+ match = re.search(r'(\d+)', view_replies_text)
+ if match is None:
+ number_of_replies = 1
+ else:
+ number_of_replies = int(match.group(1))
+ comment_renderer = comment_thread['comment']['commentRenderer']
+
+ comment = {
+ 'author_id': comment_renderer.get('authorId', ''),
+ 'author_avatar': comment_renderer['authorThumbnail']['thumbnails'][0]['url'],
+ 'likes': comment_renderer['likeCount'],
+ 'published': get_plain_text(comment_renderer['publishedTimeText']),
+ 'text': comment_renderer['contentText'].get('runs', ''),
+ 'number_of_replies': number_of_replies,
+ 'comment_id': comment_renderer['commentId'],
+ }
+
+ if 'authorText' in comment_renderer: # deleted channels have no name or channel link
+ comment['author'] = get_plain_text(comment_renderer['authorText'])
+ comment['author_url'] = comment_renderer['authorEndpoint']['commandMetadata']['webCommandMetadata']['url']
+ comment['author_channel_id'] = comment_renderer['authorEndpoint']['browseEndpoint']['browseId']
+ else:
+ comment['author'] = ''
+ comment['author_url'] = ''
+ comment['author_channel_id'] = ''
+
+ comments.append(comment)
+ except Exception as e:
+ print('Error parsing comments: ' + str(e))
+ comments = ()
+ ctoken = ''
+
+ return {
+ 'ctoken': ctoken,
+ 'comments': comments,
+ 'video_title': video_title,
+ 'video_id': metadata['video_id'],
+ 'offset': metadata['offset'],
+ 'is_replies': metadata['is_replies'],
+ 'sort': metadata['sort'],
+ }
+
+
+