aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJesus E <heckyel@riseup.net>2023-06-17 16:10:59 -0400
committerJesus E <heckyel@riseup.net>2023-06-17 16:10:59 -0400
commita4299dc91766496637065c6bd0c08dcd41a6cec1 (patch)
treef58f41e3b46a9396f1920ade93623b3728f1113b
parente6fd9b40f4c5a789dfa542a3c441eda89695ff7a (diff)
downloadyt-local-a4299dc91766496637065c6bd0c08dcd41a6cec1.tar.lz
yt-local-a4299dc91766496637065c6bd0c08dcd41a6cec1.tar.xz
yt-local-a4299dc91766496637065c6bd0c08dcd41a6cec1.zip
Merge short and video parsing
-rw-r--r--youtube/yt_data_extract/common.py68
1 files changed, 25 insertions, 43 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py
index f04ff64..032c1d0 100644
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -251,7 +251,7 @@ def extract_item_info(item, additional_info={}):
info['type'] = 'video'
elif type_parts[0] == 'reel': # shorts
info['type'] = 'video'
- primary_type = 'short'
+ primary_type = 'video'
elif primary_type in ('playlist', 'radio', 'show'):
info['type'] = 'playlist'
info['playlist_type'] = primary_type
@@ -323,54 +323,36 @@ def extract_item_info(item, additional_info={}):
# handle case where it is "No views"
if not info['approx_view_count']:
- if ('No views' in extract_str(item.get('viewCountText', ''))):
- info['view_count'] = 0
- info['approx_view_count'] = '0'
-
- info['duration'] = extract_str(item.get('lengthText'))
-
- # if it's an item in a playlist, get its index
- if 'index' in item: # url has wrong index on playlist page
- info['index'] = extract_int(item.get('index'))
- elif 'indexText' in item:
- # Current item in playlist has ▶ instead of the actual index, must
- # dig into url
- match = re.search(r'index=(\d+)', deep_get(item,
- 'navigationEndpoint', 'commandMetadata', 'webCommandMetadata',
- 'url', default=''))
- if match is None: # worth a try then
- info['index'] = extract_int(item.get('indexText'))
- else:
- info['index'] = int(match.group(1))
- else:
- info['index'] = None
-
- elif primary_type == 'short':
- info['id'] = item.get('videoId')
- if not info['id']:
- info['id'] = deep_get(item,'navigationEndpoint',
- 'reelWatchEndpoint', 'videoId')
- info['approx_view_count'] = extract_approx_int(item.get('viewCountText'))
-
- # handle case where it is "No views"
- if not info['approx_view_count']:
if ('No views' in item.get('shortViewCountText', '')
or 'no views' in accessibility_label.lower()):
info['view_count'] = 0
info['approx_view_count'] = '0'
- # dig into accessibility data to get duration for shorts
- accessibility_label = multi_deep_get(item,
- ['accessibility', 'accessibilityData', 'label'],
- default='')
+ info['duration'] = extract_str(item.get('lengthText'))
- duration = re.search(r'(\d+) (second|seconds|minute) - play video$',
- accessibility_label)
- if duration:
- if duration.group(2) == 'minute':
- info['duration'] = '1:00'
- else:
- info['duration'] = '0:' + duration.group(1).zfill(2)
+ if info['duration'] is None: # shorts
+ if not info['id']:
+ info['id'] = deep_get(item,'navigationEndpoint',
+ 'reelWatchEndpoint', 'videoId')
+ info['approx_view_count'] = extract_approx_int(item.get('viewCountText'))
+
+ # handle case where it is "No views"
+ if not info['approx_view_count']:
+ if ('No views' in extract_str(item.get('viewCountText', ''))):
+ info['view_count'] = 0
+ info['approx_view_count'] = '0'
+
+ # dig into accessibility data to get duration for shorts
+ accessibility_label = multi_deep_get(item,
+ ['accessibility', 'accessibilityData', 'label'],
+ default='')
+ duration = re.search(r'(\d+) (second|seconds|minute) - play video$',
+ accessibility_label)
+ if duration:
+ if duration.group(2) == 'minute':
+ info['duration'] = '1:00'
+ else:
+ info['duration'] = '0:' + duration.group(1).zfill(2)
# if it's an item in a playlist, get its index
if 'index' in item: # url has wrong index on playlist page