aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJesus E <heckyel@riseup.net>2023-06-17 16:14:02 -0400
committerJesus E <heckyel@riseup.net>2023-06-17 16:14:02 -0400
commitd7f934b7b21b40caf00efdcc8b47ad16c341dd44 (patch)
tree3279163bc142a1ecdc4c1d5a6bad1165a3bad831
parenta4299dc91766496637065c6bd0c08dcd41a6cec1 (diff)
downloadyt-local-d7f934b7b21b40caf00efdcc8b47ad16c341dd44.tar.lz
yt-local-d7f934b7b21b40caf00efdcc8b47ad16c341dd44.tar.xz
yt-local-d7f934b7b21b40caf00efdcc8b47ad16c341dd44.zip
Merge short and video parsing even further
Use multi_get and multi_deep_get for tag differences Replace the duration check with conservative_update
-rw-r--r--youtube/yt_data_extract/common.py53
1 files changed, 24 insertions, 29 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py
index 032c1d0..81fe0a5 100644
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -298,10 +298,11 @@ def extract_item_info(item, additional_info={}):
info['time_published'] = timestamp.group(1)
if primary_type == 'video':
- info['id'] = item.get('videoId')
- if not info['id']:
- info['id'] = deep_get(item,'navigationEndpoint', 'watchEndpoint',
- 'videoId')
+ info['id'] = multi_deep_get(item,
+ ['videoId'],
+ ['navigationEndpoint', 'watchEndpoint', 'videoId'],
+ ['navigationEndpoint', 'reelWatchEndpoint', 'videoId'], # shorts
+ )
info['view_count'] = extract_int(item.get('viewCountText'))
# dig into accessibility data to get view_count for videos marked as recommended, and to get time_published
@@ -319,40 +320,34 @@ def extract_item_info(item, additional_info={}):
if info['view_count']:
info['approx_view_count'] = '{:,}'.format(info['view_count'])
else:
- info['approx_view_count'] = extract_approx_int(item.get('shortViewCountText'))
+ info['approx_view_count'] = extract_approx_int(multi_get(item,
+ 'shortViewCountText',
+ 'viewCountText') # shorts
+ )
# handle case where it is "No views"
if not info['approx_view_count']:
if ('No views' in item.get('shortViewCountText', '')
- or 'no views' in accessibility_label.lower()):
+ or 'no views' in accessibility_label.lower()
+ or 'No views' in extract_str(item.get('viewCountText', '')) # shorts
+ ):
info['view_count'] = 0
info['approx_view_count'] = '0'
info['duration'] = extract_str(item.get('lengthText'))
- if info['duration'] is None: # shorts
- if not info['id']:
- info['id'] = deep_get(item,'navigationEndpoint',
- 'reelWatchEndpoint', 'videoId')
- info['approx_view_count'] = extract_approx_int(item.get('viewCountText'))
-
- # handle case where it is "No views"
- if not info['approx_view_count']:
- if ('No views' in extract_str(item.get('viewCountText', ''))):
- info['view_count'] = 0
- info['approx_view_count'] = '0'
-
- # dig into accessibility data to get duration for shorts
- accessibility_label = multi_deep_get(item,
- ['accessibility', 'accessibilityData', 'label'],
- default='')
- duration = re.search(r'(\d+) (second|seconds|minute) - play video$',
- accessibility_label)
- if duration:
- if duration.group(2) == 'minute':
- info['duration'] = '1:00'
- else:
- info['duration'] = '0:' + duration.group(1).zfill(2)
+ # dig into accessibility data to get duration for shorts
+ accessibility_label = deep_get(item,
+ 'accessibility', 'accessibilityData', 'label',
+ default='')
+ duration = re.search(r'(\d+) (second|seconds|minute) - play video$',
+ accessibility_label)
+ if duration:
+ if duration.group(2) == 'minute':
+ conservative_update(info, 'duration', '1:00')
+ else:
+ conservative_update(info,
+ 'duration', '0:' + duration.group(1).zfill(2))
# if it's an item in a playlist, get its index
if 'index' in item: # url has wrong index on playlist page