Merge short and video parsing

author: Jesus E <heckyel@riseup.net> 2023-06-17 16:10:59 -0400
committer: Jesus E <heckyel@riseup.net> 2023-06-17 16:10:59 -0400
commit: a4299dc91766496637065c6bd0c08dcd41a6cec1 (patch)
tree: f58f41e3b46a9396f1920ade93623b3728f1113b
parent: e6fd9b40f4c5a789dfa542a3c441eda89695ff7a (diff)
download: yt-local-a4299dc91766496637065c6bd0c08dcd41a6cec1.tar.lz
yt-local-a4299dc91766496637065c6bd0c08dcd41a6cec1.tar.xz
yt-local-a4299dc91766496637065c6bd0c08dcd41a6cec1.zip
1 files changed, 25 insertions, 43 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py
index f04ff64..032c1d0 100644
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -251,7 +251,7 @@ def extract_item_info(item, additional_info={}):
         info['type'] = 'video'
     elif type_parts[0] == 'reel': # shorts
         info['type'] = 'video'
-        primary_type = 'short'
+        primary_type = 'video'
     elif primary_type in ('playlist', 'radio', 'show'):
         info['type'] = 'playlist'
         info['playlist_type'] = primary_type
@@ -323,54 +323,36 @@ def extract_item_info(item, additional_info={}):
 
         # handle case where it is "No views"
         if not info['approx_view_count']:
-            if ('No views' in extract_str(item.get('viewCountText', ''))):
-                info['view_count'] = 0
-                info['approx_view_count'] = '0'
-
-        info['duration'] = extract_str(item.get('lengthText'))
-
-        # if it's an item in a playlist, get its index
-        if 'index' in item: # url has wrong index on playlist page
-            info['index'] = extract_int(item.get('index'))
-        elif 'indexText' in item:
-            # Current item in playlist has ▶ instead of the actual index, must
-            # dig into url
-            match = re.search(r'index=(\d+)', deep_get(item,
-                'navigationEndpoint', 'commandMetadata', 'webCommandMetadata',
-                'url', default=''))
-            if match is None:   # worth a try then
-                info['index'] = extract_int(item.get('indexText'))
-            else:
-                info['index'] = int(match.group(1))
-        else:
-            info['index'] = None
-
-    elif primary_type == 'short':
-        info['id'] = item.get('videoId')
-        if not info['id']:
-            info['id'] = deep_get(item,'navigationEndpoint',
-                                  'reelWatchEndpoint', 'videoId')
-        info['approx_view_count'] = extract_approx_int(item.get('viewCountText'))
-
-        # handle case where it is "No views"
-        if not info['approx_view_count']:
             if ('No views' in item.get('shortViewCountText', '')
                     or 'no views' in accessibility_label.lower()):
                 info['view_count'] = 0
                 info['approx_view_count'] = '0'
 
-        # dig into accessibility data to get duration for shorts
-        accessibility_label = multi_deep_get(item,
-            ['accessibility', 'accessibilityData', 'label'],
-            default='')
+        info['duration'] = extract_str(item.get('lengthText'))
 
-        duration = re.search(r'(\d+) (second|seconds|minute) - play video$',
-                             accessibility_label)
-        if duration:
-            if duration.group(2) == 'minute':
-                info['duration'] = '1:00'
-            else:
-                info['duration'] = '0:' + duration.group(1).zfill(2)
+        if info['duration'] is None: # shorts
+            if not info['id']:
+                info['id'] = deep_get(item,'navigationEndpoint',
+                                    'reelWatchEndpoint', 'videoId')
+            info['approx_view_count'] = extract_approx_int(item.get('viewCountText'))
+
+            # handle case where it is "No views"
+            if not info['approx_view_count']:
+                if ('No views' in extract_str(item.get('viewCountText', ''))):
+                    info['view_count'] = 0
+                    info['approx_view_count'] = '0'
+
+            # dig into accessibility data to get duration for shorts
+            accessibility_label = multi_deep_get(item,
+                ['accessibility', 'accessibilityData', 'label'],
+                default='')
+            duration = re.search(r'(\d+) (second|seconds|minute) - play video$',
+                                accessibility_label)
+            if duration:
+                if duration.group(2) == 'minute':
+                    info['duration'] = '1:00'
+                else:
+                    info['duration'] = '0:' + duration.group(1).zfill(2)
 
         # if it's an item in a playlist, get its index
         if 'index' in item: # url has wrong index on playlist page
author	Jesus E <heckyel@riseup.net>	2023-06-17 16:10:59 -0400
committer	Jesus E <heckyel@riseup.net>	2023-06-17 16:10:59 -0400
commit	a4299dc91766496637065c6bd0c08dcd41a6cec1 (patch)
tree	f58f41e3b46a9396f1920ade93623b3728f1113b
parent	e6fd9b40f4c5a789dfa542a3c441eda89695ff7a (diff)
download	yt-local-a4299dc91766496637065c6bd0c08dcd41a6cec1.tar.lz yt-local-a4299dc91766496637065c6bd0c08dcd41a6cec1.tar.xz yt-local-a4299dc91766496637065c6bd0c08dcd41a6cec1.zip