From 3200d66d880d72ba2c4e687840d31c9c98c66f6a Mon Sep 17 00:00:00 2001 From: James Taylor Date: Tue, 24 Dec 2019 13:07:12 -0800 Subject: Fix extract_approx_int not working for non-approx ints, make extract_int more robust For example, "354 subscribers" wasn't being extracted correctly be extract_approx_int. Make extract_approx_int and extract_int only extract integers that are words. So e.g. 342 will not be extracted from internetuser342 --- youtube/yt_data_extract/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube') diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py index 06f0e95..4af76c2 100644 --- a/youtube/yt_data_extract/common.py +++ b/youtube/yt_data_extract/common.py @@ -135,7 +135,7 @@ def extract_int(string, default=None): string = extract_str(string) if not string: return default - match = re.search(r'(\d+)', string.replace(',', '')) + match = re.search(r'\b(\d+)\b', string.replace(',', '')) if match is None: return default try: @@ -149,7 +149,7 @@ def extract_approx_int(string): string = extract_str(string) if not string: return None - match = re.search(r'(\d+(?:\.\d+)?[KMBTkmbt])', string.replace(',', '')) + match = re.search(r'\b(\d+(?:\.\d+)?[KMBTkmbt]?)\b', string.replace(',', '')) if match is None: return None return match.group(1) -- cgit v1.2.3