diff options
-rw-r--r-- | README.md | 27 | ||||
-rw-r--r-- | test/test_utils.py | 54 | ||||
-rw-r--r-- | yt_dlp/options.py | 15 | ||||
-rw-r--r-- | yt_dlp/utils.py | 27 |
4 files changed, 81 insertions, 42 deletions
@@ -340,19 +340,22 @@ Then simply run `make`. You can also run `make yt-dlp` instead to compile only t COUNT views --match-filter FILTER Generic video filter. Any field (see "OUTPUT TEMPLATE") can be compared with a - number or a quoted string using the - operators defined in "Filtering formats". - You can also simply specify a field to - match if the field is present and "!field" - to check if the field is not present. - Multiple filters can be checked using "&". - For example, to only match videos that are - not live, has a like count more than 100, a - dislike count less than 50 (or the dislike + number or a string using the operators + defined in "Filtering formats". You can + also simply specify a field to match if the + field is present and "!field" to check if + the field is not present. In addition, + Python style regular expression matching + can be done using "~=", and multiple + filters can be checked with "&". Use a "\" + to escape "&" or quotes if needed. Eg: + --match-filter "!is_live & like_count>?100 + & description~=\'(?i)\bcats \& dogs\b\'" + matches only videos that are not live, has + a like count more than 100 (or the like field is not available), and also has a - description that contains "python", use - --match-filter "!is_live & like_count>100 & - dislike_count<?50 & description*='python'" + description that contains the phrase "cats + & dogs" (ignoring case) --no-match-filter Do not use generic video filter (default) --no-playlist Download only the video, if the URL refers to a video and a playlist diff --git a/test/test_utils.py b/test/test_utils.py index 5ac5dedc9..aef59e491 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1207,11 +1207,26 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') '9999 51') def test_match_str(self): + # Unary self.assertFalse(match_str('xy', {'x': 1200})) self.assertTrue(match_str('!xy', {'x': 1200})) self.assertTrue(match_str('x', {'x': 1200})) self.assertFalse(match_str('!x', {'x': 1200})) self.assertTrue(match_str('x', {'x': 0})) + self.assertTrue(match_str('is_live', {'is_live': True})) + self.assertFalse(match_str('is_live', {'is_live': False})) + self.assertFalse(match_str('is_live', {'is_live': None})) + self.assertFalse(match_str('is_live', {})) + self.assertFalse(match_str('!is_live', {'is_live': True})) + self.assertTrue(match_str('!is_live', {'is_live': False})) + self.assertTrue(match_str('!is_live', {'is_live': None})) + self.assertTrue(match_str('!is_live', {})) + self.assertTrue(match_str('title', {'title': 'abc'})) + self.assertTrue(match_str('title', {'title': ''})) + self.assertFalse(match_str('!title', {'title': 'abc'})) + self.assertFalse(match_str('!title', {'title': ''})) + + # Numeric self.assertFalse(match_str('x>0', {'x': 0})) self.assertFalse(match_str('x>0', {})) self.assertTrue(match_str('x>?0', {})) @@ -1219,6 +1234,8 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') self.assertFalse(match_str('x>2K', {'x': 1200})) self.assertTrue(match_str('x>=1200 & x < 1300', {'x': 1200})) self.assertFalse(match_str('x>=1100 & x < 1200', {'x': 1200})) + + # String self.assertFalse(match_str('y=a212', {'y': 'foobar42'})) self.assertTrue(match_str('y=foobar42', {'y': 'foobar42'})) self.assertFalse(match_str('y!=foobar42', {'y': 'foobar42'})) @@ -1234,6 +1251,8 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') self.assertTrue(match_str('y!*=baz', {'y': 'foobar42'})) self.assertTrue(match_str('y$=42', {'y': 'foobar42'})) self.assertFalse(match_str('y$=43', {'y': 'foobar42'})) + + # And self.assertFalse(match_str( 'like_count > 100 & dislike_count <? 50 & description', {'like_count': 90, 'description': 'foo'})) @@ -1246,18 +1265,29 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') self.assertFalse(match_str( 'like_count > 100 & dislike_count <? 50 & description', {'like_count': 190, 'dislike_count': 10})) - self.assertTrue(match_str('is_live', {'is_live': True})) - self.assertFalse(match_str('is_live', {'is_live': False})) - self.assertFalse(match_str('is_live', {'is_live': None})) - self.assertFalse(match_str('is_live', {})) - self.assertFalse(match_str('!is_live', {'is_live': True})) - self.assertTrue(match_str('!is_live', {'is_live': False})) - self.assertTrue(match_str('!is_live', {'is_live': None})) - self.assertTrue(match_str('!is_live', {})) - self.assertTrue(match_str('title', {'title': 'abc'})) - self.assertTrue(match_str('title', {'title': ''})) - self.assertFalse(match_str('!title', {'title': 'abc'})) - self.assertFalse(match_str('!title', {'title': ''})) + + # Regex + self.assertTrue(match_str(r'x~=\bbar', {'x': 'foo bar'})) + self.assertFalse(match_str(r'x~=\bbar.+', {'x': 'foo bar'})) + self.assertFalse(match_str(r'x~=^FOO', {'x': 'foo bar'})) + self.assertTrue(match_str(r'x~=(?i)^FOO', {'x': 'foo bar'})) + + # Quotes + self.assertTrue(match_str(r'x^="foo"', {'x': 'foo "bar"'})) + self.assertFalse(match_str(r'x^="foo "', {'x': 'foo "bar"'})) + self.assertFalse(match_str(r'x$="bar"', {'x': 'foo "bar"'})) + self.assertTrue(match_str(r'x$=" \"bar\""', {'x': 'foo "bar"'})) + + # Escaping & + self.assertFalse(match_str(r'x=foo & bar', {'x': 'foo & bar'})) + self.assertTrue(match_str(r'x=foo \& bar', {'x': 'foo & bar'})) + self.assertTrue(match_str(r'x=foo \& bar & x^=foo', {'x': 'foo & bar'})) + self.assertTrue(match_str(r'x="foo \& bar" & x^=foo', {'x': 'foo & bar'})) + + # Example from docs + self.assertTrue( + r'!is_live & like_count>?100 & description~=\'(?i)\bcats \& dogs\b\'', + {'description': 'Raining Cats & Dogs'}) def test_parse_dfxp_time_expr(self): self.assertEqual(parse_dfxp_time_expr(None), None) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index fba231382..b5ddbeaff 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -378,13 +378,14 @@ def parseOpts(overrideArguments=None): 'Generic video filter. Any field (see "OUTPUT TEMPLATE") can be compared with a ' 'number or a string using the operators defined in "Filtering formats". ' 'You can also simply specify a field to match if the field is present ' - 'and "!field" to check if the field is not present. ' - 'Multiple filters can be checked using "&". ' - 'For example, to only match videos that are not live, ' - 'has a like count more than 100, a dislike count less than 50 ' - '(or the dislike field is not available), and also has a description ' - 'that contains "python", use --match-filter "!is_live & ' - 'like_count>100 & dislike_count<?50 & description*=\'python\'"')) + 'and "!field" to check if the field is not present. In addition, ' + 'Python style regular expression matching can be done using "~=", ' + 'and multiple filters can be checked with "&". ' + 'Use a "\\" to escape "&" or quotes if needed. Eg: --match-filter ' + r'"!is_live & like_count>?100 & description~=\'(?i)\bcats \& dogs\b\'" ' + 'matches only videos that are not live, has a like count more than 100 ' + '(or the like field is not available), and also has a description ' + 'that contains the phrase "cats & dogs" (ignoring case)')) selection.add_option( '--no-match-filter', metavar='FILTER', dest='match_filter', action='store_const', const=None, diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index d06b18e00..b04fbd22c 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -4664,23 +4664,28 @@ def render_table(header_row, data, delim=False, extraGap=0, hideEmpty=False): def _match_one(filter_part, dct): # TODO: Generalize code with YoutubeDL._build_format_filter + STRING_OPERATORS = { + '*=': operator.contains, + '^=': lambda attr, value: attr.startswith(value), + '$=': lambda attr, value: attr.endswith(value), + '~=': lambda attr, value: re.search(value, attr), + } COMPARISON_OPERATORS = { + **STRING_OPERATORS, + '<=': operator.le, # "<=" must be defined above "<" '<': operator.lt, - '<=': operator.le, - '>': operator.gt, '>=': operator.ge, + '>': operator.gt, '=': operator.eq, - '*=': operator.contains, - '^=': lambda attr, value: attr.startswith(value), - '$=': lambda attr, value: attr.endswith(value), } + operator_rex = re.compile(r'''(?x)\s* (?P<key>[a-z_]+) \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* (?: (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)| - (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)| - (?P<strval>(?![0-9.])[a-z0-9A-Z]*) + (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)| + (?P<strval>.+?) ) \s*$ ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys()))) @@ -4705,9 +4710,8 @@ def _match_one(filter_part, dct): if quote is not None: comparison_value = comparison_value.replace(r'\%s' % quote, quote) else: - if m.group('op') in ('*=', '^=', '$='): - raise ValueError( - 'Operator %s only supports string values!' % m.group('op')) + if m.group('op') in STRING_OPERATORS: + raise ValueError('Operator %s only supports string values!' % m.group('op')) try: comparison_value = int(m.group('intval')) except ValueError: @@ -4743,7 +4747,8 @@ def match_str(filter_str, dct): """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """ return all( - _match_one(filter_part, dct) for filter_part in filter_str.split('&')) + _match_one(filter_part.replace(r'\&', '&'), dct) + for filter_part in re.split(r'(?<!\\)&', filter_str)) def match_filter_func(filter_str): |