aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorZenon Mousmoulas <zmousm@users.noreply.github.com>2022-01-09 20:14:56 +0200
committerGitHub <noreply@github.com>2022-01-09 23:44:56 +0530
commit0254f1627487c137abd201dea230247de6cb7f87 (patch)
tree7bc4fa474d192290a657944d377d067abbfe933b
parenta70b71e85ad61c5f7c85ef920d72f949fb767d53 (diff)
downloadhypervideo-pre-0254f1627487c137abd201dea230247de6cb7f87.tar.lz
hypervideo-pre-0254f1627487c137abd201dea230247de6cb7f87.tar.xz
hypervideo-pre-0254f1627487c137abd201dea230247de6cb7f87.zip
[utils] Improve `get_elements_text_and_html_by_attribute` regex (#2280)
Authored by: zmousm, pukkandan
-rw-r--r--test/test_utils.py6
-rw-r--r--yt_dlp/utils.py25
2 files changed, 15 insertions, 16 deletions
diff --git a/test/test_utils.py b/test/test_utils.py
index c3ec798dc..2c8f2c03e 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -1659,10 +1659,10 @@ Line 1
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(
- get_elements_text_and_html_by_attribute('class', 'foo bar', html),
+ list(get_elements_text_and_html_by_attribute('class', 'foo bar', html)),
list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES)))
- self.assertEqual(get_elements_text_and_html_by_attribute('class', 'foo', html), [])
- self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), [])
+ self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'foo', html)), [])
+ self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html)), [])
GET_ELEMENT_BY_TAG_TEST_STRING = '''
random text lorem ipsum</p>
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py
index 826ab5d29..9a66de9f5 100644
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -473,24 +473,23 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value
attribute in the passed HTML document
"""
+ value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
+
value = re.escape(value) if escape_value else value
- retlist = []
- for m in re.finditer(r'''(?xs)
+ partial_element_re = r'''(?x)
<(?P<tag>[a-zA-Z0-9:._-]+)
- (?:\s+[a-zA-Z0-9_:.-]+(?:=\S*?|\s*=\s*(?:"[^"]*"|'[^']*')|))*?
- \s+%(attribute)s(?:=%(value)s|\s*=\s*(?P<_q>['"]?)%(value)s(?P=_q))
- (?:\s+[a-zA-Z0-9_:.-]+(?:=\S*?|\s*=\s*(?:"[^"]*"|'[^']*')|))*?
- \s*>
- ''' % {'attribute': re.escape(attribute), 'value': value}, html):
- content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
+ (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
+ \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
+ ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
- retlist.append((
- unescapeHTML(re.sub(r'(?s)^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content)),
- whole,
- ))
+ for m in re.finditer(partial_element_re, html):
+ content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
- return retlist
+ yield (
+ unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
+ whole
+ )
class HTMLBreakOnClosingTagParser(compat_HTMLParser):