[utils] Improve `get_elements_text_and_html_by_attribute` regex (#2280)

Authored by: zmousm, pukkandan
author: Zenon Mousmoulas <zmousm@users.noreply.github.com> 2022-01-09 20:14:56 +0200
committer: GitHub <noreply@github.com> 2022-01-09 23:44:56 +0530
commit: 0254f1627487c137abd201dea230247de6cb7f87 (patch)
tree: 7bc4fa474d192290a657944d377d067abbfe933b /yt_dlp/utils.py
parent: a70b71e85ad61c5f7c85ef920d72f949fb767d53 (diff)
download: hypervideo-pre-0254f1627487c137abd201dea230247de6cb7f87.tar.lz
hypervideo-pre-0254f1627487c137abd201dea230247de6cb7f87.tar.xz
hypervideo-pre-0254f1627487c137abd201dea230247de6cb7f87.zip
1 files changed, 12 insertions, 13 deletions
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py
index 826ab5d29..9a66de9f5 100644
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -473,24 +473,23 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value
     attribute in the passed HTML document
     """
 
+    value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
+
     value = re.escape(value) if escape_value else value
 
-    retlist = []
-    for m in re.finditer(r'''(?xs)
+    partial_element_re = r'''(?x)
         <(?P<tag>[a-zA-Z0-9:._-]+)
-         (?:\s+[a-zA-Z0-9_:.-]+(?:=\S*?|\s*=\s*(?:"[^"]*"|'[^']*')|))*?
-         \s+%(attribute)s(?:=%(value)s|\s*=\s*(?P<_q>['"]?)%(value)s(?P=_q))
-         (?:\s+[a-zA-Z0-9_:.-]+(?:=\S*?|\s*=\s*(?:"[^"]*"|'[^']*')|))*?
-        \s*>
-    ''' % {'attribute': re.escape(attribute), 'value': value}, html):
-        content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
+         (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
+         \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
+        ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
 
-        retlist.append((
-            unescapeHTML(re.sub(r'(?s)^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content)),
-            whole,
-        ))
+    for m in re.finditer(partial_element_re, html):
+        content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 
-    return retlist
+        yield (
+            unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
+            whole
+        )
 
 
 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
author	Zenon Mousmoulas <zmousm@users.noreply.github.com>	2022-01-09 20:14:56 +0200
committer	GitHub <noreply@github.com>	2022-01-09 23:44:56 +0530
commit	0254f1627487c137abd201dea230247de6cb7f87 (patch)
tree	7bc4fa474d192290a657944d377d067abbfe933b /yt_dlp/utils.py
parent	a70b71e85ad61c5f7c85ef920d72f949fb767d53 (diff)
download	hypervideo-pre-0254f1627487c137abd201dea230247de6cb7f87.tar.lz hypervideo-pre-0254f1627487c137abd201dea230247de6cb7f87.tar.xz hypervideo-pre-0254f1627487c137abd201dea230247de6cb7f87.zip