From 6f32a0b5b70fe0f8b14c2946b40840b795044662 Mon Sep 17 00:00:00 2001
From: Zenon Mousmoulas
Date: Wed, 5 Jan 2022 20:37:49 +0200
Subject: [utils] Improve parsing for nested HTML elements (#2129)
and add functions to return the HTML of elements
Authored by: zmousm
---
test/test_utils.py | 107 +++++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 92 insertions(+), 15 deletions(-)
(limited to 'test/test_utils.py')
diff --git a/test/test_utils.py b/test/test_utils.py
index 1a9f71947..c3ec798dc 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -44,6 +44,12 @@ from yt_dlp.utils import (
get_element_by_attribute,
get_elements_by_class,
get_elements_by_attribute,
+ get_element_html_by_class,
+ get_element_html_by_attribute,
+ get_elements_html_by_class,
+ get_elements_html_by_attribute,
+ get_elements_text_and_html_by_attribute,
+ get_element_text_and_html_by_tag,
InAdvancePagedList,
int_or_none,
intlist_to_bytes,
@@ -118,6 +124,7 @@ from yt_dlp.compat import (
compat_chr,
compat_etree_fromstring,
compat_getenv,
+ compat_HTMLParseError,
compat_os_name,
compat_setenv,
)
@@ -1575,46 +1582,116 @@ Line 1
self.assertEqual(urshift(3, 1), 1)
self.assertEqual(urshift(-3, 1), 2147483646)
+ GET_ELEMENT_BY_CLASS_TEST_STRING = '''
+ nice
+ '''
+
def test_get_element_by_class(self):
- html = '''
- nice
- '''
+ html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_by_class('foo', html), 'nice')
self.assertEqual(get_element_by_class('no-such-class', html), None)
+ def test_get_element_html_by_class(self):
+ html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
+
+ self.assertEqual(get_element_html_by_class('foo', html), html.strip())
+ self.assertEqual(get_element_by_class('no-such-class', html), None)
+
+ GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = '''
+ foo
+ '''
+
def test_get_element_by_attribute(self):
- html = '''
- nice
- '''
+ html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice')
self.assertEqual(get_element_by_attribute('class', 'foo', html), None)
self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None)
- html = '''
- foo
- '''
+ html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo')
+ def test_get_element_html_by_attribute(self):
+ html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
+
+ self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), html.strip())
+ self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None)
+ self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None)
+
+ html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
+
+ self.assertEqual(get_element_html_by_attribute('itemprop', 'author', html), html.strip())
+
+ GET_ELEMENTS_BY_CLASS_TEST_STRING = '''
+ nicealso nice
+ '''
+ GET_ELEMENTS_BY_CLASS_RES = ['nice', 'also nice']
+
def test_get_elements_by_class(self):
- html = '''
- nicealso nice
- '''
+ html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice'])
self.assertEqual(get_elements_by_class('no-such-class', html), [])
+ def test_get_elements_html_by_class(self):
+ html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
+
+ self.assertEqual(get_elements_html_by_class('foo', html), self.GET_ELEMENTS_BY_CLASS_RES)
+ self.assertEqual(get_elements_html_by_class('no-such-class', html), [])
+
def test_get_elements_by_attribute(self):
- html = '''
- nicealso nice
- '''
+ html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(get_elements_by_attribute('class', 'foo bar', html), ['nice', 'also nice'])
self.assertEqual(get_elements_by_attribute('class', 'foo', html), [])
self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), [])
+ def test_get_elements_html_by_attribute(self):
+ html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
+
+ self.assertEqual(get_elements_html_by_attribute('class', 'foo bar', html), self.GET_ELEMENTS_BY_CLASS_RES)
+ self.assertEqual(get_elements_html_by_attribute('class', 'foo', html), [])
+ self.assertEqual(get_elements_html_by_attribute('class', 'no-such-foo', html), [])
+
+ def test_get_elements_text_and_html_by_attribute(self):
+ html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
+
+ self.assertEqual(
+ get_elements_text_and_html_by_attribute('class', 'foo bar', html),
+ list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES)))
+ self.assertEqual(get_elements_text_and_html_by_attribute('class', 'foo', html), [])
+ self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), [])
+
+ GET_ELEMENT_BY_TAG_TEST_STRING = '''
+ random text lorem ipsum
+
+ this should be returned
+
this should also be returned
+
+ this should also be returned
+
+ closing tag above should not trick, so this should also be returned
+
+ but this text should not be returned
+ '''
+ GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[32:276]
+ GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT = GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML[5:-6]
+ GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[78:119]
+ GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT = GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML[6:-7]
+
+ def test_get_element_text_and_html_by_tag(self):
+ html = self.GET_ELEMENT_BY_TAG_TEST_STRING
+
+ self.assertEqual(
+ get_element_text_and_html_by_tag('div', html),
+ (self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT, self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML))
+ self.assertEqual(
+ get_element_text_and_html_by_tag('span', html),
+ (self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT, self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML))
+ self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
+
def test_iri_to_uri(self):
self.assertEqual(
iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),
--
cgit v1.2.3