diff options
Diffstat (limited to 'hypervideo_dl/extractor/motherless.py')
-rw-r--r-- | hypervideo_dl/extractor/motherless.py | 31 |
1 files changed, 15 insertions, 16 deletions
diff --git a/hypervideo_dl/extractor/motherless.py b/hypervideo_dl/extractor/motherless.py index 111c7c5..c24ef9b 100644 --- a/hypervideo_dl/extractor/motherless.py +++ b/hypervideo_dl/extractor/motherless.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import datetime import re @@ -71,7 +69,7 @@ class MotherlessIE(InfoExtractor): 'title': 'a/ Hot Teens', 'categories': list, 'upload_date': '20210104', - 'uploader_id': 'yonbiw', + 'uploader_id': 'anonymous', 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, }, @@ -125,11 +123,12 @@ class MotherlessIE(InfoExtractor): kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta} upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d') - comment_count = webpage.count('class="media-comment-contents"') + comment_count = len(re.findall(r'''class\s*=\s*['"]media-comment-contents\b''', webpage)) uploader_id = self._html_search_regex( - (r'"media-meta-member">\s+<a href="/m/([^"]+)"', - r'<span\b[^>]+\bclass="username">([^<]+)</span>'), + (r'''<span\b[^>]+\bclass\s*=\s*["']username\b[^>]*>([^<]+)</span>''', + r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*<a\b[^>]+\bhref\s*=\s*['"]/m/([^"']+)'''), webpage, 'uploader_id', fatal=False) + categories = self._html_search_meta('keywords', webpage, default=None) if categories: categories = [cat.strip() for cat in categories.split(',')] @@ -219,23 +218,23 @@ class MotherlessGroupIE(InfoExtractor): r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False) description = self._html_search_meta( 'description', webpage, fatal=False) - page_count = self._int(self._search_regex( - r'(\d+)</(?:a|span)><(?:a|span)[^>]+rel="next">', - webpage, 'page_count', default=0), 'page_count') + page_count = str_to_int(self._search_regex( + r'(\d+)\s*</(?:a|span)>\s*<(?:a|span)[^>]+(?:>\s*NEXT|\brel\s*=\s*["\']?next)\b', + webpage, 'page_count', default=0)) if not page_count: message = self._search_regex( - r'class="error-page"[^>]*>\s*<p[^>]*>\s*(?P<error_msg>[^<]+)(?<=\S)\s*', + r'''class\s*=\s*['"]error-page\b[^>]*>\s*<p[^>]*>\s*(?P<error_msg>[^<]+)(?<=\S)\s*''', webpage, 'error_msg', default=None) or 'This group has no videos.' self.report_warning(message, group_id) + page_count = 1 PAGE_SIZE = 80 def _get_page(idx): - if not page_count: - return - webpage = self._download_webpage( - page_url, group_id, query={'page': idx + 1}, - note='Downloading page %d/%d' % (idx + 1, page_count) - ) + if idx > 0: + webpage = self._download_webpage( + page_url, group_id, query={'page': idx + 1}, + note='Downloading page %d/%d' % (idx + 1, page_count) + ) for entry in self._extract_entries(webpage, url): yield entry |