diff options
-rw-r--r-- | yt_dlp/extractor/_extractors.py | 5 | ||||
-rw-r--r-- | yt_dlp/extractor/common.py | 56 | ||||
-rw-r--r-- | yt_dlp/extractor/generic.py | 219 | ||||
-rw-r--r-- | yt_dlp/extractor/peekvids.py | 190 | ||||
-rw-r--r-- | yt_dlp/extractor/thisvid.py | 226 |
5 files changed, 529 insertions, 167 deletions
diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 352de83ca..83e732189 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1872,6 +1872,11 @@ from .theweatherchannel import TheWeatherChannelIE from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE +from .thisvid import ( + ThisVidIE, + ThisVidMemberIE, + ThisVidPlaylistIE, +) from .threespeak import ( ThreeSpeakIE, ThreeSpeakUserIE, diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index f48b97a6b..21d5c39fd 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1396,10 +1396,16 @@ class InfoExtractor: # And then there are the jokers who advertise that they use RTA, but actually don't. AGE_LIMIT_MARKERS = [ r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>', + r'>[^<]*you acknowledge you are at least (\d+) years old', + r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b', ] - if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS): - return 18 - return 0 + + age_limit = 0 + for marker in AGE_LIMIT_MARKERS: + mobj = re.search(marker, html) + if mobj: + age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18))) + return age_limit def _media_rating_search(self, html): # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/ @@ -3216,7 +3222,7 @@ class InfoExtractor: def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): mobj = re.search( - r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)', + r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''', webpage) if mobj: try: @@ -3237,19 +3243,20 @@ class InfoExtractor: def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - # JWPlayer backward compatibility: flattened playlists - # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 - if 'playlist' not in jwplayer_data: - jwplayer_data = {'playlist': [jwplayer_data]} - entries = [] + if not isinstance(jwplayer_data, dict): + return entries - # JWPlayer backward compatibility: single playlist item + playlist_items = jwplayer_data.get('playlist') + # JWPlayer backward compatibility: single playlist item/flattened playlists # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10 - if not isinstance(jwplayer_data['playlist'], list): - jwplayer_data['playlist'] = [jwplayer_data['playlist']] + # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 + if not isinstance(playlist_items, list): + playlist_items = (playlist_items or jwplayer_data, ) - for video_data in jwplayer_data['playlist']: + for video_data in playlist_items: + if not isinstance(video_data, dict): + continue # JWPlayer backward compatibility: flattened sources # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 if 'sources' not in video_data: @@ -3287,6 +3294,13 @@ class InfoExtractor: 'timestamp': int_or_none(video_data.get('pubdate')), 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), 'subtitles': subtitles, + 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ... + 'genre': clean_html(video_data.get('genre')), + 'channel': clean_html(dict_get(video_data, ('category', 'channel'))), + 'season_number': int_or_none(video_data.get('season')), + 'episode_number': int_or_none(video_data.get('episode')), + 'release_year': int_or_none(video_data.get('releasedate')), + 'age_limit': int_or_none(video_data.get('age_restriction')), } # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']): @@ -3304,7 +3318,7 @@ class InfoExtractor: def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - urls = [] + urls = set() formats = [] for source in jwplayer_sources_data: if not isinstance(source, dict): @@ -3313,14 +3327,14 @@ class InfoExtractor: base_url, self._proto_relative_url(source.get('file'))) if not source_url or source_url in urls: continue - urls.append(source_url) + urls.add(source_url) source_type = source.get('type') or '' ext = mimetype2ext(source_type) or determine_ext(source_url) - if source_type == 'hls' or ext == 'm3u8': + if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url: formats.extend(self._extract_m3u8_formats( source_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=m3u8_id, fatal=False)) - elif source_type == 'dash' or ext == 'mpd': + elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url: formats.extend(self._extract_mpd_formats( source_url, video_id, mpd_id=mpd_id, fatal=False)) elif ext == 'smil': @@ -3335,13 +3349,12 @@ class InfoExtractor: 'ext': ext, }) else: + format_id = str_or_none(source.get('label')) height = int_or_none(source.get('height')) - if height is None: + if height is None and format_id: # Often no height is provided but there is a label in # format like "1080p", "720p SD", or 1080. - height = int_or_none(self._search_regex( - r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''), - 'height', default=None)) + height = parse_resolution(format_id).get('height') a_format = { 'url': source_url, 'width': int_or_none(source.get('width')), @@ -3349,6 +3362,7 @@ class InfoExtractor: 'tbr': int_or_none(source.get('bitrate'), scale=1000), 'filesize': int_or_none(source.get('filesize')), 'ext': ext, + 'format_id': format_id } if source_url.startswith('rtmp'): a_format['ext'] = 'flv' diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index ffc279023..14d492f07 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -32,6 +32,7 @@ from ..utils import ( unified_timestamp, unsmuggle_url, url_or_none, + urljoin, variadic, xpath_attr, xpath_text, @@ -1867,11 +1868,13 @@ class GenericIE(InfoExtractor): 'display_id': 'kelis-4th-of-july', 'ext': 'mp4', 'title': 'Kelis - 4th Of July', - 'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', + 'description': 'Kelis - 4th Of July', + 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['Untested major version'], }, { # KVS Player 'url': 'https://www.kvs-demo.com/embed/105/', @@ -1880,35 +1883,12 @@ class GenericIE(InfoExtractor): 'display_id': 'kelis-4th-of-july', 'ext': 'mp4', 'title': 'Kelis - 4th Of July / Embed Player', - 'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', + 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', }, 'params': { 'skip_download': True, }, }, { - # KVS Player - 'url': 'https://thisvid.com/videos/french-boy-pantsed/', - 'md5': '3397979512c682f6b85b3b04989df224', - 'info_dict': { - 'id': '2400174', - 'display_id': 'french-boy-pantsed', - 'ext': 'mp4', - 'title': 'French Boy Pantsed - ThisVid.com', - 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg', - } - }, { - # KVS Player - 'url': 'https://thisvid.com/embed/2400174/', - 'md5': '3397979512c682f6b85b3b04989df224', - 'info_dict': { - 'id': '2400174', - 'display_id': 'french-boy-pantsed', - 'ext': 'mp4', - 'title': 'French Boy Pantsed - ThisVid.com', - 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg', - } - }, { - # KVS Player 'url': 'https://youix.com/video/leningrad-zoj/', 'md5': '94f96ba95706dc3880812b27b7d8a2b8', 'info_dict': { @@ -1916,8 +1896,8 @@ class GenericIE(InfoExtractor): 'display_id': 'leningrad-zoj', 'ext': 'mp4', 'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com', - 'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg', - } + 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg', + }, }, { # KVS Player 'url': 'https://youix.com/embed/18485', @@ -1927,19 +1907,20 @@ class GenericIE(InfoExtractor): 'display_id': 'leningrad-zoj', 'ext': 'mp4', 'title': 'Ленинград - ЗОЖ', - 'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg', - } + 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg', + }, }, { # KVS Player 'url': 'https://bogmedia.org/videos/21217/40-nochey-40-nights-2016/', 'md5': '94166bdb26b4cb1fb9214319a629fc51', 'info_dict': { 'id': '21217', - 'display_id': '40-nochey-40-nights-2016', + 'display_id': '40-nochey-2016', 'ext': 'mp4', 'title': '40 ночей (2016) - BogMedia.org', + 'description': 'md5:4e6d7d622636eb7948275432eb256dc3', 'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg', - } + }, }, { # KVS Player (for sites that serve kt_player.js via non-https urls) @@ -1949,9 +1930,9 @@ class GenericIE(InfoExtractor): 'id': '389508', 'display_id': 'syren-de-mer-onlyfans-05-07-2020have-a-happy-safe-holiday5f014e68a220979bdb8cd-source', 'ext': 'mp4', - 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер', - 'thumbnail': 'http://www.camhub.world/contents/videos_screenshots/389000/389508/preview.mp4.jpg', - } + 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер', + 'thumbnail': r're:https?://www\.camhub\.world/contents/videos_screenshots/389000/389508/preview\.mp4\.jpg', + }, }, { # Reddit-hosted video that will redirect and be processed by RedditIE @@ -2169,7 +2150,20 @@ class GenericIE(InfoExtractor): 'direct': True, 'age_limit': 0, } - } + }, + { + 'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/', + 'md5': 'e2f0a4c329f7986280b7328e24036d60', + 'info_dict': { + 'id': '284002', + 'display_id': 'just-out-of-the-shower-joi', + 'ext': 'mp4', + 'title': 'Just Out Of The Shower JOI - Shooshtime', + 'thumbnail': 'https://i.shoosh.co/contents/videos_screenshots/284000/284002/preview.mp4.jpg', + 'height': 720, + 'age_limit': 18, + }, + }, ] def report_following_redirect(self, new_url): @@ -2235,43 +2229,87 @@ class GenericIE(InfoExtractor): 'entries': entries, } - def _kvs_getrealurl(self, video_url, license_code): + @classmethod + def _kvs_get_real_url(cls, video_url, license_code): if not video_url.startswith('function/0/'): return video_url # not obfuscated - url_path, _, url_query = video_url.partition('?') - urlparts = url_path.split('/')[2:] - license = self._kvs_getlicensetoken(license_code) - newmagic = urlparts[5][:32] + parsed = urllib.parse.urlparse(video_url[len('function/0/'):]) + license = cls._kvs_get_license_token(license_code) + urlparts = parsed.path.split('/') - for o in range(len(newmagic) - 1, -1, -1): - new = '' - l = (o + sum(int(n) for n in license[o:])) % 32 + HASH_LENGTH = 32 + hash = urlparts[3][:HASH_LENGTH] + indices = list(range(HASH_LENGTH)) - for i in range(0, len(newmagic)): - if i == o: - new += newmagic[l] - elif i == l: - new += newmagic[o] - else: - new += newmagic[i] - newmagic = new + # Swap indices of hash according to the destination calculated from the license token + accum = 0 + for src in reversed(range(HASH_LENGTH)): + accum += license[src] + dest = (src + accum) % HASH_LENGTH + indices[src], indices[dest] = indices[dest], indices[src] + + urlparts[3] = ''.join(hash[index] for index in indices) + urlparts[3][HASH_LENGTH:] + return urllib.parse.urlunparse(parsed._replace(path='/'.join(urlparts))) - urlparts[5] = newmagic + urlparts[5][32:] - return '/'.join(urlparts) + '?' + url_query + @staticmethod + def _kvs_get_license_token(license): + license = license.replace('$', '') + license_values = [int(char) for char in license] - def _kvs_getlicensetoken(self, license): - modlicense = license.replace('$', '').replace('0', '1') - center = int(len(modlicense) / 2) + modlicense = license.replace('0', '1') + center = len(modlicense) // 2 fronthalf = int(modlicense[:center + 1]) backhalf = int(modlicense[center:]) + modlicense = str(4 * abs(fronthalf - backhalf))[:center + 1] + + return [ + (license_values[index + offset] + current) % 10 + for index, current in enumerate(map(int, modlicense)) + for offset in range(4) + ] + + def _extract_kvs(self, url, webpage, video_id): + flashvars = self._search_json( + r'(?s:<script\b[^>]*>.*?var\s+flashvars\s*=)', + webpage, 'flashvars', video_id, transform_source=js_to_json) + + # extract the part after the last / as the display_id from the + # canonical URL. + display_id = self._search_regex( + r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>' + r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)', + webpage, 'display_id', fatal=False) + title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title') + + thumbnail = flashvars['preview_url'] + if thumbnail.startswith('//'): + protocol, _, _ = url.partition('/') + thumbnail = protocol + thumbnail + + url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys())) + formats = [] + for key in url_keys: + if '/get_file/' not in flashvars[key]: + continue + format_id = flashvars.get(f'{key}_text', key) + formats.append({ + 'url': urljoin(url, self._kvs_get_real_url(flashvars[key], flashvars['license_code'])), + 'format_id': format_id, + 'ext': 'mp4', + **(parse_resolution(format_id) or parse_resolution(flashvars[key])), + 'http_headers': {'Referer': url}, + }) + if not formats[-1].get('height'): + formats[-1]['quality'] = 1 - modlicense = str(4 * abs(fronthalf - backhalf)) - retval = '' - for o in range(0, center + 1): - for i in range(1, 5): - retval += str((int(license[o + i]) + int(modlicense[o])) % 10) - return retval + return { + 'id': flashvars['video_id'], + 'display_id': display_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } def _real_extract(self, url): if url.startswith('//'): @@ -2580,6 +2618,17 @@ class GenericIE(InfoExtractor): self.report_detected('video.js embed') return [{'formats': formats, 'subtitles': subtitles}] + # Look for generic KVS player (before json-ld bc of some urls that break otherwise) + found = self._search_regex(( + r'<script\b[^>]+?\bsrc\s*=\s*(["\'])https?://(?:\S+?/)+kt_player\.js\?v=(?P<ver>\d+(?:\.\d+)+)\1[^>]*>', + r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:\S+?/)+kt_player\.swf\?v=(?P<ver>\d+(?:\.\d+)+)\2\s*,', + ), webpage, 'KVS player', group='ver', default=False) + if found: + self.report_detected('KWS Player') + if found.split('.')[0] not in ('4', '5', '6'): + self.report_warning(f'Untested major version ({found}) in player engine - download may fail.') + return [self._extract_kvs(url, webpage, video_id)] + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld(webpage, video_id, default={}) if json_ld.get('url') not in (url, None): @@ -2623,52 +2672,6 @@ class GenericIE(InfoExtractor): if found: self.report_detected('JW Player embed') if not found: - # Look for generic KVS player - found = re.search(r'<script [^>]*?src="https?://.+?/kt_player\.js\?v=(?P<ver>(?P<maj_ver>\d+)(\.\d+)+)".*?>', webpage) - if found: - self.report_detected('KWS Player') - if found.group('maj_ver') not in ['4', '5']: - self.report_warning('Untested major version (%s) in player engine--Download may fail.' % found.group('ver')) - flashvars = re.search(r'(?ms)<script.*?>.*?var\s+flashvars\s*=\s*(\{.*?\});.*?</script>', webpage) - flashvars = self._parse_json(flashvars.group(1), video_id, transform_source=js_to_json) - - # extract the part after the last / as the display_id from the - # canonical URL. - display_id = self._search_regex( - r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>' - r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)', - webpage, 'display_id', fatal=False - ) - title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title') - - thumbnail = flashvars['preview_url'] - if thumbnail.startswith('//'): - protocol, _, _ = url.partition('/') - thumbnail = protocol + thumbnail - - url_keys = list(filter(re.compile(r'video_url|video_alt_url\d*').fullmatch, flashvars.keys())) - formats = [] - for key in url_keys: - if '/get_file/' not in flashvars[key]: - continue - format_id = flashvars.get(f'{key}_text', key) - formats.append({ - 'url': self._kvs_getrealurl(flashvars[key], flashvars['license_code']), - 'format_id': format_id, - 'ext': 'mp4', - **(parse_resolution(format_id) or parse_resolution(flashvars[key])) - }) - if not formats[-1].get('height'): - formats[-1]['quality'] = 1 - - return [{ - 'id': flashvars['video_id'], - 'display_id': display_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - }] - if not found: # Broaden the search a little bit found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) if found: diff --git a/yt_dlp/extractor/peekvids.py b/yt_dlp/extractor/peekvids.py index 2d9b9a742..d1fc058b9 100644 --- a/yt_dlp/extractor/peekvids.py +++ b/yt_dlp/extractor/peekvids.py @@ -1,71 +1,128 @@ +import re + from .common import InfoExtractor +from ..utils import ( + ExtractorError, + get_element_by_class, + int_or_none, + merge_dicts, + url_or_none, +) + + +class PeekVidsBaseIE(InfoExtractor): + def _real_extract(self, url): + domain, video_id = self._match_valid_url(url).group('domain', 'id') + webpage = self._download_webpage(url, video_id, expected_status=429) + if '>Rate Limit Exceeded' in webpage: + raise ExtractorError( + f'You are suspected as a bot. Wait, or pass the captcha on the site and provide cookies. {self._login_hint()}', + video_id=video_id, expected=True) + + title = self._html_search_regex(r'(?s)<h1\b[^>]*>(.+?)</h1>', webpage, 'title') + + display_id = video_id + video_id = self._search_regex(r'(?s)<video\b[^>]+\bdata-id\s*=\s*["\']?([\w-]+)', webpage, 'short video ID') + srcs = self._download_json( + f'https://www.{domain}/v-alt/{video_id}', video_id, + note='Downloading list of source files') + + formats = [] + for k, v in srcs.items(): + f_url = url_or_none(v) + if not f_url: + continue + + height = self._search_regex(r'^data-src(\d{3,})$', k, 'height', default=None) + if not height: + continue + + formats.append({ + 'url': f_url, + 'format_id': height, + 'height': int_or_none(height), + }) + + if not formats: + formats = [{'url': url} for url in srcs.values()] + info = self._search_json_ld(webpage, video_id, expected_type='VideoObject', default={}) + info.pop('url', None) -class PeekVidsIE(InfoExtractor): + # may not have found the thumbnail if it was in a list in the ld+json + info.setdefault('thumbnail', self._og_search_thumbnail(webpage)) + detail = (get_element_by_class('detail-video-block', webpage) + or get_element_by_class('detail-block', webpage) or '') + info['description'] = self._html_search_regex( + rf'(?s)(.+?)(?:{re.escape(info.get("description", ""))}\s*<|<ul\b)', + detail, 'description', default=None) or None + info['title'] = re.sub(r'\s*[,-][^,-]+$', '', info.get('title') or title) or self._generic_title(url) + + def cat_tags(name, html): + l = self._html_search_regex( + rf'(?s)<span\b[^>]*>\s*{re.escape(name)}\s*:\s*</span>(.+?)</li>', + html, name, default='') + return list(filter(None, re.split(r'\s+', l))) + + return merge_dicts({ + 'id': video_id, + 'display_id': display_id, + 'age_limit': 18, + 'formats': formats, + 'categories': cat_tags('Categories', detail), + 'tags': cat_tags('Tags', detail), + 'uploader': self._html_search_regex(r'[Uu]ploaded\s+by\s(.+?)"', webpage, 'uploader', default=None), + }, info) + + +class PeekVidsIE(PeekVidsBaseIE): _VALID_URL = r'''(?x) - https?://(?:www\.)?peekvids\.com/ + https?://(?:www\.)?(?P<domain>peekvids\.com)/ (?:(?:[^/?#]+/){2}|embed/?\?(?:[^#]*&)?v=) (?P<id>[^/?&#]*) ''' _TESTS = [{ 'url': 'https://peekvids.com/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp/BSyLMbN0YCd', - 'md5': 'a00940646c428e232407e3e62f0e8ef5', + 'md5': '2ff6a357a9717dc9dc9894b51307e9a2', 'info_dict': { - 'id': 'BSyLMbN0YCd', - 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp, SEXYhub', + 'id': '1262717', + 'display_id': 'BSyLMbN0YCd', + 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp', 'ext': 'mp4', 'thumbnail': r're:^https?://.*\.jpg$', - 'description': 'Watch Dane Jones - Cute redhead with perfect tits with Mini Vamp (7 min), uploaded by SEXYhub.com', + 'description': 'md5:0a61df3620de26c0af8963b1a730cd69', 'timestamp': 1642579329, 'upload_date': '20220119', 'duration': 416, 'view_count': int, 'age_limit': 18, + 'uploader': 'SEXYhub.com', + 'categories': list, + 'tags': list, }, }] - _DOMAIN = 'www.peekvids.com' - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - short_video_id = self._html_search_regex(r'<video [^>]*data-id="(.+?)"', webpage, 'short video ID') - srcs = self._download_json( - f'https://{self._DOMAIN}/v-alt/{short_video_id}', video_id, - note='Downloading list of source files') - formats = [{ - 'url': url, - 'ext': 'mp4', - 'format_id': name[8:], - } for name, url in srcs.items() if len(name) > 8 and name.startswith('data-src')] - if not formats: - formats = [{'url': url} for url in srcs.values()] - info = self._search_json_ld(webpage, video_id, expected_type='VideoObject') - info.update({ - 'id': video_id, - 'age_limit': 18, - 'formats': formats, - }) - return info - - -class PlayVidsIE(PeekVidsIE): # XXX: Do not subclass from concrete IE - _VALID_URL = r'https?://(?:www\.)?playvids\.com/(?:embed/|[^/]{2}/)?(?P<id>[^/?#]*)' +class PlayVidsIE(PeekVidsBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?P<domain>playvids\.com)/(?:embed/|\w\w?/)?(?P<id>[^/?#]*)' _TESTS = [{ 'url': 'https://www.playvids.com/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp', - 'md5': 'cd7dfd8a2e815a45402369c76e3c1825', + 'md5': '2f12e50213dd65f142175da633c4564c', 'info_dict': { - 'id': 'U3pBrYhsjXM', - 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp, SEXYhub', + 'id': '1978030', + 'display_id': 'U3pBrYhsjXM', + 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp', 'ext': 'mp4', 'thumbnail': r're:^https?://.*\.jpg$', - 'description': 'Watch Dane Jones - Cute redhead with perfect tits with Mini Vamp video in HD, uploaded by SEXYhub.com', + 'description': 'md5:0a61df3620de26c0af8963b1a730cd69', 'timestamp': 1640435839, 'upload_date': '20211225', 'duration': 416, 'view_count': int, 'age_limit': 18, + 'uploader': 'SEXYhub.com', + 'categories': list, + 'tags': list, }, }, { 'url': 'https://www.playvids.com/es/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp', @@ -73,5 +130,62 @@ class PlayVidsIE(PeekVidsIE): # XXX: Do not subclass from concrete IE }, { 'url': 'https://www.playvids.com/embed/U3pBrYhsjXM', 'only_matching': True, + }, { + 'url': 'https://www.playvids.com/bKmGLe3IwjZ/sv/brazzers-800-phone-sex-madison-ivy-always-on-the-line', + 'md5': 'e783986e596cafbf46411a174ab42ba6', + 'info_dict': { + 'id': '762385', + 'display_id': 'bKmGLe3IwjZ', + 'ext': 'mp4', + 'title': 'Brazzers - 1 800 Phone Sex: Madison Ivy Always On The Line 6', + 'description': 'md5:bdcd2db2b8ad85831a491d7c8605dcef', + 'timestamp': 1516958544, + 'upload_date': '20180126', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 480, + 'uploader': 'Brazzers', + 'age_limit': 18, + 'view_count': int, + 'age_limit': 18, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'https://www.playvids.com/v/47iUho33toY', + 'md5': 'b056b5049d34b648c1e86497cf4febce', + 'info_dict': { + 'id': '700621', + 'display_id': '47iUho33toY', + 'ext': 'mp4', + 'title': 'KATEE OWEN STRIPTIASE IN SEXY RED LINGERIE', + 'description': None, + 'timestamp': 1507052209, + 'upload_date': '20171003', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 332, + 'uploader': 'Cacerenele', + 'age_limit': 18, + 'view_count': int, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'https://www.playvids.com/z3_7iwWCmqt/sexy-teen-filipina-striptease-beautiful-pinay-bargirl-strips-and-dances', + 'md5': 'efa09be9f031314b7b7e3bc6510cd0df', + 'info_dict': { + 'id': '1523518', + 'display_id': 'z3_7iwWCmqt', + 'ext': 'mp4', + 'title': 'SEXY TEEN FILIPINA STRIPTEASE - Beautiful Pinay Bargirl Strips and Dances', + 'description': None, + 'timestamp': 1607470323, + 'upload_date': '20201208', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 593, + 'uploader': 'yorours', + 'age_limit': 18, + 'view_count': int, + 'categories': list, + 'tags': list, + }, }] - _DOMAIN = 'www.playvids.com' diff --git a/yt_dlp/extractor/thisvid.py b/yt_dlp/extractor/thisvid.py new file mode 100644 index 000000000..9d3368ed7 --- /dev/null +++ b/yt_dlp/extractor/thisvid.py @@ -0,0 +1,226 @@ +import itertools +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + clean_html, + get_element_by_class, + int_or_none, + url_or_none, + urljoin, +) + + +class ThisVidIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?thisvid\.com/(?P<type>videos|embed)/(?P<id>[A-Za-z0-9-]+)' + _TESTS = [{ + 'url': 'https://thisvid.com/videos/sitting-on-ball-tight-jeans/', + 'md5': '839becb572995687e11a69dc4358a386', + 'info_dict': { + 'id': '3533241', + 'ext': 'mp4', + 'title': 'Sitting on ball tight jeans', + 'description': 'md5:372353bb995883d1b65fddf507489acd', + 'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg', + 'uploader_id': '150629', + 'uploader': 'jeanslevisjeans', + 'display_id': 'sitting-on-ball-tight-jeans', + 'age_limit': 18, + } + }, { + 'url': 'https://thisvid.com/embed/3533241/', + 'md5': '839becb572995687e11a69dc4358a386', + 'info_dict': { + 'id': '3533241', + 'ext': 'mp4', + 'title': 'Sitting on ball tight jeans', + 'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg', + 'uploader_id': '150629', + 'uploader': 'jeanslevisjeans', + 'display_id': 'sitting-on-ball-tight-jeans', + 'age_limit': 18, + } + }] + + def _real_extract(self, url): + main_id, type_ = re.match(self._VALID_URL, url).group('id', 'type') + webpage = self._download_webpage(url, main_id) + + title = self._html_search_regex( + r'<title\b[^>]*?>(?:Video:\s+)?(.+?)(?:\s+-\s+ThisVid(?:\.com| tube))?</title>', + webpage, 'title') + + if type_ == 'embed': + # look for more metadata + video_alt_url = url_or_none(self._search_regex( + rf'''video_alt_url\s*:\s+'({self._VALID_URL}/)',''', + webpage, 'video_alt_url', default=None)) + if video_alt_url and video_alt_url != url: + webpage = self._download_webpage( + video_alt_url, main_id, + note='Redirecting embed to main page', fatal=False) or webpage + + video_holder = get_element_by_class('video-holder', webpage) or '' + if '>This video is a private video' in video_holder: + self.raise_login_required( + (clean_html(video_holder) or 'Private video').partition('\n')[0]) + + uploader = self._html_search_regex( + r'''(?s)<span\b[^>]*>Added by:\s*</span><a\b[^>]+\bclass\s*=\s*["']author\b[^>]+\bhref\s*=\s*["']https://thisvid\.com/members/([0-9]+/.{3,}?)\s*</a>''', + webpage, 'uploader', default='') + uploader = re.split(r'''/["'][^>]*>\s*''', uploader) + if len(uploader) == 2: + # id must be non-empty, uploader could be '' + uploader_id, uploader = uploader + uploader = uploader or None + else: + uploader_id = uploader = None + + return self.url_result( + url, ie='Generic', url_transparent=True, + title=title, + age_limit=18, + uploader=uploader, + uploader_id=uploader_id) + + +class ThisVidPlaylistBaseIE(InfoExtractor): + _PLAYLIST_URL_RE = None + + @classmethod + def _find_urls(cls, html): + for m in re.finditer(rf'''<a\b[^>]+\bhref\s*=\s*["'](?P<url>{cls._PLAYLIST_URL_RE}\b)[^>]+>''', html): + yield m.group('url') + + def _generate_playlist_entries(self, url, playlist_id, html=None): + page_url = url + for page in itertools.count(1): + if not html: + html = self._download_webpage( + page_url, playlist_id, note=f'Downloading page {page}', + fatal=False) or '' + + yield from self._find_urls(html) + + next_page = get_element_by_class('pagination-next', html) or '' + if next_page: + # member list page + next_page = urljoin(url, self._search_regex( + r'''<a\b[^>]+\bhref\s*=\s*("|')(?P<url>(?!#)(?:(?!\1).)+)''', + next_page, 'next page link', group='url', default=None)) + + # in case a member page should have pagination-next with empty link, not just `else:` + if next_page is None: + # playlist page + parsed_url = urllib.parse.urlparse(page_url) + base_path, _, num = parsed_url.path.rpartition('/') + num = int_or_none(num) + if num is None: + base_path, num = parsed_url.path.rstrip('/'), 1 + parsed_url = parsed_url._replace(path=f'{base_path}/{num + 1}') + next_page = urllib.parse.urlunparse(parsed_url) + if page_url == next_page: + next_page = None + + if not next_page: + return + page_url, html = next_page, None + + def _make_playlist_result(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + title = re.split( + r'(?i)\s*\|\s*ThisVid\.com\s*$', + self._og_search_title(webpage, default=None) + or self._html_search_regex(r'(?s)<title\b[^>]*>(.+?)</title', webpage, 'title', fatal=False) or '', 1)[0] or None + + return self.playlist_from_matches( + self._generate_playlist_entries(url, playlist_id, webpage), + playlist_id=playlist_id, playlist_title=title, ie=ThisVidIE) + + +class ThisVidMemberIE(ThisVidPlaylistBaseIE): + _VALID_URL = r'https?://thisvid\.com/members/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://thisvid.com/members/2140501/', + 'info_dict': { + 'id': '2140501', + 'title': 'Rafflesia\'s Profile', + }, + 'playlist_mincount': 16, + }, { + 'url': 'https://thisvid.com/members/2140501/favourite_videos/', + 'info_dict': { + 'id': '2140501', + 'title': 'Rafflesia\'s Favourite Videos', + }, + 'playlist_mincount': 15, + }, { + 'url': 'https://thisvid.com/members/636468/public_videos/', + 'info_dict': { + 'id': '636468', + 'title': 'Happymouth\'s Public Videos', + }, + 'playlist_mincount': 196, + }] + _PLAYLIST_URL_RE = ThisVidIE._VALID_URL + + def _real_extract(self, url): + return self._make_playlist_result(url) + + +class ThisVidPlaylistIE(ThisVidPlaylistBaseIE): + _VALID_URL = r'https?://thisvid\.com/playlist/(?P<id>\d+)/video/(?P<video_id>[A-Za-z0-9-]+)' + _TESTS = [{ + 'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/', + 'info_dict': { + 'id': '6615', + 'title': 'Underwear Stuff', + }, + 'playlist_mincount': 200, + }, { + 'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/', + 'info_dict': { + 'id': '1072387', + 'ext': 'mp4', + 'title': 'Big Italian Booty 28', + 'description': 'md5:1bccf7b13765e18fb27bf764dba7ede2', + 'uploader_id': '367912', + 'uploader': 'Jcmusclefun', + 'age_limit': 18, + 'display_id': 'big-italian-booty-28', + 'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+1072387/preview\.jpg', + }, + 'params': { + 'noplaylist': True, + }, + }] + _PLAYLIST_URL_RE = _VALID_URL + + def _generate_playlist_entries(self, url, playlist_id, html=None): + for wrapped_url in super()._generate_playlist_entries(url, playlist_id, html): + video_id = re.match(self._VALID_URL, wrapped_url).group('video_id') + yield urljoin(url, f'/videos/{video_id}/') + + def _real_extract(self, url): + playlist_id, video_id = self._match_valid_url(url).group('id', 'video_id') + + if not self._yes_playlist(playlist_id, video_id): + redirect_url = urljoin(url, f'/videos/{video_id}/') + return self.url_result(redirect_url, ThisVidIE) + + result = self._make_playlist_result(url) + + # Fix duplicated title (`the title - the title` => `the title`) + title = result['title'] + t_len = len(title) + if t_len > 5 and t_len % 2 != 0: + t_len = t_len // 2 + if title[t_len] == '-': + first, second = map(str.strip, (title[:t_len], title[t_len + 1:])) + if first and first == second: + result['title'] = first + + return result |