diff options
Diffstat (limited to 'hypervideo_dl/extractor/tnaflix.py')
-rw-r--r-- | hypervideo_dl/extractor/tnaflix.py | 205 |
1 files changed, 107 insertions, 98 deletions
diff --git a/hypervideo_dl/extractor/tnaflix.py b/hypervideo_dl/extractor/tnaflix.py index d7617f7..4482c84 100644 --- a/hypervideo_dl/extractor/tnaflix.py +++ b/hypervideo_dl/extractor/tnaflix.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -11,6 +9,7 @@ from ..utils import ( parse_duration, str_to_int, unescapeHTML, + url_basename, xpath_text, ) @@ -22,8 +21,6 @@ class TNAFlixNetworkBaseIE(InfoExtractor): r'<input[^>]+name="config\d?" value="(?P<url>[^"]+)"', r'config\s*=\s*(["\'])(?P<url>(?:https?:)?//(?:(?!\1).)+)\1', ] - _HOST = 'tna' - _VKEY_SUFFIX = '' _TITLE_REGEX = r'<input[^>]+name="title" value="([^"]+)"' _DESCRIPTION_REGEX = r'<input[^>]+name="description" value="([^"]+)"' _UPLOADER_REGEX = r'<input[^>]+name="username" value="([^"]+)"' @@ -74,7 +71,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor): def _real_extract(self, url): mobj = self._match_valid_url(url) - video_id = mobj.group('id') + video_id, host = mobj.group('id', 'host') for display_id_key in ('display_id', 'display_id_2'): if display_id_key in mobj.groupdict(): display_id = mobj.group(display_id_key) @@ -85,98 +82,109 @@ class TNAFlixNetworkBaseIE(InfoExtractor): webpage = self._download_webpage(url, display_id) + # check for MovieFap-style config cfg_url = self._proto_relative_url(self._html_search_regex( self._CONFIG_REGEX, webpage, 'flashvars.config', default=None, group='url'), 'http:') + query = {} + # check for TNAFlix-style config if not cfg_url: inputs = self._hidden_inputs(webpage) - cfg_url = ('https://cdn-fck.%sflix.com/%sflix/%s%s.fid?key=%s&VID=%s&premium=1&vip=1&alpha' - % (self._HOST, self._HOST, inputs['vkey'], self._VKEY_SUFFIX, inputs['nkey'], video_id)) - - cfg_xml = self._download_xml( - cfg_url, display_id, 'Downloading metadata', - transform_source=fix_xml_ampersands, headers={'Referer': url}) - - formats = [] - - def extract_video_url(vl): - # Any URL modification now results in HTTP Error 403: Forbidden - return unescapeHTML(vl.text) - - video_link = cfg_xml.find('./videoLink') - if video_link is not None: - formats.append({ - 'url': extract_video_url(video_link), - 'ext': xpath_text(cfg_xml, './videoConfig/type', 'type', default='flv'), - }) - - for item in cfg_xml.findall('./quality/item'): - video_link = item.find('./videoLink') - if video_link is None: - continue - res = item.find('res') - format_id = None if res is None else res.text - height = int_or_none(self._search_regex( - r'^(\d+)[pP]', format_id, 'height', default=None)) - formats.append({ - 'url': self._proto_relative_url(extract_video_url(video_link), 'http:'), - 'format_id': format_id, - 'height': height, + if inputs.get('vkey') and inputs.get('nkey'): + cfg_url = f'https://www.{host}.com/cdn/cdn.php' + query.update({ + 'file': inputs['vkey'], + 'key': inputs['nkey'], + 'VID': video_id, + 'premium': '1', + 'vip': '1', + 'alpha': '', + }) + + formats, json_ld = [], {} + + # TNAFlix and MovieFap extraction + if cfg_url: + cfg_xml = self._download_xml( + cfg_url, display_id, 'Downloading metadata', + transform_source=fix_xml_ampersands, headers={'Referer': url}, query=query) + + def extract_video_url(vl): + # Any URL modification now results in HTTP Error 403: Forbidden + return unescapeHTML(vl.text) + + video_link = cfg_xml.find('./videoLink') + if video_link is not None: + formats.append({ + 'url': extract_video_url(video_link), + 'ext': xpath_text(cfg_xml, './videoConfig/type', 'type', default='flv'), + }) + + for item in cfg_xml.findall('./quality/item'): + video_link = item.find('./videoLink') + if video_link is None: + continue + res = item.find('res') + format_id = None if res is None else res.text + height = int_or_none(self._search_regex( + r'^(\d+)[pP]', format_id, 'height', default=None)) + formats.append({ + 'url': self._proto_relative_url(extract_video_url(video_link), 'http:'), + 'format_id': format_id, + 'height': height, + }) + + thumbnails = self._extract_thumbnails(cfg_xml) or [] + thumbnails.append({ + 'url': self._proto_relative_url(xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:') }) - self._sort_formats(formats) - - thumbnail = self._proto_relative_url( - xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:') - thumbnails = self._extract_thumbnails(cfg_xml) - - title = None - if self._TITLE_REGEX: - title = self._html_search_regex( - self._TITLE_REGEX, webpage, 'title', default=None) - if not title: - title = self._og_search_title(webpage) - - age_limit = self._rta_search(webpage) or 18 - - duration = parse_duration(self._html_search_meta( - 'duration', webpage, 'duration', default=None)) + # check for EMPFlix-style JSON and extract + else: + player = self._download_json( + f'http://www.{host}.com/ajax/video-player/{video_id}', video_id, + headers={'Referer': url}).get('html', '') + for mobj in re.finditer(r'<source src="(?P<src>[^"]+)"', player): + video_url = mobj.group('src') + height = self._search_regex(r'-(\d+)p\.', url_basename(video_url), 'height', default=None) + formats.append({ + 'url': self._proto_relative_url(video_url, 'http:'), + 'ext': url_basename(video_url).split('.')[-1], + 'height': int_or_none(height), + 'format_id': f'{height}p' if height else url_basename(video_url).split('.')[0], + }) + thumbnail = self._proto_relative_url(self._search_regex( + r'data-poster="([^"]+)"', player, 'thumbnail', default=None), 'http:') + thumbnails = [{'url': thumbnail}] if thumbnail else None + json_ld = self._search_json_ld(webpage, display_id, default={}) def extract_field(pattern, name): return self._html_search_regex(pattern, webpage, name, default=None) if pattern else None - description = extract_field(self._DESCRIPTION_REGEX, 'description') - uploader = extract_field(self._UPLOADER_REGEX, 'uploader') - view_count = str_to_int(extract_field(self._VIEW_COUNT_REGEX, 'view count')) - comment_count = str_to_int(extract_field(self._COMMENT_COUNT_REGEX, 'comment count')) - average_rating = float_or_none(extract_field(self._AVERAGE_RATING_REGEX, 'average rating')) - - categories_str = extract_field(self._CATEGORIES_REGEX, 'categories') - categories = [c.strip() for c in categories_str.split(',')] if categories_str is not None else [] - return { 'id': video_id, 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'title': (extract_field(self._TITLE_REGEX, 'title') + or self._og_search_title(webpage, default=None) + or json_ld.get('title')), + 'description': extract_field(self._DESCRIPTION_REGEX, 'description') or json_ld.get('description'), 'thumbnails': thumbnails, - 'duration': duration, - 'age_limit': age_limit, - 'uploader': uploader, - 'view_count': view_count, - 'comment_count': comment_count, - 'average_rating': average_rating, - 'categories': categories, + 'duration': parse_duration( + self._html_search_meta('duration', webpage, 'duration', default=None)) or json_ld.get('duration'), + 'age_limit': self._rta_search(webpage) or 18, + 'uploader': extract_field(self._UPLOADER_REGEX, 'uploader') or json_ld.get('uploader'), + 'view_count': str_to_int(extract_field(self._VIEW_COUNT_REGEX, 'view count')), + 'comment_count': str_to_int(extract_field(self._COMMENT_COUNT_REGEX, 'comment count')), + 'average_rating': float_or_none(extract_field(self._AVERAGE_RATING_REGEX, 'average rating')), + 'categories': list(map(str.strip, (extract_field(self._CATEGORIES_REGEX, 'categories') or '').split(','))), 'formats': formats, } class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE): - _VALID_URL = r'https?://player\.(?:tna|emp)flix\.com/video/(?P<id>\d+)' - - _TITLE_REGEX = r'<title>([^<]+)</title>' + _VALID_URL = r'https?://player\.(?P<host>tnaflix|empflix)\.com/video/(?P<id>\d+)' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.(?:tna|emp)flix\.com/video/\d+)\1'] _TESTS = [{ 'url': 'https://player.tnaflix.com/video/6538', @@ -184,23 +192,26 @@ class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE): 'id': '6538', 'display_id': '6538', 'ext': 'mp4', - 'title': 'Educational xxx video', + 'title': 'Educational xxx video (G Spot)', + 'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8', 'thumbnail': r're:https?://.*\.jpg$', 'age_limit': 18, + 'duration': 164, + 'uploader': 'bobwhite39', + 'categories': list, }, 'params': { 'skip_download': True, }, }, { - 'url': 'https://player.empflix.com/video/33051', + 'url': 'http://player.empflix.com/video/33051', 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.(?:tna|emp)flix\.com/video/\d+)\1', - webpage)] + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id, host = mobj.group('id', 'host') + return self.url_result(f'http://www.{host}.com/category/{video_id}/video{video_id}') class TNAEMPFlixBaseIE(TNAFlixNetworkBaseIE): @@ -210,7 +221,7 @@ class TNAEMPFlixBaseIE(TNAFlixNetworkBaseIE): class TNAFlixIE(TNAEMPFlixBaseIE): - _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?(?P<host>tnaflix)\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)' _TITLE_REGEX = r'<title>(.+?) - (?:TNAFlix Porn Videos|TNAFlix\.com)</title>' @@ -226,17 +237,17 @@ class TNAFlixIE(TNAEMPFlixBaseIE): 'thumbnail': r're:https?://.*\.jpg$', 'duration': 91, 'age_limit': 18, - 'categories': ['Porn Stars'], + 'categories': list, } }, { # non-anonymous uploader, categories 'url': 'https://www.tnaflix.com/teen-porn/Educational-xxx-video/video6538', - 'md5': '0f5d4d490dbfd117b8607054248a07c0', + 'md5': 'add5a9fa7f4da53d3e9d0845ac58f20c', 'info_dict': { 'id': '6538', 'display_id': 'Educational-xxx-video', 'ext': 'mp4', - 'title': 'Educational xxx video', + 'title': 'Educational xxx video (G Spot)', 'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8', 'thumbnail': r're:https?://.*\.jpg$', 'duration': 164, @@ -251,14 +262,11 @@ class TNAFlixIE(TNAEMPFlixBaseIE): class EMPFlixIE(TNAEMPFlixBaseIE): - _VALID_URL = r'https?://(?:www\.)?empflix\.com/(?:videos/(?P<display_id>.+?)-|[^/]+/(?P<display_id_2>[^/]+)/video)(?P<id>[0-9]+)' - - _HOST = 'emp' - _VKEY_SUFFIX = '-1' + _VALID_URL = r'https?://(?:www\.)?(?P<host>empflix)\.com/(?:videos/(?P<display_id>.+?)-|[^/]+/(?P<display_id_2>[^/]+)/video)(?P<id>[0-9]+)' _TESTS = [{ - 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', - 'md5': 'bc30d48b91a7179448a0bda465114676', + 'url': 'http://www.empflix.com/amateur-porn/Amateur-Finger-Fuck/video33051', + 'md5': 'd761c7b26601bd14476cd9512f2654fc', 'info_dict': { 'id': '33051', 'display_id': 'Amateur-Finger-Fuck', @@ -268,20 +276,20 @@ class EMPFlixIE(TNAEMPFlixBaseIE): 'thumbnail': r're:https?://.*\.jpg$', 'duration': 83, 'age_limit': 18, - 'uploader': 'cwbike', - 'categories': ['Amateur', 'Anal', 'Fisting', 'Home made', 'Solo'], + 'uploader': None, + 'categories': list, } }, { 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', 'only_matching': True, }, { - 'url': 'https://www.empflix.com/amateur-porn/Amateur-Finger-Fuck/video33051', + 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', 'only_matching': True, }] class MovieFapIE(TNAFlixNetworkBaseIE): - _VALID_URL = r'https?://(?:www\.)?moviefap\.com/videos/(?P<id>[0-9a-f]+)/(?P<display_id>[^/]+)\.html' + _VALID_URL = r'https?://(?:www\.)?(?P<host>moviefap)\.com/videos/(?P<id>[0-9a-f]+)/(?P<display_id>[^/]+)\.html' _VIEW_COUNT_REGEX = r'<br>Views\s*<strong>([\d,.]+)</strong>' _COMMENT_COUNT_REGEX = r'<span[^>]+id="comCount"[^>]*>([\d,.]+)</span>' @@ -323,5 +331,6 @@ class MovieFapIE(TNAFlixNetworkBaseIE): 'comment_count': int, 'average_rating': float, 'categories': ['Amateur', 'Teen'], - } + }, + 'skip': 'This video does not exist', }] |