diff options
author | mzbaulhaque <11481344+mzbaulhaque@users.noreply.github.com> | 2021-08-15 23:32:48 +0600 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-08-15 23:02:48 +0530 |
commit | df2a5633daf17d32e4d8aa437f2f39d9ce454b6b (patch) | |
tree | 9fd3d040d7bab585715af39ef68506ee2de12aa5 | |
parent | 7a6742b5f9e7b8d0c92fb6b1f56e7e486873c169 (diff) | |
download | hypervideo-pre-df2a5633daf17d32e4d8aa437f2f39d9ce454b6b.tar.lz hypervideo-pre-df2a5633daf17d32e4d8aa437f2f39d9ce454b6b.tar.xz hypervideo-pre-df2a5633daf17d32e4d8aa437f2f39d9ce454b6b.zip |
[pornhub] Separate and fix playlist extractor (#700)
Closes #680
Authored by: mzbaulhaque
-rw-r--r-- | yt_dlp/extractor/extractors.py | 1 | ||||
-rw-r--r-- | yt_dlp/extractor/pornhub.py | 71 |
2 files changed, 59 insertions, 13 deletions
diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 5c58e2ba4..955a44a90 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -1044,6 +1044,7 @@ from .pornhd import PornHdIE from .pornhub import ( PornHubIE, PornHubUserIE, + PornHubPlaylistIE, PornHubPagedVideoListIE, PornHubUserVideosUploadIE, ) diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index c525505d1..c2b20ecfd 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import functools import itertools +import math import operator import re @@ -638,7 +639,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?%s/(?P<id>(?:[^/]+/)*[^/?#&]+)' % PornHubBaseIE._PORNHUB_HOST_RE + _VALID_URL = r'https?://(?:[^/]+\.)?%s/(?!playlist/)(?P<id>(?:[^/]+/)*[^/?#&]+)' % PornHubBaseIE._PORNHUB_HOST_RE _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph/videos', 'only_matching': True, @@ -732,18 +733,6 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): 'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn', 'only_matching': True, }, { - 'url': 'https://www.pornhub.com/playlist/44121572', - 'info_dict': { - 'id': 'playlist/44121572', - }, - 'playlist_mincount': 132, - }, { - 'url': 'https://www.pornhub.com/playlist/4667351', - 'only_matching': True, - }, { - 'url': 'https://de.pornhub.com/playlist/4667351', - 'only_matching': True, - }, { 'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph/videos', 'only_matching': True, }] @@ -770,3 +759,59 @@ class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): 'url': 'http://pornhubthbh7ap3u.onion/pornstar/jenny-blighe/videos/upload', 'only_matching': True, }] + + +class PornHubPlaylistIE(PornHubPlaylistBaseIE): + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/playlist/(?P<id>[^/?#&]+))' % PornHubBaseIE._PORNHUB_HOST_RE + _TESTS = [{ + 'url': 'https://www.pornhub.com/playlist/44121572', + 'info_dict': { + 'id': '44121572', + }, + 'playlist_count': 77, + }, { + 'url': 'https://www.pornhub.com/playlist/4667351', + 'only_matching': True, + }, { + 'url': 'https://de.pornhub.com/playlist/4667351', + 'only_matching': True, + }, { + 'url': 'https://de.pornhub.com/playlist/4667351?page=2', + 'only_matching': True, + }] + + def _entries(self, url, host, item_id): + webpage = self._download_webpage(url, item_id, 'Downloading page 1') + playlist_id = self._search_regex(r'var\s+playlistId\s*=\s*"([^"]+)"', webpage, 'playlist_id') + video_count = int_or_none( + self._search_regex(r'var\s+itemsCount\s*=\s*([0-9]+)\s*\|\|', webpage, 'video_count')) + token = self._search_regex(r'var\s+token\s*=\s*"([^"]+)"', webpage, 'token') + page_count = math.ceil((video_count - 36) / 40.) + 1 + page_entries = self._extract_entries(webpage, host) + + def download_page(page_num): + note = 'Downloading page {}'.format(page_num) + page_url = 'https://www.{}/playlist/viewChunked'.format(host) + return self._download_webpage(page_url, item_id, note, query={ + 'id': playlist_id, + 'page': page_num, + 'token': token, + }) + + for page_num in range(1, page_count + 1): + if page_num > 1: + webpage = download_page(page_num) + page_entries = self._extract_entries(webpage, host) + if not page_entries: + break + for e in page_entries: + yield e + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + item_id = mobj.group('id') + + self._login(host) + + return self.playlist_result(self._entries(mobj.group('url'), host, item_id), item_id) |