aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorlkho <llhmtc@gmail.com>2020-08-29 15:04:16 +0800
committerlkho <llhmtc@gmail.com>2020-08-29 15:33:57 +0800
commitde4144a4aedd6ab9f24ffa1a777bce99e019468e (patch)
tree61e354e2f400dd2ef31c5c98850cbeb9c94eed00
parent503406d4bc838b51c9b1adf0d3fd4a9efda26d30 (diff)
downloadhypervideo-pre-de4144a4aedd6ab9f24ffa1a777bce99e019468e.tar.lz
hypervideo-pre-de4144a4aedd6ab9f24ffa1a777bce99e019468e.tar.xz
hypervideo-pre-de4144a4aedd6ab9f24ffa1a777bce99e019468e.zip
[duboku] add playlist extractor
-rw-r--r--youtube_dl/extractor/duboku.py93
-rw-r--r--youtube_dl/extractor/extractors.py5
2 files changed, 97 insertions, 1 deletions
diff --git a/youtube_dl/extractor/duboku.py b/youtube_dl/extractor/duboku.py
index 3e4cf8d5b..4db81a665 100644
--- a/youtube_dl/extractor/duboku.py
+++ b/youtube_dl/extractor/duboku.py
@@ -4,10 +4,49 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urlparse
from ..utils import *
+def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
+ """Return the content of the tag with the specified attribute in the passed HTML document"""
+
+ if tag is None:
+ tag = '[a-zA-Z0-9:._-]+'
+ if attribute is None:
+ attribute = ''
+ else:
+ attribute = r'\s+(?P<attribute>%s)' % re.escape(attribute)
+ if value is None:
+ value = ''
+ else:
+ value = re.escape(value) if escape_value else value
+ value = '=[\'"]?(?P<value>%s)[\'"]?' % value
+
+ retlist = []
+ for m in re.finditer(r'''(?xs)
+ <(?P<tag>%s)
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
+ %s%s
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
+ \s*>
+ (?P<content>.*?)
+ </\1>
+ ''' % (tag, attribute, value), html):
+ retlist.append(m)
+
+ return retlist
+
+
+def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
+ retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
+ return retval[0] if retval else None
+
+
class DubokuIE(InfoExtractor):
+ IE_NAME = 'duboku'
+ IE_DESC = 'www.duboku.co'
+
_VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P<id>[0-9\-]+)\.html.*'
_TESTS = [{
'url': 'https://www.duboku.co/vodplay/1575-1-1.html',
@@ -90,3 +129,57 @@ class DubokuIE(InfoExtractor):
'episode_id': episode_id,
'formats': formats,
}
+
+
+class DubokuPlaylistIE(InfoExtractor):
+ IE_NAME = 'duboku:list'
+ IE_DESC = 'www.duboku.co entire series'
+
+ _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P<id>[0-9]+)\.html.*'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ raise ExtractorError('Invalid URL: %s' % url)
+ series_id = mobj.group('id')
+ fragment = compat_urlparse.urlparse(url).fragment
+
+ webpage_url = 'https://www.duboku.co/voddetail/%s.html' % series_id
+ webpage_html = self._download_webpage(webpage_url, series_id)
+
+ # extract title
+
+ title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
+ title = unescapeHTML(title.group('content')) if title else None
+ if not title:
+ title = self._html_search_meta('keywords', webpage_html)
+ if not title:
+ title = _get_element_by_tag_and_attrib(webpage_html, 'title')
+ title = unescapeHTML(title.group('content')) if title else None
+
+ # extract playlists
+
+ playlists = {}
+ for div in _get_elements_by_tag_and_attrib(
+ webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
+ playlist_id = div.group('value')
+ playlist = []
+ for a in _get_elements_by_tag_and_attrib(
+ div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
+ playlist.append({
+ 'href': unescapeHTML(a.group('value')),
+ 'title': unescapeHTML(a.group('content'))
+ })
+ playlists[playlist_id] = playlist
+
+ # select the specified playlist if url fragment exists
+ playlist = playlists.get(fragment) if fragment else next(iter(playlists.values()))
+ if not playlist:
+ raise ExtractorError(
+ 'Cannot find %s' % fragment if fragment else 'Cannot extract playlist')
+
+ # return url results
+ return self.playlist_result([
+ self.url_result(
+ 'https://www.duboku.co' + x['href'], video_title=x.get('title'))
+ for x in playlist], series_id, title)
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index e6c008b6f..407701717 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -282,7 +282,10 @@ from .drtv import (
)
from .dtube import DTubeIE
from .dvtv import DVTVIE
-from .duboku import DubokuIE
+from .duboku import (
+ DubokuIE,
+ DubokuPlaylistIE
+)
from .dumpert import DumpertIE
from .defense import DefenseGouvFrIE
from .discovery import DiscoveryIE