aboutsummaryrefslogtreecommitdiffstats
path: root/hypervideo_dl/extractor/bbc.py
diff options
context:
space:
mode:
Diffstat (limited to 'hypervideo_dl/extractor/bbc.py')
-rw-r--r--hypervideo_dl/extractor/bbc.py89
1 files changed, 61 insertions, 28 deletions
diff --git a/hypervideo_dl/extractor/bbc.py b/hypervideo_dl/extractor/bbc.py
index 4e2dcd7..29ad7de 100644
--- a/hypervideo_dl/extractor/bbc.py
+++ b/hypervideo_dl/extractor/bbc.py
@@ -11,6 +11,7 @@ from ..compat import (
compat_etree_Element,
compat_HTTPError,
compat_str,
+ compat_urllib_error,
compat_urlparse,
)
from ..utils import (
@@ -38,7 +39,7 @@ from ..utils import (
class BBCCoUkIE(InfoExtractor):
IE_NAME = 'bbc.co.uk'
IE_DESC = 'BBC iPlayer'
- _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})'
+ _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
_VALID_URL = r'''(?x)
https?://
(?:www\.)?bbc\.co\.uk/
@@ -263,11 +264,7 @@ class BBCCoUkIE(InfoExtractor):
'only_matching': True,
}]
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading signin page')
@@ -293,9 +290,6 @@ class BBCCoUkIE(InfoExtractor):
'Unable to login: %s' % error, expected=True)
raise ExtractorError('Unable to log in')
- def _real_initialize(self):
- self._login()
-
class MediaSelectionError(Exception):
def __init__(self, id):
self.id = id
@@ -394,9 +388,17 @@ class BBCCoUkIE(InfoExtractor):
formats.extend(self._extract_mpd_formats(
href, programme_id, mpd_id=format_id, fatal=False))
elif transfer_format == 'hls':
- formats.extend(self._extract_m3u8_formats(
- href, programme_id, ext='mp4', entry_protocol='m3u8_native',
- m3u8_id=format_id, fatal=False))
+ # TODO: let expected_status be passed into _extract_xxx_formats() instead
+ try:
+ fmts = self._extract_m3u8_formats(
+ href, programme_id, ext='mp4', entry_protocol='m3u8_native',
+ m3u8_id=format_id, fatal=False)
+ except ExtractorError as e:
+ if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError)
+ and e.exc_info[1].code in (403, 404)):
+ raise
+ fmts = []
+ formats.extend(fmts)
elif transfer_format == 'hds':
formats.extend(self._extract_f4m_formats(
href, programme_id, f4m_id=format_id, fatal=False))
@@ -451,9 +453,10 @@ class BBCCoUkIE(InfoExtractor):
playlist = self._download_json(
'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
playlist_id, 'Downloading playlist JSON')
+ formats = []
+ subtitles = {}
- version = playlist.get('defaultAvailableVersion')
- if version:
+ for version in playlist.get('allAvailableVersions', []):
smp_config = version['smpConfig']
title = smp_config['title']
description = smp_config['summary']
@@ -463,8 +466,17 @@ class BBCCoUkIE(InfoExtractor):
continue
programme_id = item.get('vpid')
duration = int_or_none(item.get('duration'))
- formats, subtitles = self._download_media_selector(programme_id)
- return programme_id, title, description, duration, formats, subtitles
+ version_formats, version_subtitles = self._download_media_selector(programme_id)
+ types = version['types']
+ for f in version_formats:
+ f['format_note'] = ', '.join(types)
+ if any('AudioDescribed' in x for x in types):
+ f['language_preference'] = -10
+ formats += version_formats
+ for tag, subformats in (version_subtitles or {}).items():
+ subtitles.setdefault(tag, []).extend(subformats)
+
+ return programme_id, title, description, duration, formats, subtitles
except ExtractorError as ee:
if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
raise
@@ -775,20 +787,32 @@ class BBCIE(BBCCoUkIE):
'upload_date': '20150725',
},
}, {
+ # video with window.__INITIAL_DATA__ and value as JSON string
+ 'url': 'https://www.bbc.com/news/av/world-europe-59468682',
+ 'info_dict': {
+ 'id': 'p0b71qth',
+ 'ext': 'mp4',
+ 'title': 'Why France is making this woman a national hero',
+ 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
+ 'thumbnail': r're:https?://.+/.+\.jpg',
+ 'timestamp': 1638230731,
+ 'upload_date': '20211130',
+ },
+ }, {
# single video article embedded with data-media-vpid
'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
'only_matching': True,
}, {
+ # bbcthreeConfig
'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
'info_dict': {
'id': 'p06556y7',
'ext': 'mp4',
- 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
- 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd',
+ 'title': 'Things Not To Say to people that live on council estates',
+ 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
+ 'duration': 360,
+ 'thumbnail': r're:https?://.+/.+\.jpg',
},
- 'params': {
- 'skip_download': True,
- }
}, {
# window.__PRELOADED_STATE__
'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
@@ -882,9 +906,8 @@ class BBCIE(BBCCoUkIE):
playlist_title = json_ld_info.get('title')
if not playlist_title:
- playlist_title = self._og_search_title(
- webpage, default=None) or self._html_search_regex(
- r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
+ playlist_title = (self._og_search_title(webpage, default=None)
+ or self._html_extract_title(webpage, 'playlist title', default=None))
if playlist_title:
playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
@@ -1161,9 +1184,16 @@ class BBCIE(BBCCoUkIE):
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
- initial_data = self._parse_json(self._search_regex(
- r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage,
- 'preload state', default='{}'), playlist_id, fatal=False)
+ initial_data = self._search_regex(
+ r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
+ 'quoted preload state', default=None)
+ if initial_data is None:
+ initial_data = self._search_regex(
+ r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
+ 'preload state', default={})
+ else:
+ initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
+ initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
if initial_data:
def parse_media(media):
if not media:
@@ -1204,7 +1234,10 @@ class BBCIE(BBCCoUkIE):
if name == 'media-experience':
parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
elif name == 'article':
- for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []):
+ for block in (try_get(resp,
+ (lambda x: x['data']['blocks'],
+ lambda x: x['data']['content']['model']['blocks'],),
+ list) or []):
if block.get('type') != 'media':
continue
parse_media(block.get('model'))