aboutsummaryrefslogtreecommitdiffstats
path: root/hypervideo_dl/extractor/cspan.py
diff options
context:
space:
mode:
Diffstat (limited to 'hypervideo_dl/extractor/cspan.py')
-rw-r--r--hypervideo_dl/extractor/cspan.py52
1 files changed, 49 insertions, 3 deletions
diff --git a/hypervideo_dl/extractor/cspan.py b/hypervideo_dl/extractor/cspan.py
index 2e01aff..f51159b 100644
--- a/hypervideo_dl/extractor/cspan.py
+++ b/hypervideo_dl/extractor/cspan.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_HTMLParseError
from ..utils import (
determine_ext,
ExtractorError,
@@ -11,14 +12,16 @@ from ..utils import (
get_element_by_attribute,
get_element_by_class,
int_or_none,
+ join_nonempty,
js_to_json,
merge_dicts,
parse_iso8601,
+ parse_qs,
smuggle_url,
str_to_int,
unescapeHTML,
)
-from .senateisvp import SenateISVPIE
+from .senategov import SenateISVPIE
from .ustream import UstreamIE
@@ -126,8 +129,12 @@ class CSpanIE(InfoExtractor):
ext = 'vtt'
subtitle['ext'] = ext
ld_info = self._search_json_ld(webpage, video_id, default={})
- title = get_element_by_class('video-page-title', webpage) or \
- self._og_search_title(webpage)
+ try:
+ title = get_element_by_class('video-page-title', webpage)
+ except compat_HTMLParseError:
+ title = None
+ if title is None:
+ title = self._og_search_title(webpage)
description = get_element_by_attribute('itemprop', 'description', webpage) or \
self._html_search_meta(['og:description', 'description'], webpage)
return merge_dicts(info, ld_info, {
@@ -242,3 +249,42 @@ class CSpanIE(InfoExtractor):
'title': title,
'id': 'c' + video_id if video_type == 'clip' else video_id,
}
+
+
+class CSpanCongressIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?c-span\.org/congress/'
+ _TESTS = [{
+ 'url': 'https://www.c-span.org/congress/?chamber=house&date=2017-12-13&t=1513208380',
+ 'info_dict': {
+ 'id': 'house_2017-12-13',
+ 'title': 'Congressional Chronicle - Members of Congress, Hearings and More',
+ 'description': 'md5:54c264b7a8f219937987610243305a84',
+ 'thumbnail': r're:https://ximage.c-spanvideo.org/.+',
+ 'ext': 'mp4'
+ }
+ }]
+
+ def _real_extract(self, url):
+ query = parse_qs(url)
+ video_date = query.get('date', [None])[0]
+ video_id = join_nonempty(query.get('chamber', ['senate'])[0], video_date, delim='_')
+ webpage = self._download_webpage(url, video_id)
+ if not video_date:
+ jwp_date = re.search(r'jwsetup.clipprogdate = \'(?P<date>\d{4}-\d{2}-\d{2})\';', webpage)
+ if jwp_date:
+ video_id = f'{video_id}_{jwp_date.group("date")}'
+ jwplayer_data = self._parse_json(
+ self._search_regex(r'jwsetup\s*=\s*({(?:.|\n)[^;]+});', webpage, 'player config'),
+ video_id, transform_source=js_to_json)
+
+ title = (self._og_search_title(webpage, default=None)
+ or self._html_extract_title(webpage, 'video title'))
+ description = (self._og_search_description(webpage, default=None)
+ or self._html_search_meta('description', webpage, 'description', default=None))
+
+ return {
+ **self._parse_jwplayer_data(jwplayer_data, video_id, False),
+ 'title': re.sub(r'\s+', ' ', title.split('|')[0]).strip(),
+ 'description': description,
+ 'http_headers': {'Referer': 'https://www.c-span.org/'},
+ }