diff options
Diffstat (limited to 'hypervideo_dl/extractor/cbc.py')
| -rw-r--r-- | hypervideo_dl/extractor/cbc.py | 182 | 
1 files changed, 140 insertions, 42 deletions
| diff --git a/hypervideo_dl/extractor/cbc.py b/hypervideo_dl/extractor/cbc.py index 2429521..4892419 100644 --- a/hypervideo_dl/extractor/cbc.py +++ b/hypervideo_dl/extractor/cbc.py @@ -2,17 +2,22 @@  from __future__ import unicode_literals  import re +import json +import base64 +import time  from .common import InfoExtractor  from ..compat import (      compat_str,  )  from ..utils import ( +    int_or_none, +    join_nonempty,      js_to_json, -    smuggle_url, -    try_get,      orderedSet, +    smuggle_url,      strip_or_none, +    try_get,      ExtractorError,  ) @@ -122,9 +127,9 @@ class CBCIE(InfoExtractor):      def _real_extract(self, url):          display_id = self._match_id(url)          webpage = self._download_webpage(url, display_id) -        title = self._og_search_title(webpage, default=None) or self._html_search_meta( -            'twitter:title', webpage, 'title', default=None) or self._html_search_regex( -                r'<title>([^<]+)</title>', webpage, 'title', fatal=False) +        title = (self._og_search_title(webpage, default=None) +                 or self._html_search_meta('twitter:title', webpage, 'title', default=None) +                 or self._html_extract_title(webpage))          entries = [              self._extract_player_init(player_init, display_id)              for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] @@ -244,37 +249,129 @@ class CBCGemIE(InfoExtractor):          'params': {'format': 'bv'},          'skip': 'Geo-restricted to Canada',      }] -    _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + +    _GEO_COUNTRIES = ['CA'] +    _TOKEN_API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37' +    _NETRC_MACHINE = 'cbcgem' +    _claims_token = None + +    def _new_claims_token(self, email, password): +        data = json.dumps({ +            'email': email, +            'password': password, +        }).encode() +        headers = {'content-type': 'application/json'} +        query = {'apikey': self._TOKEN_API_KEY} +        resp = self._download_json('https://api.loginradius.com/identity/v2/auth/login', +                                   None, data=data, headers=headers, query=query) +        access_token = resp['access_token'] + +        query = { +            'access_token': access_token, +            'apikey': self._TOKEN_API_KEY, +            'jwtapp': 'jwt', +        } +        resp = self._download_json('https://cloud-api.loginradius.com/sso/jwt/api/token', +                                   None, headers=headers, query=query) +        sig = resp['signature'] + +        data = json.dumps({'jwt': sig}).encode() +        headers = {'content-type': 'application/json', 'ott-device-type': 'web'} +        resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/token', +                                   None, data=data, headers=headers) +        cbc_access_token = resp['accessToken'] + +        headers = {'content-type': 'application/json', 'ott-device-type': 'web', 'ott-access-token': cbc_access_token} +        resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/profile', +                                   None, headers=headers) +        return resp['claimsToken'] + +    def _get_claims_token_expiry(self): +        # Token is a JWT +        # JWT is decoded here and 'exp' field is extracted +        # It is a Unix timestamp for when the token expires +        b64_data = self._claims_token.split('.')[1] +        data = base64.urlsafe_b64decode(b64_data + "==") +        return json.loads(data)['exp'] + +    def claims_token_expired(self): +        exp = self._get_claims_token_expiry() +        if exp - time.time() < 10: +            # It will expire in less than 10 seconds, or has already expired +            return True +        return False + +    def claims_token_valid(self): +        return self._claims_token is not None and not self.claims_token_expired() + +    def _get_claims_token(self, email, password): +        if not self.claims_token_valid(): +            self._claims_token = self._new_claims_token(email, password) +            self._downloader.cache.store(self._NETRC_MACHINE, 'claims_token', self._claims_token) +        return self._claims_token + +    def _real_initialize(self): +        if self.claims_token_valid(): +            return +        self._claims_token = self._downloader.cache.load(self._NETRC_MACHINE, 'claims_token') + +    def _find_secret_formats(self, formats, video_id): +        """ Find a valid video url and convert it to the secret variant """ +        base_format = next((f for f in formats if f.get('vcodec') != 'none'), None) +        if not base_format: +            return + +        base_url = re.sub(r'(Manifest\(.*?),filter=[\w-]+(.*?\))', r'\1\2', base_format['url']) +        url = re.sub(r'(Manifest\(.*?),format=[\w-]+(.*?\))', r'\1\2', base_url) + +        secret_xml = self._download_xml(url, video_id, note='Downloading secret XML', fatal=False) +        if not secret_xml: +            return + +        for child in secret_xml: +            if child.attrib.get('Type') != 'video': +                continue +            for video_quality in child: +                bitrate = int_or_none(video_quality.attrib.get('Bitrate')) +                if not bitrate or 'Index' not in video_quality.attrib: +                    continue +                height = int_or_none(video_quality.attrib.get('MaxHeight')) + +                yield { +                    **base_format, +                    'format_id': join_nonempty('sec', height), +                    # Note: \g<1> is necessary instead of \1 since bitrate is a number +                    'url': re.sub(r'(QualityLevels\()\d+(\))', fr'\g<1>{bitrate}\2', base_url), +                    'width': int_or_none(video_quality.attrib.get('MaxWidth')), +                    'tbr': bitrate / 1000.0, +                    'height': height, +                }      def _real_extract(self, url):          video_id = self._match_id(url) -        video_info = self._download_json(self._API_BASE + video_id, video_id) - -        last_error = None -        attempt = -1 -        retries = self.get_param('extractor_retries', 15) -        while attempt < retries: -            attempt += 1 -            if last_error: -                self.report_warning('%s. Retrying ...' % last_error) -            m3u8_info = self._download_json( -                video_info['playSession']['url'], video_id, -                note='Downloading JSON metadata%s' % f' (attempt {attempt})') -            m3u8_url = m3u8_info.get('url') -            if m3u8_url: -                break -            elif m3u8_info.get('errorCode') == 1: -                self.raise_geo_restricted(countries=['CA']) -            else: -                last_error = f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}' -                # 35 means media unavailable, but retries work -                if m3u8_info.get('errorCode') != 35 or attempt >= retries: -                    raise ExtractorError(last_error) +        video_info = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + video_id, video_id) + +        email, password = self._get_login_info() +        if email and password: +            claims_token = self._get_claims_token(email, password) +            headers = {'x-claims-token': claims_token} +        else: +            headers = {} +        m3u8_info = self._download_json(video_info['playSession']['url'], video_id, headers=headers) +        m3u8_url = m3u8_info.get('url') + +        if m3u8_info.get('errorCode') == 1: +            self.raise_geo_restricted(countries=['CA']) +        elif m3u8_info.get('errorCode') == 35: +            self.raise_login_required(method='password') +        elif m3u8_info.get('errorCode') != 0: +            raise ExtractorError(f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}')          formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls')          self._remove_duplicate_formats(formats) +        formats.extend(self._find_secret_formats(formats, video_id)) -        for i, format in enumerate(formats): +        for format in formats:              if format.get('vcodec') == 'none':                  if format.get('ext') is None:                      format['ext'] = 'm4a' @@ -328,7 +425,8 @@ class CBCGemPlaylistIE(InfoExtractor):          show = match.group('show')          show_info = self._download_json(self._API_BASE + show, season_id)          season = int(match.group('season')) -        season_info = try_get(show_info, lambda x: x['seasons'][season - 1]) + +        season_info = next((s for s in show_info['seasons'] if s.get('season') == season), None)          if season_info is None:              raise ExtractorError(f'Couldn\'t find season {season} of {show}') @@ -377,7 +475,7 @@ class CBCGemPlaylistIE(InfoExtractor):  class CBCGemLiveIE(InfoExtractor):      IE_NAME = 'gem.cbc.ca:live' -    _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P<id>[0-9]{12})' +    _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P<id>\d+)'      _TEST = {          'url': 'https://gem.cbc.ca/live/920604739687',          'info_dict': { @@ -396,21 +494,21 @@ class CBCGemLiveIE(InfoExtractor):      # It's unclear where the chars at the end come from, but they appear to be      # constant. Might need updating in the future. -    _API = 'https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT' +    # There are two URLs, some livestreams are in one, and some +    # in the other. The JSON schema is the same for both. +    _API_URLS = ['https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT', 'https://tpfeed.cbc.ca/f/ExhSPC/FNiv9xQx_BnT']      def _real_extract(self, url):          video_id = self._match_id(url) -        live_info = self._download_json(self._API, video_id)['entries'] -        video_info = None -        for stream in live_info: -            if stream.get('guid') == video_id: -                video_info = stream - -        if video_info is None: -            raise ExtractorError( -                'Couldn\'t find video metadata, maybe this livestream is now offline', -                expected=True) +        for api_url in self._API_URLS: +            video_info = next(( +                stream for stream in self._download_json(api_url, video_id)['entries'] +                if stream.get('guid') == video_id), None) +            if video_info: +                break +        else: +            raise ExtractorError('Couldn\'t find video metadata, maybe this livestream is now offline', expected=True)          return {              '_type': 'url_transparent', | 
