diff options
author | James Taylor <user234683@users.noreply.github.com> | 2019-10-17 19:58:13 -0700 |
---|---|---|
committer | James Taylor <user234683@users.noreply.github.com> | 2019-10-17 19:58:13 -0700 |
commit | 4c07546e7a5e5882abdda896009b744e947df1c4 (patch) | |
tree | 25870ecb94999df109895840810609e1d2167d96 /youtube_dl/extractor | |
parent | 9abb83fdbc05294f186daeefff8c85cfda06b7d2 (diff) | |
download | yt-local-4c07546e7a5e5882abdda896009b744e947df1c4.tar.lz yt-local-4c07546e7a5e5882abdda896009b744e947df1c4.tar.xz yt-local-4c07546e7a5e5882abdda896009b744e947df1c4.zip |
Extraction: Replace youtube-dl with custom-built watch page extraction
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 46 | ||||
-rw-r--r-- | youtube_dl/extractor/adobepass.py | 1567 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 2862 | ||||
-rw-r--r-- | youtube_dl/extractor/commonmistakes.py | 50 | ||||
-rw-r--r-- | youtube_dl/extractor/commonprotocols.py | 60 | ||||
-rw-r--r-- | youtube_dl/extractor/extractors.py | 31 | ||||
-rw-r--r-- | youtube_dl/extractor/generic.py | 3335 | ||||
-rw-r--r-- | youtube_dl/extractor/openload.py | 379 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 3264 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube_unmodified_reference.py | 3192 |
10 files changed, 0 insertions, 14786 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py deleted file mode 100644 index d5a4418..0000000 --- a/youtube_dl/extractor/__init__.py +++ /dev/null @@ -1,46 +0,0 @@ -from __future__ import unicode_literals - -try: - from .lazy_extractors import * - from .lazy_extractors import _ALL_CLASSES - _LAZY_LOADER = True -except ImportError: - _LAZY_LOADER = False - from .extractors import * - - _ALL_CLASSES = [ - klass - for name, klass in globals().items() - if name.endswith('IE') and name != 'GenericIE' - ] - #_ALL_CLASSES.append(GenericIE) - - -def gen_extractor_classes(): - """ Return a list of supported extractors. - The order does matter; the first extractor matched is the one handling the URL. - """ - return _ALL_CLASSES - - -def gen_extractors(): - """ Return a list of an instance of every supported extractor. - The order does matter; the first extractor matched is the one handling the URL. - """ - return [klass() for klass in gen_extractor_classes()] - - -def list_extractors(age_limit): - """ - Return a list of extractors that are suitable for the given age, - sorted by extractor ID. - """ - - return sorted( - filter(lambda ie: ie.is_suitable(age_limit), gen_extractors()), - key=lambda ie: ie.IE_NAME.lower()) - - -def get_info_extractor(ie_name): - """Returns the info extractor class with the given ie_name""" - return globals()[ie_name + 'IE'] diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py deleted file mode 100644 index b83b51e..0000000 --- a/youtube_dl/extractor/adobepass.py +++ /dev/null @@ -1,1567 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import time -import xml.etree.ElementTree as etree - -from .common import InfoExtractor -from ..compat import ( - compat_kwargs, - compat_urlparse, -) -from ..utils import ( - unescapeHTML, - urlencode_postdata, - unified_timestamp, - ExtractorError, - NO_DEFAULT, -) - - -MSO_INFO = { - 'DTV': { - 'name': 'DIRECTV', - 'username_field': 'username', - 'password_field': 'password', - }, - 'ATTOTT': { - 'name': 'DIRECTV NOW', - 'username_field': 'email', - 'password_field': 'loginpassword', - }, - 'Rogers': { - 'name': 'Rogers', - 'username_field': 'UserName', - 'password_field': 'UserPassword', - }, - 'Comcast_SSO': { - 'name': 'Comcast XFINITY', - 'username_field': 'user', - 'password_field': 'passwd', - }, - 'TWC': { - 'name': 'Time Warner Cable | Spectrum', - 'username_field': 'Ecom_User_ID', - 'password_field': 'Ecom_Password', - }, - 'Brighthouse': { - 'name': 'Bright House Networks | Spectrum', - 'username_field': 'j_username', - 'password_field': 'j_password', - }, - 'Charter_Direct': { - 'name': 'Charter Spectrum', - 'username_field': 'IDToken1', - 'password_field': 'IDToken2', - }, - 'Verizon': { - 'name': 'Verizon FiOS', - 'username_field': 'IDToken1', - 'password_field': 'IDToken2', - }, - 'thr030': { - 'name': '3 Rivers Communications' - }, - 'com140': { - 'name': 'Access Montana' - }, - 'acecommunications': { - 'name': 'AcenTek' - }, - 'acm010': { - 'name': 'Acme Communications' - }, - 'ada020': { - 'name': 'Adams Cable Service' - }, - 'alb020': { - 'name': 'Albany Mutual Telephone' - }, - 'algona': { - 'name': 'Algona Municipal Utilities' - }, - 'allwest': { - 'name': 'All West Communications' - }, - 'all025': { - 'name': 'Allen\'s Communications' - }, - 'spl010': { - 'name': 'Alliance Communications' - }, - 'all070': { - 'name': 'ALLO Communications' - }, - 'alpine': { - 'name': 'Alpine Communications' - }, - 'hun015': { - 'name': 'American Broadband' - }, - 'nwc010': { - 'name': 'American Broadband Missouri' - }, - 'com130-02': { - 'name': 'American Community Networks' - }, - 'com130-01': { - 'name': 'American Warrior Networks' - }, - 'tom020': { - 'name': 'Amherst Telephone/Tomorrow Valley' - }, - 'tvc020': { - 'name': 'Andycable' - }, - 'arkwest': { - 'name': 'Arkwest Communications' - }, - 'art030': { - 'name': 'Arthur Mutual Telephone Company' - }, - 'arvig': { - 'name': 'Arvig' - }, - 'nttcash010': { - 'name': 'Ashland Home Net' - }, - 'astound': { - 'name': 'Astound (now Wave)' - }, - 'dix030': { - 'name': 'ATC Broadband' - }, - 'ara010': { - 'name': 'ATC Communications' - }, - 'she030-02': { - 'name': 'Ayersville Communications' - }, - 'baldwin': { - 'name': 'Baldwin Lightstream' - }, - 'bal040': { - 'name': 'Ballard TV' - }, - 'cit025': { - 'name': 'Bardstown Cable TV' - }, - 'bay030': { - 'name': 'Bay Country Communications' - }, - 'tel095': { - 'name': 'Beaver Creek Cooperative Telephone' - }, - 'bea020': { - 'name': 'Beaver Valley Cable' - }, - 'bee010': { - 'name': 'Bee Line Cable' - }, - 'wir030': { - 'name': 'Beehive Broadband' - }, - 'bra020': { - 'name': 'BELD' - }, - 'bel020': { - 'name': 'Bellevue Municipal Cable' - }, - 'vol040-01': { - 'name': 'Ben Lomand Connect / BLTV' - }, - 'bev010': { - 'name': 'BEVCOMM' - }, - 'big020': { - 'name': 'Big Sandy Broadband' - }, - 'ble020': { - 'name': 'Bledsoe Telephone Cooperative' - }, - 'bvt010': { - 'name': 'Blue Valley Tele-Communications' - }, - 'bra050': { - 'name': 'Brandenburg Telephone Co.' - }, - 'bte010': { - 'name': 'Bristol Tennessee Essential Services' - }, - 'annearundel': { - 'name': 'Broadstripe' - }, - 'btc010': { - 'name': 'BTC Communications' - }, - 'btc040': { - 'name': 'BTC Vision - Nahunta' - }, - 'bul010': { - 'name': 'Bulloch Telephone Cooperative' - }, - 'but010': { - 'name': 'Butler-Bremer Communications' - }, - 'tel160-csp': { - 'name': 'C Spire SNAP' - }, - 'csicable': { - 'name': 'Cable Services Inc.' - }, - 'cableamerica': { - 'name': 'CableAmerica' - }, - 'cab038': { - 'name': 'CableSouth Media 3' - }, - 'weh010-camtel': { - 'name': 'Cam-Tel Company' - }, - 'car030': { - 'name': 'Cameron Communications' - }, - 'canbytel': { - 'name': 'Canby Telcom' - }, - 'crt020': { - 'name': 'CapRock Tv' - }, - 'car050': { - 'name': 'Carnegie Cable' - }, - 'cas': { - 'name': 'CAS Cable' - }, - 'casscomm': { - 'name': 'CASSCOMM' - }, - 'mid180-02': { - 'name': 'Catalina Broadband Solutions' - }, - 'cccomm': { - 'name': 'CC Communications' - }, - 'nttccde010': { - 'name': 'CDE Lightband' - }, - 'cfunet': { - 'name': 'Cedar Falls Utilities' - }, - 'dem010-01': { - 'name': 'Celect-Bloomer Telephone Area' - }, - 'dem010-02': { - 'name': 'Celect-Bruce Telephone Area' - }, - 'dem010-03': { - 'name': 'Celect-Citizens Connected Area' - }, - 'dem010-04': { - 'name': 'Celect-Elmwood/Spring Valley Area' - }, - 'dem010-06': { - 'name': 'Celect-Mosaic Telecom' - }, - 'dem010-05': { - 'name': 'Celect-West WI Telephone Area' - }, - 'net010-02': { - 'name': 'Cellcom/Nsight Telservices' - }, - 'cen100': { - 'name': 'CentraCom' - }, - 'nttccst010': { - 'name': 'Central Scott / CSTV' - }, - 'cha035': { - 'name': 'Chaparral CableVision' - }, - 'cha050': { - 'name': 'Chariton Valley Communication Corporation, Inc.' - }, - 'cha060': { - 'name': 'Chatmoss Cablevision' - }, - 'nttcche010': { - 'name': 'Cherokee Communications' - }, - 'che050': { - 'name': 'Chesapeake Bay Communications' - }, - 'cimtel': { - 'name': 'Cim-Tel Cable, LLC.' - }, - 'cit180': { - 'name': 'Citizens Cablevision - Floyd, VA' - }, - 'cit210': { - 'name': 'Citizens Cablevision, Inc.' - }, - 'cit040': { - 'name': 'Citizens Fiber' - }, - 'cit250': { - 'name': 'Citizens Mutual' - }, - 'war040': { - 'name': 'Citizens Telephone Corporation' - }, - 'wat025': { - 'name': 'City Of Monroe' - }, - 'wadsworth': { - 'name': 'CityLink' - }, - 'nor100': { - 'name': 'CL Tel' - }, - 'cla010': { - 'name': 'Clarence Telephone and Cedar Communications' - }, - 'ser060': { - 'name': 'Clear Choice Communications' - }, - 'tac020': { - 'name': 'Click! Cable TV' - }, - 'war020': { - 'name': 'CLICK1.NET' - }, - 'cml010': { - 'name': 'CML Telephone Cooperative Association' - }, - 'cns': { - 'name': 'CNS' - }, - 'com160': { - 'name': 'Co-Mo Connect' - }, - 'coa020': { - 'name': 'Coast Communications' - }, - 'coa030': { - 'name': 'Coaxial Cable TV' - }, - 'mid055': { - 'name': 'Cobalt TV (Mid-State Community TV)' - }, - 'col070': { - 'name': 'Columbia Power & Water Systems' - }, - 'col080': { - 'name': 'Columbus Telephone' - }, - 'nor105': { - 'name': 'Communications 1 Cablevision, Inc.' - }, - 'com150': { - 'name': 'Community Cable & Broadband' - }, - 'com020': { - 'name': 'Community Communications Company' - }, - 'coy010': { - 'name': 'commZoom' - }, - 'com025': { - 'name': 'Complete Communication Services' - }, - 'cat020': { - 'name': 'Comporium' - }, - 'com071': { - 'name': 'ComSouth Telesys' - }, - 'consolidatedcable': { - 'name': 'Consolidated' - }, - 'conwaycorp': { - 'name': 'Conway Corporation' - }, - 'coo050': { - 'name': 'Coon Valley Telecommunications Inc' - }, - 'coo080': { - 'name': 'Cooperative Telephone Company' - }, - 'cpt010': { - 'name': 'CP-TEL' - }, - 'cra010': { - 'name': 'Craw-Kan Telephone' - }, - 'crestview': { - 'name': 'Crestview Cable Communications' - }, - 'cross': { - 'name': 'Cross TV' - }, - 'cro030': { - 'name': 'Crosslake Communications' - }, - 'ctc040': { - 'name': 'CTC - Brainerd MN' - }, - 'phe030': { - 'name': 'CTV-Beam - East Alabama' - }, - 'cun010': { - 'name': 'Cunningham Telephone & Cable' - }, - 'dpc010': { - 'name': 'D & P Communications' - }, - 'dak030': { - 'name': 'Dakota Central Telecommunications' - }, - 'nttcdel010': { - 'name': 'Delcambre Telephone LLC' - }, - 'tel160-del': { - 'name': 'Delta Telephone Company' - }, - 'sal040': { - 'name': 'DiamondNet' - }, - 'ind060-dc': { - 'name': 'Direct Communications' - }, - 'doy010': { - 'name': 'Doylestown Cable TV' - }, - 'dic010': { - 'name': 'DRN' - }, - 'dtc020': { - 'name': 'DTC' - }, - 'dtc010': { - 'name': 'DTC Cable (Delhi)' - }, - 'dum010': { - 'name': 'Dumont Telephone Company' - }, - 'dun010': { - 'name': 'Dunkerton Telephone Cooperative' - }, - 'cci010': { - 'name': 'Duo County Telecom' - }, - 'eagle': { - 'name': 'Eagle Communications' - }, - 'weh010-east': { - 'name': 'East Arkansas Cable TV' - }, - 'eatel': { - 'name': 'EATEL Video, LLC' - }, - 'ell010': { - 'name': 'ECTA' - }, - 'emerytelcom': { - 'name': 'Emery Telcom Video LLC' - }, - 'nor200': { - 'name': 'Empire Access' - }, - 'endeavor': { - 'name': 'Endeavor Communications' - }, - 'sun045': { - 'name': 'Enhanced Telecommunications Corporation' - }, - 'mid030': { - 'name': 'enTouch' - }, - 'epb020': { - 'name': 'EPB Smartnet' - }, - 'jea010': { - 'name': 'EPlus Broadband' - }, - 'com065': { - 'name': 'ETC' - }, - 'ete010': { - 'name': 'Etex Communications' - }, - 'fbc-tele': { - 'name': 'F&B Communications' - }, - 'fal010': { - 'name': 'Falcon Broadband' - }, - 'fam010': { - 'name': 'FamilyView CableVision' - }, - 'far020': { - 'name': 'Farmers Mutual Telephone Company' - }, - 'fay010': { - 'name': 'Fayetteville Public Utilities' - }, - 'sal060': { - 'name': 'fibrant' - }, - 'fid010': { - 'name': 'Fidelity Communications' - }, - 'for030': { - 'name': 'FJ Communications' - }, - 'fli020': { - 'name': 'Flint River Communications' - }, - 'far030': { - 'name': 'FMT - Jesup' - }, - 'foo010': { - 'name': 'Foothills Communications' - }, - 'for080': { - 'name': 'Forsyth CableNet' - }, - 'fbcomm': { - 'name': 'Frankfort Plant Board' - }, - 'tel160-fra': { - 'name': 'Franklin Telephone Company' - }, - 'nttcftc010': { - 'name': 'FTC' - }, - 'fullchannel': { - 'name': 'Full Channel, Inc.' - }, - 'gar040': { - 'name': 'Gardonville Cooperative Telephone Association' - }, - 'gbt010': { - 'name': 'GBT Communications, Inc.' - }, - 'tec010': { - 'name': 'Genuine Telecom' - }, - 'clr010': { - 'name': 'Giant Communications' - }, - 'gla010': { - 'name': 'Glasgow EPB' - }, - 'gle010': { - 'name': 'Glenwood Telecommunications' - }, - 'gra060': { - 'name': 'GLW Broadband Inc.' - }, - 'goldenwest': { - 'name': 'Golden West Cablevision' - }, - 'vis030': { - 'name': 'Grantsburg Telcom' - }, - 'gpcom': { - 'name': 'Great Plains Communications' - }, - 'gri010': { - 'name': 'Gridley Cable Inc' - }, - 'hbc010': { - 'name': 'H&B Cable Services' - }, - 'hae010': { - 'name': 'Haefele TV Inc.' - }, - 'htc010': { - 'name': 'Halstad Telephone Company' - }, - 'har005': { - 'name': 'Harlan Municipal Utilities' - }, - 'har020': { - 'name': 'Hart Communications' - }, - 'ced010': { - 'name': 'Hartelco TV' - }, - 'hea040': { - 'name': 'Heart of Iowa Communications Cooperative' - }, - 'htc020': { - 'name': 'Hickory Telephone Company' - }, - 'nttchig010': { - 'name': 'Highland Communication Services' - }, - 'hig030': { - 'name': 'Highland Media' - }, - 'spc010': { - 'name': 'Hilliary Communications' - }, - 'hin020': { - 'name': 'Hinton CATV Co.' - }, - 'hometel': { - 'name': 'HomeTel Entertainment, Inc.' - }, - 'hoodcanal': { - 'name': 'Hood Canal Communications' - }, - 'weh010-hope': { - 'name': 'Hope - Prescott Cable TV' - }, - 'horizoncable': { - 'name': 'Horizon Cable TV, Inc.' - }, - 'hor040': { - 'name': 'Horizon Chillicothe Telephone' - }, - 'htc030': { - 'name': 'HTC Communications Co. - IL' - }, - 'htccomm': { - 'name': 'HTC Communications, Inc. - IA' - }, - 'wal005': { - 'name': 'Huxley Communications' - }, - 'imon': { - 'name': 'ImOn Communications' - }, - 'ind040': { - 'name': 'Independence Telecommunications' - }, - 'rrc010': { - 'name': 'Inland Networks' - }, - 'stc020': { - 'name': 'Innovative Cable TV St Croix' - }, - 'car100': { - 'name': 'Innovative Cable TV St Thomas-St John' - }, - 'icc010': { - 'name': 'Inside Connect Cable' - }, - 'int100': { - 'name': 'Integra Telecom' - }, - 'int050': { - 'name': 'Interstate Telecommunications Coop' - }, - 'irv010': { - 'name': 'Irvine Cable' - }, - 'k2c010': { - 'name': 'K2 Communications' - }, - 'kal010': { - 'name': 'Kalida Telephone Company, Inc.' - }, - 'kal030': { - 'name': 'Kalona Cooperative Telephone Company' - }, - 'kmt010': { - 'name': 'KMTelecom' - }, - 'kpu010': { - 'name': 'KPU Telecommunications' - }, - 'kuh010': { - 'name': 'Kuhn Communications, Inc.' - }, - 'lak130': { - 'name': 'Lakeland Communications' - }, - 'lan010': { - 'name': 'Langco' - }, - 'lau020': { - 'name': 'Laurel Highland Total Communications, Inc.' - }, - 'leh010': { - 'name': 'Lehigh Valley Cooperative Telephone' - }, - 'bra010': { - 'name': 'Limestone Cable/Bracken Cable' - }, - 'loc020': { - 'name': 'LISCO' - }, - 'lit020': { - 'name': 'Litestream' - }, - 'tel140': { - 'name': 'LivCom' - }, - 'loc010': { - 'name': 'LocalTel Communications' - }, - 'weh010-longview': { - 'name': 'Longview - Kilgore Cable TV' - }, - 'lon030': { - 'name': 'Lonsdale Video Ventures, LLC' - }, - 'lns010': { - 'name': 'Lost Nation-Elwood Telephone Co.' - }, - 'nttclpc010': { - 'name': 'LPC Connect' - }, - 'lumos': { - 'name': 'Lumos Networks' - }, - 'madison': { - 'name': 'Madison Communications' - }, - 'mad030': { - 'name': 'Madison County Cable Inc.' - }, - 'nttcmah010': { - 'name': 'Mahaska Communication Group' - }, - 'mar010': { - 'name': 'Marne & Elk Horn Telephone Company' - }, - 'mcc040': { - 'name': 'McClure Telephone Co.' - }, - 'mctv': { - 'name': 'MCTV' - }, - 'merrimac': { - 'name': 'Merrimac Communications Ltd.' - }, - 'metronet': { - 'name': 'Metronet' - }, - 'mhtc': { - 'name': 'MHTC' - }, - 'midhudson': { - 'name': 'Mid-Hudson Cable' - }, - 'midrivers': { - 'name': 'Mid-Rivers Communications' - }, - 'mid045': { - 'name': 'Midstate Communications' - }, - 'mil080': { - 'name': 'Milford Communications' - }, - 'min030': { - 'name': 'MINET' - }, - 'nttcmin010': { - 'name': 'Minford TV' - }, - 'san040-02': { - 'name': 'Mitchell Telecom' - }, - 'mlg010': { - 'name': 'MLGC' - }, - 'mon060': { - 'name': 'Mon-Cre TVE' - }, - 'mou110': { - 'name': 'Mountain Telephone' - }, - 'mou050': { - 'name': 'Mountain Village Cable' - }, - 'mtacomm': { - 'name': 'MTA Communications, LLC' - }, - 'mtc010': { - 'name': 'MTC Cable' - }, - 'med040': { - 'name': 'MTC Technologies' - }, - 'man060': { - 'name': 'MTCC' - }, - 'mtc030': { - 'name': 'MTCO Communications' - }, - 'mul050': { - 'name': 'Mulberry Telecommunications' - }, - 'mur010': { - 'name': 'Murray Electric System' - }, - 'musfiber': { - 'name': 'MUS FiberNET' - }, - 'mpw': { - 'name': 'Muscatine Power & Water' - }, - 'nttcsli010': { - 'name': 'myEVTV.com' - }, - 'nor115': { - 'name': 'NCC' - }, - 'nor260': { - 'name': 'NDTC' - }, - 'nctc': { - 'name': 'Nebraska Central Telecom, Inc.' - }, - 'nel020': { - 'name': 'Nelsonville TV Cable' - }, - 'nem010': { - 'name': 'Nemont' - }, - 'new075': { - 'name': 'New Hope Telephone Cooperative' - }, - 'nor240': { - 'name': 'NICP' - }, - 'cic010': { - 'name': 'NineStar Connect' - }, - 'nktelco': { - 'name': 'NKTelco' - }, - 'nortex': { - 'name': 'Nortex Communications' - }, - 'nor140': { - 'name': 'North Central Telephone Cooperative' - }, - 'nor030': { - 'name': 'Northland Communications' - }, - 'nor075': { - 'name': 'Northwest Communications' - }, - 'nor125': { - 'name': 'Norwood Light Broadband' - }, - 'net010': { - 'name': 'Nsight Telservices' - }, - 'dur010': { - 'name': 'Ntec' - }, - 'nts010': { - 'name': 'NTS Communications' - }, - 'new045': { - 'name': 'NU-Telecom' - }, - 'nulink': { - 'name': 'NuLink' - }, - 'jam030': { - 'name': 'NVC' - }, - 'far035': { - 'name': 'OmniTel Communications' - }, - 'onesource': { - 'name': 'OneSource Communications' - }, - 'cit230': { - 'name': 'Opelika Power Services' - }, - 'daltonutilities': { - 'name': 'OptiLink' - }, - 'mid140': { - 'name': 'OPTURA' - }, - 'ote010': { - 'name': 'OTEC Communication Company' - }, - 'cci020': { - 'name': 'Packerland Broadband' - }, - 'pan010': { - 'name': 'Panora Telco/Guthrie Center Communications' - }, - 'otter': { - 'name': 'Park Region Telephone & Otter Tail Telcom' - }, - 'mid050': { - 'name': 'Partner Communications Cooperative' - }, - 'fib010': { - 'name': 'Pathway' - }, - 'paulbunyan': { - 'name': 'Paul Bunyan Communications' - }, - 'pem020': { - 'name': 'Pembroke Telephone Company' - }, - 'mck010': { - 'name': 'Peoples Rural Telephone Cooperative' - }, - 'pul010': { - 'name': 'PES Energize' - }, - 'phi010': { - 'name': 'Philippi Communications System' - }, - 'phonoscope': { - 'name': 'Phonoscope Cable' - }, - 'pin070': { - 'name': 'Pine Belt Communications, Inc.' - }, - 'weh010-pine': { - 'name': 'Pine Bluff Cable TV' - }, - 'pin060': { - 'name': 'Pineland Telephone Cooperative' - }, - 'cam010': { - 'name': 'Pinpoint Communications' - }, - 'pio060': { - 'name': 'Pioneer Broadband' - }, - 'pioncomm': { - 'name': 'Pioneer Communications' - }, - 'pioneer': { - 'name': 'Pioneer DTV' - }, - 'pla020': { - 'name': 'Plant TiftNet, Inc.' - }, - 'par010': { - 'name': 'PLWC' - }, - 'pro035': { - 'name': 'PMT' - }, - 'vik011': { - 'name': 'Polar Cablevision' - }, - 'pottawatomie': { - 'name': 'Pottawatomie Telephone Co.' - }, - 'premiercomm': { - 'name': 'Premier Communications' - }, - 'psc010': { - 'name': 'PSC' - }, - 'pan020': { - 'name': 'PTCI' - }, - 'qco010': { - 'name': 'QCOL' - }, - 'qua010': { - 'name': 'Quality Cablevision' - }, - 'rad010': { - 'name': 'Radcliffe Telephone Company' - }, - 'car040': { - 'name': 'Rainbow Communications' - }, - 'rai030': { - 'name': 'Rainier Connect' - }, - 'ral010': { - 'name': 'Ralls Technologies' - }, - 'rct010': { - 'name': 'RC Technologies' - }, - 'red040': { - 'name': 'Red River Communications' - }, - 'ree010': { - 'name': 'Reedsburg Utility Commission' - }, - 'mol010': { - 'name': 'Reliance Connects- Oregon' - }, - 'res020': { - 'name': 'Reserve Telecommunications' - }, - 'weh010-resort': { - 'name': 'Resort TV Cable' - }, - 'rld010': { - 'name': 'Richland Grant Telephone Cooperative, Inc.' - }, - 'riv030': { - 'name': 'River Valley Telecommunications Coop' - }, - 'rockportcable': { - 'name': 'Rock Port Cablevision' - }, - 'rsf010': { - 'name': 'RS Fiber' - }, - 'rtc': { - 'name': 'RTC Communication Corp' - }, - 'res040': { - 'name': 'RTC-Reservation Telephone Coop.' - }, - 'rte010': { - 'name': 'RTEC Communications' - }, - 'stc010': { - 'name': 'S&T' - }, - 'san020': { - 'name': 'San Bruno Cable TV' - }, - 'san040-01': { - 'name': 'Santel' - }, - 'sav010': { - 'name': 'SCI Broadband-Savage Communications Inc.' - }, - 'sco050': { - 'name': 'Scottsboro Electric Power Board' - }, - 'scr010': { - 'name': 'Scranton Telephone Company' - }, - 'selco': { - 'name': 'SELCO' - }, - 'she010': { - 'name': 'Shentel' - }, - 'she030': { - 'name': 'Sherwood Mutual Telephone Association, Inc.' - }, - 'ind060-ssc': { - 'name': 'Silver Star Communications' - }, - 'sjoberg': { - 'name': 'Sjoberg\'s Inc.' - }, - 'sou025': { - 'name': 'SKT' - }, - 'sky050': { - 'name': 'SkyBest TV' - }, - 'nttcsmi010': { - 'name': 'Smithville Communications' - }, - 'woo010': { - 'name': 'Solarus' - }, - 'sou075': { - 'name': 'South Central Rural Telephone Cooperative' - }, - 'sou065': { - 'name': 'South Holt Cablevision, Inc.' - }, - 'sou035': { - 'name': 'South Slope Cooperative Communications' - }, - 'spa020': { - 'name': 'Spanish Fork Community Network' - }, - 'spe010': { - 'name': 'Spencer Municipal Utilities' - }, - 'spi005': { - 'name': 'Spillway Communications, Inc.' - }, - 'srt010': { - 'name': 'SRT' - }, - 'cccsmc010': { - 'name': 'St. Maarten Cable TV' - }, - 'sta025': { - 'name': 'Star Communications' - }, - 'sco020': { - 'name': 'STE' - }, - 'uin010': { - 'name': 'STRATA Networks' - }, - 'sum010': { - 'name': 'Sumner Cable TV' - }, - 'pie010': { - 'name': 'Surry TV/PCSI TV' - }, - 'swa010': { - 'name': 'Swayzee Communications' - }, - 'sweetwater': { - 'name': 'Sweetwater Cable Television Co' - }, - 'weh010-talequah': { - 'name': 'Tahlequah Cable TV' - }, - 'tct': { - 'name': 'TCT' - }, - 'tel050': { - 'name': 'Tele-Media Company' - }, - 'com050': { - 'name': 'The Community Agency' - }, - 'thr020': { - 'name': 'Three River' - }, - 'cab140': { - 'name': 'Town & Country Technologies' - }, - 'tra010': { - 'name': 'Trans-Video' - }, - 'tre010': { - 'name': 'Trenton TV Cable Company' - }, - 'tcc': { - 'name': 'Tri County Communications Cooperative' - }, - 'tri025': { - 'name': 'TriCounty Telecom' - }, - 'tri110': { - 'name': 'TrioTel Communications, Inc.' - }, - 'tro010': { - 'name': 'Troy Cablevision, Inc.' - }, - 'tsc': { - 'name': 'TSC' - }, - 'cit220': { - 'name': 'Tullahoma Utilities Board' - }, - 'tvc030': { - 'name': 'TV Cable of Rensselaer' - }, - 'tvc015': { - 'name': 'TVC Cable' - }, - 'cab180': { - 'name': 'TVision' - }, - 'twi040': { - 'name': 'Twin Lakes' - }, - 'tvtinc': { - 'name': 'Twin Valley' - }, - 'uis010': { - 'name': 'Union Telephone Company' - }, - 'uni110': { - 'name': 'United Communications - TN' - }, - 'uni120': { - 'name': 'United Services' - }, - 'uss020': { - 'name': 'US Sonet' - }, - 'cab060': { - 'name': 'USA Communications' - }, - 'she005': { - 'name': 'USA Communications/Shellsburg, IA' - }, - 'val040': { - 'name': 'Valley TeleCom Group' - }, - 'val025': { - 'name': 'Valley Telecommunications' - }, - 'val030': { - 'name': 'Valparaiso Broadband' - }, - 'cla050': { - 'name': 'Vast Broadband' - }, - 'sul015': { - 'name': 'Venture Communications Cooperative, Inc.' - }, - 'ver025': { - 'name': 'Vernon Communications Co-op' - }, - 'weh010-vicksburg': { - 'name': 'Vicksburg Video' - }, - 'vis070': { - 'name': 'Vision Communications' - }, - 'volcanotel': { - 'name': 'Volcano Vision, Inc.' - }, - 'vol040-02': { - 'name': 'VolFirst / BLTV' - }, - 'ver070': { - 'name': 'VTel' - }, - 'nttcvtx010': { - 'name': 'VTX1' - }, - 'bci010-02': { - 'name': 'Vyve Broadband' - }, - 'wab020': { - 'name': 'Wabash Mutual Telephone' - }, - 'waitsfield': { - 'name': 'Waitsfield Cable' - }, - 'wal010': { - 'name': 'Walnut Communications' - }, - 'wavebroadband': { - 'name': 'Wave' - }, - 'wav030': { - 'name': 'Waverly Communications Utility' - }, - 'wbi010': { - 'name': 'WBI' - }, - 'web020': { - 'name': 'Webster-Calhoun Cooperative Telephone Association' - }, - 'wes005': { - 'name': 'West Alabama TV Cable' - }, - 'carolinata': { - 'name': 'West Carolina Communications' - }, - 'wct010': { - 'name': 'West Central Telephone Association' - }, - 'wes110': { - 'name': 'West River Cooperative Telephone Company' - }, - 'ani030': { - 'name': 'WesTel Systems' - }, - 'westianet': { - 'name': 'Western Iowa Networks' - }, - 'nttcwhi010': { - 'name': 'Whidbey Telecom' - }, - 'weh010-white': { - 'name': 'White County Cable TV' - }, - 'wes130': { - 'name': 'Wiatel' - }, - 'wik010': { - 'name': 'Wiktel' - }, - 'wil070': { - 'name': 'Wilkes Communications, Inc./RiverStreet Networks' - }, - 'wil015': { - 'name': 'Wilson Communications' - }, - 'win010': { - 'name': 'Windomnet/SMBS' - }, - 'win090': { - 'name': 'Windstream Cable TV' - }, - 'wcta': { - 'name': 'Winnebago Cooperative Telecom Association' - }, - 'wtc010': { - 'name': 'WTC' - }, - 'wil040': { - 'name': 'WTC Communications, Inc.' - }, - 'wya010': { - 'name': 'Wyandotte Cable' - }, - 'hin020-02': { - 'name': 'X-Stream Services' - }, - 'xit010': { - 'name': 'XIT Communications' - }, - 'yel010': { - 'name': 'Yelcot Communications' - }, - 'mid180-01': { - 'name': 'yondoo' - }, - 'cou060': { - 'name': 'Zito Media' - }, -} - - -class AdobePassIE(InfoExtractor): - _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' - _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' - _MVPD_CACHE = 'ap-mvpd' - - _DOWNLOADING_LOGIN_PAGE = 'Downloading Provider Login Page' - - def _download_webpage_handle(self, *args, **kwargs): - headers = kwargs.get('headers', {}) - headers.update(self.geo_verification_headers()) - kwargs['headers'] = headers - return super(AdobePassIE, self)._download_webpage_handle( - *args, **compat_kwargs(kwargs)) - - @staticmethod - def _get_mvpd_resource(provider_id, title, guid, rating): - channel = etree.Element('channel') - channel_title = etree.SubElement(channel, 'title') - channel_title.text = provider_id - item = etree.SubElement(channel, 'item') - resource_title = etree.SubElement(item, 'title') - resource_title.text = title - resource_guid = etree.SubElement(item, 'guid') - resource_guid.text = guid - resource_rating = etree.SubElement(item, 'media:rating') - resource_rating.attrib = {'scheme': 'urn:v-chip'} - resource_rating.text = rating - return '<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">' + etree.tostring(channel).decode() + '</rss>' - - def _extract_mvpd_auth(self, url, video_id, requestor_id, resource): - def xml_text(xml_str, tag): - return self._search_regex( - '<%s>(.+?)</%s>' % (tag, tag), xml_str, tag) - - def is_expired(token, date_ele): - token_expires = unified_timestamp(re.sub(r'[_ ]GMT', '', xml_text(token, date_ele))) - return token_expires and token_expires <= int(time.time()) - - def post_form(form_page_res, note, data={}): - form_page, urlh = form_page_res - post_url = self._html_search_regex(r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_page, 'post url', group='url') - if not re.match(r'https?://', post_url): - post_url = compat_urlparse.urljoin(urlh.geturl(), post_url) - form_data = self._hidden_inputs(form_page) - form_data.update(data) - return self._download_webpage_handle( - post_url, video_id, note, data=urlencode_postdata(form_data), headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }) - - def raise_mvpd_required(): - raise ExtractorError( - 'This video is only available for users of participating TV providers. ' - 'Use --ap-mso to specify Adobe Pass Multiple-system operator Identifier ' - 'and --ap-username and --ap-password or --netrc to provide account credentials.', expected=True) - - def extract_redirect_url(html, url=None, fatal=False): - # TODO: eliminate code duplication with generic extractor and move - # redirection code into _download_webpage_handle - REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' - redirect_url = self._search_regex( - r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' - r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX, - html, 'meta refresh redirect', - default=NO_DEFAULT if fatal else None, fatal=fatal) - if not redirect_url: - return None - if url: - redirect_url = compat_urlparse.urljoin(url, unescapeHTML(redirect_url)) - return redirect_url - - mvpd_headers = { - 'ap_42': 'anonymous', - 'ap_11': 'Linux i686', - 'ap_z': self._USER_AGENT, - 'User-Agent': self._USER_AGENT, - } - - guid = xml_text(resource, 'guid') if '<' in resource else resource - count = 0 - while count < 2: - requestor_info = self._downloader.cache.load(self._MVPD_CACHE, requestor_id) or {} - authn_token = requestor_info.get('authn_token') - if authn_token and is_expired(authn_token, 'simpleTokenExpires'): - authn_token = None - if not authn_token: - # TODO add support for other TV Providers - mso_id = self._downloader.params.get('ap_mso') - if not mso_id: - raise_mvpd_required() - username, password = self._get_login_info('ap_username', 'ap_password', mso_id) - if not username or not password: - raise_mvpd_required() - mso_info = MSO_INFO[mso_id] - - provider_redirect_page_res = self._download_webpage_handle( - self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, - 'Downloading Provider Redirect Page', query={ - 'noflash': 'true', - 'mso_id': mso_id, - 'requestor_id': requestor_id, - 'no_iframe': 'false', - 'domain_name': 'adobe.com', - 'redirect_url': url, - }) - - if mso_id == 'Comcast_SSO': - # Comcast page flow varies by video site and whether you - # are on Comcast's network. - provider_redirect_page, urlh = provider_redirect_page_res - if 'automatically signing you in' in provider_redirect_page: - oauth_redirect_url = self._html_search_regex( - r'window\.location\s*=\s*[\'"]([^\'"]+)', - provider_redirect_page, 'oauth redirect') - self._download_webpage( - oauth_redirect_url, video_id, 'Confirming auto login') - else: - if '<form name="signin"' in provider_redirect_page: - provider_login_page_res = provider_redirect_page_res - elif 'http-equiv="refresh"' in provider_redirect_page: - oauth_redirect_url = extract_redirect_url( - provider_redirect_page, fatal=True) - provider_login_page_res = self._download_webpage_handle( - oauth_redirect_url, video_id, - self._DOWNLOADING_LOGIN_PAGE) - else: - provider_login_page_res = post_form( - provider_redirect_page_res, - self._DOWNLOADING_LOGIN_PAGE) - - mvpd_confirm_page_res = post_form( - provider_login_page_res, 'Logging in', { - mso_info['username_field']: username, - mso_info['password_field']: password, - }) - mvpd_confirm_page, urlh = mvpd_confirm_page_res - if '<button class="submit" value="Resume">Resume</button>' in mvpd_confirm_page: - post_form(mvpd_confirm_page_res, 'Confirming Login') - elif mso_id == 'Verizon': - # In general, if you're connecting from a Verizon-assigned IP, - # you will not actually pass your credentials. - provider_redirect_page, urlh = provider_redirect_page_res - if 'Please wait ...' in provider_redirect_page: - saml_redirect_url = self._html_search_regex( - r'self\.parent\.location=(["\'])(?P<url>.+?)\1', - provider_redirect_page, - 'SAML Redirect URL', group='url') - saml_login_page = self._download_webpage( - saml_redirect_url, video_id, - 'Downloading SAML Login Page') - else: - saml_login_page_res = post_form( - provider_redirect_page_res, 'Logging in', { - mso_info['username_field']: username, - mso_info['password_field']: password, - }) - saml_login_page, urlh = saml_login_page_res - if 'Please try again.' in saml_login_page: - raise ExtractorError( - 'We\'re sorry, but either the User ID or Password entered is not correct.') - saml_login_url = self._search_regex( - r'xmlHttp\.open\("POST"\s*,\s*(["\'])(?P<url>.+?)\1', - saml_login_page, 'SAML Login URL', group='url') - saml_response_json = self._download_json( - saml_login_url, video_id, 'Downloading SAML Response', - headers={'Content-Type': 'text/xml'}) - self._download_webpage( - saml_response_json['targetValue'], video_id, - 'Confirming Login', data=urlencode_postdata({ - 'SAMLResponse': saml_response_json['SAMLResponse'], - 'RelayState': saml_response_json['RelayState'] - }), headers={ - 'Content-Type': 'application/x-www-form-urlencoded' - }) - else: - # Some providers (e.g. DIRECTV NOW) have another meta refresh - # based redirect that should be followed. - provider_redirect_page, urlh = provider_redirect_page_res - provider_refresh_redirect_url = extract_redirect_url( - provider_redirect_page, url=urlh.geturl()) - if provider_refresh_redirect_url: - provider_redirect_page_res = self._download_webpage_handle( - provider_refresh_redirect_url, video_id, - 'Downloading Provider Redirect Page (meta refresh)') - provider_login_page_res = post_form( - provider_redirect_page_res, self._DOWNLOADING_LOGIN_PAGE) - mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', { - mso_info.get('username_field', 'username'): username, - mso_info.get('password_field', 'password'): password, - }) - if mso_id != 'Rogers': - post_form(mvpd_confirm_page_res, 'Confirming Login') - - session = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, - 'Retrieving Session', data=urlencode_postdata({ - '_method': 'GET', - 'requestor_id': requestor_id, - }), headers=mvpd_headers) - if '<pendingLogout' in session: - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {}) - count += 1 - continue - authn_token = unescapeHTML(xml_text(session, 'authnToken')) - requestor_info['authn_token'] = authn_token - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, requestor_info) - - authz_token = requestor_info.get(guid) - if authz_token and is_expired(authz_token, 'simpleTokenTTL'): - authz_token = None - if not authz_token: - authorize = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'authorize', video_id, - 'Retrieving Authorization Token', data=urlencode_postdata({ - 'resource_id': resource, - 'requestor_id': requestor_id, - 'authentication_token': authn_token, - 'mso_id': xml_text(authn_token, 'simpleTokenMsoID'), - 'userMeta': '1', - }), headers=mvpd_headers) - if '<pendingLogout' in authorize: - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {}) - count += 1 - continue - if '<error' in authorize: - raise ExtractorError(xml_text(authorize, 'details'), expected=True) - authz_token = unescapeHTML(xml_text(authorize, 'authzToken')) - requestor_info[guid] = authz_token - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, requestor_info) - - mvpd_headers.update({ - 'ap_19': xml_text(authn_token, 'simpleSamlNameID'), - 'ap_23': xml_text(authn_token, 'simpleSamlSessionIndex'), - }) - - short_authorize = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'shortAuthorize', - video_id, 'Retrieving Media Token', data=urlencode_postdata({ - 'authz_token': authz_token, - 'requestor_id': requestor_id, - 'session_guid': xml_text(authn_token, 'simpleTokenAuthenticationGuid'), - 'hashed_guid': 'false', - }), headers=mvpd_headers) - if '<pendingLogout' in short_authorize: - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {}) - count += 1 - continue - return short_authorize diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py deleted file mode 100644 index 5d4db54..0000000 --- a/youtube_dl/extractor/common.py +++ /dev/null @@ -1,2862 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import base64 -import datetime -import hashlib -import json -import netrc -import os -import random -import re -import socket -import sys -import time -import math - -from ..compat import ( - compat_cookiejar, - compat_cookies, - compat_etree_fromstring, - compat_getpass, - compat_integer_types, - compat_http_client, - compat_os_name, - compat_str, - compat_urllib_error, - compat_urllib_parse_unquote, - compat_urllib_parse_urlencode, - compat_urllib_request, - compat_urlparse, - compat_xml_parse_error, -) -from ..downloader.f4m import ( - get_base_url, - remove_encrypted_media, -) -from ..utils import ( - NO_DEFAULT, - age_restricted, - base_url, - bug_reports_message, - clean_html, - compiled_regex_type, - determine_ext, - determine_protocol, - error_to_compat_str, - ExtractorError, - extract_attributes, - fix_xml_ampersands, - float_or_none, - GeoRestrictedError, - GeoUtils, - int_or_none, - js_to_json, - JSON_LD_RE, - mimetype2ext, - orderedSet, - parse_codecs, - parse_duration, - parse_iso8601, - parse_m3u8_attributes, - RegexNotFoundError, - sanitized_Request, - sanitize_filename, - unescapeHTML, - unified_strdate, - unified_timestamp, - update_Request, - update_url_query, - urljoin, - url_basename, - xpath_element, - xpath_text, - xpath_with_ns, -) - - -class InfoExtractor(object): - """Information Extractor class. - - Information extractors are the classes that, given a URL, extract - information about the video (or videos) the URL refers to. This - information includes the real video URL, the video title, author and - others. The information is stored in a dictionary which is then - passed to the YoutubeDL. The YoutubeDL processes this - information possibly downloading the video to the file system, among - other possible outcomes. - - The type field determines the type of the result. - By far the most common value (and the default if _type is missing) is - "video", which indicates a single video. - - For a video, the dictionaries must include the following fields: - - id: Video identifier. - title: Video title, unescaped. - - Additionally, it must contain either a formats entry or a url one: - - formats: A list of dictionaries for each format available, ordered - from worst to best quality. - - Potential fields: - * url Mandatory. The URL of the video file - * manifest_url - The URL of the manifest file in case of - fragmented media (DASH, hls, hds) - * ext Will be calculated from URL if missing - * format A human-readable description of the format - ("mp4 container with h264/opus"). - Calculated from the format_id, width, height. - and format_note fields if missing. - * format_id A short description of the format - ("mp4_h264_opus" or "19"). - Technically optional, but strongly recommended. - * format_note Additional info about the format - ("3D" or "DASH video") - * width Width of the video, if known - * height Height of the video, if known - * resolution Textual description of width and height - * tbr Average bitrate of audio and video in KBit/s - * abr Average audio bitrate in KBit/s - * acodec Name of the audio codec in use - * asr Audio sampling rate in Hertz - * vbr Average video bitrate in KBit/s - * fps Frame rate - * vcodec Name of the video codec in use - * container Name of the container format - * filesize The number of bytes, if known in advance - * filesize_approx An estimate for the number of bytes - * player_url SWF Player URL (used for rtmpdump). - * protocol The protocol that will be used for the actual - download, lower-case. - "http", "https", "rtsp", "rtmp", "rtmpe", - "m3u8", "m3u8_native" or "http_dash_segments". - * fragment_base_url - Base URL for fragments. Each fragment's path - value (if present) will be relative to - this URL. - * fragments A list of fragments of a fragmented media. - Each fragment entry must contain either an url - or a path. If an url is present it should be - considered by a client. Otherwise both path and - fragment_base_url must be present. Here is - the list of all potential fields: - * "url" - fragment's URL - * "path" - fragment's path relative to - fragment_base_url - * "duration" (optional, int or float) - * "filesize" (optional, int) - * preference Order number of this format. If this field is - present and not None, the formats get sorted - by this field, regardless of all other values. - -1 for default (order by other properties), - -2 or smaller for less than default. - < -1000 to hide the format (if there is - another one which is strictly better) - * language Language code, e.g. "de" or "en-US". - * language_preference Is this in the language mentioned in - the URL? - 10 if it's what the URL is about, - -1 for default (don't know), - -10 otherwise, other values reserved for now. - * quality Order number of the video quality of this - format, irrespective of the file format. - -1 for default (order by other properties), - -2 or smaller for less than default. - * source_preference Order number for this video source - (quality takes higher priority) - -1 for default (order by other properties), - -2 or smaller for less than default. - * http_headers A dictionary of additional HTTP headers - to add to the request. - * stretched_ratio If given and not 1, indicates that the - video's pixels are not square. - width : height ratio as float. - * no_resume The server does not support resuming the - (HTTP or RTMP) download. Boolean. - * downloader_options A dictionary of downloader options as - described in FileDownloader - - url: Final video URL. - ext: Video filename extension. - format: The video format, defaults to ext (used for --get-format) - player_url: SWF Player URL (used for rtmpdump). - - The following fields are optional: - - alt_title: A secondary title of the video. - display_id An alternative identifier for the video, not necessarily - unique, but available before title. Typically, id is - something like "4234987", title "Dancing naked mole rats", - and display_id "dancing-naked-mole-rats" - thumbnails: A list of dictionaries, with the following entries: - * "id" (optional, string) - Thumbnail format ID - * "url" - * "preference" (optional, int) - quality of the image - * "width" (optional, int) - * "height" (optional, int) - * "resolution" (optional, string "{width}x{height"}, - deprecated) - * "filesize" (optional, int) - thumbnail: Full URL to a video thumbnail image. - description: Full video description. - uploader: Full name of the video uploader. - license: License name the video is licensed under. - creator: The creator of the video. - release_date: The date (YYYYMMDD) when the video was released. - timestamp: UNIX timestamp of the moment the video became available. - upload_date: Video upload date (YYYYMMDD). - If not explicitly set, calculated from timestamp. - uploader_id: Nickname or id of the video uploader. - uploader_url: Full URL to a personal webpage of the video uploader. - location: Physical location where the video was filmed. - subtitles: The available subtitles as a dictionary in the format - {tag: subformats}. "tag" is usually a language code, and - "subformats" is a list sorted from lower to higher - preference, each element is a dictionary with the "ext" - entry and one of: - * "data": The subtitles file contents - * "url": A URL pointing to the subtitles file - "ext" will be calculated from URL if missing - automatic_captions: Like 'subtitles', used by the YoutubeIE for - automatically generated captions - duration: Length of the video in seconds, as an integer or float. - view_count: How many users have watched the video on the platform. - like_count: Number of positive ratings of the video - dislike_count: Number of negative ratings of the video - repost_count: Number of reposts of the video - average_rating: Average rating give by users, the scale used depends on the webpage - comment_count: Number of comments on the video - comments: A list of comments, each with one or more of the following - properties (all but one of text or html optional): - * "author" - human-readable name of the comment author - * "author_id" - user ID of the comment author - * "id" - Comment ID - * "html" - Comment as HTML - * "text" - Plain text of the comment - * "timestamp" - UNIX timestamp of comment - * "parent" - ID of the comment this one is replying to. - Set to "root" to indicate that this is a - comment to the original video. - age_limit: Age restriction for the video, as an integer (years) - webpage_url: The URL to the video webpage, if given to youtube-dl it - should allow to get the same result again. (It will be set - by YoutubeDL if it's missing) - categories: A list of categories that the video falls in, for example - ["Sports", "Berlin"] - tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"] - is_live: True, False, or None (=unknown). Whether this video is a - live stream that goes on instead of a fixed-length video. - start_time: Time in seconds where the reproduction should start, as - specified in the URL. - end_time: Time in seconds where the reproduction should end, as - specified in the URL. - chapters: A list of dictionaries, with the following entries: - * "start_time" - The start time of the chapter in seconds - * "end_time" - The end time of the chapter in seconds - * "title" (optional, string) - - The following fields should only be used when the video belongs to some logical - chapter or section: - - chapter: Name or title of the chapter the video belongs to. - chapter_number: Number of the chapter the video belongs to, as an integer. - chapter_id: Id of the chapter the video belongs to, as a unicode string. - - The following fields should only be used when the video is an episode of some - series, programme or podcast: - - series: Title of the series or programme the video episode belongs to. - season: Title of the season the video episode belongs to. - season_number: Number of the season the video episode belongs to, as an integer. - season_id: Id of the season the video episode belongs to, as a unicode string. - episode: Title of the video episode. Unlike mandatory video title field, - this field should denote the exact title of the video episode - without any kind of decoration. - episode_number: Number of the video episode within a season, as an integer. - episode_id: Id of the video episode, as a unicode string. - - The following fields should only be used when the media is a track or a part of - a music album: - - track: Title of the track. - track_number: Number of the track within an album or a disc, as an integer. - track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii), - as a unicode string. - artist: Artist(s) of the track. - genre: Genre(s) of the track. - album: Title of the album the track belongs to. - album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). - album_artist: List of all artists appeared on the album (e.g. - "Ash Borer / Fell Voices" or "Various Artists", useful for splits - and compilations). - disc_number: Number of the disc or other physical medium the track belongs to, - as an integer. - release_year: Year (YYYY) when the album was released. - - Unless mentioned otherwise, the fields should be Unicode strings. - - Unless mentioned otherwise, None is equivalent to absence of information. - - - _type "playlist" indicates multiple videos. - There must be a key "entries", which is a list, an iterable, or a PagedList - object, each element of which is a valid dictionary by this specification. - - Additionally, playlists can have "id", "title", "description", "uploader", - "uploader_id", "uploader_url" attributes with the same semantics as videos - (see above). - - - _type "multi_video" indicates that there are multiple videos that - form a single show, for examples multiple acts of an opera or TV episode. - It must have an entries key like a playlist and contain all the keys - required for a video at the same time. - - - _type "url" indicates that the video must be extracted from another - location, possibly by a different extractor. Its only required key is: - "url" - the next URL to extract. - The key "ie_key" can be set to the class name (minus the trailing "IE", - e.g. "Youtube") if the extractor class is known in advance. - Additionally, the dictionary may have any properties of the resolved entity - known in advance, for example "title" if the title of the referred video is - known ahead of time. - - - _type "url_transparent" entities have the same specification as "url", but - indicate that the given additional information is more precise than the one - associated with the resolved URL. - This is useful when a site employs a video service that hosts the video and - its technical metadata, but that video service does not embed a useful - title, description etc. - - - Subclasses of this one should re-define the _real_initialize() and - _real_extract() methods and define a _VALID_URL regexp. - Probably, they should also be added to the list of extractors. - - _GEO_BYPASS attribute may be set to False in order to disable - geo restriction bypass mechanisms for a particular extractor. - Though it won't disable explicit geo restriction bypass based on - country code provided with geo_bypass_country. - - _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted - countries for this extractor. One of these countries will be used by - geo restriction bypass mechanism right away in order to bypass - geo restriction, of course, if the mechanism is not disabled. - - _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted - IP blocks in CIDR notation for this extractor. One of these IP blocks - will be used by geo restriction bypass mechanism similarly - to _GEO_COUNTRIES. - - Finally, the _WORKING attribute should be set to False for broken IEs - in order to warn the users and skip the tests. - """ - - _ready = False - _downloader = None - _x_forwarded_for_ip = None - _GEO_BYPASS = True - _GEO_COUNTRIES = None - _GEO_IP_BLOCKS = None - _WORKING = True - - def __init__(self, downloader=None): - """Constructor. Receives an optional downloader.""" - self._ready = False - self._x_forwarded_for_ip = None - self.set_downloader(downloader) - - @classmethod - def suitable(cls, url): - """Receives a URL and returns True if suitable for this IE.""" - - # This does not use has/getattr intentionally - we want to know whether - # we have cached the regexp for *this* class, whereas getattr would also - # match the superclass - if '_VALID_URL_RE' not in cls.__dict__: - cls._VALID_URL_RE = re.compile(cls._VALID_URL) - return cls._VALID_URL_RE.match(url) is not None - - @classmethod - def _match_id(cls, url): - if '_VALID_URL_RE' not in cls.__dict__: - cls._VALID_URL_RE = re.compile(cls._VALID_URL) - m = cls._VALID_URL_RE.match(url) - assert m - return compat_str(m.group('id')) - - @classmethod - def working(cls): - """Getter method for _WORKING.""" - return cls._WORKING - - def initialize(self): - """Initializes an instance (authentication, etc).""" - self._initialize_geo_bypass({ - 'countries': self._GEO_COUNTRIES, - 'ip_blocks': self._GEO_IP_BLOCKS, - }) - if not self._ready: - self._real_initialize() - self._ready = True - - def _initialize_geo_bypass(self, geo_bypass_context): - """ - Initialize geo restriction bypass mechanism. - - This method is used to initialize geo bypass mechanism based on faking - X-Forwarded-For HTTP header. A random country from provided country list - is selected and a random IP belonging to this country is generated. This - IP will be passed as X-Forwarded-For HTTP header in all subsequent - HTTP requests. - - This method will be used for initial geo bypass mechanism initialization - during the instance initialization with _GEO_COUNTRIES and - _GEO_IP_BLOCKS. - - You may also manually call it from extractor's code if geo bypass - information is not available beforehand (e.g. obtained during - extraction) or due to some other reason. In this case you should pass - this information in geo bypass context passed as first argument. It may - contain following fields: - - countries: List of geo unrestricted countries (similar - to _GEO_COUNTRIES) - ip_blocks: List of geo unrestricted IP blocks in CIDR notation - (similar to _GEO_IP_BLOCKS) - - """ - if not self._x_forwarded_for_ip: - - # Geo bypass mechanism is explicitly disabled by user - if not self._downloader.params.get('geo_bypass', True): - return - - if not geo_bypass_context: - geo_bypass_context = {} - - # Backward compatibility: previously _initialize_geo_bypass - # expected a list of countries, some 3rd party code may still use - # it this way - if isinstance(geo_bypass_context, (list, tuple)): - geo_bypass_context = { - 'countries': geo_bypass_context, - } - - # The whole point of geo bypass mechanism is to fake IP - # as X-Forwarded-For HTTP header based on some IP block or - # country code. - - # Path 1: bypassing based on IP block in CIDR notation - - # Explicit IP block specified by user, use it right away - # regardless of whether extractor is geo bypassable or not - ip_block = self._downloader.params.get('geo_bypass_ip_block', None) - - # Otherwise use random IP block from geo bypass context but only - # if extractor is known as geo bypassable - if not ip_block: - ip_blocks = geo_bypass_context.get('ip_blocks') - if self._GEO_BYPASS and ip_blocks: - ip_block = random.choice(ip_blocks) - - if ip_block: - self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block) - if self._downloader.params.get('verbose', False): - self._downloader.to_screen( - '[debug] Using fake IP %s as X-Forwarded-For.' - % self._x_forwarded_for_ip) - return - - # Path 2: bypassing based on country code - - # Explicit country code specified by user, use it right away - # regardless of whether extractor is geo bypassable or not - country = self._downloader.params.get('geo_bypass_country', None) - - # Otherwise use random country code from geo bypass context but - # only if extractor is known as geo bypassable - if not country: - countries = geo_bypass_context.get('countries') - if self._GEO_BYPASS and countries: - country = random.choice(countries) - - if country: - self._x_forwarded_for_ip = GeoUtils.random_ipv4(country) - if self._downloader.params.get('verbose', False): - self._downloader.to_screen( - '[debug] Using fake IP %s (%s) as X-Forwarded-For.' - % (self._x_forwarded_for_ip, country.upper())) - - def extract(self, url): - """Extracts URL information and returns it in list of dicts.""" - try: - for _ in range(2): - try: - self.initialize() - ie_result = self._real_extract(url) - if self._x_forwarded_for_ip: - ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip - return ie_result - except GeoRestrictedError as e: - if self.__maybe_fake_ip_and_retry(e.countries): - continue - raise - except ExtractorError: - raise - except compat_http_client.IncompleteRead as e: - raise ExtractorError('A network error has occurred.', cause=e, expected=True) - except (KeyError, StopIteration) as e: - raise ExtractorError('An extractor error has occurred.', cause=e) - - def __maybe_fake_ip_and_retry(self, countries): - if (not self._downloader.params.get('geo_bypass_country', None) and - self._GEO_BYPASS and - self._downloader.params.get('geo_bypass', True) and - not self._x_forwarded_for_ip and - countries): - country_code = random.choice(countries) - self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) - if self._x_forwarded_for_ip: - self.report_warning( - 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.' - % (self._x_forwarded_for_ip, country_code.upper())) - return True - return False - - def set_downloader(self, downloader): - """Sets the downloader for this IE.""" - self._downloader = downloader - - def _real_initialize(self): - """Real initialization process. Redefine in subclasses.""" - pass - - def _real_extract(self, url): - """Real extraction process. Redefine in subclasses.""" - pass - - @classmethod - def ie_key(cls): - """A string for getting the InfoExtractor with get_info_extractor""" - return compat_str(cls.__name__[:-2]) - - @property - def IE_NAME(self): - return compat_str(type(self).__name__[:-2]) - - @staticmethod - def __can_accept_status_code(err, expected_status): - assert isinstance(err, compat_urllib_error.HTTPError) - if expected_status is None: - return False - if isinstance(expected_status, compat_integer_types): - return err.code == expected_status - elif isinstance(expected_status, (list, tuple)): - return err.code in expected_status - elif callable(expected_status): - return expected_status(err.code) is True - else: - assert False - - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None): - """ - Return the response handle. - - See _download_webpage docstring for arguments specification. - """ - if note is None: - self.report_download_webpage(video_id) - elif note is not False: - if video_id is None: - self.to_screen('%s' % (note,)) - else: - self.to_screen('%s: %s' % (video_id, note)) - - # Some sites check X-Forwarded-For HTTP header in order to figure out - # the origin of the client behind proxy. This allows bypassing geo - # restriction by faking this header's value to IP that belongs to some - # geo unrestricted country. We will do so once we encounter any - # geo restriction error. - if self._x_forwarded_for_ip: - if 'X-Forwarded-For' not in headers: - headers['X-Forwarded-For'] = self._x_forwarded_for_ip - - if isinstance(url_or_request, compat_urllib_request.Request): - url_or_request = update_Request( - url_or_request, data=data, headers=headers, query=query) - else: - if query: - url_or_request = update_url_query(url_or_request, query) - if data is not None or headers: - url_or_request = sanitized_Request(url_or_request, data, headers) - try: - return self._downloader.urlopen(url_or_request) - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - if isinstance(err, compat_urllib_error.HTTPError): - if self.__can_accept_status_code(err, expected_status): - return err.fp - - if errnote is False: - return False - if errnote is None: - errnote = 'Unable to download webpage' - - errmsg = '%s: %s' % (errnote, error_to_compat_str(err)) - if fatal: - raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) - else: - self._downloader.report_warning(errmsg) - return False - - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): - """ - Return a tuple (page content as string, URL handle). - - See _download_webpage docstring for arguments specification. - """ - # Strip hashes from the URL (#1038) - if isinstance(url_or_request, (compat_str, str)): - url_or_request = url_or_request.partition('#')[0] - - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status) - if urlh is False: - assert not fatal - return False - content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) - return (content, urlh) - - @staticmethod - def _guess_encoding_from_content(content_type, webpage_bytes): - m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) - if m: - encoding = m.group(1) - else: - m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]', - webpage_bytes[:1024]) - if m: - encoding = m.group(1).decode('ascii') - elif webpage_bytes.startswith(b'\xff\xfe'): - encoding = 'utf-16' - else: - encoding = 'utf-8' - - return encoding - - def __check_blocked(self, content): - first_block = content[:512] - if ('<title>Access to this site is blocked</title>' in content and - 'Websense' in first_block): - msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' - blocked_iframe = self._html_search_regex( - r'<iframe src="([^"]+)"', content, - 'Websense information URL', default=None) - if blocked_iframe: - msg += ' Visit %s for more details' % blocked_iframe - raise ExtractorError(msg, expected=True) - if '<title>The URL you requested has been blocked</title>' in first_block: - msg = ( - 'Access to this webpage has been blocked by Indian censorship. ' - 'Use a VPN or proxy server (with --proxy) to route around it.') - block_msg = self._html_search_regex( - r'</h1><p>(.*?)</p>', - content, 'block message', default=None) - if block_msg: - msg += ' (Message: "%s")' % block_msg.replace('\n', ' ') - raise ExtractorError(msg, expected=True) - if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and - 'blocklist.rkn.gov.ru' in content): - raise ExtractorError( - 'Access to this webpage has been blocked by decision of the Russian government. ' - 'Visit http://blocklist.rkn.gov.ru/ for a block reason.', - expected=True) - - def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): - content_type = urlh.headers.get('Content-Type', '') - webpage_bytes = urlh.read() - if prefix is not None: - webpage_bytes = prefix + webpage_bytes - if not encoding: - encoding = self._guess_encoding_from_content(content_type, webpage_bytes) - if self._downloader.params.get('dump_intermediate_pages', False): - self.to_screen('Dumping request to ' + urlh.geturl()) - dump = base64.b64encode(webpage_bytes).decode('ascii') - self._downloader.to_screen(dump) - if self._downloader.params.get('write_pages', False): - basen = '%s_%s' % (video_id, urlh.geturl()) - if len(basen) > 240: - h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() - basen = basen[:240 - len(h)] + h - raw_filename = basen + '.dump' - filename = sanitize_filename(raw_filename, restricted=True) - self.to_screen('Saving request to ' + filename) - # Working around MAX_PATH limitation on Windows (see - # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) - if compat_os_name == 'nt': - absfilepath = os.path.abspath(filename) - if len(absfilepath) > 259: - filename = '\\\\?\\' + absfilepath - with open(filename, 'wb') as outf: - outf.write(webpage_bytes) - - try: - content = webpage_bytes.decode(encoding, 'replace') - except LookupError: - content = webpage_bytes.decode('utf-8', 'replace') - - self.__check_blocked(content) - - return content - - def _download_webpage( - self, url_or_request, video_id, note=None, errnote=None, - fatal=True, tries=1, timeout=5, encoding=None, data=None, - headers={}, query={}, expected_status=None): - """ - Return the data of the page as a string. - - Arguments: - url_or_request -- plain text URL as a string or - a compat_urllib_request.Requestobject - video_id -- Video/playlist/item identifier (string) - - Keyword arguments: - note -- note printed before downloading (string) - errnote -- note printed in case of an error (string) - fatal -- flag denoting whether error should be considered fatal, - i.e. whether it should cause ExtractionError to be raised, - otherwise a warning will be reported and extraction continued - tries -- number of tries - timeout -- sleep interval between tries - encoding -- encoding for a page content decoding, guessed automatically - when not explicitly specified - data -- POST data (bytes) - headers -- HTTP headers (dict) - query -- URL query (dict) - expected_status -- allows to accept failed HTTP requests (non 2xx - status code) by explicitly specifying a set of accepted status - codes. Can be any of the following entities: - - an integer type specifying an exact failed status code to - accept - - a list or a tuple of integer types specifying a list of - failed status codes to accept - - a callable accepting an actual failed status code and - returning True if it should be accepted - Note that this argument does not affect success status codes (2xx) - which are always accepted. - """ - - success = False - try_count = 0 - while success is False: - try: - res = self._download_webpage_handle( - url_or_request, video_id, note, errnote, fatal, - encoding=encoding, data=data, headers=headers, query=query, - expected_status=expected_status) - success = True - except compat_http_client.IncompleteRead as e: - try_count += 1 - if try_count >= tries: - raise e - self._sleep(timeout, video_id) - if res is False: - return res - else: - content, _ = res - return content - - def _download_xml_handle( - self, url_or_request, video_id, note='Downloading XML', - errnote='Unable to download XML', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, - expected_status=None): - """ - Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle). - - See _download_webpage docstring for arguments specification. - """ - res = self._download_webpage_handle( - url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query, - expected_status=expected_status) - if res is False: - return res - xml_string, urlh = res - return self._parse_xml( - xml_string, video_id, transform_source=transform_source, - fatal=fatal), urlh - - def _download_xml( - self, url_or_request, video_id, - note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None, - data=None, headers={}, query={}, expected_status=None): - """ - Return the xml as an xml.etree.ElementTree.Element. - - See _download_webpage docstring for arguments specification. - """ - res = self._download_xml_handle( - url_or_request, video_id, note=note, errnote=errnote, - transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query, - expected_status=expected_status) - return res if res is False else res[0] - - def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): - if transform_source: - xml_string = transform_source(xml_string) - try: - return compat_etree_fromstring(xml_string.encode('utf-8')) - except compat_xml_parse_error as ve: - errmsg = '%s: Failed to parse XML ' % video_id - if fatal: - raise ExtractorError(errmsg, cause=ve) - else: - self.report_warning(errmsg + str(ve)) - - def _download_json_handle( - self, url_or_request, video_id, note='Downloading JSON metadata', - errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, - expected_status=None): - """ - Return a tuple (JSON object, URL handle). - - See _download_webpage docstring for arguments specification. - """ - res = self._download_webpage_handle( - url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query, - expected_status=expected_status) - if res is False: - return res - json_string, urlh = res - return self._parse_json( - json_string, video_id, transform_source=transform_source, - fatal=fatal), urlh - - def _download_json( - self, url_or_request, video_id, note='Downloading JSON metadata', - errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, - expected_status=None): - """ - Return the JSON object as a dict. - - See _download_webpage docstring for arguments specification. - """ - res = self._download_json_handle( - url_or_request, video_id, note=note, errnote=errnote, - transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query, - expected_status=expected_status) - return res if res is False else res[0] - - def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): - if transform_source: - json_string = transform_source(json_string) - try: - return json.loads(json_string) - except ValueError as ve: - errmsg = '%s: Failed to parse JSON ' % video_id - if fatal: - raise ExtractorError(errmsg, cause=ve) - else: - self.report_warning(errmsg + str(ve)) - - def report_warning(self, msg, video_id=None): - idstr = '' if video_id is None else '%s: ' % video_id - self._downloader.report_warning( - '[%s] %s%s' % (self.IE_NAME, idstr, msg)) - - def to_screen(self, msg): - """Print msg to screen, prefixing it with '[ie_name]'""" - self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg)) - - def report_extraction(self, id_or_name): - """Report information extraction.""" - self.to_screen('%s: Extracting information' % id_or_name) - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self.to_screen('%s: Downloading webpage' % video_id) - - def report_age_confirmation(self): - """Report attempt to confirm age.""" - self.to_screen('Confirming age') - - def report_login(self): - """Report attempt to log in.""" - self.to_screen('Logging in') - - @staticmethod - def raise_login_required(msg='This video is only available for registered users'): - raise ExtractorError( - '%s. Use --username and --password or --netrc to provide account credentials.' % msg, - expected=True) - - @staticmethod - def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None): - raise GeoRestrictedError(msg, countries=countries) - - # Methods for following #608 - @staticmethod - def url_result(url, ie=None, video_id=None, video_title=None): - """Returns a URL that points to a page that should be processed""" - # TODO: ie should be the class used for getting the info - video_info = {'_type': 'url', - 'url': url, - 'ie_key': ie} - if video_id is not None: - video_info['id'] = video_id - if video_title is not None: - video_info['title'] = video_title - return video_info - - def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None): - urls = orderedSet( - self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) - for m in matches) - return self.playlist_result( - urls, playlist_id=playlist_id, playlist_title=playlist_title) - - @staticmethod - def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): - """Returns a playlist""" - video_info = {'_type': 'playlist', - 'entries': entries} - if playlist_id: - video_info['id'] = playlist_id - if playlist_title: - video_info['title'] = playlist_title - if playlist_description: - video_info['description'] = playlist_description - return video_info - - def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): - """ - Perform a regex search on the given string, using a single or a list of - patterns returning the first matching group. - In case of failure return a default value or raise a WARNING or a - RegexNotFoundError, depending on fatal, specifying the field name. - """ - if isinstance(pattern, (str, compat_str, compiled_regex_type)): - mobj = re.search(pattern, string, flags) - else: - for p in pattern: - mobj = re.search(p, string, flags) - if mobj: - break - - if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty(): - _name = '\033[0;34m%s\033[0m' % name - else: - _name = name - - if mobj: - if group is None: - # return the first matching group - return next(g for g in mobj.groups() if g is not None) - else: - return mobj.group(group) - elif default is not NO_DEFAULT: - return default - elif fatal: - raise RegexNotFoundError('Unable to extract %s' % _name) - else: - self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) - return None - - def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): - """ - Like _search_regex, but strips HTML tags and unescapes entities. - """ - res = self._search_regex(pattern, string, name, default, fatal, flags, group) - if res: - return clean_html(res).strip() - else: - return res - - def _get_netrc_login_info(self, netrc_machine=None): - username = None - password = None - netrc_machine = netrc_machine or self._NETRC_MACHINE - - if self._downloader.params.get('usenetrc', False): - try: - info = netrc.netrc().authenticators(netrc_machine) - if info is not None: - username = info[0] - password = info[2] - else: - raise netrc.NetrcParseError( - 'No authenticators for %s' % netrc_machine) - except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning( - 'parsing .netrc: %s' % error_to_compat_str(err)) - - return username, password - - def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None): - """ - Get the login info as (username, password) - First look for the manually specified credentials using username_option - and password_option as keys in params dictionary. If no such credentials - available look in the netrc file using the netrc_machine or _NETRC_MACHINE - value. - If there's no info available, return (None, None) - """ - if self._downloader is None: - return (None, None) - - downloader_params = self._downloader.params - - # Attempt to use provided username and password or .netrc data - if downloader_params.get(username_option) is not None: - username = downloader_params[username_option] - password = downloader_params[password_option] - else: - username, password = self._get_netrc_login_info(netrc_machine) - - return username, password - - def _get_tfa_info(self, note='two-factor verification code'): - """ - Get the two-factor authentication info - TODO - asking the user will be required for sms/phone verify - currently just uses the command line option - If there's no info available, return None - """ - if self._downloader is None: - return None - downloader_params = self._downloader.params - - if downloader_params.get('twofactor') is not None: - return downloader_params['twofactor'] - - return compat_getpass('Type %s and press [Return]: ' % note) - - # Helper functions for extracting OpenGraph info - @staticmethod - def _og_regexes(prop): - content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' - property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)' - % {'prop': re.escape(prop)}) - template = r'<meta[^>]+?%s[^>]+?%s' - return [ - template % (property_re, content_re), - template % (content_re, property_re), - ] - - @staticmethod - def _meta_regex(prop): - return r'''(?isx)<meta - (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1) - [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) - - def _og_search_property(self, prop, html, name=None, **kargs): - if not isinstance(prop, (list, tuple)): - prop = [prop] - if name is None: - name = 'OpenGraph %s' % prop[0] - og_regexes = [] - for p in prop: - og_regexes.extend(self._og_regexes(p)) - escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs) - if escaped is None: - return None - return unescapeHTML(escaped) - - def _og_search_thumbnail(self, html, **kargs): - return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs) - - def _og_search_description(self, html, **kargs): - return self._og_search_property('description', html, fatal=False, **kargs) - - def _og_search_title(self, html, **kargs): - return self._og_search_property('title', html, **kargs) - - def _og_search_video_url(self, html, name='video url', secure=True, **kargs): - regexes = self._og_regexes('video') + self._og_regexes('video:url') - if secure: - regexes = self._og_regexes('video:secure_url') + regexes - return self._html_search_regex(regexes, html, name, **kargs) - - def _og_search_url(self, html, **kargs): - return self._og_search_property('url', html, **kargs) - - def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): - if not isinstance(name, (list, tuple)): - name = [name] - if display_name is None: - display_name = name[0] - return self._html_search_regex( - [self._meta_regex(n) for n in name], - html, display_name, fatal=fatal, group='content', **kwargs) - - def _dc_search_uploader(self, html): - return self._html_search_meta('dc.creator', html, 'uploader') - - def _rta_search(self, html): - # See http://www.rtalabel.org/index.php?content=howtofaq#single - if re.search(r'(?ix)<meta\s+name="rating"\s+' - r' content="RTA-5042-1996-1400-1577-RTA"', - html): - return 18 - return 0 - - def _media_rating_search(self, html): - # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/ - rating = self._html_search_meta('rating', html) - - if not rating: - return None - - RATING_TABLE = { - 'safe for kids': 0, - 'general': 8, - '14 years': 14, - 'mature': 17, - 'restricted': 19, - } - return RATING_TABLE.get(rating.lower()) - - def _family_friendly_search(self, html): - # See http://schema.org/VideoObject - family_friendly = self._html_search_meta( - 'isFamilyFriendly', html, default=None) - - if not family_friendly: - return None - - RATING_TABLE = { - '1': 0, - 'true': 0, - '0': 18, - 'false': 18, - } - return RATING_TABLE.get(family_friendly.lower()) - - def _twitter_search_player(self, html): - return self._html_search_meta('twitter:player', html, - 'twitter card player') - - def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): - json_ld = self._search_regex( - JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs) - default = kwargs.get('default', NO_DEFAULT) - if not json_ld: - return default if default is not NO_DEFAULT else {} - # JSON-LD may be malformed and thus `fatal` should be respected. - # At the same time `default` may be passed that assumes `fatal=False` - # for _search_regex. Let's simulate the same behavior here as well. - fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False - return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) - - def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): - if isinstance(json_ld, compat_str): - json_ld = self._parse_json(json_ld, video_id, fatal=fatal) - if not json_ld: - return {} - info = {} - if not isinstance(json_ld, (list, tuple, dict)): - return info - if isinstance(json_ld, dict): - json_ld = [json_ld] - - INTERACTION_TYPE_MAP = { - 'CommentAction': 'comment', - 'AgreeAction': 'like', - 'DisagreeAction': 'dislike', - 'LikeAction': 'like', - 'DislikeAction': 'dislike', - 'ListenAction': 'view', - 'WatchAction': 'view', - 'ViewAction': 'view', - } - - def extract_interaction_statistic(e): - interaction_statistic = e.get('interactionStatistic') - if not isinstance(interaction_statistic, list): - return - for is_e in interaction_statistic: - if not isinstance(is_e, dict): - continue - if is_e.get('@type') != 'InteractionCounter': - continue - interaction_type = is_e.get('interactionType') - if not isinstance(interaction_type, compat_str): - continue - interaction_count = int_or_none(is_e.get('userInteractionCount')) - if interaction_count is None: - continue - count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1]) - if not count_kind: - continue - count_key = '%s_count' % count_kind - if info.get(count_key) is not None: - continue - info[count_key] = interaction_count - - def extract_video_object(e): - assert e['@type'] == 'VideoObject' - info.update({ - 'url': e.get('contentUrl'), - 'title': unescapeHTML(e.get('name')), - 'description': unescapeHTML(e.get('description')), - 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), - 'duration': parse_duration(e.get('duration')), - 'timestamp': unified_timestamp(e.get('uploadDate')), - 'filesize': float_or_none(e.get('contentSize')), - 'tbr': int_or_none(e.get('bitrate')), - 'width': int_or_none(e.get('width')), - 'height': int_or_none(e.get('height')), - 'view_count': int_or_none(e.get('interactionCount')), - }) - extract_interaction_statistic(e) - - for e in json_ld: - if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')): - item_type = e.get('@type') - if expected_type is not None and expected_type != item_type: - return info - if item_type in ('TVEpisode', 'Episode'): - info.update({ - 'episode': unescapeHTML(e.get('name')), - 'episode_number': int_or_none(e.get('episodeNumber')), - 'description': unescapeHTML(e.get('description')), - }) - part_of_season = e.get('partOfSeason') - if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'): - info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) - part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') - if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): - info['series'] = unescapeHTML(part_of_series.get('name')) - elif item_type in ('Article', 'NewsArticle'): - info.update({ - 'timestamp': parse_iso8601(e.get('datePublished')), - 'title': unescapeHTML(e.get('headline')), - 'description': unescapeHTML(e.get('articleBody')), - }) - elif item_type == 'VideoObject': - extract_video_object(e) - continue - video = e.get('video') - if isinstance(video, dict) and video.get('@type') == 'VideoObject': - extract_video_object(video) - break - return dict((k, v) for k, v in info.items() if v is not None) - - @staticmethod - def _hidden_inputs(html): - html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) - hidden_inputs = {} - for input in re.findall(r'(?i)(<input[^>]+>)', html): - attrs = extract_attributes(input) - if not input: - continue - if attrs.get('type') not in ('hidden', 'submit'): - continue - name = attrs.get('name') or attrs.get('id') - value = attrs.get('value') - if name and value is not None: - hidden_inputs[name] = value - return hidden_inputs - - def _form_hidden_inputs(self, form_id, html): - form = self._search_regex( - r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id, - html, '%s form' % form_id, group='form') - return self._hidden_inputs(form) - - def _sort_formats(self, formats, field_preference=None): - if not formats: - raise ExtractorError('No video formats found') - - for f in formats: - # Automatically determine tbr when missing based on abr and vbr (improves - # formats sorting in some cases) - if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None: - f['tbr'] = f['abr'] + f['vbr'] - - def _formats_key(f): - # TODO remove the following workaround - from ..utils import determine_ext - if not f.get('ext') and 'url' in f: - f['ext'] = determine_ext(f['url']) - - if isinstance(field_preference, (list, tuple)): - return tuple( - f.get(field) - if f.get(field) is not None - else ('' if field == 'format_id' else -1) - for field in field_preference) - - preference = f.get('preference') - if preference is None: - preference = 0 - if f.get('ext') in ['f4f', 'f4m']: # Not yet supported - preference -= 0.5 - - protocol = f.get('protocol') or determine_protocol(f) - proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1) - - if f.get('vcodec') == 'none': # audio only - preference -= 50 - if self._downloader.params.get('prefer_free_formats'): - ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus'] - else: - ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a'] - ext_preference = 0 - try: - audio_ext_preference = ORDER.index(f['ext']) - except ValueError: - audio_ext_preference = -1 - else: - if f.get('acodec') == 'none': # video only - preference -= 40 - if self._downloader.params.get('prefer_free_formats'): - ORDER = ['flv', 'mp4', 'webm'] - else: - ORDER = ['webm', 'flv', 'mp4'] - try: - ext_preference = ORDER.index(f['ext']) - except ValueError: - ext_preference = -1 - audio_ext_preference = 0 - - return ( - preference, - f.get('language_preference') if f.get('language_preference') is not None else -1, - f.get('quality') if f.get('quality') is not None else -1, - f.get('tbr') if f.get('tbr') is not None else -1, - f.get('filesize') if f.get('filesize') is not None else -1, - f.get('vbr') if f.get('vbr') is not None else -1, - f.get('height') if f.get('height') is not None else -1, - f.get('width') if f.get('width') is not None else -1, - proto_preference, - ext_preference, - f.get('abr') if f.get('abr') is not None else -1, - audio_ext_preference, - f.get('fps') if f.get('fps') is not None else -1, - f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, - f.get('source_preference') if f.get('source_preference') is not None else -1, - f.get('format_id') if f.get('format_id') is not None else '', - ) - formats.sort(key=_formats_key) - - def _check_formats(self, formats, video_id): - if formats: - formats[:] = filter( - lambda f: self._is_valid_url( - f['url'], video_id, - item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'), - formats) - - @staticmethod - def _remove_duplicate_formats(formats): - format_urls = set() - unique_formats = [] - for f in formats: - if f['url'] not in format_urls: - format_urls.add(f['url']) - unique_formats.append(f) - formats[:] = unique_formats - - def _is_valid_url(self, url, video_id, item='video', headers={}): - url = self._proto_relative_url(url, scheme='http:') - # For now assume non HTTP(S) URLs always valid - if not (url.startswith('http://') or url.startswith('https://')): - return True - try: - self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) - return True - except ExtractorError as e: - if isinstance(e.cause, compat_urllib_error.URLError): - self.to_screen( - '%s: %s URL is invalid, skipping' % (video_id, item)) - return False - raise - - def http_scheme(self): - """ Either "http:" or "https:", depending on the user's preferences """ - return ( - 'http:' - if self._downloader.params.get('prefer_insecure', False) - else 'https:') - - def _proto_relative_url(self, url, scheme=None): - if url is None: - return url - if url.startswith('//'): - if scheme is None: - scheme = self.http_scheme() - return scheme + url - else: - return url - - def _sleep(self, timeout, video_id, msg_template=None): - if msg_template is None: - msg_template = '%(video_id)s: Waiting for %(timeout)s seconds' - msg = msg_template % {'video_id': video_id, 'timeout': timeout} - self.to_screen(msg) - time.sleep(timeout) - - def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, - transform_source=lambda s: fix_xml_ampersands(s).strip(), - fatal=True, m3u8_id=None): - manifest = self._download_xml( - manifest_url, video_id, 'Downloading f4m manifest', - 'Unable to download f4m manifest', - # Some manifests may be malformed, e.g. prosiebensat1 generated manifests - # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) - transform_source=transform_source, - fatal=fatal) - - if manifest is False: - return [] - - return self._parse_f4m_formats( - manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id, - transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id) - - def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None, - transform_source=lambda s: fix_xml_ampersands(s).strip(), - fatal=True, m3u8_id=None): - # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy - akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0') - if akamai_pv is not None and ';' in akamai_pv.text: - playerVerificationChallenge = akamai_pv.text.split(';')[0] - if playerVerificationChallenge.strip() != '': - return [] - - formats = [] - manifest_version = '1.0' - media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') - if not media_nodes: - manifest_version = '2.0' - media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') - # Remove unsupported DRM protected media from final formats - # rendition (see https://github.com/rg3/youtube-dl/issues/8573). - media_nodes = remove_encrypted_media(media_nodes) - if not media_nodes: - return formats - - manifest_base_url = get_base_url(manifest) - - bootstrap_info = xpath_element( - manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], - 'bootstrap info', default=None) - - vcodec = None - mime_type = xpath_text( - manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'], - 'base URL', default=None) - if mime_type and mime_type.startswith('audio/'): - vcodec = 'none' - - for i, media_el in enumerate(media_nodes): - tbr = int_or_none(media_el.attrib.get('bitrate')) - width = int_or_none(media_el.attrib.get('width')) - height = int_or_none(media_el.attrib.get('height')) - format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])) - # If <bootstrapInfo> is present, the specified f4m is a - # stream-level manifest, and only set-level manifests may refer to - # external resources. See section 11.4 and section 4 of F4M spec - if bootstrap_info is None: - media_url = None - # @href is introduced in 2.0, see section 11.6 of F4M spec - if manifest_version == '2.0': - media_url = media_el.attrib.get('href') - if media_url is None: - media_url = media_el.attrib.get('url') - if not media_url: - continue - manifest_url = ( - media_url if media_url.startswith('http://') or media_url.startswith('https://') - else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) - # If media_url is itself a f4m manifest do the recursive extraction - # since bitrates in parent manifest (this one) and media_url manifest - # may differ leading to inability to resolve the format by requested - # bitrate in f4m downloader - ext = determine_ext(manifest_url) - if ext == 'f4m': - f4m_formats = self._extract_f4m_formats( - manifest_url, video_id, preference=preference, f4m_id=f4m_id, - transform_source=transform_source, fatal=fatal) - # Sometimes stream-level manifest contains single media entry that - # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player). - # At the same time parent's media entry in set-level manifest may - # contain it. We will copy it from parent in such cases. - if len(f4m_formats) == 1: - f = f4m_formats[0] - f.update({ - 'tbr': f.get('tbr') or tbr, - 'width': f.get('width') or width, - 'height': f.get('height') or height, - 'format_id': f.get('format_id') if not tbr else format_id, - 'vcodec': vcodec, - }) - formats.extend(f4m_formats) - continue - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', preference=preference, - m3u8_id=m3u8_id, fatal=fatal)) - continue - formats.append({ - 'format_id': format_id, - 'url': manifest_url, - 'manifest_url': manifest_url, - 'ext': 'flv' if bootstrap_info is not None else None, - 'protocol': 'f4m', - 'tbr': tbr, - 'width': width, - 'height': height, - 'vcodec': vcodec, - 'preference': preference, - }) - return formats - - def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None): - return { - 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), - 'url': m3u8_url, - 'ext': ext, - 'protocol': 'm3u8', - 'preference': preference - 100 if preference else -100, - 'resolution': 'multiple', - 'format_note': 'Quality selection URL', - } - - def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, - entry_protocol='m3u8', preference=None, - m3u8_id=None, note=None, errnote=None, - fatal=True, live=False): - res = self._download_webpage_handle( - m3u8_url, video_id, - note=note or 'Downloading m3u8 information', - errnote=errnote or 'Failed to download m3u8 information', - fatal=fatal) - - if res is False: - return [] - - m3u8_doc, urlh = res - m3u8_url = urlh.geturl() - - return self._parse_m3u8_formats( - m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, - preference=preference, m3u8_id=m3u8_id, live=live) - - def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None, - entry_protocol='m3u8', preference=None, - m3u8_id=None, live=False): - if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access - return [] - - if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay - return [] - - formats = [] - - format_url = lambda u: ( - u - if re.match(r'^https?://', u) - else compat_urlparse.urljoin(m3u8_url, u)) - - # References: - # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21 - # 2. https://github.com/rg3/youtube-dl/issues/12211 - - # We should try extracting formats only from master playlists [1, 4.3.4], - # i.e. playlists that describe available qualities. On the other hand - # media playlists [1, 4.3.3] should be returned as is since they contain - # just the media without qualities renditions. - # Fortunately, master playlist can be easily distinguished from media - # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4] - # master playlist tags MUST NOT appear in a media playist and vice versa. - # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every - # media playlist and MUST NOT appear in master playlist thus we can - # clearly detect media playlist with this criterion. - - if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is - return [{ - 'url': m3u8_url, - 'format_id': m3u8_id, - 'ext': ext, - 'protocol': entry_protocol, - 'preference': preference, - }] - - groups = {} - last_stream_inf = {} - - def extract_media(x_media_line): - media = parse_m3u8_attributes(x_media_line) - # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED - media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME') - if not (media_type and group_id and name): - return - groups.setdefault(group_id, []).append(media) - if media_type not in ('VIDEO', 'AUDIO'): - return - media_url = media.get('URI') - if media_url: - format_id = [] - for v in (m3u8_id, group_id, name): - if v: - format_id.append(v) - f = { - 'format_id': '-'.join(format_id), - 'url': format_url(media_url), - 'manifest_url': m3u8_url, - 'language': media.get('LANGUAGE'), - 'ext': ext, - 'protocol': entry_protocol, - 'preference': preference, - } - if media_type == 'AUDIO': - f['vcodec'] = 'none' - formats.append(f) - - def build_stream_name(): - # Despite specification does not mention NAME attribute for - # EXT-X-STREAM-INF tag it still sometimes may be present (see [1] - # or vidio test in TestInfoExtractor.test_parse_m3u8_formats) - # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015 - stream_name = last_stream_inf.get('NAME') - if stream_name: - return stream_name - # If there is no NAME in EXT-X-STREAM-INF it will be obtained - # from corresponding rendition group - stream_group_id = last_stream_inf.get('VIDEO') - if not stream_group_id: - return - stream_group = groups.get(stream_group_id) - if not stream_group: - return stream_group_id - rendition = stream_group[0] - return rendition.get('NAME') or stream_group_id - - for line in m3u8_doc.splitlines(): - if line.startswith('#EXT-X-STREAM-INF:'): - last_stream_inf = parse_m3u8_attributes(line) - elif line.startswith('#EXT-X-MEDIA:'): - extract_media(line) - elif line.startswith('#') or not line.strip(): - continue - else: - tbr = float_or_none( - last_stream_inf.get('AVERAGE-BANDWIDTH') or - last_stream_inf.get('BANDWIDTH'), scale=1000) - format_id = [] - if m3u8_id: - format_id.append(m3u8_id) - stream_name = build_stream_name() - # Bandwidth of live streams may differ over time thus making - # format_id unpredictable. So it's better to keep provided - # format_id intact. - if not live: - format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats))) - manifest_url = format_url(line.strip()) - f = { - 'format_id': '-'.join(format_id), - 'url': manifest_url, - 'manifest_url': m3u8_url, - 'tbr': tbr, - 'ext': ext, - 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')), - 'protocol': entry_protocol, - 'preference': preference, - } - resolution = last_stream_inf.get('RESOLUTION') - if resolution: - mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution) - if mobj: - f['width'] = int(mobj.group('width')) - f['height'] = int(mobj.group('height')) - # Unified Streaming Platform - mobj = re.search( - r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url']) - if mobj: - abr, vbr = mobj.groups() - abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000) - f.update({ - 'vbr': vbr, - 'abr': abr, - }) - codecs = parse_codecs(last_stream_inf.get('CODECS')) - f.update(codecs) - audio_group_id = last_stream_inf.get('AUDIO') - # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which - # references a rendition group MUST have a CODECS attribute. - # However, this is not always respected, for example, [2] - # contains EXT-X-STREAM-INF tag which references AUDIO - # rendition group but does not have CODECS and despite - # referencing audio group an audio group, it represents - # a complete (with audio and video) format. So, for such cases - # we will ignore references to rendition groups and treat them - # as complete formats. - if audio_group_id and codecs and f.get('vcodec') != 'none': - audio_group = groups.get(audio_group_id) - if audio_group and audio_group[0].get('URI'): - # TODO: update acodec for audio only formats with - # the same GROUP-ID - f['acodec'] = 'none' - formats.append(f) - last_stream_inf = {} - return formats - - @staticmethod - def _xpath_ns(path, namespace=None): - if not namespace: - return path - out = [] - for c in path.split('/'): - if not c or c == '.': - out.append(c) - else: - out.append('{%s}%s' % (namespace, c)) - return '/'.join(out) - - def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): - smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) - - if smil is False: - assert not fatal - return [] - - namespace = self._parse_smil_namespace(smil) - - return self._parse_smil_formats( - smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) - - def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): - smil = self._download_smil(smil_url, video_id, fatal=fatal) - if smil is False: - return {} - return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) - - def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None): - return self._download_xml( - smil_url, video_id, 'Downloading SMIL file', - 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source) - - def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): - namespace = self._parse_smil_namespace(smil) - - formats = self._parse_smil_formats( - smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) - subtitles = self._parse_smil_subtitles(smil, namespace=namespace) - - video_id = os.path.splitext(url_basename(smil_url))[0] - title = None - description = None - upload_date = None - for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): - name = meta.attrib.get('name') - content = meta.attrib.get('content') - if not name or not content: - continue - if not title and name == 'title': - title = content - elif not description and name in ('description', 'abstract'): - description = content - elif not upload_date and name == 'date': - upload_date = unified_strdate(content) - - thumbnails = [{ - 'id': image.get('type'), - 'url': image.get('src'), - 'width': int_or_none(image.get('width')), - 'height': int_or_none(image.get('height')), - } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')] - - return { - 'id': video_id, - 'title': title or video_id, - 'description': description, - 'upload_date': upload_date, - 'thumbnails': thumbnails, - 'formats': formats, - 'subtitles': subtitles, - } - - def _parse_smil_namespace(self, smil): - return self._search_regex( - r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) - - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): - base = smil_url - for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): - b = meta.get('base') or meta.get('httpBase') - if b: - base = b - break - - formats = [] - rtmp_count = 0 - http_count = 0 - m3u8_count = 0 - - srcs = [] - media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace)) - for medium in media: - src = medium.get('src') - if not src or src in srcs: - continue - srcs.append(src) - - bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000) - filesize = int_or_none(medium.get('size') or medium.get('fileSize')) - width = int_or_none(medium.get('width')) - height = int_or_none(medium.get('height')) - proto = medium.get('proto') - ext = medium.get('ext') - src_ext = determine_ext(src) - streamer = medium.get('streamer') or base - - if proto == 'rtmp' or streamer.startswith('rtmp'): - rtmp_count += 1 - formats.append({ - 'url': streamer, - 'play_path': src, - 'ext': 'flv', - 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), - 'tbr': bitrate, - 'filesize': filesize, - 'width': width, - 'height': height, - }) - if transform_rtmp_url: - streamer, src = transform_rtmp_url(streamer, src) - formats[-1].update({ - 'url': streamer, - 'play_path': src, - }) - continue - - src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) - src_url = src_url.strip() - - if proto == 'm3u8' or src_ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) - if len(m3u8_formats) == 1: - m3u8_count += 1 - m3u8_formats[0].update({ - 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate), - 'tbr': bitrate, - 'width': width, - 'height': height, - }) - formats.extend(m3u8_formats) - continue - - if src_ext == 'f4m': - f4m_url = src_url - if not f4m_params: - f4m_params = { - 'hdcore': '3.2.0', - 'plugin': 'flowplayer-3.2.0.1', - } - f4m_url += '&' if '?' in f4m_url else '?' - f4m_url += compat_urllib_parse_urlencode(f4m_params) - formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) - continue - - if src_url.startswith('http') and self._is_valid_url(src, video_id): - http_count += 1 - formats.append({ - 'url': src_url, - 'ext': ext or src_ext or 'flv', - 'format_id': 'http-%d' % (bitrate or http_count), - 'tbr': bitrate, - 'filesize': filesize, - 'width': width, - 'height': height, - }) - continue - - return formats - - def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): - urls = [] - subtitles = {} - for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): - src = textstream.get('src') - if not src or src in urls: - continue - urls.append(src) - ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src) - lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang - subtitles.setdefault(lang, []).append({ - 'url': src, - 'ext': ext, - }) - return subtitles - - def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True): - xspf = self._download_xml( - xspf_url, playlist_id, 'Downloading xpsf playlist', - 'Unable to download xspf manifest', fatal=fatal) - if xspf is False: - return [] - return self._parse_xspf( - xspf, playlist_id, xspf_url=xspf_url, - xspf_base_url=base_url(xspf_url)) - - def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None): - NS_MAP = { - 'xspf': 'http://xspf.org/ns/0/', - 's1': 'http://static.streamone.nl/player/ns/0', - } - - entries = [] - for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): - title = xpath_text( - track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id) - description = xpath_text( - track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description') - thumbnail = xpath_text( - track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail') - duration = float_or_none( - xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) - - formats = [] - for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)): - format_url = urljoin(xspf_base_url, location.text) - if not format_url: - continue - formats.append({ - 'url': format_url, - 'manifest_url': xspf_url, - 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), - 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), - 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), - }) - self._sort_formats(formats) - - entries.append({ - 'id': playlist_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - }) - return entries - - def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}): - res = self._download_xml_handle( - mpd_url, video_id, - note=note or 'Downloading MPD manifest', - errnote=errnote or 'Failed to download MPD manifest', - fatal=fatal) - if res is False: - return [] - mpd_doc, urlh = res - mpd_base_url = base_url(urlh.geturl()) - - return self._parse_mpd_formats( - mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url, - formats_dict=formats_dict, mpd_url=mpd_url) - - def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None): - """ - Parse formats from MPD manifest. - References: - 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E), - http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip - 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP - """ - if mpd_doc.get('type') == 'dynamic': - return [] - - namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None) - - def _add_ns(path): - return self._xpath_ns(path, namespace) - - def is_drm_protected(element): - return element.find(_add_ns('ContentProtection')) is not None - - def extract_multisegment_info(element, ms_parent_info): - ms_info = ms_parent_info.copy() - - # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some - # common attributes and elements. We will only extract relevant - # for us. - def extract_common(source): - segment_timeline = source.find(_add_ns('SegmentTimeline')) - if segment_timeline is not None: - s_e = segment_timeline.findall(_add_ns('S')) - if s_e: - ms_info['total_number'] = 0 - ms_info['s'] = [] - for s in s_e: - r = int(s.get('r', 0)) - ms_info['total_number'] += 1 + r - ms_info['s'].append({ - 't': int(s.get('t', 0)), - # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) - 'd': int(s.attrib['d']), - 'r': r, - }) - start_number = source.get('startNumber') - if start_number: - ms_info['start_number'] = int(start_number) - timescale = source.get('timescale') - if timescale: - ms_info['timescale'] = int(timescale) - segment_duration = source.get('duration') - if segment_duration: - ms_info['segment_duration'] = float(segment_duration) - - def extract_Initialization(source): - initialization = source.find(_add_ns('Initialization')) - if initialization is not None: - ms_info['initialization_url'] = initialization.attrib['sourceURL'] - - segment_list = element.find(_add_ns('SegmentList')) - if segment_list is not None: - extract_common(segment_list) - extract_Initialization(segment_list) - segment_urls_e = segment_list.findall(_add_ns('SegmentURL')) - if segment_urls_e: - ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e] - else: - segment_template = element.find(_add_ns('SegmentTemplate')) - if segment_template is not None: - extract_common(segment_template) - media = segment_template.get('media') - if media: - ms_info['media'] = media - initialization = segment_template.get('initialization') - if initialization: - ms_info['initialization'] = initialization - else: - extract_Initialization(segment_template) - return ms_info - - mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) - formats = [] - for period in mpd_doc.findall(_add_ns('Period')): - period_duration = parse_duration(period.get('duration')) or mpd_duration - period_ms_info = extract_multisegment_info(period, { - 'start_number': 1, - 'timescale': 1, - }) - for adaptation_set in period.findall(_add_ns('AdaptationSet')): - if is_drm_protected(adaptation_set): - continue - adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info) - for representation in adaptation_set.findall(_add_ns('Representation')): - if is_drm_protected(representation): - continue - representation_attrib = adaptation_set.attrib.copy() - representation_attrib.update(representation.attrib) - # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory - mime_type = representation_attrib['mimeType'] - content_type = mime_type.split('/')[0] - if content_type == 'text': - # TODO implement WebVTT downloading - pass - elif content_type in ('video', 'audio'): - base_url = '' - for element in (representation, adaptation_set, period, mpd_doc): - base_url_e = element.find(_add_ns('BaseURL')) - if base_url_e is not None: - base_url = base_url_e.text + base_url - if re.match(r'^https?://', base_url): - break - if mpd_base_url and not re.match(r'^https?://', base_url): - if not mpd_base_url.endswith('/') and not base_url.startswith('/'): - mpd_base_url += '/' - base_url = mpd_base_url + base_url - representation_id = representation_attrib.get('id') - lang = representation_attrib.get('lang') - url_el = representation.find(_add_ns('BaseURL')) - filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) - bandwidth = int_or_none(representation_attrib.get('bandwidth')) - f = { - 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, - 'url': base_url, - 'manifest_url': mpd_url, - 'ext': mimetype2ext(mime_type), - 'width': int_or_none(representation_attrib.get('width')), - 'height': int_or_none(representation_attrib.get('height')), - 'tbr': float_or_none(bandwidth, 1000), - 'asr': int_or_none(representation_attrib.get('audioSamplingRate')), - 'fps': int_or_none(representation_attrib.get('frameRate')), - 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, - 'format_note': 'DASH %s' % content_type, - 'filesize': filesize, - 'container': mimetype2ext(mime_type) + '_dash', - } - f.update(parse_codecs(representation_attrib.get('codecs'))) - representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) - - def prepare_template(template_name, identifiers): - tmpl = representation_ms_info[template_name] - # First of, % characters outside $...$ templates - # must be escaped by doubling for proper processing - # by % operator string formatting used further (see - # https://github.com/rg3/youtube-dl/issues/16867). - t = '' - in_template = False - for c in tmpl: - t += c - if c == '$': - in_template = not in_template - elif c == '%' and not in_template: - t += c - # Next, $...$ templates are translated to their - # %(...) counterparts to be used with % operator - t = t.replace('$RepresentationID$', representation_id) - t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) - t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) - t.replace('$$', '$') - return t - - # @initialization is a regular template like @media one - # so it should be handled just the same way (see - # https://github.com/rg3/youtube-dl/issues/11605) - if 'initialization' in representation_ms_info: - initialization_template = prepare_template( - 'initialization', - # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and - # $Time$ shall not be included for @initialization thus - # only $Bandwidth$ remains - ('Bandwidth', )) - representation_ms_info['initialization_url'] = initialization_template % { - 'Bandwidth': bandwidth, - } - - def location_key(location): - return 'url' if re.match(r'^https?://', location) else 'path' - - if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: - - media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) - media_location_key = location_key(media_template) - - # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ - # can't be used at the same time - if '%(Number' in media_template and 's' not in representation_ms_info: - segment_duration = None - if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: - segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) - representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) - representation_ms_info['fragments'] = [{ - media_location_key: media_template % { - 'Number': segment_number, - 'Bandwidth': bandwidth, - }, - 'duration': segment_duration, - } for segment_number in range( - representation_ms_info['start_number'], - representation_ms_info['total_number'] + representation_ms_info['start_number'])] - else: - # $Number*$ or $Time$ in media template with S list available - # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg - # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 - representation_ms_info['fragments'] = [] - segment_time = 0 - segment_d = None - segment_number = representation_ms_info['start_number'] - - def add_segment_url(): - segment_url = media_template % { - 'Time': segment_time, - 'Bandwidth': bandwidth, - 'Number': segment_number, - } - representation_ms_info['fragments'].append({ - media_location_key: segment_url, - 'duration': float_or_none(segment_d, representation_ms_info['timescale']), - }) - - for num, s in enumerate(representation_ms_info['s']): - segment_time = s.get('t') or segment_time - segment_d = s['d'] - add_segment_url() - segment_number += 1 - for r in range(s.get('r', 0)): - segment_time += segment_d - add_segment_url() - segment_number += 1 - segment_time += segment_d - elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info: - # No media template - # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI - # or any YouTube dashsegments video - fragments = [] - segment_index = 0 - timescale = representation_ms_info['timescale'] - for s in representation_ms_info['s']: - duration = float_or_none(s['d'], timescale) - for r in range(s.get('r', 0) + 1): - segment_uri = representation_ms_info['segment_urls'][segment_index] - fragments.append({ - location_key(segment_uri): segment_uri, - 'duration': duration, - }) - segment_index += 1 - representation_ms_info['fragments'] = fragments - elif 'segment_urls' in representation_ms_info: - # Segment URLs with no SegmentTimeline - # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 - # https://github.com/rg3/youtube-dl/pull/14844 - fragments = [] - segment_duration = float_or_none( - representation_ms_info['segment_duration'], - representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None - for segment_url in representation_ms_info['segment_urls']: - fragment = { - location_key(segment_url): segment_url, - } - if segment_duration: - fragment['duration'] = segment_duration - fragments.append(fragment) - representation_ms_info['fragments'] = fragments - # NB: MPD manifest may contain direct URLs to unfragmented media. - # No fragments key is present in this case. - if 'fragments' in representation_ms_info: - f.update({ - 'fragment_base_url': base_url, - 'fragments': [], - 'protocol': 'http_dash_segments', - }) - if 'initialization_url' in representation_ms_info: - initialization_url = representation_ms_info['initialization_url'] - if not f.get('url'): - f['url'] = initialization_url - f['fragments'].append({location_key(initialization_url): initialization_url}) - f['fragments'].extend(representation_ms_info['fragments']) - # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation - # is not necessarily unique within a Period thus formats with - # the same `format_id` are quite possible. There are numerous examples - # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111, - # https://github.com/rg3/youtube-dl/issues/13919) - full_info = formats_dict.get(representation_id, {}).copy() - full_info.update(f) - formats.append(full_info) - else: - self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) - return formats - - def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True): - res = self._download_xml_handle( - ism_url, video_id, - note=note or 'Downloading ISM manifest', - errnote=errnote or 'Failed to download ISM manifest', - fatal=fatal) - if res is False: - return [] - ism_doc, urlh = res - - return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id) - - def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): - """ - Parse formats from ISM manifest. - References: - 1. [MS-SSTR]: Smooth Streaming Protocol, - https://msdn.microsoft.com/en-us/library/ff469518.aspx - """ - if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None: - return [] - - duration = int(ism_doc.attrib['Duration']) - timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000 - - formats = [] - for stream in ism_doc.findall('StreamIndex'): - stream_type = stream.get('Type') - if stream_type not in ('video', 'audio'): - continue - url_pattern = stream.attrib['Url'] - stream_timescale = int_or_none(stream.get('TimeScale')) or timescale - stream_name = stream.get('Name') - for track in stream.findall('QualityLevel'): - fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None) - # TODO: add support for WVC1 and WMAP - if fourcc not in ('H264', 'AVC1', 'AACL'): - self.report_warning('%s is not a supported codec' % fourcc) - continue - tbr = int(track.attrib['Bitrate']) // 1000 - # [1] does not mention Width and Height attributes. However, - # they're often present while MaxWidth and MaxHeight are - # missing, so should be used as fallbacks - width = int_or_none(track.get('MaxWidth') or track.get('Width')) - height = int_or_none(track.get('MaxHeight') or track.get('Height')) - sampling_rate = int_or_none(track.get('SamplingRate')) - - track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern) - track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern) - - fragments = [] - fragment_ctx = { - 'time': 0, - } - stream_fragments = stream.findall('c') - for stream_fragment_index, stream_fragment in enumerate(stream_fragments): - fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time'] - fragment_repeat = int_or_none(stream_fragment.get('r')) or 1 - fragment_ctx['duration'] = int_or_none(stream_fragment.get('d')) - if not fragment_ctx['duration']: - try: - next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t']) - except IndexError: - next_fragment_time = duration - fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat - for _ in range(fragment_repeat): - fragments.append({ - 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern), - 'duration': fragment_ctx['duration'] / stream_timescale, - }) - fragment_ctx['time'] += fragment_ctx['duration'] - - format_id = [] - if ism_id: - format_id.append(ism_id) - if stream_name: - format_id.append(stream_name) - format_id.append(compat_str(tbr)) - - formats.append({ - 'format_id': '-'.join(format_id), - 'url': ism_url, - 'manifest_url': ism_url, - 'ext': 'ismv' if stream_type == 'video' else 'isma', - 'width': width, - 'height': height, - 'tbr': tbr, - 'asr': sampling_rate, - 'vcodec': 'none' if stream_type == 'audio' else fourcc, - 'acodec': 'none' if stream_type == 'video' else fourcc, - 'protocol': 'ism', - 'fragments': fragments, - '_download_params': { - 'duration': duration, - 'timescale': stream_timescale, - 'width': width or 0, - 'height': height or 0, - 'fourcc': fourcc, - 'codec_private_data': track.get('CodecPrivateData'), - 'sampling_rate': sampling_rate, - 'channels': int_or_none(track.get('Channels', 2)), - 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)), - 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)), - }, - }) - return formats - - def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None): - def absolute_url(item_url): - return urljoin(base_url, item_url) - - def parse_content_type(content_type): - if not content_type: - return {} - ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type) - if ctr: - mimetype, codecs = ctr.groups() - f = parse_codecs(codecs) - f['ext'] = mimetype2ext(mimetype) - return f - return {} - - def _media_formats(src, cur_media_type, type_info={}): - full_url = absolute_url(src) - ext = type_info.get('ext') or determine_ext(full_url) - if ext == 'm3u8': - is_plain_url = False - formats = self._extract_m3u8_formats( - full_url, video_id, ext='mp4', - entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id, - preference=preference, fatal=False) - elif ext == 'mpd': - is_plain_url = False - formats = self._extract_mpd_formats( - full_url, video_id, mpd_id=mpd_id, fatal=False) - else: - is_plain_url = True - formats = [{ - 'url': full_url, - 'vcodec': 'none' if cur_media_type == 'audio' else None, - }] - return is_plain_url, formats - - entries = [] - # amp-video and amp-audio are very similar to their HTML5 counterparts - # so we wll include them right here (see - # https://www.ampproject.org/docs/reference/components/amp-video) - media_tags = [(media_tag, media_type, '') - for media_tag, media_type - in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)] - media_tags.extend(re.findall( - # We only allow video|audio followed by a whitespace or '>'. - # Allowing more characters may end up in significant slow down (see - # https://github.com/rg3/youtube-dl/issues/11979, example URL: - # http://www.porntrex.com/maps/videositemap.xml). - r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage)) - for media_tag, media_type, media_content in media_tags: - media_info = { - 'formats': [], - 'subtitles': {}, - } - media_attributes = extract_attributes(media_tag) - src = media_attributes.get('src') - if src: - _, formats = _media_formats(src, media_type) - media_info['formats'].extend(formats) - media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) - if media_content: - for source_tag in re.findall(r'<source[^>]+>', media_content): - source_attributes = extract_attributes(source_tag) - src = source_attributes.get('src') - if not src: - continue - f = parse_content_type(source_attributes.get('type')) - is_plain_url, formats = _media_formats(src, media_type, f) - if is_plain_url: - # res attribute is not standard but seen several times - # in the wild - f.update({ - 'height': int_or_none(source_attributes.get('res')), - 'format_id': source_attributes.get('label'), - }) - f.update(formats[0]) - media_info['formats'].append(f) - else: - media_info['formats'].extend(formats) - for track_tag in re.findall(r'<track[^>]+>', media_content): - track_attributes = extract_attributes(track_tag) - kind = track_attributes.get('kind') - if not kind or kind in ('subtitles', 'captions'): - src = track_attributes.get('src') - if not src: - continue - lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label') - media_info['subtitles'].setdefault(lang, []).append({ - 'url': absolute_url(src), - }) - for f in media_info['formats']: - f.setdefault('http_headers', {})['Referer'] = base_url - if media_info['formats'] or media_info['subtitles']: - entries.append(media_info) - return entries - - def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): - formats = [] - hdcore_sign = 'hdcore=3.7.0' - f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') - hds_host = hosts.get('hds') - if hds_host: - f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url) - if 'hdcore=' not in f4m_url: - f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign - f4m_formats = self._extract_f4m_formats( - f4m_url, video_id, f4m_id='hds', fatal=False) - for entry in f4m_formats: - entry.update({'extra_param_to_segment_url': hdcore_sign}) - formats.extend(f4m_formats) - m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') - hls_host = hosts.get('hls') - if hls_host: - m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url) - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - return formats - - def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): - query = compat_urlparse.urlparse(url).query - url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) - mobj = re.search( - r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url) - url_base = mobj.group('url') - http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base) - formats = [] - - def manifest_url(manifest): - m_url = '%s/%s' % (http_base_url, manifest) - if query: - m_url += '?%s' % query - return m_url - - if 'm3u8' not in skip_protocols: - formats.extend(self._extract_m3u8_formats( - manifest_url('playlist.m3u8'), video_id, 'mp4', - m3u8_entry_protocol, m3u8_id='hls', fatal=False)) - if 'f4m' not in skip_protocols: - formats.extend(self._extract_f4m_formats( - manifest_url('manifest.f4m'), - video_id, f4m_id='hds', fatal=False)) - if 'dash' not in skip_protocols: - formats.extend(self._extract_mpd_formats( - manifest_url('manifest.mpd'), - video_id, mpd_id='dash', fatal=False)) - if re.search(r'(?:/smil:|\.smil)', url_base): - if 'smil' not in skip_protocols: - rtmp_formats = self._extract_smil_formats( - manifest_url('jwplayer.smil'), - video_id, fatal=False) - for rtmp_format in rtmp_formats: - rtsp_format = rtmp_format.copy() - rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) - del rtsp_format['play_path'] - del rtsp_format['ext'] - rtsp_format.update({ - 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'), - 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), - 'protocol': 'rtsp', - }) - formats.extend([rtmp_format, rtsp_format]) - else: - for protocol in ('rtmp', 'rtsp'): - if protocol not in skip_protocols: - formats.append({ - 'url': '%s:%s' % (protocol, url_base), - 'format_id': protocol, - 'protocol': protocol, - }) - return formats - - def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): - mobj = re.search( - r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)', - webpage) - if mobj: - try: - jwplayer_data = self._parse_json(mobj.group('options'), - video_id=video_id, - transform_source=transform_source) - except ExtractorError: - pass - else: - if isinstance(jwplayer_data, dict): - return jwplayer_data - - def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): - jwplayer_data = self._find_jwplayer_data( - webpage, video_id, transform_source=js_to_json) - return self._parse_jwplayer_data( - jwplayer_data, video_id, *args, **kwargs) - - def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, - m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - # JWPlayer backward compatibility: flattened playlists - # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 - if 'playlist' not in jwplayer_data: - jwplayer_data = {'playlist': [jwplayer_data]} - - entries = [] - - # JWPlayer backward compatibility: single playlist item - # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10 - if not isinstance(jwplayer_data['playlist'], list): - jwplayer_data['playlist'] = [jwplayer_data['playlist']] - - for video_data in jwplayer_data['playlist']: - # JWPlayer backward compatibility: flattened sources - # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 - if 'sources' not in video_data: - video_data['sources'] = [video_data] - - this_video_id = video_id or video_data['mediaid'] - - formats = self._parse_jwplayer_formats( - video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id, - mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url) - - subtitles = {} - tracks = video_data.get('tracks') - if tracks and isinstance(tracks, list): - for track in tracks: - if not isinstance(track, dict): - continue - track_kind = track.get('kind') - if not track_kind or not isinstance(track_kind, compat_str): - continue - if track_kind.lower() not in ('captions', 'subtitles'): - continue - track_url = urljoin(base_url, track.get('file')) - if not track_url: - continue - subtitles.setdefault(track.get('label') or 'en', []).append({ - 'url': self._proto_relative_url(track_url) - }) - - entry = { - 'id': this_video_id, - 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')), - 'description': video_data.get('description'), - 'thumbnail': self._proto_relative_url(video_data.get('image')), - 'timestamp': int_or_none(video_data.get('pubdate')), - 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), - 'subtitles': subtitles, - } - # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32 - if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']): - entry.update({ - '_type': 'url_transparent', - 'url': formats[0]['url'], - }) - else: - self._sort_formats(formats) - entry['formats'] = formats - entries.append(entry) - if len(entries) == 1: - return entries[0] - else: - return self.playlist_result(entries) - - def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, - m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - urls = [] - formats = [] - for source in jwplayer_sources_data: - if not isinstance(source, dict): - continue - source_url = self._proto_relative_url(source.get('file')) - if not source_url: - continue - if base_url: - source_url = compat_urlparse.urljoin(base_url, source_url) - if source_url in urls: - continue - urls.append(source_url) - source_type = source.get('type') or '' - ext = mimetype2ext(source_type) or determine_ext(source_url) - if source_type == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - source_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=m3u8_id, fatal=False)) - elif source_type == 'dash' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - source_url, video_id, mpd_id=mpd_id, fatal=False)) - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - source_url, video_id, fatal=False)) - # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 - elif source_type.startswith('audio') or ext in ( - 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'): - formats.append({ - 'url': source_url, - 'vcodec': 'none', - 'ext': ext, - }) - else: - height = int_or_none(source.get('height')) - if height is None: - # Often no height is provided but there is a label in - # format like "1080p", "720p SD", or 1080. - height = int_or_none(self._search_regex( - r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''), - 'height', default=None)) - a_format = { - 'url': source_url, - 'width': int_or_none(source.get('width')), - 'height': height, - 'tbr': int_or_none(source.get('bitrate')), - 'ext': ext, - } - if source_url.startswith('rtmp'): - a_format['ext'] = 'flv' - # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as - # of jwplayer.flash.swf - rtmp_url_parts = re.split( - r'((?:mp4|mp3|flv):)', source_url, 1) - if len(rtmp_url_parts) == 3: - rtmp_url, prefix, play_path = rtmp_url_parts - a_format.update({ - 'url': rtmp_url, - 'play_path': prefix + play_path, - }) - if rtmp_params: - a_format.update(rtmp_params) - formats.append(a_format) - return formats - - def _live_title(self, name): - """ Generate the title for a live video """ - now = datetime.datetime.now() - now_str = now.strftime('%Y-%m-%d %H:%M') - return name + ' ' + now_str - - def _int(self, v, name, fatal=False, **kwargs): - res = int_or_none(v, **kwargs) - if 'get_attr' in kwargs: - print(getattr(v, kwargs['get_attr'])) - if res is None: - msg = 'Failed to extract %s: Could not parse value %r' % (name, v) - if fatal: - raise ExtractorError(msg) - else: - self._downloader.report_warning(msg) - return res - - def _float(self, v, name, fatal=False, **kwargs): - res = float_or_none(v, **kwargs) - if res is None: - msg = 'Failed to extract %s: Could not parse value %r' % (name, v) - if fatal: - raise ExtractorError(msg) - else: - self._downloader.report_warning(msg) - return res - - def _set_cookie(self, domain, name, value, expire_time=None, port=None, - path='/', secure=False, discard=False, rest={}, **kwargs): - cookie = compat_cookiejar.Cookie( - 0, name, value, port, port is not None, domain, True, - domain.startswith('.'), path, True, secure, expire_time, - discard, None, None, rest) - self._downloader.cookiejar.set_cookie(cookie) - - def _get_cookies(self, url): - """ Return a compat_cookies.SimpleCookie with the cookies for the url """ - req = sanitized_Request(url) - self._downloader.cookiejar.add_cookie_header(req) - return compat_cookies.SimpleCookie(req.get_header('Cookie')) - - def get_testcases(self, include_onlymatching=False): - t = getattr(self, '_TEST', None) - if t: - assert not hasattr(self, '_TESTS'), \ - '%s has _TEST and _TESTS' % type(self).__name__ - tests = [t] - else: - tests = getattr(self, '_TESTS', []) - for t in tests: - if not include_onlymatching and t.get('only_matching', False): - continue - t['name'] = type(self).__name__[:-len('IE')] - yield t - - def is_suitable(self, age_limit): - """ Test whether the extractor is generally suitable for the given - age limit (i.e. pornographic sites are not, all others usually are) """ - - any_restricted = False - for tc in self.get_testcases(include_onlymatching=False): - if tc.get('playlist', []): - tc = tc['playlist'][0] - is_restricted = age_restricted( - tc.get('info_dict', {}).get('age_limit'), age_limit) - if not is_restricted: - return True - any_restricted = any_restricted or is_restricted - return not any_restricted - - def extract_subtitles(self, *args, **kwargs): - if (self._downloader.params.get('writesubtitles', False) or - self._downloader.params.get('listsubtitles')): - return self._get_subtitles(*args, **kwargs) - return {} - - def _get_subtitles(self, *args, **kwargs): - raise NotImplementedError('This method must be implemented by subclasses') - - @staticmethod - def _merge_subtitle_items(subtitle_list1, subtitle_list2): - """ Merge subtitle items for one language. Items with duplicated URLs - will be dropped. """ - list1_urls = set([item['url'] for item in subtitle_list1]) - ret = list(subtitle_list1) - ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls]) - return ret - - @classmethod - def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2): - """ Merge two subtitle dictionaries, language by language. """ - ret = dict(subtitle_dict1) - for lang in subtitle_dict2: - ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) - return ret - - def extract_automatic_captions(self, *args, **kwargs): - if (self._downloader.params.get('writeautomaticsub', False) or - self._downloader.params.get('listsubtitles')): - return self._get_automatic_captions(*args, **kwargs) - return {} - - def _get_automatic_captions(self, *args, **kwargs): - raise NotImplementedError('This method must be implemented by subclasses') - - def mark_watched(self, *args, **kwargs): - if (self._downloader.params.get('mark_watched', False) and - (self._get_login_info()[0] is not None or - self._downloader.params.get('cookiefile') is not None)): - self._mark_watched(*args, **kwargs) - - def _mark_watched(self, *args, **kwargs): - raise NotImplementedError('This method must be implemented by subclasses') - - def geo_verification_headers(self): - headers = {} - geo_verification_proxy = self._downloader.params.get('geo_verification_proxy') - if geo_verification_proxy: - headers['Ytdl-request-proxy'] = geo_verification_proxy - return headers - - def _generic_id(self, url): - return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) - - def _generic_title(self, url): - return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) - - -class SearchInfoExtractor(InfoExtractor): - """ - Base class for paged search queries extractors. - They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query} - Instances should define _SEARCH_KEY and _MAX_RESULTS. - """ - - @classmethod - def _make_valid_url(cls): - return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY - - @classmethod - def suitable(cls, url): - return re.match(cls._make_valid_url(), url) is not None - - def _real_extract(self, query): - mobj = re.match(self._make_valid_url(), query) - if mobj is None: - raise ExtractorError('Invalid search query "%s"' % query) - - prefix = mobj.group('prefix') - query = mobj.group('query') - if prefix == '': - return self._get_n_results(query, 1) - elif prefix == 'all': - return self._get_n_results(query, self._MAX_RESULTS) - else: - n = int(prefix) - if n <= 0: - raise ExtractorError('invalid download number %s for query "%s"' % (n, query)) - elif n > self._MAX_RESULTS: - self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) - n = self._MAX_RESULTS - return self._get_n_results(query, n) - - def _get_n_results(self, query, n): - """Get a specified number of results for a query""" - raise NotImplementedError('This method must be implemented by subclasses') - - @property - def SEARCH_KEY(self): - return self._SEARCH_KEY diff --git a/youtube_dl/extractor/commonmistakes.py b/youtube_dl/extractor/commonmistakes.py deleted file mode 100644 index 79f7a9c..0000000 --- a/youtube_dl/extractor/commonmistakes.py +++ /dev/null @@ -1,50 +0,0 @@ -from __future__ import unicode_literals - -import sys - -from .common import InfoExtractor -from ..utils import ExtractorError - - -class CommonMistakesIE(InfoExtractor): - IE_DESC = False # Do not list - _VALID_URL = r'''(?x) - (?:url|URL)$ - ''' - - _TESTS = [{ - 'url': 'url', - 'only_matching': True, - }, { - 'url': 'URL', - 'only_matching': True, - }] - - def _real_extract(self, url): - msg = ( - 'You\'ve asked youtube-dl to download the URL "%s". ' - 'That doesn\'t make any sense. ' - 'Simply remove the parameter in your command or configuration.' - ) % url - if not self._downloader.params.get('verbose'): - msg += ' Add -v to the command line to see what arguments and configuration youtube-dl got.' - raise ExtractorError(msg, expected=True) - - -class UnicodeBOMIE(InfoExtractor): - IE_DESC = False - _VALID_URL = r'(?P<bom>\ufeff)(?P<id>.*)$' - - # Disable test for python 3.2 since BOM is broken in re in this version - # (see https://github.com/rg3/youtube-dl/issues/9751) - _TESTS = [] if (3, 0) < sys.version_info <= (3, 3) else [{ - 'url': '\ufeffhttp://www.youtube.com/watch?v=BaW_jenozKc', - 'only_matching': True, - }] - - def _real_extract(self, url): - real_url = self._match_id(url) - self.report_warning( - 'Your URL starts with a Byte Order Mark (BOM). ' - 'Removing the BOM and looking for "%s" ...' % real_url) - return self.url_result(real_url) diff --git a/youtube_dl/extractor/commonprotocols.py b/youtube_dl/extractor/commonprotocols.py deleted file mode 100644 index d98331a..0000000 --- a/youtube_dl/extractor/commonprotocols.py +++ /dev/null @@ -1,60 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import ( - compat_urlparse, -) - - -class RtmpIE(InfoExtractor): - IE_DESC = False # Do not list - _VALID_URL = r'(?i)rtmp[est]?://.+' - - _TESTS = [{ - 'url': 'rtmp://cp44293.edgefcs.net/ondemand?auth=daEcTdydfdqcsb8cZcDbAaCbhamacbbawaS-bw7dBb-bWG-GqpGFqCpNCnGoyL&aifp=v001&slist=public/unsecure/audio/2c97899446428e4301471a8cb72b4b97--audio--pmg-20110908-0900a_flv_aac_med_int.mp4', - 'only_matching': True, - }, { - 'url': 'rtmp://edge.live.hitbox.tv/live/dimak', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._generic_id(url) - title = self._generic_title(url) - return { - 'id': video_id, - 'title': title, - 'formats': [{ - 'url': url, - 'ext': 'flv', - 'format_id': compat_urlparse.urlparse(url).scheme, - }], - } - - -class MmsIE(InfoExtractor): - IE_DESC = False # Do not list - _VALID_URL = r'(?i)mms://.+' - - _TEST = { - # Direct MMS link - 'url': 'mms://kentro.kaist.ac.kr/200907/MilesReid(0709).wmv', - 'info_dict': { - 'id': 'MilesReid(0709)', - 'ext': 'wmv', - 'title': 'MilesReid(0709)', - }, - 'params': { - 'skip_download': True, # rtsp downloads, requiring mplayer or mpv - }, - } - - def _real_extract(self, url): - video_id = self._generic_id(url) - title = self._generic_title(url) - - return { - 'id': video_id, - 'title': title, - 'url': url, - } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py deleted file mode 100644 index ceb72da..0000000 --- a/youtube_dl/extractor/extractors.py +++ /dev/null @@ -1,31 +0,0 @@ -# flake8: noqa -from __future__ import unicode_literals - - -from .commonmistakes import CommonMistakesIE, UnicodeBOMIE -from .commonprotocols import ( - MmsIE, - RtmpIE, -) - -from .openload import OpenloadIE - -from .youtube import ( - YoutubeIE, - YoutubeChannelIE, - YoutubeFavouritesIE, - YoutubeHistoryIE, - YoutubeLiveIE, - YoutubePlaylistIE, - YoutubePlaylistsIE, - YoutubeRecommendedIE, - YoutubeSearchDateIE, - YoutubeSearchIE, - YoutubeSearchURLIE, - YoutubeShowIE, - YoutubeSubscriptionsIE, - YoutubeTruncatedIDIE, - YoutubeTruncatedURLIE, - YoutubeUserIE, - YoutubeWatchLaterIE, -) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py deleted file mode 100644 index aa04905..0000000 --- a/youtube_dl/extractor/generic.py +++ /dev/null @@ -1,3335 +0,0 @@ -# coding: utf-8 - -from __future__ import unicode_literals - -import os -import re -import sys - -from .common import InfoExtractor -from .youtube import YoutubeIE -from ..compat import ( - compat_etree_fromstring, - compat_str, - compat_urllib_parse_unquote, - compat_urlparse, - compat_xml_parse_error, -) -from ..utils import ( - determine_ext, - ExtractorError, - float_or_none, - HEADRequest, - is_html, - js_to_json, - KNOWN_EXTENSIONS, - merge_dicts, - mimetype2ext, - orderedSet, - sanitized_Request, - smuggle_url, - unescapeHTML, - unified_strdate, - unsmuggle_url, - UnsupportedError, - xpath_text, -) -from .commonprotocols import RtmpIE -from .brightcove import ( - BrightcoveLegacyIE, - BrightcoveNewIE, -) -from .nexx import ( - NexxIE, - NexxEmbedIE, -) -from .nbc import NBCSportsVPlayerIE -from .ooyala import OoyalaIE -from .rutv import RUTVIE -from .tvc import TVCIE -from .sportbox import SportBoxEmbedIE -from .smotri import SmotriIE -from .myvi import MyviIE -from .condenast import CondeNastIE -from .udn import UDNEmbedIE -from .senateisvp import SenateISVPIE -from .svt import SVTIE -from .pornhub import PornHubIE -from .xhamster import XHamsterEmbedIE -from .tnaflix import TNAFlixNetworkEmbedIE -from .drtuber import DrTuberIE -from .redtube import RedTubeIE -from .tube8 import Tube8IE -from .vimeo import VimeoIE -from .dailymotion import DailymotionIE -from .dailymail import DailyMailIE -from .onionstudios import OnionStudiosIE -from .viewlift import ViewLiftEmbedIE -from .mtv import MTVServicesEmbeddedIE -from .pladform import PladformIE -from .videomore import VideomoreIE -from .webcaster import WebcasterFeedIE -from .googledrive import GoogleDriveIE -from .jwplatform import JWPlatformIE -from .digiteka import DigitekaIE -from .arkena import ArkenaIE -from .instagram import InstagramIE -from .liveleak import LiveLeakIE -from .threeqsdn import ThreeQSDNIE -from .theplatform import ThePlatformIE -from .vessel import VesselIE -from .kaltura import KalturaIE -from .eagleplatform import EaglePlatformIE -from .facebook import FacebookIE -from .soundcloud import SoundcloudIE -from .tunein import TuneInBaseIE -from .vbox7 import Vbox7IE -from .dbtv import DBTVIE -from .piksel import PikselIE -from .videa import VideaIE -from .twentymin import TwentyMinutenIE -from .ustream import UstreamIE -from .openload import OpenloadIE -from .videopress import VideoPressIE -from .rutube import RutubeIE -from .limelight import LimelightBaseIE -from .anvato import AnvatoIE -from .washingtonpost import WashingtonPostIE -from .wistia import WistiaIE -from .mediaset import MediasetIE -from .joj import JojIE -from .megaphone import MegaphoneIE -from .vzaar import VzaarIE -from .channel9 import Channel9IE -from .vshare import VShareIE -from .mediasite import MediasiteIE -from .springboardplatform import SpringboardPlatformIE -from .yapfiles import YapFilesIE -from .vice import ViceIE -from .xfileshare import XFileShareIE -from .cloudflarestream import CloudflareStreamIE -from .peertube import PeerTubeIE -from .indavideo import IndavideoEmbedIE -from .apa import APAIE -from .foxnews import FoxNewsIE - - -class GenericIE(InfoExtractor): - IE_DESC = 'Generic downloader that works on some sites' - _VALID_URL = r'.*' - IE_NAME = 'generic' - _TESTS = [ - # Direct link to a video - { - 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', - 'md5': '67d406c2bcb6af27fa886f31aa934bbe', - 'info_dict': { - 'id': 'trailer', - 'ext': 'mp4', - 'title': 'trailer', - 'upload_date': '20100513', - } - }, - # Direct link to media delivered compressed (until Accept-Encoding is *) - { - 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac', - 'md5': '128c42e68b13950268b648275386fc74', - 'info_dict': { - 'id': 'FictionJunction-Parallel_Hearts', - 'ext': 'flac', - 'title': 'FictionJunction-Parallel_Hearts', - 'upload_date': '20140522', - }, - 'expected_warnings': [ - 'URL could be a direct video link, returning it as such.' - ], - 'skip': 'URL invalid', - }, - # Direct download with broken HEAD - { - 'url': 'http://ai-radio.org:8000/radio.opus', - 'info_dict': { - 'id': 'radio', - 'ext': 'opus', - 'title': 'radio', - }, - 'params': { - 'skip_download': True, # infinite live stream - }, - 'expected_warnings': [ - r'501.*Not Implemented', - r'400.*Bad Request', - ], - }, - # Direct link with incorrect MIME type - { - 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', - 'md5': '4ccbebe5f36706d85221f204d7eb5913', - 'info_dict': { - 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', - 'id': '5_Lennart_Poettering_-_Systemd', - 'ext': 'webm', - 'title': '5_Lennart_Poettering_-_Systemd', - 'upload_date': '20141120', - }, - 'expected_warnings': [ - 'URL could be a direct video link, returning it as such.' - ] - }, - # RSS feed - { - 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', - 'info_dict': { - 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', - 'title': 'Zero Punctuation', - 'description': 're:.*groundbreaking video review series.*' - }, - 'playlist_mincount': 11, - }, - # RSS feed with enclosure - { - 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', - 'info_dict': { - 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - 'ext': 'm4v', - 'upload_date': '20150228', - 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - } - }, - # RSS feed with enclosures and unsupported link URLs - { - 'url': 'http://www.hellointernet.fm/podcast?format=rss', - 'info_dict': { - 'id': 'http://www.hellointernet.fm/podcast?format=rss', - 'description': 'CGP Grey and Brady Haran talk about YouTube, life, work, whatever.', - 'title': 'Hello Internet', - }, - 'playlist_mincount': 100, - }, - # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng - { - 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', - 'info_dict': { - 'id': 'smil', - 'ext': 'mp4', - 'title': 'Automatics, robotics and biocybernetics', - 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', - 'upload_date': '20130627', - 'formats': 'mincount:16', - 'subtitles': 'mincount:1', - }, - 'params': { - 'force_generic_extractor': True, - 'skip_download': True, - }, - }, - # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html - { - 'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil', - 'info_dict': { - 'id': 'hds', - 'ext': 'flv', - 'title': 'hds', - 'formats': 'mincount:1', - }, - 'params': { - 'skip_download': True, - }, - }, - # SMIL from https://www.restudy.dk/video/play/id/1637 - { - 'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml', - 'info_dict': { - 'id': 'video_1637', - 'ext': 'flv', - 'title': 'video_1637', - 'formats': 'mincount:3', - }, - 'params': { - 'skip_download': True, - }, - }, - # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm - { - 'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil', - 'info_dict': { - 'id': 'smil-service', - 'ext': 'flv', - 'title': 'smil-service', - 'formats': 'mincount:1', - }, - 'params': { - 'skip_download': True, - }, - }, - # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370 - { - 'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil', - 'info_dict': { - 'id': '4719370', - 'ext': 'mp4', - 'title': '571de1fd-47bc-48db-abf9-238872a58d1f', - 'formats': 'mincount:3', - }, - 'params': { - 'skip_download': True, - }, - }, - # XSPF playlist from http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html - { - 'url': 'http://www.telegraaf.nl/xml/playlist/2015/8/7/mZlp2ctYIUEB.xspf', - 'info_dict': { - 'id': 'mZlp2ctYIUEB', - 'ext': 'mp4', - 'title': 'Tikibad ontruimd wegens brand', - 'description': 'md5:05ca046ff47b931f9b04855015e163a4', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 33, - }, - 'params': { - 'skip_download': True, - }, - }, - # MPD from http://dash-mse-test.appspot.com/media.html - { - 'url': 'http://yt-dash-mse-test.commondatastorage.googleapis.com/media/car-20120827-manifest.mpd', - 'md5': '4b57baab2e30d6eb3a6a09f0ba57ef53', - 'info_dict': { - 'id': 'car-20120827-manifest', - 'ext': 'mp4', - 'title': 'car-20120827-manifest', - 'formats': 'mincount:9', - 'upload_date': '20130904', - }, - 'params': { - 'format': 'bestvideo', - }, - }, - # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 - { - 'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8', - 'info_dict': { - 'id': 'content', - 'ext': 'mp4', - 'title': 'content', - 'formats': 'mincount:8', - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, - 'skip': 'video gone', - }, - # m3u8 served with Content-Type: text/plain - { - 'url': 'http://www.nacentapps.com/m3u8/index.m3u8', - 'info_dict': { - 'id': 'index', - 'ext': 'mp4', - 'title': 'index', - 'upload_date': '20140720', - 'formats': 'mincount:11', - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, - 'skip': 'video gone', - }, - # google redirect - { - 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', - 'info_dict': { - 'id': 'cmQHVoWB5FY', - 'ext': 'mp4', - 'upload_date': '20130224', - 'uploader_id': 'TheVerge', - 'description': r're:^Chris Ziegler takes a look at the\.*', - 'uploader': 'The Verge', - 'title': 'First Firefox OS phones side-by-side', - }, - 'params': { - 'skip_download': False, - } - }, - { - # redirect in Refresh HTTP header - 'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1', - 'info_dict': { - 'id': 'pO8h3EaFRdo', - 'ext': 'mp4', - 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set', - 'description': 'md5:6294cc1af09c4049e0652b51a2df10d5', - 'upload_date': '20150917', - 'uploader_id': 'brtvofficial', - 'uploader': 'Boiler Room', - }, - 'params': { - 'skip_download': False, - }, - }, - { - 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', - 'md5': '85b90ccc9d73b4acd9138d3af4c27f89', - 'info_dict': { - 'id': '13601338388002', - 'ext': 'mp4', - 'uploader': 'www.hodiho.fr', - 'title': 'R\u00e9gis plante sa Jeep', - } - }, - # bandcamp page with custom domain - { - 'add_ie': ['Bandcamp'], - 'url': 'http://bronyrock.com/track/the-pony-mash', - 'info_dict': { - 'id': '3235767654', - 'ext': 'mp3', - 'title': 'The Pony Mash', - 'uploader': 'M_Pallante', - }, - 'skip': 'There is a limit of 200 free downloads / month for the test song', - }, - { - # embedded brightcove video - # it also tests brightcove videos that need to set the 'Referer' - # in the http requests - 'add_ie': ['BrightcoveLegacy'], - 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', - 'info_dict': { - 'id': '2765128793001', - 'ext': 'mp4', - 'title': 'Le cours de bourse : l’analyse technique', - 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9', - 'uploader': 'BFM BUSINESS', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # embedded with itemprop embedURL and video id spelled as `idVideo` - 'add_id': ['BrightcoveLegacy'], - 'url': 'http://bfmbusiness.bfmtv.com/mediaplayer/chroniques/olivier-delamarche/', - 'info_dict': { - 'id': '5255628253001', - 'ext': 'mp4', - 'title': 'md5:37c519b1128915607601e75a87995fc0', - 'description': 'md5:37f7f888b434bb8f8cc8dbd4f7a4cf26', - 'uploader': 'BFM BUSINESS', - 'uploader_id': '876450612001', - 'timestamp': 1482255315, - 'upload_date': '20161220', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # https://github.com/rg3/youtube-dl/issues/2253 - 'url': 'http://bcove.me/i6nfkrc3', - 'md5': '0ba9446db037002366bab3b3eb30c88c', - 'info_dict': { - 'id': '3101154703001', - 'ext': 'mp4', - 'title': 'Still no power', - 'uploader': 'thestar.com', - 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', - }, - 'add_ie': ['BrightcoveLegacy'], - 'skip': 'video gone', - }, - { - 'url': 'http://www.championat.com/video/football/v/87/87499.html', - 'md5': 'fb973ecf6e4a78a67453647444222983', - 'info_dict': { - 'id': '3414141473001', - 'ext': 'mp4', - 'title': 'Видео. Удаление Дзагоева (ЦСКА)', - 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"', - 'uploader': 'Championat', - }, - }, - { - # https://github.com/rg3/youtube-dl/issues/3541 - 'add_ie': ['BrightcoveLegacy'], - 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', - 'info_dict': { - 'id': '3866516442001', - 'ext': 'mp4', - 'title': 'Leer mij vrouwen kennen: Aflevering 1', - 'description': 'Leer mij vrouwen kennen: Aflevering 1', - 'uploader': 'SBS Broadcasting', - }, - 'skip': 'Restricted to Netherlands', - 'params': { - 'skip_download': True, # m3u8 download - }, - }, - { - # Brightcove video in <iframe> - 'url': 'http://www.un.org/chinese/News/story.asp?NewsID=27724', - 'md5': '36d74ef5e37c8b4a2ce92880d208b968', - 'info_dict': { - 'id': '5360463607001', - 'ext': 'mp4', - 'title': '叙利亚失明儿童在废墟上演唱《心跳》 呼吁获得正常童年生活', - 'description': '联合国儿童基金会中东和北非区域大使、作曲家扎德·迪拉尼(Zade Dirani)在3月15日叙利亚冲突爆发7周年纪念日之际发布了为叙利亚谱写的歌曲《心跳》(HEARTBEAT),为受到六年冲突影响的叙利亚儿童发出强烈呐喊,呼吁世界做出共同努力,使叙利亚儿童重新获得享有正常童年生活的权利。', - 'uploader': 'United Nations', - 'uploader_id': '1362235914001', - 'timestamp': 1489593889, - 'upload_date': '20170315', - }, - 'add_ie': ['BrightcoveLegacy'], - }, - { - # Brightcove with alternative playerID key - 'url': 'http://www.nature.com/nmeth/journal/v9/n7/fig_tab/nmeth.2062_SV1.html', - 'info_dict': { - 'id': 'nmeth.2062_SV1', - 'title': 'Simultaneous multiview imaging of the Drosophila syncytial blastoderm : Quantitative high-speed imaging of entire developing embryos with simultaneous multiview light-sheet microscopy : Nature Methods : Nature Research', - }, - 'playlist': [{ - 'info_dict': { - 'id': '2228375078001', - 'ext': 'mp4', - 'title': 'nmeth.2062-sv1', - 'description': 'nmeth.2062-sv1', - 'timestamp': 1363357591, - 'upload_date': '20130315', - 'uploader': 'Nature Publishing Group', - 'uploader_id': '1964492299001', - }, - }], - }, - { - # Brightcove with UUID in videoPlayer - 'url': 'http://www8.hp.com/cn/zh/home.html', - 'info_dict': { - 'id': '5255815316001', - 'ext': 'mp4', - 'title': 'Sprocket Video - China', - 'description': 'Sprocket Video - China', - 'uploader': 'HP-Video Gallery', - 'timestamp': 1482263210, - 'upload_date': '20161220', - 'uploader_id': '1107601872001', - }, - 'params': { - 'skip_download': True, # m3u8 download - }, - 'skip': 'video rotates...weekly?', - }, - { - # Brightcove:new type [2]. - 'url': 'http://www.delawaresportszone.com/video-st-thomas-more-earns-first-trip-to-basketball-semis', - 'md5': '2b35148fcf48da41c9fb4591650784f3', - 'info_dict': { - 'id': '5348741021001', - 'ext': 'mp4', - 'upload_date': '20170306', - 'uploader_id': '4191638492001', - 'timestamp': 1488769918, - 'title': 'VIDEO: St. Thomas More earns first trip to basketball semis', - - }, - }, - { - # Alternative brightcove <video> attributes - 'url': 'http://www.programme-tv.net/videos/extraits/81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche/', - 'info_dict': { - 'id': '81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche', - 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche, Extraits : toutes les vidéos avec Télé-Loisirs", - }, - 'playlist': [{ - 'md5': '732d22ba3d33f2f3fc253c39f8f36523', - 'info_dict': { - 'id': '5311302538001', - 'ext': 'mp4', - 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche", - 'description': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche (France 2, 5 février 2017)", - 'timestamp': 1486321708, - 'upload_date': '20170205', - 'uploader_id': '800000640001', - }, - 'only_matching': True, - }], - }, - { - # Brightcove with UUID in videoPlayer - 'url': 'http://www8.hp.com/cn/zh/home.html', - 'info_dict': { - 'id': '5255815316001', - 'ext': 'mp4', - 'title': 'Sprocket Video - China', - 'description': 'Sprocket Video - China', - 'uploader': 'HP-Video Gallery', - 'timestamp': 1482263210, - 'upload_date': '20161220', - 'uploader_id': '1107601872001', - }, - 'params': { - 'skip_download': True, # m3u8 download - }, - }, - # ooyala video - { - 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', - 'md5': '166dd577b433b4d4ebfee10b0824d8ff', - 'info_dict': { - 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', - 'ext': 'mp4', - 'title': '2cc213299525360.mov', # that's what we get - 'duration': 238.231, - }, - 'add_ie': ['Ooyala'], - }, - { - # ooyala video embedded with http://player.ooyala.com/iframe.js - 'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/', - 'info_dict': { - 'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB', - 'ext': 'mp4', - 'title': '"Steve Jobs: Man in the Machine" trailer', - 'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."', - 'duration': 135.427, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'movie expired', - }, - # ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js - { - 'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/', - 'info_dict': { - 'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2', - 'ext': 'mp4', - 'title': 'Steampunk Fest Comes to Honesdale', - 'duration': 43.276, - }, - 'params': { - 'skip_download': True, - } - }, - # embed.ly video - { - 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', - 'info_dict': { - 'id': '9ODmcdjQcHQ', - 'ext': 'mp4', - 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second', - 'upload_date': '20140225', - 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff', - 'uploader': 'Tested', - 'uploader_id': 'testedcom', - }, - # No need to test YoutubeIE here - 'params': { - 'skip_download': True, - }, - }, - # funnyordie embed - { - 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns', - 'info_dict': { - 'id': '18e820ec3f', - 'ext': 'mp4', - 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama', - 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.', - }, - # HEAD requests lead to endless 301, while GET is OK - 'expected_warnings': ['301'], - }, - # RUTV embed - { - 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html', - 'info_dict': { - 'id': '776940', - 'ext': 'mp4', - 'title': 'Охотское море стало целиком российским', - 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - # TVC embed - { - 'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/', - 'info_dict': { - 'id': '55304', - 'ext': 'mp4', - 'title': 'Дошкольное воспитание', - }, - }, - # SportBox embed - { - 'url': 'http://www.vestifinance.ru/articles/25753', - 'info_dict': { - 'id': '25753', - 'title': 'Прямые трансляции с Форума-выставки "Госзаказ-2013"', - }, - 'playlist': [{ - 'info_dict': { - 'id': '370908', - 'title': 'Госзаказ. День 3', - 'ext': 'mp4', - } - }, { - 'info_dict': { - 'id': '370905', - 'title': 'Госзаказ. День 2', - 'ext': 'mp4', - } - }, { - 'info_dict': { - 'id': '370902', - 'title': 'Госзаказ. День 1', - 'ext': 'mp4', - } - }], - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - # Myvi.ru embed - { - 'url': 'http://www.kinomyvi.tv/news/detail/Pervij-dublirovannij-trejler--Uzhastikov-_nOw1', - 'info_dict': { - 'id': 'f4dafcad-ff21-423d-89b5-146cfd89fa1e', - 'ext': 'mp4', - 'title': 'Ужастики, русский трейлер (2015)', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 153, - } - }, - # XHamster embed - { - 'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8', - 'info_dict': { - 'id': 'showthread', - 'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )', - }, - 'playlist_mincount': 7, - # This forum does not allow <iframe> syntaxes anymore - # Now HTML tags are displayed as-is - 'skip': 'No videos on this page', - }, - # Embedded TED video - { - 'url': 'http://en.support.wordpress.com/videos/ted-talks/', - 'md5': '65fdff94098e4a607385a60c5177c638', - 'info_dict': { - 'id': '1969', - 'ext': 'mp4', - 'title': 'Hidden miracles of the natural world', - 'uploader': 'Louie Schwartzberg', - 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9', - } - }, - # nowvideo embed hidden behind percent encoding - { - 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/', - 'md5': '2baf4ddd70f697d94b1c18cf796d5107', - 'info_dict': { - 'id': '06e53103ca9aa', - 'ext': 'flv', - 'title': 'Macross Episode 001 Watch Macross Episode 001 onl', - 'description': 'No description', - }, - }, - # arte embed - { - 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html', - 'md5': '7653032cbb25bf6c80d80f217055fa43', - 'info_dict': { - 'id': '048195-004_PLUS7-F', - 'ext': 'flv', - 'title': 'X:enius', - 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168', - 'upload_date': '20140320', - }, - 'params': { - 'skip_download': 'Requires rtmpdump' - }, - 'skip': 'video gone', - }, - # francetv embed - { - 'url': 'http://www.tsprod.com/replay-du-concert-alcaline-de-calogero', - 'info_dict': { - 'id': 'EV_30231', - 'ext': 'mp4', - 'title': 'Alcaline, le concert avec Calogero', - 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff', - 'upload_date': '20150226', - 'timestamp': 1424989860, - 'duration': 5400, - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, - 'expected_warnings': [ - 'Forbidden' - ] - }, - # Condé Nast embed - { - 'url': 'http://www.wired.com/2014/04/honda-asimo/', - 'md5': 'ba0dfe966fa007657bd1443ee672db0f', - 'info_dict': { - 'id': '53501be369702d3275860000', - 'ext': 'mp4', - 'title': 'Honda’s New Asimo Robot Is More Human Than Ever', - } - }, - # Dailymotion embed - { - 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/', - 'md5': '441aeeb82eb72c422c7f14ec533999cd', - 'info_dict': { - 'id': 'k2mm4bCdJ6CQ2i7c8o2', - 'ext': 'mp4', - 'title': 'Le Zap de Spi0n n°216 - Zapping du Web', - 'description': 'md5:faf028e48a461b8b7fad38f1e104b119', - 'uploader': 'Spi0n', - 'uploader_id': 'xgditw', - 'upload_date': '20140425', - 'timestamp': 1398441542, - }, - 'add_ie': ['Dailymotion'], - }, - # DailyMail embed - { - 'url': 'http://www.bumm.sk/krimi/2017/07/05/biztonsagi-kamera-buktatta-le-az-agg-ferfit-utlegelo-apolot', - 'info_dict': { - 'id': '1495629', - 'ext': 'mp4', - 'title': 'Care worker punches elderly dementia patient in head 11 times', - 'description': 'md5:3a743dee84e57e48ec68bf67113199a5', - }, - 'add_ie': ['DailyMail'], - 'params': { - 'skip_download': True, - }, - }, - # YouTube embed - { - 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html', - 'info_dict': { - 'id': 'FXRb4ykk4S0', - 'ext': 'mp4', - 'title': 'The NBL Auction 2014', - 'uploader': 'BADMINTON England', - 'uploader_id': 'BADMINTONEvents', - 'upload_date': '20140603', - 'description': 'md5:9ef128a69f1e262a700ed83edb163a73', - }, - 'add_ie': ['Youtube'], - 'params': { - 'skip_download': True, - } - }, - # MTVSercices embed - { - 'url': 'http://www.vulture.com/2016/06/new-key-peele-sketches-released.html', - 'md5': 'ca1aef97695ef2c1d6973256a57e5252', - 'info_dict': { - 'id': '769f7ec0-0692-4d62-9b45-0d88074bffc1', - 'ext': 'mp4', - 'title': 'Key and Peele|October 10, 2012|2|203|Liam Neesons - Uncensored', - 'description': 'Two valets share their love for movie star Liam Neesons.', - 'timestamp': 1349922600, - 'upload_date': '20121011', - }, - }, - # YouTube embed via <data-embed-url=""> - { - 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM', - 'info_dict': { - 'id': '4vAffPZIT44', - 'ext': 'mp4', - 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!', - 'uploader': 'Gameloft', - 'uploader_id': 'gameloft', - 'upload_date': '20140828', - 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4', - }, - 'params': { - 'skip_download': True, - } - }, - # YouTube <object> embed - { - 'url': 'http://www.improbable.com/2017/04/03/untrained-modern-youths-and-ancient-masters-in-selfie-portraits/', - 'md5': '516718101ec834f74318df76259fb3cc', - 'info_dict': { - 'id': 'msN87y-iEx0', - 'ext': 'webm', - 'title': 'Feynman: Mirrors FUN TO IMAGINE 6', - 'upload_date': '20080526', - 'description': 'md5:0ffc78ea3f01b2e2c247d5f8d1d3c18d', - 'uploader': 'Christopher Sykes', - 'uploader_id': 'ChristopherJSykes', - }, - 'add_ie': ['Youtube'], - }, - # Camtasia studio - { - 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/', - 'playlist': [{ - 'md5': '0c5e352edabf715d762b0ad4e6d9ee67', - 'info_dict': { - 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', - 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1', - 'ext': 'flv', - 'duration': 2235.90, - } - }, { - 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63', - 'info_dict': { - 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP', - 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip', - 'ext': 'flv', - 'duration': 2235.93, - } - }], - 'info_dict': { - 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', - } - }, - # Flowplayer - { - 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html', - 'md5': '9d65602bf31c6e20014319c7d07fba27', - 'info_dict': { - 'id': '5123ea6d5e5a7', - 'ext': 'mp4', - 'age_limit': 18, - 'uploader': 'www.handjobhub.com', - 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com', - } - }, - # Multiple brightcove videos - # https://github.com/rg3/youtube-dl/issues/2283 - { - 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html', - 'info_dict': { - 'id': 'always-never', - 'title': 'Always / Never - The New Yorker', - }, - 'playlist_count': 3, - 'params': { - 'extract_flat': False, - 'skip_download': True, - } - }, - # MLB embed - { - 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/', - 'md5': '96f09a37e44da40dd083e12d9a683327', - 'info_dict': { - 'id': '33322633', - 'ext': 'mp4', - 'title': 'Ump changes call to ball', - 'description': 'md5:71c11215384298a172a6dcb4c2e20685', - 'duration': 48, - 'timestamp': 1401537900, - 'upload_date': '20140531', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }, - # Wistia embed - { - 'url': 'http://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', - 'md5': '1953f3a698ab51cfc948ed3992a0b7ff', - 'info_dict': { - 'id': '6e2wtrbdaf', - 'ext': 'mov', - 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england', - 'description': 'a Paywall Videos video from Remilon', - 'duration': 644.072, - 'uploader': 'study.com', - 'timestamp': 1459678540, - 'upload_date': '20160403', - 'filesize': 24687186, - }, - }, - { - 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz', - 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4', - 'info_dict': { - 'id': 'uxjb0lwrcz', - 'ext': 'mp4', - 'title': 'Conversation about Hexagonal Rails Part 1', - 'description': 'a Martin Fowler video from ThoughtWorks', - 'duration': 1715.0, - 'uploader': 'thoughtworks.wistia.com', - 'timestamp': 1401832161, - 'upload_date': '20140603', - }, - }, - # Wistia standard embed (async) - { - 'url': 'https://www.getdrip.com/university/brennan-dunn-drip-workshop/', - 'info_dict': { - 'id': '807fafadvk', - 'ext': 'mp4', - 'title': 'Drip Brennan Dunn Workshop', - 'description': 'a JV Webinars video from getdrip-1', - 'duration': 4986.95, - 'timestamp': 1463607249, - 'upload_date': '20160518', - }, - 'params': { - 'skip_download': True, - } - }, - # Soundcloud embed - { - 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/', - 'info_dict': { - 'id': '174391317', - 'ext': 'mp3', - 'description': 'md5:ff867d6b555488ad3c52572bb33d432c', - 'uploader': 'Sophos Security', - 'title': 'Chet Chat 171 - Oct 29, 2014', - 'upload_date': '20141029', - } - }, - # Soundcloud multiple embeds - { - 'url': 'http://www.guitarplayer.com/lessons/1014/legato-workout-one-hour-to-more-fluid-performance---tab/52809', - 'info_dict': { - 'id': '52809', - 'title': 'Guitar Essentials: Legato Workout—One-Hour to Fluid Performance | TAB + AUDIO', - }, - 'playlist_mincount': 7, - }, - # TuneIn station embed - { - 'url': 'http://radiocnrv.com/promouvoir-radio-cnrv/', - 'info_dict': { - 'id': '204146', - 'ext': 'mp3', - 'title': 'CNRV', - 'location': 'Paris, France', - 'is_live': True, - }, - 'params': { - # Live stream - 'skip_download': True, - }, - }, - # Livestream embed - { - 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast', - 'info_dict': { - 'id': '67864563', - 'ext': 'flv', - 'upload_date': '20141112', - 'title': 'Rosetta #CometLanding webcast HL 10', - } - }, - # Another Livestream embed, without 'new.' in URL - { - 'url': 'https://www.freespeech.org/', - 'info_dict': { - 'id': '123537347', - 'ext': 'mp4', - 'title': 're:^FSTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - }, - 'params': { - # Live stream - 'skip_download': True, - }, - }, - # LazyYT - { - 'url': 'https://skiplagged.com/', - 'info_dict': { - 'id': 'skiplagged', - 'title': 'Skiplagged: The smart way to find cheap flights', - }, - 'playlist_mincount': 1, - 'add_ie': ['Youtube'], - }, - # Cinchcast embed - { - 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/', - 'info_dict': { - 'id': '7141703', - 'ext': 'mp3', - 'upload_date': '20141126', - 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing', - } - }, - # Cinerama player - { - 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm', - 'info_dict': { - 'id': '730m_DandD_1901_512k', - 'ext': 'mp4', - 'uploader': 'www.abc.net.au', - 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015', - } - }, - # embedded viddler video - { - 'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597', - 'info_dict': { - 'id': '4d03aad9', - 'ext': 'mp4', - 'uploader': 'deadspin', - 'title': 'WALL-TO-GORTAT', - 'timestamp': 1422285291, - 'upload_date': '20150126', - }, - 'add_ie': ['Viddler'], - }, - # Libsyn embed - { - 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve', - 'info_dict': { - 'id': '3377616', - 'ext': 'mp3', - 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart", - 'description': 'md5:601cb790edd05908957dae8aaa866465', - 'upload_date': '20150220', - }, - 'skip': 'All The Daily Show URLs now redirect to http://www.cc.com/shows/', - }, - # jwplayer YouTube - { - 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/', - 'info_dict': { - 'id': 'Mrj4DVp2zeA', - 'ext': 'mp4', - 'upload_date': '20150212', - 'uploader': 'The National Archives UK', - 'description': 'md5:8078af856dca76edc42910b61273dbbf', - 'uploader_id': 'NationalArchives08', - 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue', - }, - }, - # jwplayer rtmp - { - 'url': 'http://www.suffolk.edu/sjc/live.php', - 'info_dict': { - 'id': 'live', - 'ext': 'flv', - 'title': 'Massachusetts Supreme Judicial Court Oral Arguments', - 'uploader': 'www.suffolk.edu', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Only has video a few mornings per month, see http://www.suffolk.edu/sjc/', - }, - # Complex jwplayer - { - 'url': 'http://www.indiedb.com/games/king-machine/videos', - 'info_dict': { - 'id': 'videos', - 'ext': 'mp4', - 'title': 'king machine trailer 1', - 'description': 'Browse King Machine videos & audio for sweet media. Your eyes will thank you.', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }, - { - # JWPlayer config passed as variable - 'url': 'http://www.txxx.com/videos/3326530/ariele/', - 'info_dict': { - 'id': '3326530_hq', - 'ext': 'mp4', - 'title': 'ARIELE | Tube Cup', - 'uploader': 'www.txxx.com', - 'age_limit': 18, - }, - 'params': { - 'skip_download': True, - } - }, - { - # JWPlatform iframe - 'url': 'https://www.mediaite.com/tv/dem-senator-claims-gary-cohn-faked-a-bad-connection-during-trump-call-to-get-him-off-the-phone/', - 'md5': 'ca00a040364b5b439230e7ebfd02c4e9', - 'info_dict': { - 'id': 'O0c5JcKT', - 'ext': 'mp4', - 'upload_date': '20171122', - 'timestamp': 1511366290, - 'title': 'Dem Senator Claims Gary Cohn Faked a Bad Connection During Trump Call to Get Him Off the Phone', - }, - 'add_ie': [JWPlatformIE.ie_key()], - }, - { - # Video.js embed, multiple formats - 'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html', - 'info_dict': { - 'id': 'yygqldloqIk', - 'ext': 'mp4', - 'title': 'SolidWorks. Урок 6 Настройка чертежа', - 'description': 'md5:baf95267792646afdbf030e4d06b2ab3', - 'upload_date': '20130314', - 'uploader': 'PROстое3D', - 'uploader_id': 'PROstoe3D', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Video.js embed, single format - 'url': 'https://www.vooplayer.com/v3/watch/watch.php?v=NzgwNTg=', - 'info_dict': { - 'id': 'watch', - 'ext': 'mp4', - 'title': 'Step 1 - Good Foundation', - 'description': 'md5:d1e7ff33a29fc3eb1673d6c270d344f4', - }, - 'params': { - 'skip_download': True, - }, - }, - # rtl.nl embed - { - 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen', - 'playlist_mincount': 5, - 'info_dict': { - 'id': 'aanslagen-kopenhagen', - 'title': 'Aanslagen Kopenhagen', - } - }, - # Zapiks embed - { - 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html', - 'info_dict': { - 'id': '118046', - 'ext': 'mp4', - 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !', - } - }, - # Kaltura embed (different embed code) - { - 'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014', - 'info_dict': { - 'id': '1_a52wc67y', - 'ext': 'flv', - 'upload_date': '20150127', - 'uploader_id': 'PremierMedia', - 'timestamp': int, - 'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014', - }, - }, - # Kaltura embed with single quotes - { - 'url': 'http://fod.infobase.com/p_ViewPlaylist.aspx?AssignmentID=NUN8ZY', - 'info_dict': { - 'id': '0_izeg5utt', - 'ext': 'mp4', - 'title': '35871', - 'timestamp': 1355743100, - 'upload_date': '20121217', - 'uploader_id': 'cplapp@learn360.com', - }, - 'add_ie': ['Kaltura'], - }, - { - # Kaltura embedded via quoted entry_id - 'url': 'https://www.oreilly.com/ideas/my-cloud-makes-pretty-pictures', - 'info_dict': { - 'id': '0_utuok90b', - 'ext': 'mp4', - 'title': '06_matthew_brender_raj_dutt', - 'timestamp': 1466638791, - 'upload_date': '20160622', - }, - 'add_ie': ['Kaltura'], - 'expected_warnings': [ - 'Could not send HEAD request' - ], - 'params': { - 'skip_download': True, - } - }, - { - # Kaltura embedded, some fileExt broken (#11480) - 'url': 'http://www.cornell.edu/video/nima-arkani-hamed-standard-models-of-particle-physics', - 'info_dict': { - 'id': '1_sgtvehim', - 'ext': 'mp4', - 'title': 'Our "Standard Models" of particle physics and cosmology', - 'description': 'md5:67ea74807b8c4fea92a6f38d6d323861', - 'timestamp': 1321158993, - 'upload_date': '20111113', - 'uploader_id': 'kps1', - }, - 'add_ie': ['Kaltura'], - }, - { - # Kaltura iframe embed - 'url': 'http://www.gsd.harvard.edu/event/i-m-pei-a-centennial-celebration/', - 'md5': 'ae5ace8eb09dc1a35d03b579a9c2cc44', - 'info_dict': { - 'id': '0_f2cfbpwy', - 'ext': 'mp4', - 'title': 'I. M. Pei: A Centennial Celebration', - 'description': 'md5:1db8f40c69edc46ca180ba30c567f37c', - 'upload_date': '20170403', - 'uploader_id': 'batchUser', - 'timestamp': 1491232186, - }, - 'add_ie': ['Kaltura'], - }, - { - # Kaltura iframe embed, more sophisticated - 'url': 'http://www.cns.nyu.edu/~eero/math-tools/Videos/lecture-05sep2017.html', - 'info_dict': { - 'id': '1_9gzouybz', - 'ext': 'mp4', - 'title': 'lecture-05sep2017', - 'description': 'md5:40f347d91fd4ba047e511c5321064b49', - 'upload_date': '20170913', - 'uploader_id': 'eps2', - 'timestamp': 1505340777, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Kaltura'], - }, - { - # meta twitter:player - 'url': 'http://thechive.com/2017/12/08/all-i-want-for-christmas-is-more-twerk/', - 'info_dict': { - 'id': '0_01b42zps', - 'ext': 'mp4', - 'title': 'Main Twerk (Video)', - 'upload_date': '20171208', - 'uploader_id': 'sebastian.salinas@thechive.com', - 'timestamp': 1512713057, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Kaltura'], - }, - # referrer protected EaglePlatform embed - { - 'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/', - 'info_dict': { - 'id': '582306', - 'ext': 'mp4', - 'title': 'Стас Намин: «Мы нарушили девственность Кремля»', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 3382, - 'view_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, - # ClipYou (EaglePlatform) embed (custom URL) - { - 'url': 'http://muz-tv.ru/play/7129/', - # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used - 'info_dict': { - 'id': '12820', - 'ext': 'mp4', - 'title': "'O Sole Mio", - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 216, - 'view_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video is unavailable.', - }, - # Pladform embed - { - 'url': 'http://muz-tv.ru/kinozal/view/7400/', - 'info_dict': { - 'id': '100183293', - 'ext': 'mp4', - 'title': 'Тайны перевала Дятлова • 1 серия 2 часть', - 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 694, - 'age_limit': 0, - }, - 'skip': 'HTTP Error 404: Not Found', - }, - # Playwire embed - { - 'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html', - 'info_dict': { - 'id': '3519514', - 'ext': 'mp4', - 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer', - 'thumbnail': r're:^https?://.*\.png$', - 'duration': 45.115, - }, - }, - # 5min embed - { - 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/', - 'md5': '4c6f127a30736b59b3e2c19234ee2bf7', - 'info_dict': { - 'id': '518726732', - 'ext': 'mp4', - 'title': 'Facebook Creates "On This Day" | Crunch Report', - 'description': 'Amazon updates Fire TV line, Tesla\'s Model X spotted in the wild', - 'timestamp': 1427237531, - 'uploader': 'Crunch Report', - 'upload_date': '20150324', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - # Crooks and Liars embed - { - 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists', - 'info_dict': { - 'id': '8RUoRhRi', - 'ext': 'mp4', - 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!", - 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f', - 'timestamp': 1428207000, - 'upload_date': '20150405', - 'uploader': 'Heather', - }, - }, - # Crooks and Liars external embed - { - 'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/', - 'info_dict': { - 'id': 'MTE3MjUtMzQ2MzA', - 'ext': 'mp4', - 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5', - 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec', - 'timestamp': 1265032391, - 'upload_date': '20100201', - 'uploader': 'Heather', - }, - }, - # NBC Sports vplayer embed - { - 'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a', - 'info_dict': { - 'id': 'ln7x1qSThw4k', - 'ext': 'flv', - 'title': "PFT Live: New leader in the 'new-look' defense", - 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e', - 'uploader': 'NBCU-SPORTS', - 'upload_date': '20140107', - 'timestamp': 1389118457, - }, - 'skip': 'Invalid Page URL', - }, - # NBC News embed - { - 'url': 'http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html', - 'md5': '1aa589c675898ae6d37a17913cf68d66', - 'info_dict': { - 'id': 'x_dtl_oa_LettermanliftPR_160608', - 'ext': 'mp4', - 'title': 'David Letterman: A Preview', - 'description': 'A preview of Tom Brokaw\'s interview with David Letterman as part of the On Assignment series powered by Dateline. Airs Sunday June 12 at 7/6c.', - 'upload_date': '20160609', - 'timestamp': 1465431544, - 'uploader': 'NBCU-NEWS', - }, - }, - # UDN embed - { - 'url': 'https://video.udn.com/news/300346', - 'md5': 'fd2060e988c326991037b9aff9df21a6', - 'info_dict': { - 'id': '300346', - 'ext': 'mp4', - 'title': '中一中男師變性 全校師生力挺', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'expected_warnings': ['Failed to parse JSON Expecting value'], - }, - # Brightcove URL in single quotes - { - 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/', - 'md5': '4ae374f1f8b91c889c4b9203c8c752af', - 'info_dict': { - 'id': '4255764656001', - 'ext': 'mp4', - 'title': 'SN Presents: Russell Martin, World Citizen', - 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.', - 'uploader': 'Rogers Sportsnet', - 'uploader_id': '1704050871', - 'upload_date': '20150525', - 'timestamp': 1432570283, - }, - }, - # OnionStudios embed - { - 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537', - 'info_dict': { - 'id': '2855', - 'ext': 'mp4', - 'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You', - 'thumbnail': r're:^https?://.*\.jpe?g$', - 'uploader': 'ClickHole', - 'uploader_id': 'clickhole', - } - }, - # SnagFilms embed - { - 'url': 'http://whilewewatch.blogspot.ru/2012/06/whilewewatch-whilewewatch-gripping.html', - 'info_dict': { - 'id': '74849a00-85a9-11e1-9660-123139220831', - 'ext': 'mp4', - 'title': '#whilewewatch', - } - }, - # AdobeTVVideo embed - { - 'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners', - 'md5': '43662b577c018ad707a63766462b1e87', - 'info_dict': { - 'id': '2456', - 'ext': 'mp4', - 'title': 'New experience with Acrobat DC', - 'description': 'New experience with Acrobat DC', - 'duration': 248.667, - }, - }, - # BrightcoveInPageEmbed embed - { - 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/', - 'info_dict': { - 'id': '4238694884001', - 'ext': 'flv', - 'title': 'Tabletop: Dread, Last Thoughts', - 'description': 'Tabletop: Dread, Last Thoughts', - 'duration': 51690, - }, - }, - # Brightcove embed, with no valid 'renditions' but valid 'IOSRenditions' - # This video can't be played in browsers if Flash disabled and UA set to iPhone, which is actually a false alarm - { - 'url': 'https://dl.dropboxusercontent.com/u/29092637/interview.html', - 'info_dict': { - 'id': '4785848093001', - 'ext': 'mp4', - 'title': 'The Cardinal Pell Interview', - 'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ', - 'uploader': 'GlobeCast Australia - GlobeStream', - 'uploader_id': '2733773828001', - 'upload_date': '20160304', - 'timestamp': 1457083087, - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, - }, - { - # Brightcove embed with whitespace around attribute names - 'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill', - 'info_dict': { - 'id': '3167554373001', - 'ext': 'mp4', - 'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill", - 'description': 'md5:57bacb0e0f29349de4972bfda3191713', - 'uploader_id': '1079349493', - 'upload_date': '20140207', - 'timestamp': 1391810548, - }, - 'params': { - 'skip_download': True, - }, - }, - # Another form of arte.tv embed - { - 'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html', - 'md5': '850bfe45417ddf221288c88a0cffe2e2', - 'info_dict': { - 'id': '030273-562_PLUS7-F', - 'ext': 'mp4', - 'title': 'ARTE Reportage - Nulle part, en France', - 'description': 'md5:e3a0e8868ed7303ed509b9e3af2b870d', - 'upload_date': '20160409', - }, - }, - # LiveLeak embed - { - 'url': 'http://www.wykop.pl/link/3088787/', - 'md5': '7619da8c820e835bef21a1efa2a0fc71', - 'info_dict': { - 'id': '874_1459135191', - 'ext': 'mp4', - 'title': 'Man shows poor quality of new apartment building', - 'description': 'The wall is like a sand pile.', - 'uploader': 'Lake8737', - }, - 'add_ie': [LiveLeakIE.ie_key()], - }, - # Another LiveLeak embed pattern (#13336) - { - 'url': 'https://milo.yiannopoulos.net/2017/06/concealed-carry-robbery/', - 'info_dict': { - 'id': '2eb_1496309988', - 'ext': 'mp4', - 'title': 'Thief robs place where everyone was armed', - 'description': 'md5:694d73ee79e535953cf2488562288eee', - 'uploader': 'brazilwtf', - }, - 'add_ie': [LiveLeakIE.ie_key()], - }, - # Duplicated embedded video URLs - { - 'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443', - 'info_dict': { - 'id': '149298443_480_16c25b74_2', - 'ext': 'mp4', - 'title': 'vs. Blue Orange Spring Game', - 'uploader': 'www.hudl.com', - }, - }, - # twitter:player:stream embed - { - 'url': 'http://www.rtl.be/info/video/589263.aspx?CategoryID=288', - 'info_dict': { - 'id': 'master', - 'ext': 'mp4', - 'title': 'Une nouvelle espèce de dinosaure découverte en Argentine', - 'uploader': 'www.rtl.be', - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, - }, - # twitter:player embed - { - 'url': 'http://www.theatlantic.com/video/index/484130/what-do-black-holes-sound-like/', - 'md5': 'a3e0df96369831de324f0778e126653c', - 'info_dict': { - 'id': '4909620399001', - 'ext': 'mp4', - 'title': 'What Do Black Holes Sound Like?', - 'description': 'what do black holes sound like', - 'upload_date': '20160524', - 'uploader_id': '29913724001', - 'timestamp': 1464107587, - 'uploader': 'TheAtlantic', - }, - 'add_ie': ['BrightcoveLegacy'], - }, - # Facebook <iframe> embed - { - 'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html', - 'md5': 'fbcde74f534176ecb015849146dd3aee', - 'info_dict': { - 'id': '599637780109885', - 'ext': 'mp4', - 'title': 'Facebook video #599637780109885', - }, - }, - # Facebook <iframe> embed, plugin video - { - 'url': 'http://5pillarsuk.com/2017/06/07/tariq-ramadan-disagrees-with-pr-exercise-by-imams-refusing-funeral-prayers-for-london-attackers/', - 'info_dict': { - 'id': '1754168231264132', - 'ext': 'mp4', - 'title': 'About the Imams and Religious leaders refusing to perform funeral prayers for...', - 'uploader': 'Tariq Ramadan (official)', - 'timestamp': 1496758379, - 'upload_date': '20170606', - }, - 'params': { - 'skip_download': True, - }, - }, - # Facebook API embed - { - 'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/', - 'md5': 'a47372ee61b39a7b90287094d447d94e', - 'info_dict': { - 'id': '10153467542406923', - 'ext': 'mp4', - 'title': 'Facebook video #10153467542406923', - }, - }, - # Wordpress "YouTube Video Importer" plugin - { - 'url': 'http://www.lothype.com/blue-devils-drumline-stanford-lot-2016/', - 'md5': 'd16797741b560b485194eddda8121b48', - 'info_dict': { - 'id': 'HNTXWDXV9Is', - 'ext': 'mp4', - 'title': 'Blue Devils Drumline Stanford lot 2016', - 'upload_date': '20160627', - 'uploader_id': 'GENOCIDE8GENERAL10', - 'uploader': 'cylus cyrus', - }, - }, - { - # video stored on custom kaltura server - 'url': 'http://www.expansion.com/multimedia/videos.html?media=EQcM30NHIPv', - 'md5': '537617d06e64dfed891fa1593c4b30cc', - 'info_dict': { - 'id': '0_1iotm5bh', - 'ext': 'mp4', - 'title': 'Elecciones británicas: 5 lecciones para Rajoy', - 'description': 'md5:435a89d68b9760b92ce67ed227055f16', - 'uploader_id': 'videos.expansion@el-mundo.net', - 'upload_date': '20150429', - 'timestamp': 1430303472, - }, - 'add_ie': ['Kaltura'], - }, - { - # Non-standard Vimeo embed - 'url': 'https://openclassrooms.com/courses/understanding-the-web', - 'md5': '64d86f1c7d369afd9a78b38cbb88d80a', - 'info_dict': { - 'id': '148867247', - 'ext': 'mp4', - 'title': 'Understanding the web - Teaser', - 'description': 'This is "Understanding the web - Teaser" by openclassrooms on Vimeo, the home for high quality videos and the people who love them.', - 'upload_date': '20151214', - 'uploader': 'OpenClassrooms', - 'uploader_id': 'openclassrooms', - }, - 'add_ie': ['Vimeo'], - }, - { - # generic vimeo embed that requires original URL passed as Referer - 'url': 'http://racing4everyone.eu/2016/07/30/formula-1-2016-round12-germany/', - 'only_matching': True, - }, - { - 'url': 'https://support.arkena.com/display/PLAY/Ways+to+embed+your+video', - 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', - 'info_dict': { - 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', - 'ext': 'mp4', - 'title': 'Big Buck Bunny', - 'description': 'Royalty free test video', - 'timestamp': 1432816365, - 'upload_date': '20150528', - 'is_live': False, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [ArkenaIE.ie_key()], - }, - { - 'url': 'http://nova.bg/news/view/2016/08/16/156543/%D0%BD%D0%B0-%D0%BA%D0%BE%D1%81%D1%8A%D0%BC-%D0%BE%D1%82-%D0%B2%D0%B7%D1%80%D0%B8%D0%B2-%D0%BE%D1%82%D1%86%D0%B5%D0%BF%D0%B8%D1%85%D0%B0-%D1%86%D1%8F%D0%BB-%D0%BA%D0%B2%D0%B0%D1%80%D1%82%D0%B0%D0%BB-%D0%B7%D0%B0%D1%80%D0%B0%D0%B4%D0%B8-%D0%B8%D0%B7%D1%82%D0%B8%D1%87%D0%B0%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%B3%D0%B0%D0%B7-%D0%B2-%D0%BF%D0%BB%D0%BE%D0%B2%D0%B4%D0%B8%D0%B2/', - 'info_dict': { - 'id': '1c7141f46c', - 'ext': 'mp4', - 'title': 'НА КОСЪМ ОТ ВЗРИВ: Изтичане на газ на бензиностанция в Пловдив', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [Vbox7IE.ie_key()], - }, - { - # DBTV embeds - 'url': 'http://www.dagbladet.no/2016/02/23/nyheter/nordlys/ski/troms/ver/43254897/', - 'info_dict': { - 'id': '43254897', - 'title': 'Etter ett års planlegging, klaffet endelig alt: - Jeg måtte ta en liten dans', - }, - 'playlist_mincount': 3, - }, - { - # Videa embeds - 'url': 'http://forum.dvdtalk.com/movie-talk/623756-deleted-magic-star-wars-ot-deleted-alt-scenes-docu-style.html', - 'info_dict': { - 'id': '623756-deleted-magic-star-wars-ot-deleted-alt-scenes-docu-style', - 'title': 'Deleted Magic - Star Wars: OT Deleted / Alt. Scenes Docu. Style - DVD Talk Forum', - }, - 'playlist_mincount': 2, - }, - { - # 20 minuten embed - 'url': 'http://www.20min.ch/schweiz/news/story/So-kommen-Sie-bei-Eis-und-Schnee-sicher-an-27032552', - 'info_dict': { - 'id': '523629', - 'ext': 'mp4', - 'title': 'So kommen Sie bei Eis und Schnee sicher an', - 'description': 'md5:117c212f64b25e3d95747e5276863f7d', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [TwentyMinutenIE.ie_key()], - }, - { - # VideoPress embed - 'url': 'https://en.support.wordpress.com/videopress/', - 'info_dict': { - 'id': 'OcobLTqC', - 'ext': 'm4v', - 'title': 'IMG_5786', - 'timestamp': 1435711927, - 'upload_date': '20150701', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [VideoPressIE.ie_key()], - }, - { - # Rutube embed - 'url': 'http://magazzino.friday.ru/videos/vipuski/kazan-2', - 'info_dict': { - 'id': '9b3d5bee0a8740bf70dfd29d3ea43541', - 'ext': 'flv', - 'title': 'Магаззино: Казань 2', - 'description': 'md5:99bccdfac2269f0e8fdbc4bbc9db184a', - 'uploader': 'Магаззино', - 'upload_date': '20170228', - 'uploader_id': '996642', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [RutubeIE.ie_key()], - }, - { - # ThePlatform embedded with whitespaces in URLs - 'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm', - 'only_matching': True, - }, - { - # Senate ISVP iframe https - 'url': 'https://www.hsgac.senate.gov/hearings/canadas-fast-track-refugee-plan-unanswered-questions-and-implications-for-us-national-security', - 'md5': 'fb8c70b0b515e5037981a2492099aab8', - 'info_dict': { - 'id': 'govtaff020316', - 'ext': 'mp4', - 'title': 'Integrated Senate Video Player', - }, - 'add_ie': [SenateISVPIE.ie_key()], - }, - { - # Limelight embeds (1 channel embed + 4 media embeds) - 'url': 'http://www.sedona.com/FacilitatorTraining2017', - 'info_dict': { - 'id': 'FacilitatorTraining2017', - 'title': 'Facilitator Training 2017', - }, - 'playlist_mincount': 5, - }, - { - # Limelight embed (LimelightPlayerUtil.embed) - 'url': 'https://tv5.ca/videos?v=xuu8qowr291ri', - 'info_dict': { - 'id': '95d035dc5c8a401588e9c0e6bd1e9c92', - 'ext': 'mp4', - 'title': '07448641', - 'timestamp': 1499890639, - 'upload_date': '20170712', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['LimelightMedia'], - }, - { - 'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/', - 'info_dict': { - 'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest', - 'title': 'Standoff with Walnut Creek murder suspect ends', - 'description': 'md5:3ccc48a60fc9441eeccfc9c469ebf788', - }, - 'playlist_mincount': 4, - }, - { - # WashingtonPost embed - 'url': 'http://www.vanityfair.com/hollywood/2017/04/donald-trump-tv-pitches', - 'info_dict': { - 'id': '8caf6e88-d0ec-11e5-90d3-34c2c42653ac', - 'ext': 'mp4', - 'title': "No one has seen the drama series based on Trump's life \u2014 until now", - 'description': 'Donald Trump wanted a weekly TV drama based on his life. It never aired. But The Washington Post recently obtained a scene from the pilot script — and enlisted actors.', - 'timestamp': 1455216756, - 'uploader': 'The Washington Post', - 'upload_date': '20160211', - }, - 'add_ie': [WashingtonPostIE.ie_key()], - }, - { - # Mediaset embed - 'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml', - 'info_dict': { - 'id': '720642', - 'ext': 'mp4', - 'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [MediasetIE.ie_key()], - }, - { - # JOJ.sk embeds - 'url': 'https://www.noviny.sk/slovensko/238543-slovenskom-sa-prehnala-vlna-silnych-burok', - 'info_dict': { - 'id': '238543-slovenskom-sa-prehnala-vlna-silnych-burok', - 'title': 'Slovenskom sa prehnala vlna silných búrok', - }, - 'playlist_mincount': 5, - 'add_ie': [JojIE.ie_key()], - }, - { - # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video) - 'url': 'https://tvrain.ru/amp/418921/', - 'md5': 'cc00413936695987e8de148b67d14f1d', - 'info_dict': { - 'id': '418921', - 'ext': 'mp4', - 'title': 'Стас Намин: «Мы нарушили девственность Кремля»', - }, - }, - { - # vzaar embed - 'url': 'http://help.vzaar.com/article/165-embedding-video', - 'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4', - 'info_dict': { - 'id': '8707641', - 'ext': 'mp4', - 'title': 'Building A Business Online: Principal Chairs Q & A', - }, - }, - { - # multiple HTML5 videos on one page - 'url': 'https://www.paragon-software.com/home/rk-free/keyscenarios.html', - 'info_dict': { - 'id': 'keyscenarios', - 'title': 'Rescue Kit 14 Free Edition - Getting started', - }, - 'playlist_count': 4, - }, - { - # vshare embed - 'url': 'https://youtube-dl-demo.neocities.org/vshare.html', - 'md5': '17b39f55b5497ae8b59f5fbce8e35886', - 'info_dict': { - 'id': '0f64ce6', - 'title': 'vl14062007715967', - 'ext': 'mp4', - } - }, - { - 'url': 'http://www.heidelberg-laureate-forum.org/blog/video/lecture-friday-september-23-2016-sir-c-antony-r-hoare/', - 'md5': 'aecd089f55b1cb5a59032cb049d3a356', - 'info_dict': { - 'id': '90227f51a80c4d8f86c345a7fa62bd9a1d', - 'ext': 'mp4', - 'title': 'Lecture: Friday, September 23, 2016 - Sir Tony Hoare', - 'description': 'md5:5a51db84a62def7b7054df2ade403c6c', - 'timestamp': 1474354800, - 'upload_date': '20160920', - } - }, - { - 'url': 'http://www.kidzworld.com/article/30935-trolls-the-beat-goes-on-interview-skylar-astin-and-amanda-leighton', - 'info_dict': { - 'id': '1731611', - 'ext': 'mp4', - 'title': 'Official Trailer | TROLLS: THE BEAT GOES ON!', - 'description': 'md5:eb5f23826a027ba95277d105f248b825', - 'timestamp': 1516100691, - 'upload_date': '20180116', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [SpringboardPlatformIE.ie_key()], - }, - { - 'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU', - 'info_dict': { - 'id': 'uPDB5I9wfp8', - 'ext': 'webm', - 'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3', - 'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d', - 'upload_date': '20160219', - 'uploader': 'Pocoyo - Português (BR)', - 'uploader_id': 'PocoyoBrazil', - }, - 'add_ie': [YoutubeIE.ie_key()], - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'https://www.yapfiles.ru/show/1872528/690b05d3054d2dbe1e69523aa21bb3b1.mp4.html', - 'info_dict': { - 'id': 'vMDE4NzI1Mjgt690b', - 'ext': 'mp4', - 'title': 'Котята', - }, - 'add_ie': [YapFilesIE.ie_key()], - 'params': { - 'skip_download': True, - }, - }, - { - # CloudflareStream embed - 'url': 'https://www.cloudflare.com/products/cloudflare-stream/', - 'info_dict': { - 'id': '31c9291ab41fac05471db4e73aa11717', - 'ext': 'mp4', - 'title': '31c9291ab41fac05471db4e73aa11717', - }, - 'add_ie': [CloudflareStreamIE.ie_key()], - 'params': { - 'skip_download': True, - }, - }, - { - # PeerTube embed - 'url': 'https://joinpeertube.org/fr/home/', - 'info_dict': { - 'id': 'home', - 'title': 'Reprenez le contrôle de vos vidéos ! #JoinPeertube', - }, - 'playlist_count': 2, - }, - { - # Indavideo embed - 'url': 'https://streetkitchen.hu/receptek/igy_kell_otthon_hamburgert_sutni/', - 'info_dict': { - 'id': '1693903', - 'ext': 'mp4', - 'title': 'Így kell otthon hamburgert sütni', - 'description': 'md5:f5a730ecf900a5c852e1e00540bbb0f7', - 'timestamp': 1426330212, - 'upload_date': '20150314', - 'uploader': 'StreetKitchen', - 'uploader_id': '546363', - }, - 'add_ie': [IndavideoEmbedIE.ie_key()], - 'params': { - 'skip_download': True, - }, - }, - { - # APA embed via JWPlatform embed - 'url': 'http://www.vol.at/blue-man-group/5593454', - 'info_dict': { - 'id': 'jjv85FdZ', - 'ext': 'mp4', - 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 254, - 'timestamp': 1519211149, - 'upload_date': '20180221', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://share-videos.se/auto/video/83645793?uid=13', - 'md5': 'b68d276de422ab07ee1d49388103f457', - 'info_dict': { - 'id': '83645793', - 'title': 'Lock up and get excited', - 'ext': 'mp4' - }, - 'skip': 'TODO: fix nested playlists processing in tests', - }, - # { - # # TODO: find another test - # # http://schema.org/VideoObject - # 'url': 'https://flipagram.com/f/nyvTSJMKId', - # 'md5': '888dcf08b7ea671381f00fab74692755', - # 'info_dict': { - # 'id': 'nyvTSJMKId', - # 'ext': 'mp4', - # 'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction', - # 'description': '#love for cats.', - # 'timestamp': 1461244995, - # 'upload_date': '20160421', - # }, - # 'params': { - # 'force_generic_extractor': True, - # }, - # } - ] - - def report_following_redirect(self, new_url): - """Report information extraction.""" - self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) - - def _extract_rss(self, url, video_id, doc): - playlist_title = doc.find('./channel/title').text - playlist_desc_el = doc.find('./channel/description') - playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text - - entries = [] - for it in doc.findall('./channel/item'): - next_url = None - enclosure_nodes = it.findall('./enclosure') - for e in enclosure_nodes: - next_url = e.attrib.get('url') - if next_url: - break - - if not next_url: - next_url = xpath_text(it, 'link', fatal=False) - - if not next_url: - continue - - entries.append({ - '_type': 'url_transparent', - 'url': next_url, - 'title': it.find('title').text, - }) - - return { - '_type': 'playlist', - 'id': url, - 'title': playlist_title, - 'description': playlist_desc, - 'entries': entries, - } - - def _extract_camtasia(self, url, video_id, webpage): - """ Returns None if no camtasia video can be found. """ - - camtasia_cfg = self._search_regex( - r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);', - webpage, 'camtasia configuration file', default=None) - if camtasia_cfg is None: - return None - - title = self._html_search_meta('DC.title', webpage, fatal=True) - - camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg) - camtasia_cfg = self._download_xml( - camtasia_url, video_id, - note='Downloading camtasia configuration', - errnote='Failed to download camtasia configuration') - fileset_node = camtasia_cfg.find('./playlist/array/fileset') - - entries = [] - for n in fileset_node.getchildren(): - url_n = n.find('./uri') - if url_n is None: - continue - - entries.append({ - 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0], - 'title': '%s - %s' % (title, n.tag), - 'url': compat_urlparse.urljoin(url, url_n.text), - 'duration': float_or_none(n.find('./duration').text), - }) - - return { - '_type': 'playlist', - 'entries': entries, - 'title': title, - } - - def _real_extract(self, url): - if url.startswith('//'): - return { - '_type': 'url', - 'url': self.http_scheme() + url, - } - - parsed_url = compat_urlparse.urlparse(url) - if not parsed_url.scheme: - default_search = self._downloader.params.get('default_search') - if default_search is None: - default_search = 'fixup_error' - - if default_search in ('auto', 'auto_warning', 'fixup_error'): - if '/' in url: - self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http') - return self.url_result('http://' + url) - elif default_search != 'fixup_error': - if default_search == 'auto_warning': - if re.match(r'^(?:url|URL)$', url): - raise ExtractorError( - 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url, - expected=True) - else: - self._downloader.report_warning( - 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url) - return self.url_result('ytsearch:' + url) - - if default_search in ('error', 'fixup_error'): - raise ExtractorError( - '%r is not a valid URL. ' - 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube' - % (url, url), expected=True) - else: - if ':' not in default_search: - default_search += ':' - return self.url_result(default_search + url) - - url, smuggled_data = unsmuggle_url(url) - force_videoid = None - is_intentional = smuggled_data and smuggled_data.get('to_generic') - if smuggled_data and 'force_videoid' in smuggled_data: - force_videoid = smuggled_data['force_videoid'] - video_id = force_videoid - else: - video_id = self._generic_id(url) - - self.to_screen('%s: Requesting header' % video_id) - - head_req = HEADRequest(url) - head_response = self._request_webpage( - head_req, video_id, - note=False, errnote='Could not send HEAD request to %s' % url, - fatal=False) - - if head_response is not False: - # Check for redirect - new_url = compat_str(head_response.geturl()) - if url != new_url: - self.report_following_redirect(new_url) - if force_videoid: - new_url = smuggle_url( - new_url, {'force_videoid': force_videoid}) - return self.url_result(new_url) - - full_response = None - if head_response is False: - request = sanitized_Request(url) - request.add_header('Accept-Encoding', '*') - full_response = self._request_webpage(request, video_id) - head_response = full_response - - info_dict = { - 'id': video_id, - 'title': self._generic_title(url), - 'upload_date': unified_strdate(head_response.headers.get('Last-Modified')) - } - - # Check for direct link to a video - content_type = head_response.headers.get('Content-Type', '').lower() - m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) - if m: - format_id = compat_str(m.group('format_id')) - if format_id.endswith('mpegurl'): - formats = self._extract_m3u8_formats(url, video_id, 'mp4') - elif format_id == 'f4m': - formats = self._extract_f4m_formats(url, video_id) - else: - formats = [{ - 'format_id': format_id, - 'url': url, - 'vcodec': 'none' if m.group('type') == 'audio' else None - }] - info_dict['direct'] = True - self._sort_formats(formats) - info_dict['formats'] = formats - return info_dict - - if not self._downloader.params.get('test', False) and not is_intentional: - force = self._downloader.params.get('force_generic_extractor', False) - self._downloader.report_warning( - '%s on generic information extractor.' % ('Forcing' if force else 'Falling back')) - - if not full_response: - request = sanitized_Request(url) - # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) - # making it impossible to download only chunk of the file (yet we need only 512kB to - # test whether it's HTML or not). According to youtube-dl default Accept-Encoding - # that will always result in downloading the whole file that is not desirable. - # Therefore for extraction pass we have to override Accept-Encoding to any in order - # to accept raw bytes and being able to download only a chunk. - # It may probably better to solve this by checking Content-Type for application/octet-stream - # after HEAD request finishes, but not sure if we can rely on this. - request.add_header('Accept-Encoding', '*') - full_response = self._request_webpage(request, video_id) - - first_bytes = full_response.read(512) - - # Is it an M3U playlist? - if first_bytes.startswith(b'#EXTM3U'): - info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4') - self._sort_formats(info_dict['formats']) - return info_dict - - # Maybe it's a direct link to a video? - # Be careful not to download the whole thing! - if not is_html(first_bytes): - self._downloader.report_warning( - 'URL could be a direct video link, returning it as such.') - info_dict.update({ - 'direct': True, - 'url': url, - }) - return info_dict - - webpage = self._webpage_read_content( - full_response, url, video_id, prefix=first_bytes) - - self.report_extraction(video_id) - - # Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest? - try: - doc = compat_etree_fromstring(webpage.encode('utf-8')) - if doc.tag == 'rss': - return self._extract_rss(url, video_id, doc) - elif doc.tag == 'SmoothStreamingMedia': - info_dict['formats'] = self._parse_ism_formats(doc, url) - self._sort_formats(info_dict['formats']) - return info_dict - elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): - smil = self._parse_smil(doc, url, video_id) - self._sort_formats(smil['formats']) - return smil - elif doc.tag == '{http://xspf.org/ns/0/}playlist': - return self.playlist_result( - self._parse_xspf( - doc, video_id, xspf_url=url, - xspf_base_url=compat_str(full_response.geturl())), - video_id) - elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): - info_dict['formats'] = self._parse_mpd_formats( - doc, - mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0], - mpd_url=url) - self._sort_formats(info_dict['formats']) - return info_dict - elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): - info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id) - self._sort_formats(info_dict['formats']) - return info_dict - except compat_xml_parse_error: - pass - - # Is it a Camtasia project? - camtasia_res = self._extract_camtasia(url, video_id, webpage) - if camtasia_res is not None: - return camtasia_res - - # Sometimes embedded video player is hidden behind percent encoding - # (e.g. https://github.com/rg3/youtube-dl/issues/2448) - # Unescaping the whole page allows to handle those cases in a generic way - webpage = compat_urllib_parse_unquote(webpage) - - # it's tempting to parse this further, but you would - # have to take into account all the variations like - # Video Title - Site Name - # Site Name | Video Title - # Video Title - Tagline | Site Name - # and so on and so forth; it's just not practical - video_title = self._og_search_title( - webpage, default=None) or self._html_search_regex( - r'(?s)<title>(.*?)</title>', webpage, 'video title', - default='video') - - # Try to detect age limit automatically - age_limit = self._rta_search(webpage) - # And then there are the jokers who advertise that they use RTA, - # but actually don't. - AGE_LIMIT_MARKERS = [ - r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>', - ] - if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS): - age_limit = 18 - - # video uploader is domain name - video_uploader = self._search_regex( - r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') - - video_description = self._og_search_description(webpage, default=None) - video_thumbnail = self._og_search_thumbnail(webpage, default=None) - - info_dict.update({ - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'age_limit': age_limit, - }) - - # Look for Brightcove Legacy Studio embeds - bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) - if bc_urls: - entries = [{ - '_type': 'url', - 'url': smuggle_url(bc_url, {'Referer': url}), - 'ie_key': 'BrightcoveLegacy' - } for bc_url in bc_urls] - - return { - '_type': 'playlist', - 'title': video_title, - 'id': video_id, - 'entries': entries, - } - - # Look for Brightcove New Studio embeds - bc_urls = BrightcoveNewIE._extract_urls(self, webpage) - if bc_urls: - return self.playlist_from_matches( - bc_urls, video_id, video_title, - getter=lambda x: smuggle_url(x, {'referrer': url}), - ie='BrightcoveNew') - - # Look for Nexx embeds - nexx_urls = NexxIE._extract_urls(webpage) - if nexx_urls: - return self.playlist_from_matches(nexx_urls, video_id, video_title, ie=NexxIE.ie_key()) - - # Look for Nexx iFrame embeds - nexx_embed_urls = NexxEmbedIE._extract_urls(webpage) - if nexx_embed_urls: - return self.playlist_from_matches(nexx_embed_urls, video_id, video_title, ie=NexxEmbedIE.ie_key()) - - # Look for ThePlatform embeds - tp_urls = ThePlatformIE._extract_urls(webpage) - if tp_urls: - return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform') - - # Look for Vessel embeds - vessel_urls = VesselIE._extract_urls(webpage) - if vessel_urls: - return self.playlist_from_matches(vessel_urls, video_id, video_title, ie=VesselIE.ie_key()) - - # Look for embedded rtl.nl player - matches = re.findall( - r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"', - webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl') - - vimeo_urls = VimeoIE._extract_urls(url, webpage) - if vimeo_urls: - return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key()) - - vid_me_embed_url = self._search_regex( - r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', - webpage, 'vid.me embed', default=None) - if vid_me_embed_url is not None: - return self.url_result(vid_me_embed_url, 'Vidme') - - # Look for YouTube embeds - youtube_urls = YoutubeIE._extract_urls(webpage) - if youtube_urls: - return self.playlist_from_matches( - youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key()) - - matches = DailymotionIE._extract_urls(webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title) - - # Look for embedded Dailymotion playlist player (#3822) - m = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage) - if m: - playlists = re.findall( - r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) - if playlists: - return self.playlist_from_matches( - playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) - - # Look for DailyMail embeds - dailymail_urls = DailyMailIE._extract_urls(webpage) - if dailymail_urls: - return self.playlist_from_matches( - dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key()) - - # Look for embedded Wistia player - wistia_url = WistiaIE._extract_url(webpage) - if wistia_url: - return { - '_type': 'url_transparent', - 'url': self._proto_relative_url(wistia_url), - 'ie_key': WistiaIE.ie_key(), - 'uploader': video_uploader, - } - - # Look for SVT player - svt_url = SVTIE._extract_url(webpage) - if svt_url: - return self.url_result(svt_url, 'SVT') - - # Look for Bandcamp pages with custom domain - mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) - if mobj is not None: - burl = unescapeHTML(mobj.group(1)) - # Don't set the extractor because it can be a track url or an album - return self.url_result(burl) - - # Look for embedded Vevo player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for embedded Viddler player - mobj = re.search( - r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for NYTimes player - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for Libsyn player - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for Ooyala videos - mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or - re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or - re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) or - re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or - re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) - if mobj is not None: - embed_token = self._search_regex( - r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)', - webpage, 'ooyala embed token', default=None) - return OoyalaIE._build_url_result(smuggle_url( - mobj.group('ec'), { - 'domain': url, - 'embed_token': embed_token, - })) - - # Look for multiple Ooyala embeds on SBN network websites - mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) - if mobj is not None: - embeds = self._parse_json(mobj.group(1), video_id, fatal=False) - if embeds: - return self.playlist_from_matches( - embeds, video_id, video_title, - getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') - - # Look for Aparat videos - mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage) - if mobj is not None: - return self.url_result(mobj.group(1), 'Aparat') - - # Look for MPORA videos - mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage) - if mobj is not None: - return self.url_result(mobj.group(1), 'Mpora') - - # Look for embedded NovaMov-based player - mobj = re.search( - r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\']) - (?P<url>http://(?:(?:embed|www)\.)? - (?:novamov\.com| - nowvideo\.(?:ch|sx|eu|at|ag|co)| - videoweed\.(?:es|com)| - movshare\.(?:net|sx|ag)| - divxstage\.(?:eu|net|ch|co|at|ag)) - /embed\.php.+?)\1''', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for embedded Facebook player - facebook_urls = FacebookIE._extract_urls(webpage) - if facebook_urls: - return self.playlist_from_matches(facebook_urls, video_id, video_title) - - # Look for embedded VK player - mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'VK') - - # Look for embedded Odnoklassniki player - mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Odnoklassniki') - - # Look for embedded ivi player - mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Ivi') - - # Look for embedded Huffington Post player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'HuffPost') - - # Look for embed.ly - mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage) - if mobj is not None: - return self.url_result(compat_urllib_parse_unquote(mobj.group('url'))) - - # Look for funnyordie embed - matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage) - if matches: - return self.playlist_from_matches( - matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie') - - # Look for BBC iPlayer embed - matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk') - - # Look for embedded RUTV player - rutv_url = RUTVIE._extract_url(webpage) - if rutv_url: - return self.url_result(rutv_url, 'RUTV') - - # Look for embedded TVC player - tvc_url = TVCIE._extract_url(webpage) - if tvc_url: - return self.url_result(tvc_url, 'TVC') - - # Look for embedded SportBox player - sportbox_urls = SportBoxEmbedIE._extract_urls(webpage) - if sportbox_urls: - return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie='SportBoxEmbed') - - # Look for embedded XHamster player - xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) - if xhamster_urls: - return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed') - - # Look for embedded TNAFlixNetwork player - tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage) - if tnaflix_urls: - return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key()) - - # Look for embedded PornHub player - pornhub_urls = PornHubIE._extract_urls(webpage) - if pornhub_urls: - return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key()) - - # Look for embedded DrTuber player - drtuber_urls = DrTuberIE._extract_urls(webpage) - if drtuber_urls: - return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key()) - - # Look for embedded RedTube player - redtube_urls = RedTubeIE._extract_urls(webpage) - if redtube_urls: - return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key()) - - # Look for embedded Tube8 player - tube8_urls = Tube8IE._extract_urls(webpage) - if tube8_urls: - return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key()) - - # Look for embedded Tvigle player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Tvigle') - - # Look for embedded TED player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'TED') - - # Look for embedded Ustream videos - ustream_url = UstreamIE._extract_url(webpage) - if ustream_url: - return self.url_result(ustream_url, UstreamIE.ie_key()) - - # Look for embedded arte.tv player - mobj = re.search( - r'<(?:script|iframe) [^>]*?src="(?P<url>http://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'ArteTVEmbed') - - # Look for embedded francetv player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for embedded smotri.com player - smotri_url = SmotriIE._extract_url(webpage) - if smotri_url: - return self.url_result(smotri_url, 'Smotri') - - # Look for embedded Myvi.ru player - myvi_url = MyviIE._extract_url(webpage) - if myvi_url: - return self.url_result(myvi_url) - - # Look for embedded soundcloud player - soundcloud_urls = SoundcloudIE._extract_urls(webpage) - if soundcloud_urls: - return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) - - # Look for tunein player - tunein_urls = TuneInBaseIE._extract_urls(webpage) - if tunein_urls: - return self.playlist_from_matches(tunein_urls, video_id, video_title) - - # Look for embedded mtvservices player - mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) - if mtvservices_url: - return self.url_result(mtvservices_url, ie='MTVServicesEmbedded') - - # Look for embedded yahoo player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Yahoo') - - # Look for embedded sbs.com.au player - mobj = re.search( - r'''(?x) - (?: - <meta\s+property="og:video"\s+content=| - <iframe[^>]+?src= - ) - (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'SBS') - - # Look for embedded Cinchcast player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Cinchcast') - - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1', - webpage) - if not mobj: - mobj = re.search( - r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'MLB') - - mobj = re.search( - r'<(?:iframe|script)[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL, - webpage) - if mobj is not None: - return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast') - - mobj = re.search( - r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Livestream') - - # Look for Zapiks embed - mobj = re.search( - r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Zapiks') - - # Look for Kaltura embeds - kaltura_url = KalturaIE._extract_url(webpage) - if kaltura_url: - return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key()) - - # Look for EaglePlatform embeds - eagleplatform_url = EaglePlatformIE._extract_url(webpage) - if eagleplatform_url: - return self.url_result(smuggle_url(eagleplatform_url, {'referrer': url}), EaglePlatformIE.ie_key()) - - # Look for ClipYou (uses EaglePlatform) embeds - mobj = re.search( - r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage) - if mobj is not None: - return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform') - - # Look for Pladform embeds - pladform_url = PladformIE._extract_url(webpage) - if pladform_url: - return self.url_result(pladform_url) - - # Look for Videomore embeds - videomore_url = VideomoreIE._extract_url(webpage) - if videomore_url: - return self.url_result(videomore_url) - - # Look for Webcaster embeds - webcaster_url = WebcasterFeedIE._extract_url(self, webpage) - if webcaster_url: - return self.url_result(webcaster_url, ie=WebcasterFeedIE.ie_key()) - - # Look for Playwire embeds - mobj = re.search( - r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for 5min embeds - mobj = re.search( - r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage) - if mobj is not None: - return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin') - - # Look for Crooks and Liars embeds - mobj = re.search( - r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for NBC Sports VPlayer embeds - nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) - if nbc_sports_url: - return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') - - # Look for NBC News embeds - nbc_news_embed_url = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1', webpage) - if nbc_news_embed_url: - return self.url_result(nbc_news_embed_url.group('url'), 'NBCNews') - - # Look for Google Drive embeds - google_drive_url = GoogleDriveIE._extract_url(webpage) - if google_drive_url: - return self.url_result(google_drive_url, 'GoogleDrive') - - # Look for UDN embeds - mobj = re.search( - r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage) - if mobj is not None: - return self.url_result( - compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed') - - # Look for Senate ISVP iframe - senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) - if senate_isvp_url: - return self.url_result(senate_isvp_url, 'SenateISVP') - - # Look for OnionStudios embeds - onionstudios_url = OnionStudiosIE._extract_url(webpage) - if onionstudios_url: - return self.url_result(onionstudios_url) - - # Look for ViewLift embeds - viewlift_url = ViewLiftEmbedIE._extract_url(webpage) - if viewlift_url: - return self.url_result(viewlift_url) - - # Look for JWPlatform embeds - jwplatform_urls = JWPlatformIE._extract_urls(webpage) - if jwplatform_urls: - return self.playlist_from_matches(jwplatform_urls, video_id, video_title, ie=JWPlatformIE.ie_key()) - - # Look for Digiteka embeds - digiteka_url = DigitekaIE._extract_url(webpage) - if digiteka_url: - return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key()) - - # Look for Arkena embeds - arkena_url = ArkenaIE._extract_url(webpage) - if arkena_url: - return self.url_result(arkena_url, ArkenaIE.ie_key()) - - # Look for Piksel embeds - piksel_url = PikselIE._extract_url(webpage) - if piksel_url: - return self.url_result(piksel_url, PikselIE.ie_key()) - - # Look for Limelight embeds - limelight_urls = LimelightBaseIE._extract_urls(webpage, url) - if limelight_urls: - return self.playlist_result( - limelight_urls, video_id, video_title, video_description) - - # Look for Anvato embeds - anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id) - if anvato_urls: - return self.playlist_result( - anvato_urls, video_id, video_title, video_description) - - # Look for AdobeTVVideo embeds - mobj = re.search( - r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', - webpage) - if mobj is not None: - return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group(1))), - 'AdobeTVVideo') - - # Look for Vine embeds - mobj = re.search( - r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))', - webpage) - if mobj is not None: - return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine') - - # Look for VODPlatform embeds - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vod-platform\.net/[eE]mbed/.+?)\1', - webpage) - if mobj is not None: - return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group('url'))), 'VODPlatform') - - # Look for Mangomolo embeds - mobj = re.search( - r'''(?x)<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?admin\.mangomolo\.com/analytics/index\.php/customers/embed/ - (?: - video\?.*?\bid=(?P<video_id>\d+)| - index\?.*?\bchannelid=(?P<channel_id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+) - ).+?)\1''', webpage) - if mobj is not None: - info = { - '_type': 'url_transparent', - 'url': self._proto_relative_url(unescapeHTML(mobj.group('url'))), - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'uploader': video_uploader, - } - video_id = mobj.group('video_id') - if video_id: - info.update({ - 'ie_key': 'MangomoloVideo', - 'id': video_id, - }) - else: - info.update({ - 'ie_key': 'MangomoloLive', - 'id': mobj.group('channel_id'), - }) - return info - - # Look for Instagram embeds - instagram_embed_url = InstagramIE._extract_embed_url(webpage) - if instagram_embed_url is not None: - return self.url_result( - self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key()) - - # Look for LiveLeak embeds - liveleak_urls = LiveLeakIE._extract_urls(webpage) - if liveleak_urls: - return self.playlist_from_matches(liveleak_urls, video_id, video_title) - - # Look for 3Q SDN embeds - threeqsdn_url = ThreeQSDNIE._extract_url(webpage) - if threeqsdn_url: - return { - '_type': 'url_transparent', - 'ie_key': ThreeQSDNIE.ie_key(), - 'url': self._proto_relative_url(threeqsdn_url), - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'uploader': video_uploader, - } - - # Look for VBOX7 embeds - vbox7_url = Vbox7IE._extract_url(webpage) - if vbox7_url: - return self.url_result(vbox7_url, Vbox7IE.ie_key()) - - # Look for DBTV embeds - dbtv_urls = DBTVIE._extract_urls(webpage) - if dbtv_urls: - return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key()) - - # Look for Videa embeds - videa_urls = VideaIE._extract_urls(webpage) - if videa_urls: - return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key()) - - # Look for 20 minuten embeds - twentymin_urls = TwentyMinutenIE._extract_urls(webpage) - if twentymin_urls: - return self.playlist_from_matches( - twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key()) - - # Look for Openload embeds - openload_urls = OpenloadIE._extract_urls(webpage) - if openload_urls: - return self.playlist_from_matches( - openload_urls, video_id, video_title, ie=OpenloadIE.ie_key()) - - # Look for VideoPress embeds - videopress_urls = VideoPressIE._extract_urls(webpage) - if videopress_urls: - return self.playlist_from_matches( - videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key()) - - # Look for Rutube embeds - rutube_urls = RutubeIE._extract_urls(webpage) - if rutube_urls: - return self.playlist_from_matches( - rutube_urls, video_id, video_title, ie=RutubeIE.ie_key()) - - # Look for WashingtonPost embeds - wapo_urls = WashingtonPostIE._extract_urls(webpage) - if wapo_urls: - return self.playlist_from_matches( - wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key()) - - # Look for Mediaset embeds - mediaset_urls = MediasetIE._extract_urls(webpage) - if mediaset_urls: - return self.playlist_from_matches( - mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) - - # Look for JOJ.sk embeds - joj_urls = JojIE._extract_urls(webpage) - if joj_urls: - return self.playlist_from_matches( - joj_urls, video_id, video_title, ie=JojIE.ie_key()) - - # Look for megaphone.fm embeds - mpfn_urls = MegaphoneIE._extract_urls(webpage) - if mpfn_urls: - return self.playlist_from_matches( - mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key()) - - # Look for vzaar embeds - vzaar_urls = VzaarIE._extract_urls(webpage) - if vzaar_urls: - return self.playlist_from_matches( - vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key()) - - channel9_urls = Channel9IE._extract_urls(webpage) - if channel9_urls: - return self.playlist_from_matches( - channel9_urls, video_id, video_title, ie=Channel9IE.ie_key()) - - vshare_urls = VShareIE._extract_urls(webpage) - if vshare_urls: - return self.playlist_from_matches( - vshare_urls, video_id, video_title, ie=VShareIE.ie_key()) - - # Look for Mediasite embeds - mediasite_urls = MediasiteIE._extract_urls(webpage) - if mediasite_urls: - entries = [ - self.url_result(smuggle_url( - compat_urlparse.urljoin(url, mediasite_url), - {'UrlReferrer': url}), ie=MediasiteIE.ie_key()) - for mediasite_url in mediasite_urls] - return self.playlist_result(entries, video_id, video_title) - - springboardplatform_urls = SpringboardPlatformIE._extract_urls(webpage) - if springboardplatform_urls: - return self.playlist_from_matches( - springboardplatform_urls, video_id, video_title, - ie=SpringboardPlatformIE.ie_key()) - - yapfiles_urls = YapFilesIE._extract_urls(webpage) - if yapfiles_urls: - return self.playlist_from_matches( - yapfiles_urls, video_id, video_title, ie=YapFilesIE.ie_key()) - - vice_urls = ViceIE._extract_urls(webpage) - if vice_urls: - return self.playlist_from_matches( - vice_urls, video_id, video_title, ie=ViceIE.ie_key()) - - xfileshare_urls = XFileShareIE._extract_urls(webpage) - if xfileshare_urls: - return self.playlist_from_matches( - xfileshare_urls, video_id, video_title, ie=XFileShareIE.ie_key()) - - cloudflarestream_urls = CloudflareStreamIE._extract_urls(webpage) - if cloudflarestream_urls: - return self.playlist_from_matches( - cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key()) - - peertube_urls = PeerTubeIE._extract_urls(webpage, url) - if peertube_urls: - return self.playlist_from_matches( - peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) - - indavideo_urls = IndavideoEmbedIE._extract_urls(webpage) - if indavideo_urls: - return self.playlist_from_matches( - indavideo_urls, video_id, video_title, ie=IndavideoEmbedIE.ie_key()) - - apa_urls = APAIE._extract_urls(webpage) - if apa_urls: - return self.playlist_from_matches( - apa_urls, video_id, video_title, ie=APAIE.ie_key()) - - foxnews_urls = FoxNewsIE._extract_urls(webpage) - if foxnews_urls: - return self.playlist_from_matches( - foxnews_urls, video_id, video_title, ie=FoxNewsIE.ie_key()) - - sharevideos_urls = [mobj.group('url') for mobj in re.finditer( - r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', - webpage)] - if sharevideos_urls: - return self.playlist_from_matches( - sharevideos_urls, video_id, video_title) - - # Look for HTML5 media - entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') - if entries: - if len(entries) == 1: - entries[0].update({ - 'id': video_id, - 'title': video_title, - }) - else: - for num, entry in enumerate(entries, start=1): - entry.update({ - 'id': '%s-%s' % (video_id, num), - 'title': '%s (%d)' % (video_title, num), - }) - for entry in entries: - self._sort_formats(entry['formats']) - return self.playlist_result(entries, video_id, video_title) - - jwplayer_data = self._find_jwplayer_data( - webpage, video_id, transform_source=js_to_json) - if jwplayer_data: - info = self._parse_jwplayer_data( - jwplayer_data, video_id, require_title=False, base_url=url) - return merge_dicts(info, info_dict) - - # Video.js embed - mobj = re.search( - r'(?s)\bvideojs\s*\(.+?\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;', - webpage) - if mobj is not None: - sources = self._parse_json( - mobj.group(1), video_id, transform_source=js_to_json, - fatal=False) or [] - if not isinstance(sources, list): - sources = [sources] - formats = [] - for source in sources: - src = source.get('src') - if not src or not isinstance(src, compat_str): - continue - src = compat_urlparse.urljoin(url, src) - src_type = source.get('type') - if isinstance(src_type, compat_str): - src_type = src_type.lower() - ext = determine_ext(src).lower() - if src_type == 'video/youtube': - return self.url_result(src, YoutubeIE.ie_key()) - if src_type == 'application/dash+xml' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - src, video_id, mpd_id='dash', fatal=False)) - elif src_type == 'application/x-mpegurl' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': src, - 'ext': (mimetype2ext(src_type) or - ext if ext in KNOWN_EXTENSIONS else 'mp4'), - }) - if formats: - self._sort_formats(formats) - info_dict['formats'] = formats - return info_dict - - # Looking for http://schema.org/VideoObject - json_ld = self._search_json_ld( - webpage, video_id, default={}, expected_type='VideoObject') - if json_ld.get('url'): - return merge_dicts(json_ld, info_dict) - - def check_video(vurl): - if YoutubeIE.suitable(vurl): - return True - if RtmpIE.suitable(vurl): - return True - vpath = compat_urlparse.urlparse(vurl).path - vext = determine_ext(vpath) - return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml') - - def filter_video(urls): - return list(filter(check_video, urls)) - - # Start with something easy: JW Player in SWFObject - found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)) - if not found: - # Look for gorilla-vid style embedding - found = filter_video(re.findall(r'''(?sx) - (?: - jw_plugins| - JWPlayerOptions| - jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup - ) - .*? - ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage)) - if not found: - # Broaden the search a little bit - found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) - if not found: - # Broaden the findall a little bit: JWPlayer JS loader - found = filter_video(re.findall( - r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)) - if not found: - # Flow player - found = filter_video(re.findall(r'''(?xs) - flowplayer\("[^"]+",\s* - \{[^}]+?\}\s*, - \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s* - ["']?url["']?\s*:\s*["']([^"']+)["'] - ''', webpage)) - if not found: - # Cinerama player - found = re.findall( - r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage) - if not found: - # Try to find twitter cards info - # twitter:player:stream should be checked before twitter:player since - # it is expected to contain a raw stream (see - # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) - found = filter_video(re.findall( - r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)) - if not found: - # We look for Open Graph info: - # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) - m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage) - # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: - if m_video_type is not None: - found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)) - if not found: - REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' - found = re.search( - r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' - r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX, - webpage) - if not found: - # Look also in Refresh HTTP header - refresh_header = head_response.headers.get('Refresh') - if refresh_header: - # In python 2 response HTTP headers are bytestrings - if sys.version_info < (3, 0) and isinstance(refresh_header, str): - refresh_header = refresh_header.decode('iso-8859-1') - found = re.search(REDIRECT_REGEX, refresh_header) - if found: - new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1))) - if new_url != url: - self.report_following_redirect(new_url) - return { - '_type': 'url', - 'url': new_url, - } - else: - found = None - - if not found: - # twitter:player is a https URL to iframe player that may or may not - # be supported by youtube-dl thus this is checked the very last (see - # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) - embed_url = self._html_search_meta('twitter:player', webpage, default=None) - if embed_url and embed_url != url: - return self.url_result(embed_url) - - if not found: - raise UnsupportedError(url) - - entries = [] - for video_url in orderedSet(found): - video_url = unescapeHTML(video_url) - video_url = video_url.replace('\\/', '/') - video_url = compat_urlparse.urljoin(url, video_url) - video_id = compat_urllib_parse_unquote(os.path.basename(video_url)) - - # Sometimes, jwplayer extraction will result in a YouTube URL - if YoutubeIE.suitable(video_url): - entries.append(self.url_result(video_url, 'Youtube')) - continue - - # here's a fun little line of code for you: - video_id = os.path.splitext(video_id)[0] - - entry_info_dict = { - 'id': video_id, - 'uploader': video_uploader, - 'title': video_title, - 'age_limit': age_limit, - } - - if RtmpIE.suitable(video_url): - entry_info_dict.update({ - '_type': 'url_transparent', - 'ie_key': RtmpIE.ie_key(), - 'url': video_url, - }) - entries.append(entry_info_dict) - continue - - ext = determine_ext(video_url) - if ext == 'smil': - entry_info_dict['formats'] = self._extract_smil_formats(video_url, video_id) - elif ext == 'xspf': - return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id) - elif ext == 'm3u8': - entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4') - elif ext == 'mpd': - entry_info_dict['formats'] = self._extract_mpd_formats(video_url, video_id) - elif ext == 'f4m': - entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id) - elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url: - # Just matching .ism/manifest is not enough to be reliably sure - # whether it's actually an ISM manifest or some other streaming - # manifest since there are various streaming URL formats - # possible (see [1]) as well as some other shenanigans like - # .smil/manifest URLs that actually serve an ISM (see [2]) and - # so on. - # Thus the most reasonable way to solve this is to delegate - # to generic extractor in order to look into the contents of - # the manifest itself. - # 1. https://azure.microsoft.com/en-us/documentation/articles/media-services-deliver-content-overview/#streaming-url-formats - # 2. https://svs.itworkscdn.net/lbcivod/smil:itwfcdn/lbci/170976.smil/Manifest - entry_info_dict = self.url_result( - smuggle_url(video_url, {'to_generic': True}), - GenericIE.ie_key()) - else: - entry_info_dict['url'] = video_url - - if entry_info_dict.get('formats'): - self._sort_formats(entry_info_dict['formats']) - - entries.append(entry_info_dict) - - if len(entries) == 1: - return entries[0] - else: - for num, e in enumerate(entries, start=1): - # 'url' results don't have a title - if e.get('title') is not None: - e['title'] = '%s (%d)' % (e['title'], num) - return { - '_type': 'playlist', - 'entries': entries, - } diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py deleted file mode 100644 index d264fe2..0000000 --- a/youtube_dl/extractor/openload.py +++ /dev/null @@ -1,379 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import os -import re -import subprocess -import tempfile - -from .common import InfoExtractor -from ..compat import ( - compat_urlparse, - compat_kwargs, -) -from ..utils import ( - check_executable, - determine_ext, - encodeArgument, - ExtractorError, - get_element_by_id, - get_exe_version, - is_outdated_version, - std_headers, -) - - -def cookie_to_dict(cookie): - cookie_dict = { - 'name': cookie.name, - 'value': cookie.value, - } - if cookie.port_specified: - cookie_dict['port'] = cookie.port - if cookie.domain_specified: - cookie_dict['domain'] = cookie.domain - if cookie.path_specified: - cookie_dict['path'] = cookie.path - if cookie.expires is not None: - cookie_dict['expires'] = cookie.expires - if cookie.secure is not None: - cookie_dict['secure'] = cookie.secure - if cookie.discard is not None: - cookie_dict['discard'] = cookie.discard - try: - if (cookie.has_nonstandard_attr('httpOnly') or - cookie.has_nonstandard_attr('httponly') or - cookie.has_nonstandard_attr('HttpOnly')): - cookie_dict['httponly'] = True - except TypeError: - pass - return cookie_dict - - -def cookie_jar_to_list(cookie_jar): - return [cookie_to_dict(cookie) for cookie in cookie_jar] - - -class PhantomJSwrapper(object): - """PhantomJS wrapper class - - This class is experimental. - """ - - _TEMPLATE = r''' - phantom.onError = function(msg, trace) {{ - var msgStack = ['PHANTOM ERROR: ' + msg]; - if(trace && trace.length) {{ - msgStack.push('TRACE:'); - trace.forEach(function(t) {{ - msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line - + (t.function ? ' (in function ' + t.function +')' : '')); - }}); - }} - console.error(msgStack.join('\n')); - phantom.exit(1); - }}; - var page = require('webpage').create(); - var fs = require('fs'); - var read = {{ mode: 'r', charset: 'utf-8' }}; - var write = {{ mode: 'w', charset: 'utf-8' }}; - JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{ - phantom.addCookie(x); - }}); - page.settings.resourceTimeout = {timeout}; - page.settings.userAgent = "{ua}"; - page.onLoadStarted = function() {{ - page.evaluate(function() {{ - delete window._phantom; - delete window.callPhantom; - }}); - }}; - var saveAndExit = function() {{ - fs.write("{html}", page.content, write); - fs.write("{cookies}", JSON.stringify(phantom.cookies), write); - phantom.exit(); - }}; - page.onLoadFinished = function(status) {{ - if(page.url === "") {{ - page.setContent(fs.read("{html}", read), "{url}"); - }} - else {{ - {jscode} - }} - }}; - page.open(""); - ''' - - _TMP_FILE_NAMES = ['script', 'html', 'cookies'] - - @staticmethod - def _version(): - return get_exe_version('phantomjs', version_re=r'([0-9.]+)') - - def __init__(self, extractor, required_version=None, timeout=10000): - self._TMP_FILES = {} - - self.exe = check_executable('phantomjs', ['-v']) - if not self.exe: - raise ExtractorError('PhantomJS executable not found in PATH, ' - 'download it from http://phantomjs.org', - expected=True) - - self.extractor = extractor - - if required_version: - version = self._version() - if is_outdated_version(version, required_version): - self.extractor._downloader.report_warning( - 'Your copy of PhantomJS is outdated, update it to version ' - '%s or newer if you encounter any errors.' % required_version) - - self.options = { - 'timeout': timeout, - } - for name in self._TMP_FILE_NAMES: - tmp = tempfile.NamedTemporaryFile(delete=False) - tmp.close() - self._TMP_FILES[name] = tmp - - def __del__(self): - for name in self._TMP_FILE_NAMES: - try: - os.remove(self._TMP_FILES[name].name) - except (IOError, OSError, KeyError): - pass - - def _save_cookies(self, url): - cookies = cookie_jar_to_list(self.extractor._downloader.cookiejar) - for cookie in cookies: - if 'path' not in cookie: - cookie['path'] = '/' - if 'domain' not in cookie: - cookie['domain'] = compat_urlparse.urlparse(url).netloc - with open(self._TMP_FILES['cookies'].name, 'wb') as f: - f.write(json.dumps(cookies).encode('utf-8')) - - def _load_cookies(self): - with open(self._TMP_FILES['cookies'].name, 'rb') as f: - cookies = json.loads(f.read().decode('utf-8')) - for cookie in cookies: - if cookie['httponly'] is True: - cookie['rest'] = {'httpOnly': None} - if 'expiry' in cookie: - cookie['expire_time'] = cookie['expiry'] - self.extractor._set_cookie(**compat_kwargs(cookie)) - - def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): - """ - Downloads webpage (if needed) and executes JS - - Params: - url: website url - html: optional, html code of website - video_id: video id - note: optional, displayed when downloading webpage - note2: optional, displayed when executing JS - headers: custom http headers - jscode: code to be executed when page is loaded - - Returns tuple with: - * downloaded website (after JS execution) - * anything you print with `console.log` (but not inside `page.execute`!) - - In most cases you don't need to add any `jscode`. - It is executed in `page.onLoadFinished`. - `saveAndExit();` is mandatory, use it instead of `phantom.exit()` - It is possible to wait for some element on the webpage, for example: - var check = function() { - var elementFound = page.evaluate(function() { - return document.querySelector('#b.done') !== null; - }); - if(elementFound) - saveAndExit(); - else - window.setTimeout(check, 500); - } - - page.evaluate(function(){ - document.querySelector('#a').click(); - }); - check(); - """ - if 'saveAndExit();' not in jscode: - raise ExtractorError('`saveAndExit();` not found in `jscode`') - if not html: - html = self.extractor._download_webpage(url, video_id, note=note, headers=headers) - with open(self._TMP_FILES['html'].name, 'wb') as f: - f.write(html.encode('utf-8')) - - self._save_cookies(url) - - replaces = self.options - replaces['url'] = url - user_agent = headers.get('User-Agent') or std_headers['User-Agent'] - replaces['ua'] = user_agent.replace('"', '\\"') - replaces['jscode'] = jscode - - for x in self._TMP_FILE_NAMES: - replaces[x] = self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"') - - with open(self._TMP_FILES['script'].name, 'wb') as f: - f.write(self._TEMPLATE.format(**replaces).encode('utf-8')) - - if video_id is None: - self.extractor.to_screen('%s' % (note2,)) - else: - self.extractor.to_screen('%s: %s' % (video_id, note2)) - - p = subprocess.Popen([ - self.exe, '--ssl-protocol=any', - self._TMP_FILES['script'].name - ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = p.communicate() - if p.returncode != 0: - raise ExtractorError( - 'Executing JS failed\n:' + encodeArgument(err)) - with open(self._TMP_FILES['html'].name, 'rb') as f: - html = f.read().decode('utf-8') - - self._load_cookies() - - return (html, encodeArgument(out)) - - -class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download))/(?:f|embed)/(?P<id>[a-zA-Z0-9-_]+)' - - _TESTS = [{ - 'url': 'https://openload.co/f/kUEfGclsU9o', - 'md5': 'bf1c059b004ebc7a256f89408e65c36e', - 'info_dict': { - 'id': 'kUEfGclsU9o', - 'ext': 'mp4', - 'title': 'skyrim_no-audio_1080.mp4', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }, { - 'url': 'https://openload.co/embed/rjC09fkPLYs', - 'info_dict': { - 'id': 'rjC09fkPLYs', - 'ext': 'mp4', - 'title': 'movie.mp4', - 'thumbnail': r're:^https?://.*\.jpg$', - 'subtitles': { - 'en': [{ - 'ext': 'vtt', - }], - }, - }, - 'params': { - 'skip_download': True, # test subtitles only - }, - }, { - 'url': 'https://openload.co/embed/kUEfGclsU9o/skyrim_no-audio_1080.mp4', - 'only_matching': True, - }, { - 'url': 'https://openload.io/f/ZAn6oz-VZGE/', - 'only_matching': True, - }, { - 'url': 'https://openload.co/f/_-ztPaZtMhM/', - 'only_matching': True, - }, { - # unavailable via https://openload.co/f/Sxz5sADo82g/, different layout - # for title and ext - 'url': 'https://openload.co/embed/Sxz5sADo82g/', - 'only_matching': True, - }, { - # unavailable via https://openload.co/embed/e-Ixz9ZR5L0/ but available - # via https://openload.co/f/e-Ixz9ZR5L0/ - 'url': 'https://openload.co/f/e-Ixz9ZR5L0/', - 'only_matching': True, - }, { - 'url': 'https://oload.tv/embed/KnG-kKZdcfY/', - 'only_matching': True, - }, { - 'url': 'http://www.openload.link/f/KnG-kKZdcfY', - 'only_matching': True, - }, { - 'url': 'https://oload.stream/f/KnG-kKZdcfY', - 'only_matching': True, - }, { - 'url': 'https://oload.xyz/f/WwRBpzW8Wtk', - 'only_matching': True, - }, { - 'url': 'https://oload.win/f/kUEfGclsU9o', - 'only_matching': True, - }, { - 'url': 'https://oload.download/f/kUEfGclsU9o', - 'only_matching': True, - }, { - # Its title has not got its extension but url has it - 'url': 'https://oload.download/f/N4Otkw39VCw/Tomb.Raider.2018.HDRip.XviD.AC3-EVO.avi.mp4', - 'only_matching': True, - }] - - _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' - - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+src=["\']((?:https?://)?(?:openload\.(?:co|io)|oload\.tv)/embed/[a-zA-Z0-9-_]+)', - webpage) - - def _real_extract(self, url): - video_id = self._match_id(url) - url_pattern = 'https://openload.co/%%s/%s/' % video_id - headers = { - 'User-Agent': self._USER_AGENT, - } - - for path in ('embed', 'f'): - page_url = url_pattern % path - last = path == 'f' - webpage = self._download_webpage( - page_url, video_id, 'Downloading %s webpage' % path, - headers=headers, fatal=last) - if not webpage: - continue - if 'File not found' in webpage or 'deleted by the owner' in webpage: - if not last: - continue - raise ExtractorError('File not found', expected=True, video_id=video_id) - break - - phantom = PhantomJSwrapper(self, required_version='2.0') - webpage, _ = phantom.get(page_url, html=webpage, video_id=video_id, headers=headers) - - decoded_id = (get_element_by_id('streamurl', webpage) or - get_element_by_id('streamuri', webpage) or - get_element_by_id('streamurj', webpage) or - self._search_regex( - (r'>\s*([\w-]+~\d{10,}~\d+\.\d+\.0\.0~[\w-]+)\s*<', - r'>\s*([\w~-]+~\d+\.\d+\.\d+\.\d+~[\w~-]+)', - r'>\s*([\w-]+~\d{10,}~(?:[a-f\d]+:){2}:~[\w-]+)\s*<', - r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)\s*<', - r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)'), webpage, - 'stream URL')) - - video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id - - title = self._og_search_title(webpage, default=None) or self._search_regex( - r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage, - 'title', default=None) or self._html_search_meta( - 'description', webpage, 'title', fatal=True) - - entries = self._parse_html5_media_entries(page_url, webpage, video_id) - entry = entries[0] if entries else {} - subtitles = entry.get('subtitles') - - info_dict = { - 'id': video_id, - 'title': title, - 'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None), - 'url': video_url, - 'ext': determine_ext(title, None) or determine_ext(url, 'mp4'), - 'subtitles': subtitles, - 'http_headers': headers, - } - return info_dict diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py deleted file mode 100644 index 85be28d..0000000 --- a/youtube_dl/extractor/youtube.py +++ /dev/null @@ -1,3264 +0,0 @@ -# coding: utf-8 - -from __future__ import unicode_literals - - -import itertools -import json -import os.path -import random -import re -import time -import traceback -import html - -from .common import InfoExtractor, SearchInfoExtractor -from ..jsinterp import JSInterpreter -from ..swfinterp import SWFInterpreter -from ..compat import ( - compat_chr, - compat_HTTPError, - compat_kwargs, - compat_parse_qs, - compat_urllib_parse_unquote, - compat_urllib_parse_unquote_plus, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, - compat_urlparse, - compat_str, -) -from ..utils import ( - clean_html, - dict_get, - error_to_compat_str, - ExtractorError, - float_or_none, - get_element_by_attribute, - get_element_by_id, - int_or_none, - mimetype2ext, - orderedSet, - parse_codecs, - parse_duration, - qualities, - remove_quotes, - remove_start, - smuggle_url, - str_or_none, - str_to_int, - try_get, - unescapeHTML, - unified_strdate, - unsmuggle_url, - uppercase_escape, - url_or_none, - urlencode_postdata, -) -class YoutubeError(Exception): - pass - -class YoutubeBaseInfoExtractor(InfoExtractor): - """Provide base functions for Youtube extractors""" - _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' - _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' - - _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup' - _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge' - _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' - - _NETRC_MACHINE = 'youtube' - # If True it will raise an error if no login info is provided - _LOGIN_REQUIRED = False - - _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}' - - def _set_language(self): - self._set_cookie( - '.youtube.com', 'PREF', 'f1=50000000&hl=en', - # YouTube sets the expire time to about two months - expire_time=time.time() + 2 * 30 * 24 * 3600) - - def _ids_to_results(self, ids): - return [ - self.url_result(vid_id, 'Youtube', video_id=vid_id) - for vid_id in ids] - - def _login(self): - """ - Attempt to log in to YouTube. - True is returned if successful or skipped. - False is returned if login failed. - - If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised. - """ - username, password = self._get_login_info() - # No authentication to be performed - if username is None: - if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None: - raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) - return True - - login_page = self._download_webpage( - self._LOGIN_URL, None, - note='Downloading login page', - errnote='unable to fetch login page', fatal=False) - if login_page is False: - return - - login_form = self._hidden_inputs(login_page) - - def req(url, f_req, note, errnote): - data = login_form.copy() - data.update({ - 'pstMsg': 1, - 'checkConnection': 'youtube', - 'checkedDomains': 'youtube', - 'hl': 'en', - 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]', - 'f.req': json.dumps(f_req), - 'flowName': 'GlifWebSignIn', - 'flowEntry': 'ServiceLogin', - }) - return self._download_json( - url, None, note=note, errnote=errnote, - transform_source=lambda s: re.sub(r'^[^[]*', '', s), - fatal=False, - data=urlencode_postdata(data), headers={ - 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8', - 'Google-Accounts-XSRF': 1, - }) - - def warn(message): - self._downloader.report_warning(message) - - lookup_req = [ - username, - None, [], None, 'US', None, None, 2, False, True, - [ - None, None, - [2, 1, None, 1, - 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', - None, [], 4], - 1, [None, None, []], None, None, None, True - ], - username, - ] - - lookup_results = req( - self._LOOKUP_URL, lookup_req, - 'Looking up account info', 'Unable to look up account info') - - if lookup_results is False: - return False - - user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str) - if not user_hash: - warn('Unable to extract user hash') - return False - - challenge_req = [ - user_hash, - None, 1, None, [1, None, None, None, [password, None, True]], - [ - None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4], - 1, [None, None, []], None, None, None, True - ]] - - challenge_results = req( - self._CHALLENGE_URL, challenge_req, - 'Logging in', 'Unable to log in') - - if challenge_results is False: - return - - login_res = try_get(challenge_results, lambda x: x[0][5], list) - if login_res: - login_msg = try_get(login_res, lambda x: x[5], compat_str) - warn( - 'Unable to login: %s' % 'Invalid password' - if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg) - return False - - res = try_get(challenge_results, lambda x: x[0][-1], list) - if not res: - warn('Unable to extract result entry') - return False - - login_challenge = try_get(res, lambda x: x[0][0], list) - if login_challenge: - challenge_str = try_get(login_challenge, lambda x: x[2], compat_str) - if challenge_str == 'TWO_STEP_VERIFICATION': - # SEND_SUCCESS - TFA code has been successfully sent to phone - # QUOTA_EXCEEDED - reached the limit of TFA codes - status = try_get(login_challenge, lambda x: x[5], compat_str) - if status == 'QUOTA_EXCEEDED': - warn('Exceeded the limit of TFA codes, try later') - return False - - tl = try_get(challenge_results, lambda x: x[1][2], compat_str) - if not tl: - warn('Unable to extract TL') - return False - - tfa_code = self._get_tfa_info('2-step verification code') - - if not tfa_code: - warn( - 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>' - '(Note that only TOTP (Google Authenticator App) codes work at this time.)') - return False - - tfa_code = remove_start(tfa_code, 'G-') - - tfa_req = [ - user_hash, None, 2, None, - [ - 9, None, None, None, None, None, None, None, - [None, tfa_code, True, 2] - ]] - - tfa_results = req( - self._TFA_URL.format(tl), tfa_req, - 'Submitting TFA code', 'Unable to submit TFA code') - - if tfa_results is False: - return False - - tfa_res = try_get(tfa_results, lambda x: x[0][5], list) - if tfa_res: - tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str) - warn( - 'Unable to finish TFA: %s' % 'Invalid TFA code' - if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg) - return False - - check_cookie_url = try_get( - tfa_results, lambda x: x[0][-1][2], compat_str) - else: - CHALLENGES = { - 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.", - 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.', - 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.", - } - challenge = CHALLENGES.get( - challenge_str, - '%s returned error %s.' % (self.IE_NAME, challenge_str)) - warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge) - return False - else: - check_cookie_url = try_get(res, lambda x: x[2], compat_str) - - if not check_cookie_url: - warn('Unable to extract CheckCookie URL') - return False - - check_cookie_results = self._download_webpage( - check_cookie_url, None, 'Checking cookie', fatal=False) - - if check_cookie_results is False: - return False - - if 'https://myaccount.google.com/' not in check_cookie_results: - warn('Unable to log in') - return False - - return True - - def _download_webpage_handle(self, *args, **kwargs): - query = kwargs.get('query', {}).copy() - query['disable_polymer'] = 'true' - kwargs['query'] = query - return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( - *args, **compat_kwargs(kwargs)) - - def _real_initialize(self): - if self._downloader is None: - return - self._set_language() - if not self._login(): - return - - -class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): - # Extract entries from page with "Load more" button - def _entries(self, page, playlist_id): - more_widget_html = content_html = page - for page_num in itertools.count(1): - for entry in self._process_page(content_html): - yield entry - - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: - break - - count = 0 - retries = 3 - while count <= retries: - try: - # Downloading page may result in intermittent 5xx HTTP error - # that is usually worked around with a retry - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, - 'Downloading page #%s%s' - % (page_num, ' (retry #%d)' % count if count else ''), - transform_source=uppercase_escape) - break - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): - count += 1 - if count <= retries: - continue - raise - - content_html = more['content_html'] - if not content_html.strip(): - # Some webpages show a "Load more" button but they don't - # have more videos - break - more_widget_html = more['load_more_widget_html'] - - -class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): - def _process_page(self, content): - for video_id, video_title in self.extract_videos_from_page(content): - yield self.url_result(video_id, 'Youtube', video_id, video_title) - - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - for mobj in re.finditer(self._VIDEO_RE, page): - # The link with index 0 is not the first video of the playlist (not sure if still actual) - if 'index' in mobj.groupdict() and mobj.group('id') == '0': - continue - video_id = mobj.group('id') - video_title = unescapeHTML(mobj.group('title')) - if video_title: - video_title = video_title.strip() - try: - idx = ids_in_page.index(video_id) - if video_title and not titles_in_page[idx]: - titles_in_page[idx] = video_title - except ValueError: - ids_in_page.append(video_id) - titles_in_page.append(video_title) - return zip(ids_in_page, titles_in_page) - - -class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): - def _process_page(self, content): - for playlist_id in orderedSet(re.findall( - r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', - content)): - yield self.url_result( - 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - title = self._og_search_title(webpage, fatal=False) - return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title) - - -class YoutubeIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube.com' - _VALID_URL = r"""(?x)^ - ( - (?:https?://|//) # http(s):// or protocol-independent URL - (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/| - (?:www\.)?deturl\.com/www\.youtube\.com/| - (?:www\.)?pwnyoutube\.com/| - (?:www\.)?hooktube\.com/| - (?:www\.)?yourepeat\.com/| - tube\.majestyc\.net/| - (?:(?:www|dev)\.)?invidio\.us/| - (?:www\.)?invidiou\.sh/| - (?:www\.)?invidious\.snopyta\.org/| - (?:www\.)?invidious\.kabi\.tk/| - (?:www\.)?vid\.wxzm\.sx/| - youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains - (?:.*?\#/)? # handle anchor (#/) redirect urls - (?: # the various things that can precede the ID: - (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/ - |(?: # or the v= param in all its forms - (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) - (?:\?|\#!?) # the params delimiter ? or # or #! - (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY) - v= - ) - )) - |(?: - youtu\.be| # just youtu.be/xxxx - vid\.plus| # or vid.plus/xxxx - zwearz\.com/watch| # or zwearz.com/watch/xxxx - )/ - |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= - ) - )? # all until now is optional -> you can pass the naked ID - ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID - (?!.*?\blist= - (?: - %(playlist_id)s| # combined list/video URLs are handled by the playlist IE - WL # WL are handled by the watch later IE - ) - ) - (?(1).+)? # if we found the ID, everything can follow - $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} - _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' - _formats = { - '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, - '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, - '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, - '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, - '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, - '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well - '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, - '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - - - # 3D videos - '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, - '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, - '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, - '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, - '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20}, - '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, - '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, - - # Apple HTTP Live Streaming - '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, - '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, - '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, - '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, - '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, - - # DASH mp4 video - '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) - '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, - '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, - '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, - - # Dash mp4 audio - '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, - '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, - '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, - '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, - '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, - '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, - '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, - - # Dash webm - '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, - '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) - '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - - # Dash webm audio - '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, - '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, - - # Dash webm audio with opus inside - '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, - '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, - '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, - - # RTMP (unnamed) - '_rtmp': {'protocol': 'rtmp'}, - - # av01 video only formats sometimes served with "unknown" codecs - '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - } - _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') - - _GEO_BYPASS = False - - IE_NAME = 'youtube' - _TESTS = [ - { - 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9', - 'info_dict': { - 'id': 'BaW_jenozKc', - 'ext': 'mp4', - 'title': 'youtube-dl test video "\'/\\ä↭𝕐', - 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', - 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', - 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', - 'upload_date': '20121002', - 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', - 'categories': ['Science & Technology'], - 'tags': ['youtube-dl'], - 'duration': 10, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - 'start_time': 1, - 'end_time': 9, - } - }, - { - 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY', - 'note': 'Test generic use_cipher_signature video (#897)', - 'info_dict': { - 'id': 'UxxajLWwzqY', - 'ext': 'mp4', - 'upload_date': '20120506', - 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', - 'alt_title': 'I Love It (feat. Charli XCX)', - 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8', - 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', - 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', - 'iconic ep', 'iconic', 'love', 'it'], - 'duration': 180, - 'uploader': 'Icona Pop', - 'uploader_id': 'IconaPop', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop', - 'creator': 'Icona Pop', - 'track': 'I Love It (feat. Charli XCX)', - 'artist': 'Icona Pop', - } - }, - { - 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ', - 'note': 'Test VEVO video with age protection (#956)', - 'info_dict': { - 'id': '07FYdnEawAQ', - 'ext': 'mp4', - 'upload_date': '20130703', - 'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)', - 'alt_title': 'Tunnel Vision', - 'description': 'md5:07dab3356cde4199048e4c7cd93471e1', - 'duration': 419, - 'uploader': 'justintimberlakeVEVO', - 'uploader_id': 'justintimberlakeVEVO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', - 'creator': 'Justin Timberlake', - 'track': 'Tunnel Vision', - 'artist': 'Justin Timberlake', - 'age_limit': 18, - } - }, - { - 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ', - 'note': 'Embed-only video (#1746)', - 'info_dict': { - 'id': 'yZIXLfi8CZQ', - 'ext': 'mp4', - 'upload_date': '20120608', - 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012', - 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', - 'uploader': 'SET India', - 'uploader_id': 'setindia', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', - 'age_limit': 18, - } - }, - { - 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY', - 'note': 'Use the first video ID in the URL', - 'info_dict': { - 'id': 'BaW_jenozKc', - 'ext': 'mp4', - 'title': 'youtube-dl test video "\'/\\ä↭𝕐', - 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', - 'upload_date': '20121002', - 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', - 'categories': ['Science & Technology'], - 'tags': ['youtube-dl'], - 'duration': 10, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I', - 'note': '256k DASH audio (format 141) via DASH manifest', - 'info_dict': { - 'id': 'a9LDPn-MO4I', - 'ext': 'm4a', - 'upload_date': '20121002', - 'uploader_id': '8KVIDEO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', - 'description': '', - 'uploader': '8KVIDEO', - 'title': 'UHDTV TEST 8K VIDEO.mp4' - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '141', - }, - 'skip': 'format 141 not served anymore', - }, - # DASH manifest with encrypted signature - { - 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA', - 'info_dict': { - 'id': 'IB3lcPjvWLA', - 'ext': 'm4a', - 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', - 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', - 'duration': 244, - 'uploader': 'AfrojackVEVO', - 'uploader_id': 'AfrojackVEVO', - 'upload_date': '20131011', - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '141/bestaudio[ext=m4a]', - }, - }, - # JS player signature function name containing $ - { - 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM', - 'info_dict': { - 'id': 'nfWlot6h_JM', - 'ext': 'm4a', - 'title': 'Taylor Swift - Shake It Off', - 'description': 'md5:bec2185232c05479482cb5a9b82719bf', - 'duration': 242, - 'uploader': 'TaylorSwiftVEVO', - 'uploader_id': 'TaylorSwiftVEVO', - 'upload_date': '20140818', - 'creator': 'Taylor Swift', - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '141/bestaudio[ext=m4a]', - }, - }, - # Controversy video - { - 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8', - 'info_dict': { - 'id': 'T4XJQO3qol8', - 'ext': 'mp4', - 'duration': 219, - 'upload_date': '20100909', - 'uploader': 'Amazing Atheist', - 'uploader_id': 'TheAmazingAtheist', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', - 'title': 'Burning Everyone\'s Koran', - 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', - } - }, - # Normal age-gate video (No vevo, embed allowed) - { - 'url': 'https://youtube.com/watch?v=HtVdAasjOgU', - 'info_dict': { - 'id': 'HtVdAasjOgU', - 'ext': 'mp4', - 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', - 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', - 'duration': 142, - 'uploader': 'The Witcher', - 'uploader_id': 'WitcherGame', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', - 'upload_date': '20140605', - 'age_limit': 18, - }, - }, - # Age-gate video with encrypted signature - { - 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU', - 'info_dict': { - 'id': '6kLq3WMV1nU', - 'ext': 'mp4', - 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', - 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', - 'duration': 246, - 'uploader': 'LloydVEVO', - 'uploader_id': 'LloydVEVO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', - 'upload_date': '20110629', - 'age_limit': 18, - }, - }, - # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) - # YouTube Red ad is not captured for creator - { - 'url': '__2ABJjxzNo', - 'info_dict': { - 'id': '__2ABJjxzNo', - 'ext': 'mp4', - 'duration': 266, - 'upload_date': '20100430', - 'uploader_id': 'deadmau5', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', - 'creator': 'deadmau5', - 'description': 'md5:12c56784b8032162bb936a5f76d55360', - 'uploader': 'deadmau5', - 'title': 'Deadmau5 - Some Chords (HD)', - 'alt_title': 'Some Chords', - }, - 'expected_warnings': [ - 'DASH manifest missing', - ] - }, - # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431) - { - 'url': 'lqQg6PlCWgI', - 'info_dict': { - 'id': 'lqQg6PlCWgI', - 'ext': 'mp4', - 'duration': 6085, - 'upload_date': '20150827', - 'uploader_id': 'olympic', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', - 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', - 'uploader': 'Olympic', - 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', - }, - 'params': { - 'skip_download': 'requires avconv', - } - }, - # Non-square pixels - { - 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0', - 'info_dict': { - 'id': '_b-2C3KPAM0', - 'ext': 'mp4', - 'stretched_ratio': 16 / 9., - 'duration': 85, - 'upload_date': '20110310', - 'uploader_id': 'AllenMeow', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', - 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', - 'uploader': '孫ᄋᄅ', - 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', - }, - }, - # url_encoded_fmt_stream_map is empty string - { - 'url': 'qEJwOuvDf7I', - 'info_dict': { - 'id': 'qEJwOuvDf7I', - 'ext': 'webm', - 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге', - 'description': '', - 'upload_date': '20150404', - 'uploader_id': 'spbelect', - 'uploader': 'Наблюдатели Петербурга', - }, - 'params': { - 'skip_download': 'requires avconv', - }, - 'skip': 'This live event has ended.', - }, - # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097) - { - 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', - 'info_dict': { - 'id': 'FIl7x6_3R5Y', - 'ext': 'webm', - 'title': 'md5:7b81415841e02ecd4313668cde88737a', - 'description': 'md5:116377fd2963b81ec4ce64b542173306', - 'duration': 220, - 'upload_date': '20150625', - 'uploader_id': 'dorappi2000', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', - 'uploader': 'dorappi2000', - 'formats': 'mincount:31', - }, - 'skip': 'not actual anymore', - }, - # DASH manifest with segment_list - { - 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8', - 'md5': '8ce563a1d667b599d21064e982ab9e31', - 'info_dict': { - 'id': 'CsmdDsKjzN8', - 'ext': 'mp4', - 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510 - 'uploader': 'Airtek', - 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.', - 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ', - 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015', - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '135', # bestvideo - }, - 'skip': 'This live event has ended.', - }, - { - # Multifeed videos (multiple cameras), URL is for Main Camera - 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs', - 'info_dict': { - 'id': 'jqWvoWXjCVs', - 'title': 'teamPGP: Rocket League Noob Stream', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - }, - 'playlist': [{ - 'info_dict': { - 'id': 'jqWvoWXjCVs', - 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7335, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', - }, - }, { - 'info_dict': { - 'id': '6h8e8xoXJzg', - 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7337, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', - }, - }, { - 'info_dict': { - 'id': 'PUOgX5z9xZw', - 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (grizzle)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7337, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', - }, - }, { - 'info_dict': { - 'id': 'teuwxikvS5k', - 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (zim)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7334, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', - }, - }], - 'params': { - 'skip_download': True, - }, - 'skip': 'This video is not available.', - }, - { - # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536) - 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo', - 'info_dict': { - 'id': 'gVfLd0zydlo', - 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30', - }, - 'playlist_count': 2, - 'skip': 'Not multifeed anymore', - }, - { - 'url': 'https://vid.plus/FlRa-iH7PGw', - 'only_matching': True, - }, - { - 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html', - 'only_matching': True, - }, - { - # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468) - # Also tests cut-off URL expansion in video description (see - # https://github.com/ytdl-org/youtube-dl/issues/1892, - # https://github.com/ytdl-org/youtube-dl/issues/8164) - 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg', - 'info_dict': { - 'id': 'lsguqyKfVQg', - 'ext': 'mp4', - 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', - 'alt_title': 'Dark Walk - Position Music', - 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', - 'duration': 133, - 'upload_date': '20151119', - 'uploader_id': 'IronSoulElf', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', - 'uploader': 'IronSoulElf', - 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', - 'track': 'Dark Walk - Position Music', - 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', - 'album': 'Position Music - Production Music Vol. 143 - Dark Walk', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468) - 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8', - 'only_matching': True, - }, - { - # Video with yt:stretch=17:0 - 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM', - 'info_dict': { - 'id': 'Q39EVAstoRM', - 'ext': 'mp4', - 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4', - 'description': 'md5:ee18a25c350637c8faff806845bddee9', - 'upload_date': '20151107', - 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA', - 'uploader': 'CH GAMER DROID', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video does not exist.', - }, - { - # Video licensed under Creative Commons - 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA', - 'info_dict': { - 'id': 'M4gD1WSo5mA', - 'ext': 'mp4', - 'title': 'md5:e41008789470fc2533a3252216f1c1d1', - 'description': 'md5:a677553cf0840649b731a3024aeff4cc', - 'duration': 721, - 'upload_date': '20150127', - 'uploader_id': 'BerkmanCenter', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter', - 'uploader': 'The Berkman Klein Center for Internet & Society', - 'license': 'Creative Commons Attribution license (reuse allowed)', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Channel-like uploader_url - 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg', - 'info_dict': { - 'id': 'eQcmzGIKrzg', - 'ext': 'mp4', - 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders', - 'description': 'md5:dda0d780d5a6e120758d1711d062a867', - 'duration': 4060, - 'upload_date': '20151119', - 'uploader': 'Bernie Sanders', - 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', - 'license': 'Creative Commons Attribution license (reuse allowed)', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY', - 'only_matching': True, - }, - { - # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059) - 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo', - 'only_matching': True, - }, - { - # Rental video preview - 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg', - 'info_dict': { - 'id': 'uGpuVWrhIzE', - 'ext': 'mp4', - 'title': 'Piku - Trailer', - 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb', - 'upload_date': '20150811', - 'uploader': 'FlixMatrix', - 'uploader_id': 'FlixMatrixKaravan', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan', - 'license': 'Standard YouTube License', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video is not available.', - }, - { - # YouTube Red video with episode data - 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4', - 'info_dict': { - 'id': 'iqKdEhx-dD4', - 'ext': 'mp4', - 'title': 'Isolation - Mind Field (Ep 1)', - 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f', - 'duration': 2085, - 'upload_date': '20170118', - 'uploader': 'Vsauce', - 'uploader_id': 'Vsauce', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce', - 'series': 'Mind Field', - 'season_number': 1, - 'episode_number': 1, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': [ - 'Skipping DASH manifest', - ], - }, - { - # The following content has been identified by the YouTube community - # as inappropriate or offensive to some audiences. - 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI', - 'info_dict': { - 'id': '6SJNVb0GnPI', - 'ext': 'mp4', - 'title': 'Race Differences in Intelligence', - 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1', - 'duration': 965, - 'upload_date': '20140124', - 'uploader': 'New Century Foundation', - 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # itag 212 - 'url': '1t24XAntNCY', - 'only_matching': True, - }, - { - # geo restricted to JP - 'url': 'sJL6WA-aGkQ', - 'only_matching': True, - }, - { - 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', - 'only_matching': True, - }, - { - 'url': 'https://invidio.us/watch?v=BaW_jenozKc', - 'only_matching': True, - }, - { - # DRM protected - 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc', - 'only_matching': True, - }, - { - # Video with unsupported adaptive stream type formats - 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U', - 'info_dict': { - 'id': 'Z4Vy8R84T1U', - 'ext': 'mp4', - 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'duration': 433, - 'upload_date': '20130923', - 'uploader': 'Amelia Putri Harwita', - 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q', - 'formats': 'maxcount:10', - }, - 'params': { - 'skip_download': True, - 'youtube_include_dash_manifest': False, - }, - }, - { - # Youtube Music Auto-generated description - 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs', - 'info_dict': { - 'id': 'MgNrAu2pzNs', - 'ext': 'mp4', - 'title': 'Voyeur Girl', - 'description': 'md5:7ae382a65843d6df2685993e90a8628f', - 'upload_date': '20190312', - 'uploader': 'Various Artists - Topic', - 'uploader_id': 'UCVWKBi1ELZn0QX2CBLSkiyw', - 'artist': 'Stephen', - 'track': 'Voyeur Girl', - 'album': 'it\'s too much love to know my dear', - 'release_date': '20190313', - 'release_year': 2019, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Youtube Music Auto-generated description - # Retrieve 'artist' field from 'Artist:' in video description - # when it is present on youtube music video - 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY', - 'info_dict': { - 'id': 'k0jLE7tTwjY', - 'ext': 'mp4', - 'title': 'Latch Feat. Sam Smith', - 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335', - 'upload_date': '20150110', - 'uploader': 'Various Artists - Topic', - 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w', - 'artist': 'Disclosure', - 'track': 'Latch Feat. Sam Smith', - 'album': 'Latch Featuring Sam Smith', - 'release_date': '20121008', - 'release_year': 2012, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Youtube Music Auto-generated description - # handle multiple artists on youtube music video - 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA', - 'info_dict': { - 'id': '74qn0eJSjpA', - 'ext': 'mp4', - 'title': 'Eastside', - 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2', - 'upload_date': '20180710', - 'uploader': 'Benny Blanco - Topic', - 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A', - 'artist': 'benny blanco, Halsey, Khalid', - 'track': 'Eastside', - 'album': 'Eastside', - 'release_date': '20180713', - 'release_year': 2018, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Youtube Music Auto-generated description - # handle youtube music video with release_year and no release_date - 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M', - 'info_dict': { - 'id': '-hcAI0g-f5M', - 'ext': 'mp4', - 'title': 'Put It On Me', - 'description': 'md5:93c55acc682ae7b0c668f2e34e1c069e', - 'upload_date': '20180426', - 'uploader': 'Matt Maeson - Topic', - 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ', - 'artist': 'Matt Maeson', - 'track': 'Put It On Me', - 'album': 'The Hearse', - 'release_date': None, - 'release_year': 2018, - }, - 'params': { - 'skip_download': True, - }, - }, - ] - - def __init__(self, *args, **kwargs): - super(YoutubeIE, self).__init__(*args, **kwargs) - self._player_cache = {} - - def report_video_info_webpage_download(self, video_id): - """Report attempt to download video info webpage.""" - self.to_screen('%s: Downloading video info webpage' % video_id) - - def report_information_extraction(self, video_id): - """Report attempt to extract video information.""" - self.to_screen('%s: Extracting video information' % video_id) - - def report_unavailable_format(self, video_id, format): - """Report extracted video URL.""" - self.to_screen('%s: Format %s not available' % (video_id, format)) - - def report_rtmp_download(self): - """Indicate the download will use the RTMP protocol.""" - self.to_screen('RTMP download detected') - - def _signature_cache_id(self, example_sig): - """ Return a string representation of a signature """ - return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) - - def _extract_signature_function(self, video_id, player_url, example_sig): - id_m = re.match( - r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$', - player_url) - if not id_m: - raise ExtractorError('Cannot identify player %r' % player_url) - player_type = id_m.group('ext') - player_id = id_m.group('id') - - # Read from filesystem cache - func_id = '%s_%s_%s' % ( - player_type, player_id, self._signature_cache_id(example_sig)) - assert os.path.basename(func_id) == func_id - - cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) - if cache_spec is not None: - return lambda s: ''.join(s[i] for i in cache_spec) - - download_note = ( - 'Downloading player %s' % player_url - if self._downloader.params.get('verbose') else - 'Downloading %s player %s' % (player_type, player_id) - ) - if player_type == 'js': - code = self._download_webpage( - player_url, video_id, - note=download_note, - errnote='Download of %s failed' % player_url) - res = self._parse_sig_js(code) - elif player_type == 'swf': - urlh = self._request_webpage( - player_url, video_id, - note=download_note, - errnote='Download of %s failed' % player_url) - code = urlh.read() - res = self._parse_sig_swf(code) - else: - assert False, 'Invalid player type %r' % player_type - - test_string = ''.join(map(compat_chr, range(len(example_sig)))) - cache_res = res(test_string) - cache_spec = [ord(c) for c in cache_res] - - self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec) - return res - - def _print_sig_code(self, func, example_sig): - def gen_sig_code(idxs): - def _genslice(start, end, step): - starts = '' if start == 0 else str(start) - ends = (':%d' % (end + step)) if end + step >= 0 else ':' - steps = '' if step == 1 else (':%d' % step) - return 's[%s%s%s]' % (starts, ends, steps) - - step = None - # Quelch pyflakes warnings - start will be set when step is set - start = '(Never used)' - for i, prev in zip(idxs[1:], idxs[:-1]): - if step is not None: - if i - prev == step: - continue - yield _genslice(start, prev, step) - step = None - continue - if i - prev in [-1, 1]: - step = i - prev - start = prev - continue - else: - yield 's[%d]' % prev - if step is None: - yield 's[%d]' % i - else: - yield _genslice(start, i, step) - - test_string = ''.join(map(compat_chr, range(len(example_sig)))) - cache_res = func(test_string) - cache_spec = [ord(c) for c in cache_res] - expr_code = ' + '.join(gen_sig_code(cache_spec)) - signature_id_tuple = '(%s)' % ( - ', '.join(compat_str(len(p)) for p in example_sig.split('.'))) - code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n' - ' return %s\n') % (signature_id_tuple, expr_code) - self.to_screen('Extracted signature function:\n' + code) - - def _parse_sig_js(self, jscode): - funcname = self._search_regex( - (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', - # Obsolete patterns - r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', - r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('), - jscode, 'Initial JS player signature function name', group='sig') - - jsi = JSInterpreter(jscode) - initial_function = jsi.extract_function(funcname) - return lambda s: initial_function([s]) - - def _parse_sig_swf(self, file_contents): - swfi = SWFInterpreter(file_contents) - TARGET_CLASSNAME = 'SignatureDecipher' - searched_class = swfi.extract_class(TARGET_CLASSNAME) - initial_function = swfi.extract_function(searched_class, 'decipher') - return lambda s: initial_function([s]) - - def _decrypt_signature(self, s, video_id, player_url, age_gate=False): - """Turn the encrypted s field into a working signature""" - - if player_url is None: - raise ExtractorError('Cannot decrypt signature without player_url') - - if player_url.startswith('//'): - player_url = 'https:' + player_url - elif not re.match(r'https?://', player_url): - player_url = compat_urlparse.urljoin( - 'https://www.youtube.com', player_url) - try: - player_id = (player_url, self._signature_cache_id(s)) - if player_id not in self._player_cache: - func = self._extract_signature_function( - video_id, player_url, s - ) - self._player_cache[player_id] = func - func = self._player_cache[player_id] - if self._downloader.params.get('youtube_print_sig_code'): - self._print_sig_code(func, s) - return func(s) - except Exception as e: - tb = traceback.format_exc() - raise ExtractorError( - 'Signature extraction failed: ' + tb, cause=e) - - def _get_subtitles(self, video_id, webpage): - try: - subs_doc = self._download_xml( - 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, - video_id, note=False) - except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err)) - return {} - - sub_lang_list = {} - for track in subs_doc.findall('track'): - lang = track.attrib['lang_code'] - if lang in sub_lang_list: - continue - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse_urlencode({ - 'lang': lang, - 'v': video_id, - 'fmt': ext, - 'name': track.attrib['name'].encode('utf-8'), - }) - sub_formats.append({ - 'url': 'https://www.youtube.com/api/timedtext?' + params, - 'ext': ext, - }) - sub_lang_list[lang] = sub_formats - if not sub_lang_list: - self._downloader.report_warning('video doesn\'t have subtitles') - return {} - return sub_lang_list - - def _get_ytplayer_config(self, video_id, webpage): - patterns = ( - # User data may contain arbitrary character sequences that may affect - # JSON extraction with regex, e.g. when '};' is contained the second - # regex won't capture the whole JSON. Yet working around by trying more - # concrete regex first keeping in mind proper quoted string handling - # to be implemented in future that will replace this workaround (see - # https://github.com/ytdl-org/youtube-dl/issues/7468, - # https://github.com/ytdl-org/youtube-dl/pull/7599) - r';ytplayer\.config\s*=\s*({.+?});ytplayer', - r';ytplayer\.config\s*=\s*({.+?});', - ) - config = self._search_regex( - patterns, webpage, 'ytplayer.config', default=None) - if config: - return self._parse_json( - uppercase_escape(config), video_id, fatal=False) - - def _get_automatic_captions(self, video_id, webpage): - """We need the webpage for getting the captions url, pass it as an - argument to speed up the process.""" - self.to_screen('%s: Looking for automatic captions' % video_id) - player_config = self._get_ytplayer_config(video_id, webpage) - err_msg = 'Couldn\'t find automatic captions for %s' % video_id - if not player_config: - self._downloader.report_warning(err_msg) - return {} - try: - args = player_config['args'] - caption_url = args.get('ttsurl') - if caption_url: - timestamp = args['timestamp'] - # We get the available subtitles - list_params = compat_urllib_parse_urlencode({ - 'type': 'list', - 'tlangs': 1, - 'asrs': 1, - }) - list_url = caption_url + '&' + list_params - caption_list = self._download_xml(list_url, video_id) - original_lang_node = caption_list.find('track') - if original_lang_node is None: - self._downloader.report_warning('Video doesn\'t have automatic captions') - return {} - original_lang = original_lang_node.attrib['lang_code'] - caption_kind = original_lang_node.attrib.get('kind', '') - - sub_lang_list = {} - for lang_node in caption_list.findall('target'): - sub_lang = lang_node.attrib['lang_code'] - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse_urlencode({ - 'lang': original_lang, - 'tlang': sub_lang, - 'fmt': ext, - 'ts': timestamp, - 'kind': caption_kind, - }) - sub_formats.append({ - 'url': caption_url + '&' + params, - 'ext': ext, - }) - sub_lang_list[sub_lang] = sub_formats - return sub_lang_list - - def make_captions(sub_url, sub_langs): - parsed_sub_url = compat_urllib_parse_urlparse(sub_url) - caption_qs = compat_parse_qs(parsed_sub_url.query) - captions = {} - for sub_lang in sub_langs: - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - caption_qs.update({ - 'tlang': [sub_lang], - 'fmt': [ext], - }) - sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace( - query=compat_urllib_parse_urlencode(caption_qs, True))) - sub_formats.append({ - 'url': sub_url, - 'ext': ext, - }) - captions[sub_lang] = sub_formats - return captions - - # New captions format as of 22.06.2017 - player_response = args.get('player_response') - if player_response and isinstance(player_response, compat_str): - player_response = self._parse_json( - player_response, video_id, fatal=False) - if player_response: - renderer = player_response['captions']['playerCaptionsTracklistRenderer'] - base_url = renderer['captionTracks'][0]['baseUrl'] - sub_lang_list = [] - for lang in renderer['translationLanguages']: - lang_code = lang.get('languageCode') - if lang_code: - sub_lang_list.append(lang_code) - return make_captions(base_url, sub_lang_list) - - # Some videos don't provide ttsurl but rather caption_tracks and - # caption_translation_languages (e.g. 20LmZk1hakA) - # Does not used anymore as of 22.06.2017 - caption_tracks = args['caption_tracks'] - caption_translation_languages = args['caption_translation_languages'] - caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] - sub_lang_list = [] - for lang in caption_translation_languages.split(','): - lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) - sub_lang = lang_qs.get('lc', [None])[0] - if sub_lang: - sub_lang_list.append(sub_lang) - return make_captions(caption_url, sub_lang_list) - # An extractor error can be raise by the download process if there are - # no automatic captions but there are subtitles - except (KeyError, IndexError, ExtractorError): - self._downloader.report_warning(err_msg) - return {} - - def _mark_watched(self, video_id, video_info, player_response): - playback_url = url_or_none(try_get( - player_response, - lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get( - video_info, lambda x: x['videostats_playback_base_url'][0])) - if not playback_url: - return - parsed_playback_url = compat_urlparse.urlparse(playback_url) - qs = compat_urlparse.parse_qs(parsed_playback_url.query) - - # cpn generation algorithm is reverse engineered from base.js. - # In fact it works even with dummy cpn. - CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' - cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) - - qs.update({ - 'ver': ['2'], - 'cpn': [cpn], - }) - playback_url = compat_urlparse.urlunparse( - parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True))) - - self._download_webpage( - playback_url, video_id, 'Marking watched', - 'Unable to mark watched', fatal=False) - - @staticmethod - def _extract_urls(webpage): - # Embedded YouTube player - entries = [ - unescapeHTML(mobj.group('url')) - for mobj in re.finditer(r'''(?x) - (?: - <iframe[^>]+?src=| - data-video-url=| - <embed[^>]+?src=| - embedSWF\(?:\s*| - <object[^>]+data=| - new\s+SWFObject\( - ) - (["\']) - (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ - (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?) - \1''', webpage)] - - # lazyYT YouTube embed - entries.extend(list(map( - unescapeHTML, - re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)))) - - # Wordpress "YouTube Video Importer" plugin - matches = re.findall(r'''(?x)<div[^>]+ - class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ - data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) - entries.extend(m[-1] for m in matches) - - return entries - - @staticmethod - def _extract_url(webpage): - urls = YoutubeIE._extract_urls(webpage) - return urls[0] if urls else None - - @classmethod - def extract_id(cls, url): - mobj = re.match(cls._VALID_URL, url, re.VERBOSE) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - video_id = mobj.group(2) - return video_id - - def _extract_annotations(self, video_id): - return self._download_webpage( - 'https://www.youtube.com/annotations_invideo', video_id, - note='Downloading annotations', - errnote='Unable to download video annotations', fatal=False, - query={ - 'features': 1, - 'legacy': 1, - 'video_id': video_id, - }) - - @staticmethod - def _extract_chapters(description, duration): - if not description: - return None - chapter_lines = re.findall( - r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)', - description) - if not chapter_lines: - return None - chapters = [] - for next_num, (chapter_line, time_point) in enumerate( - chapter_lines, start=1): - start_time = parse_duration(time_point) - if start_time is None: - continue - if start_time > duration: - break - end_time = (duration if next_num == len(chapter_lines) - else parse_duration(chapter_lines[next_num][1])) - if end_time is None: - continue - if end_time > duration: - end_time = duration - if start_time > end_time: - break - chapter_title = re.sub( - r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-') - chapter_title = re.sub(r'\s+', ' ', chapter_title) - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': chapter_title, - }) - return chapters - - ul_tag_pattern = re.compile(r'(</?ul)') - music_info_pattern = re.compile(r'<h4 class="title">\s*(Song|Music|Artist|Album)\s*</h4>\s*<ul class="content watch-info-tag-list">\s*<li>(?:<a[^>]*>)?([^<]*)(?:</a>)?</li>') - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - - proto = ( - 'http' if self._downloader.params.get('prefer_insecure', False) - else 'https') - - start_time = None - end_time = None - parsed_url = compat_urllib_parse_urlparse(url) - for component in [parsed_url.fragment, parsed_url.query]: - query = compat_parse_qs(component) - if start_time is None and 't' in query: - start_time = parse_duration(query['t'][0]) - if start_time is None and 'start' in query: - start_time = parse_duration(query['start'][0]) - if end_time is None and 'end' in query: - end_time = parse_duration(query['end'][0]) - - # Extract original video URL from URL with redirection, like age verification, using next_url parameter - mobj = re.search(self._NEXT_URL_RE, url) - if mobj: - url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/') - video_id = self.extract_id(url) - - # Get video webpage - url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id - video_webpage = self._download_webpage(url, video_id) - - # Attempt to extract SWF player URL - mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) - if mobj is not None: - player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) - else: - player_url = None - - dash_mpds = [] - - def add_dash_mpd(video_info): - dash_mpd = video_info.get('dashmpd') - if dash_mpd and dash_mpd[0] not in dash_mpds: - dash_mpds.append(dash_mpd[0]) - - def add_dash_mpd_pr(pl_response): - dash_mpd = url_or_none(try_get( - pl_response, lambda x: x['streamingData']['dashManifestUrl'], - compat_str)) - if dash_mpd and dash_mpd not in dash_mpds: - dash_mpds.append(dash_mpd) - - is_live = None - view_count = None - - def extract_view_count(v_info): - return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) - - def extract_token(v_info): - return dict_get(v_info, ('account_playback_token', 'accountPlaybackToken', 'token')) - - player_response = {} - - - - # Is it unlisted? - unlisted = ('<span id="watch-privacy-icon"' in video_webpage) - - - # Related videos - related_vids = [] - try: - rvs_match = re.search(r'"rvs":"(.*?)[^\\]"', video_webpage) - if rvs_match is not None: - rvs = json.loads('"' + rvs_match.group(1) + '"') # unescape json string (\u0026 for example) - related_vid_parts = (compat_parse_qs(related_item) for related_item in rvs.split(",")) - related_vids = [{key : value[0] for key,value in vid.items()} for vid in related_vid_parts] - else: - print('Failed to extract related videos: no rvs') - - except Exception: - print('Error while extracting related videos:') - traceback.print_exc() - - - # Music list - # Test case: https://www.youtube.com/watch?v=jbkZdRglnKY - music_list = [] - metadata_start = video_webpage.find('<ul class="watch-extras-section">') - if metadata_start != -1: - metadata_start += 33 - tag_index = metadata_start - open_tags = 1 - while open_tags > 0: - match = self.ul_tag_pattern.search(video_webpage, tag_index) - if match is None: - print("Couldn't match ul tag") - break - tag_index = match.end() - tag = match.group(1) - if tag == "<ul": - open_tags += 1 - else: - open_tags -= 1 - else: - last_index = 0 - metadata = video_webpage[metadata_start:tag_index] - current_song = None - while True: - match = self.music_info_pattern.search(metadata, last_index) - if match is None: - if current_song is not None: - music_list.append(current_song) - break - title, value = match.group(1), html.unescape(match.group(2)) - if title in ("Song", "Music"): - if current_song is not None: - music_list.append(current_song) - current_song = {"title": value} - else: - current_song[title.lower()] = value - last_index = match.end() - - - - # Get video info - embed_webpage = None - if re.search(r'player-age-gate-content">', video_webpage) is not None: - age_gate = True - # We simulate the access to the video from www.youtube.com/v/{video_id} - # this can be viewed without login into Youtube - url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage') - data = compat_urllib_parse_urlencode({ - 'video_id': video_id, - 'eurl': 'https://youtube.googleapis.com/v/' + video_id, - 'sts': self._search_regex( - r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), - }) - video_info_url = proto + '://www.youtube.com/get_video_info?' + data - video_info_webpage = self._download_webpage( - video_info_url, video_id, - note='Refetching age-gated info webpage', - errnote='unable to download video info webpage') - video_info = compat_parse_qs(video_info_webpage) - add_dash_mpd(video_info) - else: - age_gate = False - video_info = None - sts = None - # Try looking directly into the video webpage - ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) - if ytplayer_config: - args = ytplayer_config['args'] - if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): - # Convert to the same format returned by compat_parse_qs - video_info = dict((k, [v]) for k, v in args.items()) - add_dash_mpd(video_info) - # Rental video is not rented but preview is available (e.g. - # https://www.youtube.com/watch?v=yYr8q0y5Jfg, - # https://github.com/ytdl-org/youtube-dl/issues/10532) - if not video_info and args.get('ypc_vid'): - return self.url_result( - args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) - if args.get('livestream') == '1' or args.get('live_playback') == 1: - is_live = True - sts = ytplayer_config.get('sts') - if not player_response: - pl_response = str_or_none(args.get('player_response')) - if pl_response: - pl_response = self._parse_json(pl_response, video_id, fatal=False) - if isinstance(pl_response, dict): - player_response = pl_response - if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): - add_dash_mpd_pr(player_response) - # We also try looking in get_video_info since it may contain different dashmpd - # URL that points to a DASH manifest with possibly different itag set (some itags - # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH - # manifest pointed by get_video_info's dashmpd). - # The general idea is to take a union of itags of both DASH manifests (for example - # video with such 'manifest behavior' see https://github.com/ytdl-org/youtube-dl/issues/6093) - self.report_video_info_webpage_download(video_id) - for el in ('embedded', 'detailpage', 'vevo', ''): - query = { - 'video_id': video_id, - 'ps': 'default', - 'eurl': '', - 'gl': 'US', - 'hl': 'en', - } - if el: - query['el'] = el - if sts: - query['sts'] = sts - video_info_webpage = self._download_webpage( - '%s://www.youtube.com/get_video_info' % proto, - video_id, note=False, - errnote='unable to download video info webpage', - fatal=False, query=query) - if not video_info_webpage: - continue - get_video_info = compat_parse_qs(video_info_webpage) - if not player_response: - pl_response = get_video_info.get('player_response', [None])[0] - if isinstance(pl_response, dict): - player_response = pl_response - add_dash_mpd_pr(player_response) - add_dash_mpd(get_video_info) - if view_count is None: - view_count = extract_view_count(get_video_info) - if not video_info: - video_info = get_video_info - get_token = extract_token(get_video_info) - if get_token: - # Different get_video_info requests may report different results, e.g. - # some may report video unavailability, but some may serve it without - # any complaint (see https://github.com/ytdl-org/youtube-dl/issues/7362, - # the original webpage as well as el=info and el=embedded get_video_info - # requests report video unavailability due to geo restriction while - # el=detailpage succeeds and returns valid data). This is probably - # due to YouTube measures against IP ranges of hosting providers. - # Working around by preferring the first succeeded video_info containing - # the token if no such video_info yet was found. - token = extract_token(video_info) - if not token: - video_info = get_video_info - break - - def extract_unavailable_message(): - return self._html_search_regex( - r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>', - video_webpage, 'unavailable message', default=None) - - if not video_info: - unavailable_message = extract_unavailable_message() - if not unavailable_message: - unavailable_message = 'Unable to extract video data' - raise ExtractorError( - 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id) - - video_details = try_get( - player_response, lambda x: x['videoDetails'], dict) or {} - - video_title = video_info.get('title', [None])[0] or video_details.get('title') - if not video_title: - self._downloader.report_warning('Unable to extract video title') - video_title = '_' - - # description - description_original = video_description = get_element_by_id("eow-description", video_webpage) - if video_description: - - def replace_url(m): - redir_url = compat_urlparse.urljoin(url, m.group(1)) - parsed_redir_url = compat_urllib_parse_urlparse(redir_url) - if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect': - qs = compat_parse_qs(parsed_redir_url.query) - q = qs.get('q') - if q and q[0]: - return q[0] - return redir_url - - description_original = video_description = re.sub(r'''(?x) - <a\s+ - (?:[a-zA-Z-]+="[^"]*"\s+)*? - (?:title|href)="([^"]+)"\s+ - (?:[a-zA-Z-]+="[^"]*"\s+)*? - class="[^"]*"[^>]*> - [^<]+\.{3}\s* - </a> - ''', replace_url, video_description) - video_description = clean_html(video_description) - else: - video_description = self._html_search_meta('description', video_webpage) or video_details.get('shortDescription') - - if not smuggled_data.get('force_singlefeed', False): - if not self._downloader.params.get('noplaylist'): - multifeed_metadata_list = try_get( - player_response, - lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'], - compat_str) or try_get( - video_info, lambda x: x['multifeed_metadata_list'][0], compat_str) - if multifeed_metadata_list: - entries = [] - feed_ids = [] - for feed in multifeed_metadata_list.split(','): - # Unquote should take place before split on comma (,) since textual - # fields may contain comma as well (see - # https://github.com/ytdl-org/youtube-dl/issues/8536) - feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) - entries.append({ - '_type': 'url_transparent', - 'ie_key': 'Youtube', - 'url': smuggle_url( - '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), - {'force_singlefeed': True}), - 'title': '%s (%s)' % (video_title, feed_data['title'][0]), - }) - feed_ids.append(feed_data['id'][0]) - self.to_screen( - 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' - % (', '.join(feed_ids), video_id)) - return self.playlist_result(entries, video_id, video_title, video_description) - else: - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - - if view_count is None: - view_count = extract_view_count(video_info) - if view_count is None and video_details: - view_count = int_or_none(video_details.get('viewCount')) - - # Check for "rental" videos - if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True) - - def _extract_filesize(media_url): - return int_or_none(self._search_regex( - r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) - - if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): - self.report_rtmp_download() - formats = [{ - 'format_id': '_rtmp', - 'protocol': 'rtmp', - 'url': video_info['conn'][0], - 'player_url': player_url, - }] - elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): - encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] - if 'rtmpe%3Dyes' in encoded_url_map: - raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True) - formats_spec = {} - fmt_list = video_info.get('fmt_list', [''])[0] - if fmt_list: - for fmt in fmt_list.split(','): - spec = fmt.split('/') - if len(spec) > 1: - width_height = spec[1].split('x') - if len(width_height) == 2: - formats_spec[spec[0]] = { - 'resolution': spec[1], - 'width': int_or_none(width_height[0]), - 'height': int_or_none(width_height[1]), - } - q = qualities(['small', 'medium', 'hd720']) - streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) - if streaming_formats: - for fmt in streaming_formats: - itag = str_or_none(fmt.get('itag')) - if not itag: - continue - quality = fmt.get('quality') - quality_label = fmt.get('qualityLabel') or quality - formats_spec[itag] = { - 'asr': int_or_none(fmt.get('audioSampleRate')), - 'filesize': int_or_none(fmt.get('contentLength')), - 'format_note': quality_label, - 'fps': int_or_none(fmt.get('fps')), - 'height': int_or_none(fmt.get('height')), - 'quality': q(quality), - # bitrate for itag 43 is always 2147483647 - 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, - 'width': int_or_none(fmt.get('width')), - } - formats = [] - for url_data_str in encoded_url_map.split(','): - url_data = compat_parse_qs(url_data_str) - if 'itag' not in url_data or 'url' not in url_data or url_data.get('drm_families'): - continue - stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0])) - # Unsupported FORMAT_STREAM_TYPE_OTF - if stream_type == 3: - continue - format_id = url_data['itag'][0] - url = url_data['url'][0] - - if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): - ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' - jsplayer_url_json = self._search_regex( - ASSETS_RE, - embed_webpage if age_gate else video_webpage, - 'JS player URL (1)', default=None) - if not jsplayer_url_json and not age_gate: - # We need the embed website after all - if embed_webpage is None: - embed_url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage( - embed_url, video_id, 'Downloading embed webpage') - jsplayer_url_json = self._search_regex( - ASSETS_RE, embed_webpage, 'JS player URL') - - player_url = json.loads(jsplayer_url_json) - if player_url is None: - player_url_json = self._search_regex( - r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', - video_webpage, 'age gate player URL') - player_url = json.loads(player_url_json) - - if 'sig' in url_data: - url += '&signature=' + url_data['sig'][0] - elif 's' in url_data: - encrypted_sig = url_data['s'][0] - - if self._downloader.params.get('verbose'): - if player_url is None: - player_version = 'unknown' - player_desc = 'unknown' - else: - if player_url.endswith('swf'): - player_version = self._search_regex( - r'-(.+?)(?:/watch_as3)?\.swf$', player_url, - 'flash player', fatal=False) - player_desc = 'flash player %s' % player_version - else: - player_version = self._search_regex( - [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', - r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], - player_url, - 'html5 player', fatal=False) - player_desc = 'html5 player %s' % player_version - - parts_sizes = self._signature_cache_id(encrypted_sig) - self.to_screen('{%s} signature length %s, %s' % - (format_id, parts_sizes, player_desc)) - - signature = self._decrypt_signature( - encrypted_sig, video_id, player_url, age_gate) - sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature' - url += '&%s=%s' % (sp, signature) - if 'ratebypass' not in url: - url += '&ratebypass=yes' - - dct = { - 'format_id': format_id, - 'url': url, - 'player_url': player_url, - } - if format_id in self._formats: - dct.update(self._formats[format_id]) - if format_id in formats_spec: - dct.update(formats_spec[format_id]) - - # Some itags are not included in DASH manifest thus corresponding formats will - # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993). - # Trying to extract metadata from url_encoded_fmt_stream_map entry. - mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0]) - width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) - - filesize = int_or_none(url_data.get( - 'clen', [None])[0]) or _extract_filesize(url) - - quality = url_data.get('quality', [None])[0] - - more_fields = { - 'filesize': filesize, - 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000), - 'width': width, - 'height': height, - 'fps': int_or_none(url_data.get('fps', [None])[0]), - 'format_note': url_data.get('quality_label', [None])[0] or quality, - 'quality': q(quality), - } - for key, value in more_fields.items(): - if value: - dct[key] = value - type_ = url_data.get('type', [None])[0] - if type_: - type_split = type_.split(';') - kind_ext = type_split[0].split('/') - if len(kind_ext) == 2: - kind, _ = kind_ext - dct['ext'] = mimetype2ext(type_split[0]) - if kind in ('audio', 'video'): - codecs = None - for mobj in re.finditer( - r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_): - if mobj.group('key') == 'codecs': - codecs = mobj.group('val') - break - if codecs: - dct.update(parse_codecs(codecs)) - if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none': - dct['downloader_options'] = { - # Youtube throttles chunks >~10M - 'http_chunk_size': 10485760, - } - formats.append(dct) - else: - manifest_url = ( - url_or_none(try_get( - player_response, - lambda x: x['streamingData']['hlsManifestUrl'], - compat_str)) - or url_or_none(try_get( - video_info, lambda x: x['hlsvp'][0], compat_str))) - if manifest_url: - formats = [] - m3u8_formats = self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', fatal=False) - for a_format in m3u8_formats: - itag = self._search_regex( - r'/itag/(\d+)/', a_format['url'], 'itag', default=None) - if itag: - a_format['format_id'] = itag - if itag in self._formats: - dct = self._formats[itag].copy() - dct.update(a_format) - a_format = dct - a_format['player_url'] = player_url - # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming - a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' - formats.append(a_format) - else: - error_message = clean_html(video_info.get('reason', [None])[0]) - alt_error_message = clean_html(video_info.get('reason', [None])[0]) - print(alt_error_message) - if not error_message: - error_message = alt_error_message - if error_message: - raise YoutubeError(error_message) - raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info') - - # uploader - video_uploader = try_get( - video_info, lambda x: x['author'][0], - compat_str) or str_or_none(video_details.get('author')) - if video_uploader: - video_uploader = compat_urllib_parse_unquote_plus(video_uploader) - else: - self._downloader.report_warning('unable to extract uploader name') - - # uploader_id - video_uploader_id = None - video_uploader_url = None - mobj = re.search( - r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', - video_webpage) - if mobj is not None: - video_uploader_id = mobj.group('uploader_id') - video_uploader_url = mobj.group('uploader_url') - else: - self._downloader.report_warning('unable to extract uploader nickname') - - channel_id = ( - str_or_none(video_details.get('channelId')) - or self._html_search_meta( - 'channelId', video_webpage, 'channel id', default=None) - or self._search_regex( - r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1', - video_webpage, 'channel id', default=None, group='id')) - channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None - - # thumbnail image - # We try first to get a high quality image: - m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', - video_webpage, re.DOTALL) - if m_thumb is not None: - video_thumbnail = m_thumb.group(1) - elif 'thumbnail_url' not in video_info: - self._downloader.report_warning('unable to extract video thumbnail') - video_thumbnail = None - else: # don't panic if we can't find it - video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) - - # upload date - upload_date = self._html_search_meta( - 'datePublished', video_webpage, 'upload date', default=None) - if not upload_date: - upload_date = self._search_regex( - [r'(?s)id="eow-date.*?>(.*?)</span>', - r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], - video_webpage, 'upload date', default=None) - upload_date = unified_strdate(upload_date) - - video_license = self._html_search_regex( - r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li', - video_webpage, 'license', default=None) - - m_music = re.search( - r'''(?x) - <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s* - <ul[^>]*>\s* - <li>(?P<title>.+?) - by (?P<creator>.+?) - (?: - \(.+?\)| - <a[^>]* - (?: - \bhref=["\']/red[^>]*>| # drop possible - >\s*Listen ad-free with YouTube Red # YouTube Red ad - ) - .*? - )?</li - ''', - video_webpage) - if m_music: - video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) - video_creator = clean_html(m_music.group('creator')) - else: - video_alt_title = video_creator = None - - def extract_meta(field): - return self._html_search_regex( - r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field, - video_webpage, field, default=None) - - track = extract_meta('Song') - artist = extract_meta('Artist') - album = extract_meta('Album') - - # Youtube Music Auto-generated description - release_date = release_year = None - if video_description: - mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description) - if mobj: - if not track: - track = mobj.group('track').strip() - if not artist: - artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')) - if not album: - album = mobj.group('album'.strip()) - release_year = mobj.group('release_year') - release_date = mobj.group('release_date') - if release_date: - release_date = release_date.replace('-', '') - if not release_year: - release_year = int(release_date[:4]) - if release_year: - release_year = int(release_year) - - m_episode = re.search( - r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', - video_webpage) - if m_episode: - series = unescapeHTML(m_episode.group('series')) - season_number = int(m_episode.group('season')) - episode_number = int(m_episode.group('episode')) - else: - series = season_number = episode_number = None - - m_cat_container = self._search_regex( - r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', - video_webpage, 'categories', default=None) - if m_cat_container: - category = self._html_search_regex( - r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', - default=None) - video_categories = None if category is None else [category] - else: - video_categories = None - - video_tags = [ - unescapeHTML(m.group('content')) - for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] - - def _extract_count(count_name): - return str_to_int(self._search_regex( - r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' - % re.escape(count_name), - video_webpage, count_name, default=None)) - - like_count = _extract_count('like') - dislike_count = _extract_count('dislike') - - if view_count is None: - view_count = str_to_int(self._search_regex( - r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage, - 'view count', default=None)) - - average_rating = ( - float_or_none(video_details.get('averageRating')) - or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0]))) - - # subtitles - video_subtitles = self._get_subtitles(video_id, video_webpage) - automatic_captions = self._get_automatic_captions(video_id, video_webpage) - - video_duration = try_get( - video_info, lambda x: int_or_none(x['length_seconds'][0])) - if not video_duration: - video_duration = int_or_none(video_details.get('lengthSeconds')) - if not video_duration: - video_duration = parse_duration(self._html_search_meta( - 'duration', video_webpage, 'video duration')) - - # annotations - video_annotations = None - if self._downloader.params.get('writeannotations', False): - video_annotations = self._extract_annotations(video_id) - - chapters = self._extract_chapters(description_original, video_duration) - - # Look for the DASH manifest - if self._downloader.params.get('youtube_include_dash_manifest', True): - dash_mpd_fatal = True - for mpd_url in dash_mpds: - dash_formats = {} - try: - def decrypt_sig(mobj): - s = mobj.group(1) - dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) - return '/signature/%s' % dec_s - - mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url) - - for df in self._extract_mpd_formats( - mpd_url, video_id, fatal=dash_mpd_fatal, - formats_dict=self._formats): - if not df.get('filesize'): - df['filesize'] = _extract_filesize(df['url']) - # Do not overwrite DASH format found in some previous DASH manifest - if df['format_id'] not in dash_formats: - dash_formats[df['format_id']] = df - # Additional DASH manifests may end up in HTTP Error 403 therefore - # allow them to fail without bug report message if we already have - # some DASH manifest succeeded. This is temporary workaround to reduce - # burst of bug reports until we figure out the reason and whether it - # can be fixed at all. - dash_mpd_fatal = False - except (ExtractorError, KeyError) as e: - self.report_warning( - 'Skipping DASH manifest: %r' % e, video_id) - if dash_formats: - # Remove the formats we found through non-DASH, they - # contain less info and it can be wrong, because we use - # fixed values (for example the resolution). See - # https://github.com/ytdl-org/youtube-dl/issues/5774 for an - # example. - formats = [f for f in formats if f['format_id'] not in dash_formats.keys()] - formats.extend(dash_formats.values()) - - # Check for malformed aspect ratio - stretched_m = re.search( - r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">', - video_webpage) - if stretched_m: - w = float(stretched_m.group('w')) - h = float(stretched_m.group('h')) - # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0). - # We will only process correct ratios. - if w > 0 and h > 0: - ratio = w / h - for f in formats: - if f.get('vcodec') != 'none': - f['stretched_ratio'] = ratio - - if not formats: - token = extract_token(video_info) - if not token: - if 'reason' in video_info: - if 'The uploader has not made this video available in your country.' in video_info['reason']: - regions_allowed = self._html_search_meta( - 'regionsAllowed', video_webpage, default=None) - countries = regions_allowed.split(',') if regions_allowed else None - self.raise_geo_restricted( - msg=video_info['reason'][0], countries=countries) - reason = video_info['reason'][0] - if 'Invalid parameters' in reason: - unavailable_message = extract_unavailable_message() - if unavailable_message: - reason = unavailable_message - raise YoutubeError( - 'YouTube said: %s' % reason, - expected=True, video_id=video_id) - else: - raise ExtractorError( - '"token" parameter not in video info for unknown reason', - video_id=video_id) - - if not formats and (video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos'])): - raise ExtractorError('This video is DRM protected.', expected=True) - - self._sort_formats(formats) - - self.mark_watched(video_id, video_info, player_response) - - return { - 'id': video_id, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'uploader_url': video_uploader_url, - 'channel_id': channel_id, - 'channel_url': channel_url, - 'upload_date': upload_date, - 'license': video_license, - 'creator': video_creator or artist, - 'title': video_title, - 'alt_title': video_alt_title or track, - 'thumbnail': video_thumbnail, - 'description': video_description, - 'categories': video_categories, - 'tags': video_tags, - 'subtitles': video_subtitles, - 'automatic_captions': automatic_captions, - 'duration': video_duration, - 'age_limit': 18 if age_gate else 0, - 'annotations': video_annotations, - 'chapters': chapters, - 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id, - 'view_count': view_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'average_rating': average_rating, - 'formats': formats, - 'is_live': is_live, - 'start_time': start_time, - 'end_time': end_time, - 'series': series, - 'season_number': season_number, - 'episode_number': episode_number, - 'track': track, - 'artist': artist, - 'album': album, - 'release_date': release_date, - 'release_year': release_year, - 'related_vids': related_vids, - 'music_list': music_list, - 'unlisted': unlisted, - } - - -class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): - IE_DESC = 'YouTube.com playlists' - _VALID_URL = r"""(?x)(?: - (?:https?://)? - (?:\w+\.)? - (?: - (?: - youtube\.com| - invidio\.us - ) - / - (?: - (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11})) - \? (?:.*?[&;])*? (?:p|a|list)= - | p/ - )| - youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= - ) - ( - (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,} - # Top tracks, they can also include dots - |(?:MC)[\w\.]* - ) - .* - | - (%(playlist_id)s) - )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} - _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?' - IE_NAME = 'youtube:playlist' - _TESTS = [{ - 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', - 'info_dict': { - 'title': 'ytdl test PL', - 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', - }, - 'playlist_count': 3, - }, { - 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', - 'info_dict': { - 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', - 'title': 'YDL_Empty_List', - }, - 'playlist_count': 0, - 'skip': 'This playlist is private', - }, { - 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', - 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', - 'info_dict': { - 'title': '29C3: Not my department', - 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', - }, - 'playlist_count': 95, - }, { - 'note': 'issue #673', - 'url': 'PLBB231211A4F62143', - 'info_dict': { - 'title': '[OLD]Team Fortress 2 (Class-based LP)', - 'id': 'PLBB231211A4F62143', - }, - 'playlist_mincount': 26, - }, { - 'note': 'Large playlist', - 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', - 'info_dict': { - 'title': 'Uploads from Cauchemar', - 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', - }, - 'playlist_mincount': 799, - }, { - 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', - 'info_dict': { - 'title': 'YDL_safe_search', - 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', - }, - 'playlist_count': 2, - 'skip': 'This playlist is private', - }, { - 'note': 'embedded', - 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', - 'playlist_count': 4, - 'info_dict': { - 'title': 'JODA15', - 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', - } - }, { - 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', - 'playlist_mincount': 485, - 'info_dict': { - 'title': '2017 華語最新單曲 (2/24更新)', - 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', - } - }, { - 'note': 'Embedded SWF player', - 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0', - 'playlist_count': 4, - 'info_dict': { - 'title': 'JODA7', - 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ', - } - }, { - 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', - 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', - 'info_dict': { - 'title': 'Uploads from Interstellar Movie', - 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', - }, - 'playlist_mincount': 21, - }, { - # Playlist URL that does not actually serve a playlist - 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', - 'info_dict': { - 'id': 'FqZTN594JQw', - 'ext': 'webm', - 'title': "Smiley's People 01 detective, Adventure Series, Action", - 'uploader': 'STREEM', - 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', - 'upload_date': '20150526', - 'license': 'Standard YouTube License', - 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', - 'categories': ['People & Blogs'], - 'tags': list, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [YoutubeIE.ie_key()], - }, { - 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5', - 'info_dict': { - 'id': 'yeWKywCrFtk', - 'ext': 'mp4', - 'title': 'Small Scale Baler and Braiding Rugs', - 'uploader': 'Backus-Page House Museum', - 'uploader_id': 'backuspagemuseum', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum', - 'upload_date': '20161008', - 'license': 'Standard YouTube License', - 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a', - 'categories': ['Nonprofits & Activism'], - 'tags': list, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'noplaylist': True, - 'skip_download': True, - }, - }, { - 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', - 'only_matching': True, - }, { - 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', - 'only_matching': True, - }, { - # music album playlist - 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', - 'only_matching': True, - }, { - 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', - 'only_matching': True, - }] - - def _real_initialize(self): - self._login() - - def _extract_mix(self, playlist_id): - # The mixes are generated from a single video - # the id of the playlist is just 'RD' + video_id - ids = [] - last_id = playlist_id[-11:] - for n in itertools.count(1): - url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) - webpage = self._download_webpage( - url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n)) - new_ids = orderedSet(re.findall( - r'''(?xs)data-video-username=".*?".*? - href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), - webpage)) - # Fetch new pages until all the videos are repeated, it seems that - # there are always 51 unique videos. - new_ids = [_id for _id in new_ids if _id not in ids] - if not new_ids: - break - ids.extend(new_ids) - last_id = ids[-1] - - url_results = self._ids_to_results(ids) - - search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage) - title_span = ( - search_title('playlist-title') - or search_title('title long-title') - or search_title('title')) - title = clean_html(title_span) - - return self.playlist_result(url_results, playlist_id, title) - - def _extract_playlist(self, playlist_id): - url = self._TEMPLATE_URL % playlist_id - page = self._download_webpage(url, playlist_id) - - # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604) - for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page): - match = match.strip() - # Check if the playlist exists or is private - mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match) - if mobj: - reason = mobj.group('reason') - message = 'This playlist %s' % reason - if 'private' in reason: - message += ', use --username or --netrc to access it' - message += '.' - raise ExtractorError(message, expected=True) - elif re.match(r'[^<]*Invalid parameters[^<]*', match): - raise ExtractorError( - 'Invalid parameters. Maybe URL is incorrect.', - expected=True) - elif re.match(r'[^<]*Choose your language[^<]*', match): - continue - else: - self.report_warning('Youtube gives an alert message: ' + match) - - playlist_title = self._html_search_regex( - r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>', - page, 'title', default=None) - - _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref=' - uploader = self._search_regex( - r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE, - page, 'uploader', default=None) - mobj = re.search( - r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE, - page) - if mobj: - uploader_id = mobj.group('uploader_id') - uploader_url = compat_urlparse.urljoin(url, mobj.group('path')) - else: - uploader_id = uploader_url = None - - has_videos = True - - if not playlist_title: - try: - # Some playlist URLs don't actually serve a playlist (e.g. - # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4) - next(self._entries(page, playlist_id)) - except StopIteration: - has_videos = False - - playlist = self.playlist_result( - self._entries(page, playlist_id), playlist_id, playlist_title) - playlist.update({ - 'uploader': uploader, - 'uploader_id': uploader_id, - 'uploader_url': uploader_url, - }) - - return has_videos, playlist - - def _check_download_just_video(self, url, playlist_id): - # Check if it's a video-specific URL - query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - video_id = query_dict.get('v', [None])[0] or self._search_regex( - r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url, - 'video id', default=None) - if video_id: - if self._downloader.params.get('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - return video_id, self.url_result(video_id, 'Youtube', video_id=video_id) - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - return video_id, None - return None, None - - def _real_extract(self, url): - # Extract playlist id - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - playlist_id = mobj.group(1) or mobj.group(2) - - video_id, video = self._check_download_just_video(url, playlist_id) - if video: - return video - - if playlist_id.startswith(('RD', 'UL', 'PU')): - # Mixes require a custom extraction process - return self._extract_mix(playlist_id) - - has_videos, playlist = self._extract_playlist(playlist_id) - if has_videos or not video_id: - return playlist - - # Some playlist URLs don't actually serve a playlist (see - # https://github.com/ytdl-org/youtube-dl/issues/10537). - # Fallback to plain video extraction if there is a video id - # along with playlist id. - return self.url_result(video_id, 'Youtube', video_id=video_id) - - -class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): - IE_DESC = 'YouTube.com channels' - _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)' - _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' - _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' - IE_NAME = 'youtube:channel' - _TESTS = [{ - 'note': 'paginated channel', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', - 'playlist_mincount': 91, - 'info_dict': { - 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'Uploads from lex will', - } - }, { - 'note': 'Age restricted channel', - # from https://www.youtube.com/user/DeusExOfficial - 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w', - 'playlist_mincount': 64, - 'info_dict': { - 'id': 'UUs0ifCMCm1icqRbqhUINa0w', - 'title': 'Uploads from Deus Ex', - }, - }, { - 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url) - else super(YoutubeChannelIE, cls).suitable(url)) - - def _build_template_url(self, url, channel_id): - return self._TEMPLATE_URL % channel_id - - def _real_extract(self, url): - channel_id = self._match_id(url) - - url = self._build_template_url(url, channel_id) - - # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778) - # Workaround by extracting as a playlist if managed to obtain channel playlist URL - # otherwise fallback on channel by page extraction - channel_page = self._download_webpage( - url + '?view=57', channel_id, - 'Downloading channel page', fatal=False) - if channel_page is False: - channel_playlist_id = False - else: - channel_playlist_id = self._html_search_meta( - 'channelId', channel_page, 'channel id', default=None) - if not channel_playlist_id: - channel_url = self._html_search_meta( - ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'), - channel_page, 'channel url', default=None) - if channel_url: - channel_playlist_id = self._search_regex( - r'vnd\.youtube://user/([0-9A-Za-z_-]+)', - channel_url, 'channel id', default=None) - if channel_playlist_id and channel_playlist_id.startswith('UC'): - playlist_id = 'UU' + channel_playlist_id[2:] - return self.url_result( - compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist') - - channel_page = self._download_webpage(url, channel_id, 'Downloading page #1') - autogenerated = re.search(r'''(?x) - class="[^"]*?(?: - channel-header-autogenerated-label| - yt-channel-title-autogenerated - )[^"]*"''', channel_page) is not None - - if autogenerated: - # The videos are contained in a single page - # the ajax pages can't be used, they are empty - entries = [ - self.url_result( - video_id, 'Youtube', video_id=video_id, - video_title=video_title) - for video_id, video_title in self.extract_videos_from_page(channel_page)] - return self.playlist_result(entries, channel_id) - - try: - next(self._entries(channel_page, channel_id)) - except StopIteration: - alert_message = self._html_search_regex( - r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>', - channel_page, 'alert', default=None, group='alert') - if alert_message: - raise ExtractorError('Youtube said: %s' % alert_message, expected=True) - - return self.playlist_result(self._entries(channel_page, channel_id), channel_id) - - -class YoutubeUserIE(YoutubeChannelIE): - IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' - _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos' - IE_NAME = 'youtube:user' - - _TESTS = [{ - 'url': 'https://www.youtube.com/user/TheLinuxFoundation', - 'playlist_mincount': 320, - 'info_dict': { - 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ', - 'title': 'Uploads from The Linux Foundation', - } - }, { - # Only available via https://www.youtube.com/c/12minuteathlete/videos - # but not https://www.youtube.com/user/12minuteathlete/videos - 'url': 'https://www.youtube.com/c/12minuteathlete/videos', - 'playlist_mincount': 249, - 'info_dict': { - 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ', - 'title': 'Uploads from 12 Minute Athlete', - } - }, { - 'url': 'ytuser:phihag', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/gametrailers', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/gametrailers', - 'only_matching': True, - }, { - # This channel is not available, geo restricted to JP - 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - # Don't return True if the url can be extracted with other youtube - # extractor, the regex would is too permissive and it would match. - other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls) - if any(ie.suitable(url) for ie in other_yt_ies): - return False - else: - return super(YoutubeUserIE, cls).suitable(url) - - def _build_template_url(self, url, channel_id): - mobj = re.match(self._VALID_URL, url) - return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id')) - - -class YoutubeLiveIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube.com live streams' - _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live' - IE_NAME = 'youtube:live' - - _TESTS = [{ - 'url': 'https://www.youtube.com/user/TheYoungTurks/live', - 'info_dict': { - 'id': 'a48o2S1cPoo', - 'ext': 'mp4', - 'title': 'The Young Turks - Live Main Show', - 'uploader': 'The Young Turks', - 'uploader_id': 'TheYoungTurks', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', - 'upload_date': '20150715', - 'license': 'Standard YouTube License', - 'description': 'md5:438179573adcdff3c97ebb1ee632b891', - 'categories': ['News & Politics'], - 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/TheYoungTurks/live', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') - base_url = mobj.group('base_url') - webpage = self._download_webpage(url, channel_id, fatal=False) - if webpage: - page_type = self._og_search_property( - 'type', webpage, 'page type', default='') - video_id = self._html_search_meta( - 'videoId', webpage, 'video id', default=None) - if page_type.startswith('video') and video_id and re.match( - r'^[0-9A-Za-z_-]{11}$', video_id): - return self.url_result(video_id, YoutubeIE.ie_key()) - return self.url_result(base_url) - - -class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): - IE_DESC = 'YouTube.com user/channel playlists' - _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists' - IE_NAME = 'youtube:playlists' - - _TESTS = [{ - 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', - 'playlist_mincount': 4, - 'info_dict': { - 'id': 'ThirstForScience', - 'title': 'Thirst for Science', - }, - }, { - # with "Load more" button - 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', - 'playlist_mincount': 70, - 'info_dict': { - 'id': 'igorkle1', - 'title': 'Игорь Клейнер', - }, - }, { - 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists', - 'playlist_mincount': 17, - 'info_dict': { - 'id': 'UCiU1dHvZObB2iP6xkJ__Icw', - 'title': 'Chem Player', - }, - }] - - -class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' - - -class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): - IE_DESC = 'YouTube.com searches' - # there doesn't appear to be a real limit, for example if you search for - # 'python' you get more than 8.000.000 results - _MAX_RESULTS = float('inf') - IE_NAME = 'youtube:search' - _SEARCH_KEY = 'ytsearch' - _EXTRA_QUERY_ARGS = {} - _TESTS = [] - - def _get_n_results(self, query, n): - """Get a specified number of results for a query""" - - videos = [] - limit = n - - url_query = { - 'search_query': query.encode('utf-8'), - } - url_query.update(self._EXTRA_QUERY_ARGS) - result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query) - - for pagenum in itertools.count(1): - data = self._download_json( - result_url, video_id='query "%s"' % query, - note='Downloading page %s' % pagenum, - errnote='Unable to download API page', - query={'spf': 'navigate'}) - html_content = data[1]['body']['content'] - - if 'class="search-message' in html_content: - raise ExtractorError( - '[youtube] No video results', expected=True) - - new_videos = list(self._process_page(html_content)) - videos += new_videos - if not new_videos or len(videos) > limit: - break - next_link = self._html_search_regex( - r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next', - html_content, 'next link', default=None) - if next_link is None: - break - result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link) - - if len(videos) > n: - videos = videos[:n] - return self.playlist_result(videos, query) - - -class YoutubeSearchDateIE(YoutubeSearchIE): - IE_NAME = YoutubeSearchIE.IE_NAME + ':date' - _SEARCH_KEY = 'ytsearchdate' - IE_DESC = 'YouTube.com searches, newest videos first' - _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'} - - -class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): - IE_DESC = 'YouTube.com search URLs' - IE_NAME = 'youtube:search_url' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' - _TESTS = [{ - 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', - 'playlist_mincount': 5, - 'info_dict': { - 'title': 'youtube-dl test video', - } - }, { - 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - query = compat_urllib_parse_unquote_plus(mobj.group('query')) - webpage = self._download_webpage(url, query) - return self.playlist_result(self._process_page(webpage), playlist_title=query) - - -class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): - IE_DESC = 'YouTube.com (multi-season) shows' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)' - IE_NAME = 'youtube:show' - _TESTS = [{ - 'url': 'https://www.youtube.com/show/airdisasters', - 'playlist_mincount': 5, - 'info_dict': { - 'id': 'airdisasters', - 'title': 'Air Disasters', - } - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - return super(YoutubeShowIE, self)._real_extract( - 'https://www.youtube.com/show/%s/playlists' % playlist_id) - - -class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): - """ - Base class for feed extractors - Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. - """ - _LOGIN_REQUIRED = True - - @property - def IE_NAME(self): - return 'youtube:%s' % self._FEED_NAME - - def _real_initialize(self): - self._login() - - def _entries(self, page): - # The extraction process is the same as for playlists, but the regex - # for the video ids doesn't contain an index - ids = [] - more_widget_html = content_html = page - for page_num in itertools.count(1): - matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) - - # 'recommended' feed has infinite 'load more' and each new portion spins - # the same videos in (sometimes) slightly different order, so we'll check - # for unicity and break when portion has no new videos - new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) - if not new_ids: - break - - ids.extend(new_ids) - - for entry in self._ids_to_results(new_ids): - yield entry - - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: - break - - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] - - def _real_extract(self, url): - page = self._download_webpage( - 'https://www.youtube.com/feed/%s' % self._FEED_NAME, - self._PLAYLIST_TITLE) - return self.playlist_result( - self._entries(page), playlist_title=self._PLAYLIST_TITLE) - - -class YoutubeWatchLaterIE(YoutubePlaylistIE): - IE_NAME = 'youtube:watchlater' - IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater' - - _TESTS = [{ - 'url': 'https://www.youtube.com/playlist?list=WL', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL', - 'only_matching': True, - }] - - def _real_extract(self, url): - _, video = self._check_download_just_video(url, 'WL') - if video: - return video - _, playlist = self._extract_playlist('WL') - return playlist - - -class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): - IE_NAME = 'youtube:favorites' - IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?' - _LOGIN_REQUIRED = True - - def _real_extract(self, url): - webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos') - playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id') - return self.url_result(playlist_id, 'YoutubePlaylist') - - -class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?' - _FEED_NAME = 'recommended' - _PLAYLIST_TITLE = 'Youtube Recommended videos' - - -class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' - _FEED_NAME = 'subscriptions' - _PLAYLIST_TITLE = 'Youtube Subscriptions' - - -class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory' - _FEED_NAME = 'history' - _PLAYLIST_TITLE = 'Youtube History' - - -class YoutubeTruncatedURLIE(InfoExtractor): - IE_NAME = 'youtube:truncated_url' - IE_DESC = False # Do not list - _VALID_URL = r'''(?x) - (?:https?://)? - (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/ - (?:watch\?(?: - feature=[a-z_]+| - annotation_id=annotation_[^&]+| - x-yt-cl=[0-9]+| - hl=[^&]*| - t=[0-9]+ - )? - | - attribution_link\?a=[^&]+ - ) - $ - ''' - - _TESTS = [{ - 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?feature=foo', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?hl=en-GB', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?t=2372', - 'only_matching': True, - }] - - def _real_extract(self, url): - raise ExtractorError( - 'Did you forget to quote the URL? Remember that & is a meta ' - 'character in most shells, so you want to put the URL in quotes, ' - 'like youtube-dl ' - '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" ' - ' or simply youtube-dl BaW_jenozKc .', - expected=True) - - -class YoutubeTruncatedIDIE(InfoExtractor): - IE_NAME = 'youtube:truncated_id' - IE_DESC = False # Do not list - _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$' - - _TESTS = [{ - 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - raise ExtractorError( - 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url), - expected=True) diff --git a/youtube_dl/extractor/youtube_unmodified_reference.py b/youtube_dl/extractor/youtube_unmodified_reference.py deleted file mode 100644 index f76a6e7..0000000 --- a/youtube_dl/extractor/youtube_unmodified_reference.py +++ /dev/null @@ -1,3192 +0,0 @@ -# coding: utf-8 - -from __future__ import unicode_literals - - -import itertools -import json -import os.path -import random -import re -import time -import traceback - -from .common import InfoExtractor, SearchInfoExtractor -from ..jsinterp import JSInterpreter -from ..swfinterp import SWFInterpreter -from ..compat import ( - compat_chr, - compat_HTTPError, - compat_kwargs, - compat_parse_qs, - compat_urllib_parse_unquote, - compat_urllib_parse_unquote_plus, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, - compat_urlparse, - compat_str, -) -from ..utils import ( - clean_html, - dict_get, - error_to_compat_str, - ExtractorError, - float_or_none, - get_element_by_attribute, - get_element_by_id, - int_or_none, - mimetype2ext, - orderedSet, - parse_codecs, - parse_duration, - qualities, - remove_quotes, - remove_start, - smuggle_url, - str_or_none, - str_to_int, - try_get, - unescapeHTML, - unified_strdate, - unsmuggle_url, - uppercase_escape, - url_or_none, - urlencode_postdata, -) - - -class YoutubeBaseInfoExtractor(InfoExtractor): - """Provide base functions for Youtube extractors""" - _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' - _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' - - _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup' - _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge' - _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' - - _NETRC_MACHINE = 'youtube' - # If True it will raise an error if no login info is provided - _LOGIN_REQUIRED = False - - _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}' - - def _set_language(self): - self._set_cookie( - '.youtube.com', 'PREF', 'f1=50000000&hl=en', - # YouTube sets the expire time to about two months - expire_time=time.time() + 2 * 30 * 24 * 3600) - - def _ids_to_results(self, ids): - return [ - self.url_result(vid_id, 'Youtube', video_id=vid_id) - for vid_id in ids] - - def _login(self): - """ - Attempt to log in to YouTube. - True is returned if successful or skipped. - False is returned if login failed. - - If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised. - """ - username, password = self._get_login_info() - # No authentication to be performed - if username is None: - if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None: - raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) - return True - - login_page = self._download_webpage( - self._LOGIN_URL, None, - note='Downloading login page', - errnote='unable to fetch login page', fatal=False) - if login_page is False: - return - - login_form = self._hidden_inputs(login_page) - - def req(url, f_req, note, errnote): - data = login_form.copy() - data.update({ - 'pstMsg': 1, - 'checkConnection': 'youtube', - 'checkedDomains': 'youtube', - 'hl': 'en', - 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]', - 'f.req': json.dumps(f_req), - 'flowName': 'GlifWebSignIn', - 'flowEntry': 'ServiceLogin', - }) - return self._download_json( - url, None, note=note, errnote=errnote, - transform_source=lambda s: re.sub(r'^[^[]*', '', s), - fatal=False, - data=urlencode_postdata(data), headers={ - 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8', - 'Google-Accounts-XSRF': 1, - }) - - def warn(message): - self._downloader.report_warning(message) - - lookup_req = [ - username, - None, [], None, 'US', None, None, 2, False, True, - [ - None, None, - [2, 1, None, 1, - 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', - None, [], 4], - 1, [None, None, []], None, None, None, True - ], - username, - ] - - lookup_results = req( - self._LOOKUP_URL, lookup_req, - 'Looking up account info', 'Unable to look up account info') - - if lookup_results is False: - return False - - user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str) - if not user_hash: - warn('Unable to extract user hash') - return False - - challenge_req = [ - user_hash, - None, 1, None, [1, None, None, None, [password, None, True]], - [ - None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4], - 1, [None, None, []], None, None, None, True - ]] - - challenge_results = req( - self._CHALLENGE_URL, challenge_req, - 'Logging in', 'Unable to log in') - - if challenge_results is False: - return - - login_res = try_get(challenge_results, lambda x: x[0][5], list) - if login_res: - login_msg = try_get(login_res, lambda x: x[5], compat_str) - warn( - 'Unable to login: %s' % 'Invalid password' - if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg) - return False - - res = try_get(challenge_results, lambda x: x[0][-1], list) - if not res: - warn('Unable to extract result entry') - return False - - login_challenge = try_get(res, lambda x: x[0][0], list) - if login_challenge: - challenge_str = try_get(login_challenge, lambda x: x[2], compat_str) - if challenge_str == 'TWO_STEP_VERIFICATION': - # SEND_SUCCESS - TFA code has been successfully sent to phone - # QUOTA_EXCEEDED - reached the limit of TFA codes - status = try_get(login_challenge, lambda x: x[5], compat_str) - if status == 'QUOTA_EXCEEDED': - warn('Exceeded the limit of TFA codes, try later') - return False - - tl = try_get(challenge_results, lambda x: x[1][2], compat_str) - if not tl: - warn('Unable to extract TL') - return False - - tfa_code = self._get_tfa_info('2-step verification code') - - if not tfa_code: - warn( - 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>' - '(Note that only TOTP (Google Authenticator App) codes work at this time.)') - return False - - tfa_code = remove_start(tfa_code, 'G-') - - tfa_req = [ - user_hash, None, 2, None, - [ - 9, None, None, None, None, None, None, None, - [None, tfa_code, True, 2] - ]] - - tfa_results = req( - self._TFA_URL.format(tl), tfa_req, - 'Submitting TFA code', 'Unable to submit TFA code') - - if tfa_results is False: - return False - - tfa_res = try_get(tfa_results, lambda x: x[0][5], list) - if tfa_res: - tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str) - warn( - 'Unable to finish TFA: %s' % 'Invalid TFA code' - if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg) - return False - - check_cookie_url = try_get( - tfa_results, lambda x: x[0][-1][2], compat_str) - else: - CHALLENGES = { - 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.", - 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.', - 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.", - } - challenge = CHALLENGES.get( - challenge_str, - '%s returned error %s.' % (self.IE_NAME, challenge_str)) - warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge) - return False - else: - check_cookie_url = try_get(res, lambda x: x[2], compat_str) - - if not check_cookie_url: - warn('Unable to extract CheckCookie URL') - return False - - check_cookie_results = self._download_webpage( - check_cookie_url, None, 'Checking cookie', fatal=False) - - if check_cookie_results is False: - return False - - if 'https://myaccount.google.com/' not in check_cookie_results: - warn('Unable to log in') - return False - - return True - - def _download_webpage_handle(self, *args, **kwargs): - query = kwargs.get('query', {}).copy() - query['disable_polymer'] = 'true' - kwargs['query'] = query - return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( - *args, **compat_kwargs(kwargs)) - - def _real_initialize(self): - if self._downloader is None: - return - self._set_language() - if not self._login(): - return - - -class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): - # Extract entries from page with "Load more" button - def _entries(self, page, playlist_id): - more_widget_html = content_html = page - for page_num in itertools.count(1): - for entry in self._process_page(content_html): - yield entry - - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: - break - - count = 0 - retries = 3 - while count <= retries: - try: - # Downloading page may result in intermittent 5xx HTTP error - # that is usually worked around with a retry - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, - 'Downloading page #%s%s' - % (page_num, ' (retry #%d)' % count if count else ''), - transform_source=uppercase_escape) - break - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): - count += 1 - if count <= retries: - continue - raise - - content_html = more['content_html'] - if not content_html.strip(): - # Some webpages show a "Load more" button but they don't - # have more videos - break - more_widget_html = more['load_more_widget_html'] - - -class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): - def _process_page(self, content): - for video_id, video_title in self.extract_videos_from_page(content): - yield self.url_result(video_id, 'Youtube', video_id, video_title) - - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - for mobj in re.finditer(self._VIDEO_RE, page): - # The link with index 0 is not the first video of the playlist (not sure if still actual) - if 'index' in mobj.groupdict() and mobj.group('id') == '0': - continue - video_id = mobj.group('id') - video_title = unescapeHTML(mobj.group('title')) - if video_title: - video_title = video_title.strip() - try: - idx = ids_in_page.index(video_id) - if video_title and not titles_in_page[idx]: - titles_in_page[idx] = video_title - except ValueError: - ids_in_page.append(video_id) - titles_in_page.append(video_title) - return zip(ids_in_page, titles_in_page) - - -class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): - def _process_page(self, content): - for playlist_id in orderedSet(re.findall( - r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', - content)): - yield self.url_result( - 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - title = self._og_search_title(webpage, fatal=False) - return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title) - - -class YoutubeIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube.com' - _VALID_URL = r"""(?x)^ - ( - (?:https?://|//) # http(s):// or protocol-independent URL - (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/| - (?:www\.)?deturl\.com/www\.youtube\.com/| - (?:www\.)?pwnyoutube\.com/| - (?:www\.)?hooktube\.com/| - (?:www\.)?yourepeat\.com/| - tube\.majestyc\.net/| - (?:(?:www|dev)\.)?invidio\.us/| - (?:www\.)?invidiou\.sh/| - (?:www\.)?invidious\.snopyta\.org/| - (?:www\.)?invidious\.kabi\.tk/| - (?:www\.)?vid\.wxzm\.sx/| - youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains - (?:.*?\#/)? # handle anchor (#/) redirect urls - (?: # the various things that can precede the ID: - (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/ - |(?: # or the v= param in all its forms - (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) - (?:\?|\#!?) # the params delimiter ? or # or #! - (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY) - v= - ) - )) - |(?: - youtu\.be| # just youtu.be/xxxx - vid\.plus| # or vid.plus/xxxx - zwearz\.com/watch| # or zwearz.com/watch/xxxx - )/ - |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= - ) - )? # all until now is optional -> you can pass the naked ID - ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID - (?!.*?\blist= - (?: - %(playlist_id)s| # combined list/video URLs are handled by the playlist IE - WL # WL are handled by the watch later IE - ) - ) - (?(1).+)? # if we found the ID, everything can follow - $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} - _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' - _formats = { - '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, - '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, - '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, - '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, - '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, - '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well - '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, - '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - - - # 3D videos - '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, - '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, - '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, - '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, - '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20}, - '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, - '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, - - # Apple HTTP Live Streaming - '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, - '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, - '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, - '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, - '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, - - # DASH mp4 video - '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) - '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, - '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, - '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, - - # Dash mp4 audio - '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, - '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, - '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, - '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, - '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, - '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, - '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, - - # Dash webm - '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, - '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) - '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - - # Dash webm audio - '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, - '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, - - # Dash webm audio with opus inside - '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, - '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, - '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, - - # RTMP (unnamed) - '_rtmp': {'protocol': 'rtmp'}, - - # av01 video only formats sometimes served with "unknown" codecs - '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - } - _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') - - _GEO_BYPASS = False - - IE_NAME = 'youtube' - _TESTS = [ - { - 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9', - 'info_dict': { - 'id': 'BaW_jenozKc', - 'ext': 'mp4', - 'title': 'youtube-dl test video "\'/\\ä↭𝕐', - 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', - 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', - 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', - 'upload_date': '20121002', - 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', - 'categories': ['Science & Technology'], - 'tags': ['youtube-dl'], - 'duration': 10, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - 'start_time': 1, - 'end_time': 9, - } - }, - { - 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY', - 'note': 'Test generic use_cipher_signature video (#897)', - 'info_dict': { - 'id': 'UxxajLWwzqY', - 'ext': 'mp4', - 'upload_date': '20120506', - 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', - 'alt_title': 'I Love It (feat. Charli XCX)', - 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8', - 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', - 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', - 'iconic ep', 'iconic', 'love', 'it'], - 'duration': 180, - 'uploader': 'Icona Pop', - 'uploader_id': 'IconaPop', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop', - 'creator': 'Icona Pop', - 'track': 'I Love It (feat. Charli XCX)', - 'artist': 'Icona Pop', - } - }, - { - 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ', - 'note': 'Test VEVO video with age protection (#956)', - 'info_dict': { - 'id': '07FYdnEawAQ', - 'ext': 'mp4', - 'upload_date': '20130703', - 'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)', - 'alt_title': 'Tunnel Vision', - 'description': 'md5:07dab3356cde4199048e4c7cd93471e1', - 'duration': 419, - 'uploader': 'justintimberlakeVEVO', - 'uploader_id': 'justintimberlakeVEVO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', - 'creator': 'Justin Timberlake', - 'track': 'Tunnel Vision', - 'artist': 'Justin Timberlake', - 'age_limit': 18, - } - }, - { - 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ', - 'note': 'Embed-only video (#1746)', - 'info_dict': { - 'id': 'yZIXLfi8CZQ', - 'ext': 'mp4', - 'upload_date': '20120608', - 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012', - 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', - 'uploader': 'SET India', - 'uploader_id': 'setindia', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', - 'age_limit': 18, - } - }, - { - 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY', - 'note': 'Use the first video ID in the URL', - 'info_dict': { - 'id': 'BaW_jenozKc', - 'ext': 'mp4', - 'title': 'youtube-dl test video "\'/\\ä↭𝕐', - 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', - 'upload_date': '20121002', - 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', - 'categories': ['Science & Technology'], - 'tags': ['youtube-dl'], - 'duration': 10, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I', - 'note': '256k DASH audio (format 141) via DASH manifest', - 'info_dict': { - 'id': 'a9LDPn-MO4I', - 'ext': 'm4a', - 'upload_date': '20121002', - 'uploader_id': '8KVIDEO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', - 'description': '', - 'uploader': '8KVIDEO', - 'title': 'UHDTV TEST 8K VIDEO.mp4' - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '141', - }, - 'skip': 'format 141 not served anymore', - }, - # DASH manifest with encrypted signature - { - 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA', - 'info_dict': { - 'id': 'IB3lcPjvWLA', - 'ext': 'm4a', - 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', - 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', - 'duration': 244, - 'uploader': 'AfrojackVEVO', - 'uploader_id': 'AfrojackVEVO', - 'upload_date': '20131011', - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '141/bestaudio[ext=m4a]', - }, - }, - # JS player signature function name containing $ - { - 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM', - 'info_dict': { - 'id': 'nfWlot6h_JM', - 'ext': 'm4a', - 'title': 'Taylor Swift - Shake It Off', - 'description': 'md5:bec2185232c05479482cb5a9b82719bf', - 'duration': 242, - 'uploader': 'TaylorSwiftVEVO', - 'uploader_id': 'TaylorSwiftVEVO', - 'upload_date': '20140818', - 'creator': 'Taylor Swift', - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '141/bestaudio[ext=m4a]', - }, - }, - # Controversy video - { - 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8', - 'info_dict': { - 'id': 'T4XJQO3qol8', - 'ext': 'mp4', - 'duration': 219, - 'upload_date': '20100909', - 'uploader': 'Amazing Atheist', - 'uploader_id': 'TheAmazingAtheist', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', - 'title': 'Burning Everyone\'s Koran', - 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', - } - }, - # Normal age-gate video (No vevo, embed allowed) - { - 'url': 'https://youtube.com/watch?v=HtVdAasjOgU', - 'info_dict': { - 'id': 'HtVdAasjOgU', - 'ext': 'mp4', - 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', - 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', - 'duration': 142, - 'uploader': 'The Witcher', - 'uploader_id': 'WitcherGame', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', - 'upload_date': '20140605', - 'age_limit': 18, - }, - }, - # Age-gate video with encrypted signature - { - 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU', - 'info_dict': { - 'id': '6kLq3WMV1nU', - 'ext': 'mp4', - 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', - 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', - 'duration': 246, - 'uploader': 'LloydVEVO', - 'uploader_id': 'LloydVEVO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', - 'upload_date': '20110629', - 'age_limit': 18, - }, - }, - # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) - # YouTube Red ad is not captured for creator - { - 'url': '__2ABJjxzNo', - 'info_dict': { - 'id': '__2ABJjxzNo', - 'ext': 'mp4', - 'duration': 266, - 'upload_date': '20100430', - 'uploader_id': 'deadmau5', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', - 'creator': 'deadmau5', - 'description': 'md5:12c56784b8032162bb936a5f76d55360', - 'uploader': 'deadmau5', - 'title': 'Deadmau5 - Some Chords (HD)', - 'alt_title': 'Some Chords', - }, - 'expected_warnings': [ - 'DASH manifest missing', - ] - }, - # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431) - { - 'url': 'lqQg6PlCWgI', - 'info_dict': { - 'id': 'lqQg6PlCWgI', - 'ext': 'mp4', - 'duration': 6085, - 'upload_date': '20150827', - 'uploader_id': 'olympic', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', - 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', - 'uploader': 'Olympic', - 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', - }, - 'params': { - 'skip_download': 'requires avconv', - } - }, - # Non-square pixels - { - 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0', - 'info_dict': { - 'id': '_b-2C3KPAM0', - 'ext': 'mp4', - 'stretched_ratio': 16 / 9., - 'duration': 85, - 'upload_date': '20110310', - 'uploader_id': 'AllenMeow', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', - 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', - 'uploader': '孫ᄋᄅ', - 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', - }, - }, - # url_encoded_fmt_stream_map is empty string - { - 'url': 'qEJwOuvDf7I', - 'info_dict': { - 'id': 'qEJwOuvDf7I', - 'ext': 'webm', - 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге', - 'description': '', - 'upload_date': '20150404', - 'uploader_id': 'spbelect', - 'uploader': 'Наблюдатели Петербурга', - }, - 'params': { - 'skip_download': 'requires avconv', - }, - 'skip': 'This live event has ended.', - }, - # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097) - { - 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', - 'info_dict': { - 'id': 'FIl7x6_3R5Y', - 'ext': 'webm', - 'title': 'md5:7b81415841e02ecd4313668cde88737a', - 'description': 'md5:116377fd2963b81ec4ce64b542173306', - 'duration': 220, - 'upload_date': '20150625', - 'uploader_id': 'dorappi2000', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', - 'uploader': 'dorappi2000', - 'formats': 'mincount:31', - }, - 'skip': 'not actual anymore', - }, - # DASH manifest with segment_list - { - 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8', - 'md5': '8ce563a1d667b599d21064e982ab9e31', - 'info_dict': { - 'id': 'CsmdDsKjzN8', - 'ext': 'mp4', - 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510 - 'uploader': 'Airtek', - 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.', - 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ', - 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015', - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '135', # bestvideo - }, - 'skip': 'This live event has ended.', - }, - { - # Multifeed videos (multiple cameras), URL is for Main Camera - 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs', - 'info_dict': { - 'id': 'jqWvoWXjCVs', - 'title': 'teamPGP: Rocket League Noob Stream', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - }, - 'playlist': [{ - 'info_dict': { - 'id': 'jqWvoWXjCVs', - 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7335, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', - }, - }, { - 'info_dict': { - 'id': '6h8e8xoXJzg', - 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7337, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', - }, - }, { - 'info_dict': { - 'id': 'PUOgX5z9xZw', - 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (grizzle)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7337, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', - }, - }, { - 'info_dict': { - 'id': 'teuwxikvS5k', - 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (zim)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7334, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', - }, - }], - 'params': { - 'skip_download': True, - }, - 'skip': 'This video is not available.', - }, - { - # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536) - 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo', - 'info_dict': { - 'id': 'gVfLd0zydlo', - 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30', - }, - 'playlist_count': 2, - 'skip': 'Not multifeed anymore', - }, - { - 'url': 'https://vid.plus/FlRa-iH7PGw', - 'only_matching': True, - }, - { - 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html', - 'only_matching': True, - }, - { - # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468) - # Also tests cut-off URL expansion in video description (see - # https://github.com/ytdl-org/youtube-dl/issues/1892, - # https://github.com/ytdl-org/youtube-dl/issues/8164) - 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg', - 'info_dict': { - 'id': 'lsguqyKfVQg', - 'ext': 'mp4', - 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', - 'alt_title': 'Dark Walk - Position Music', - 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', - 'duration': 133, - 'upload_date': '20151119', - 'uploader_id': 'IronSoulElf', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', - 'uploader': 'IronSoulElf', - 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', - 'track': 'Dark Walk - Position Music', - 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', - 'album': 'Position Music - Production Music Vol. 143 - Dark Walk', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468) - 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8', - 'only_matching': True, - }, - { - # Video with yt:stretch=17:0 - 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM', - 'info_dict': { - 'id': 'Q39EVAstoRM', - 'ext': 'mp4', - 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4', - 'description': 'md5:ee18a25c350637c8faff806845bddee9', - 'upload_date': '20151107', - 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA', - 'uploader': 'CH GAMER DROID', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video does not exist.', - }, - { - # Video licensed under Creative Commons - 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA', - 'info_dict': { - 'id': 'M4gD1WSo5mA', - 'ext': 'mp4', - 'title': 'md5:e41008789470fc2533a3252216f1c1d1', - 'description': 'md5:a677553cf0840649b731a3024aeff4cc', - 'duration': 721, - 'upload_date': '20150127', - 'uploader_id': 'BerkmanCenter', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter', - 'uploader': 'The Berkman Klein Center for Internet & Society', - 'license': 'Creative Commons Attribution license (reuse allowed)', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Channel-like uploader_url - 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg', - 'info_dict': { - 'id': 'eQcmzGIKrzg', - 'ext': 'mp4', - 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders', - 'description': 'md5:dda0d780d5a6e120758d1711d062a867', - 'duration': 4060, - 'upload_date': '20151119', - 'uploader': 'Bernie Sanders', - 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', - 'license': 'Creative Commons Attribution license (reuse allowed)', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY', - 'only_matching': True, - }, - { - # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059) - 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo', - 'only_matching': True, - }, - { - # Rental video preview - 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg', - 'info_dict': { - 'id': 'uGpuVWrhIzE', - 'ext': 'mp4', - 'title': 'Piku - Trailer', - 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb', - 'upload_date': '20150811', - 'uploader': 'FlixMatrix', - 'uploader_id': 'FlixMatrixKaravan', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan', - 'license': 'Standard YouTube License', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video is not available.', - }, - { - # YouTube Red video with episode data - 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4', - 'info_dict': { - 'id': 'iqKdEhx-dD4', - 'ext': 'mp4', - 'title': 'Isolation - Mind Field (Ep 1)', - 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f', - 'duration': 2085, - 'upload_date': '20170118', - 'uploader': 'Vsauce', - 'uploader_id': 'Vsauce', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce', - 'series': 'Mind Field', - 'season_number': 1, - 'episode_number': 1, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': [ - 'Skipping DASH manifest', - ], - }, - { - # The following content has been identified by the YouTube community - # as inappropriate or offensive to some audiences. - 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI', - 'info_dict': { - 'id': '6SJNVb0GnPI', - 'ext': 'mp4', - 'title': 'Race Differences in Intelligence', - 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1', - 'duration': 965, - 'upload_date': '20140124', - 'uploader': 'New Century Foundation', - 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # itag 212 - 'url': '1t24XAntNCY', - 'only_matching': True, - }, - { - # geo restricted to JP - 'url': 'sJL6WA-aGkQ', - 'only_matching': True, - }, - { - 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', - 'only_matching': True, - }, - { - 'url': 'https://invidio.us/watch?v=BaW_jenozKc', - 'only_matching': True, - }, - { - # DRM protected - 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc', - 'only_matching': True, - }, - { - # Video with unsupported adaptive stream type formats - 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U', - 'info_dict': { - 'id': 'Z4Vy8R84T1U', - 'ext': 'mp4', - 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'duration': 433, - 'upload_date': '20130923', - 'uploader': 'Amelia Putri Harwita', - 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q', - 'formats': 'maxcount:10', - }, - 'params': { - 'skip_download': True, - 'youtube_include_dash_manifest': False, - }, - }, - { - # Youtube Music Auto-generated description - 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs', - 'info_dict': { - 'id': 'MgNrAu2pzNs', - 'ext': 'mp4', - 'title': 'Voyeur Girl', - 'description': 'md5:7ae382a65843d6df2685993e90a8628f', - 'upload_date': '20190312', - 'uploader': 'Various Artists - Topic', - 'uploader_id': 'UCVWKBi1ELZn0QX2CBLSkiyw', - 'artist': 'Stephen', - 'track': 'Voyeur Girl', - 'album': 'it\'s too much love to know my dear', - 'release_date': '20190313', - 'release_year': 2019, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Youtube Music Auto-generated description - # Retrieve 'artist' field from 'Artist:' in video description - # when it is present on youtube music video - 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY', - 'info_dict': { - 'id': 'k0jLE7tTwjY', - 'ext': 'mp4', - 'title': 'Latch Feat. Sam Smith', - 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335', - 'upload_date': '20150110', - 'uploader': 'Various Artists - Topic', - 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w', - 'artist': 'Disclosure', - 'track': 'Latch Feat. Sam Smith', - 'album': 'Latch Featuring Sam Smith', - 'release_date': '20121008', - 'release_year': 2012, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Youtube Music Auto-generated description - # handle multiple artists on youtube music video - 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA', - 'info_dict': { - 'id': '74qn0eJSjpA', - 'ext': 'mp4', - 'title': 'Eastside', - 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2', - 'upload_date': '20180710', - 'uploader': 'Benny Blanco - Topic', - 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A', - 'artist': 'benny blanco, Halsey, Khalid', - 'track': 'Eastside', - 'album': 'Eastside', - 'release_date': '20180713', - 'release_year': 2018, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Youtube Music Auto-generated description - # handle youtube music video with release_year and no release_date - 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M', - 'info_dict': { - 'id': '-hcAI0g-f5M', - 'ext': 'mp4', - 'title': 'Put It On Me', - 'description': 'md5:93c55acc682ae7b0c668f2e34e1c069e', - 'upload_date': '20180426', - 'uploader': 'Matt Maeson - Topic', - 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ', - 'artist': 'Matt Maeson', - 'track': 'Put It On Me', - 'album': 'The Hearse', - 'release_date': None, - 'release_year': 2018, - }, - 'params': { - 'skip_download': True, - }, - }, - ] - - def __init__(self, *args, **kwargs): - super(YoutubeIE, self).__init__(*args, **kwargs) - self._player_cache = {} - - def report_video_info_webpage_download(self, video_id): - """Report attempt to download video info webpage.""" - self.to_screen('%s: Downloading video info webpage' % video_id) - - def report_information_extraction(self, video_id): - """Report attempt to extract video information.""" - self.to_screen('%s: Extracting video information' % video_id) - - def report_unavailable_format(self, video_id, format): - """Report extracted video URL.""" - self.to_screen('%s: Format %s not available' % (video_id, format)) - - def report_rtmp_download(self): - """Indicate the download will use the RTMP protocol.""" - self.to_screen('RTMP download detected') - - def _signature_cache_id(self, example_sig): - """ Return a string representation of a signature """ - return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) - - def _extract_signature_function(self, video_id, player_url, example_sig): - id_m = re.match( - r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$', - player_url) - if not id_m: - raise ExtractorError('Cannot identify player %r' % player_url) - player_type = id_m.group('ext') - player_id = id_m.group('id') - - # Read from filesystem cache - func_id = '%s_%s_%s' % ( - player_type, player_id, self._signature_cache_id(example_sig)) - assert os.path.basename(func_id) == func_id - - cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) - if cache_spec is not None: - return lambda s: ''.join(s[i] for i in cache_spec) - - download_note = ( - 'Downloading player %s' % player_url - if self._downloader.params.get('verbose') else - 'Downloading %s player %s' % (player_type, player_id) - ) - if player_type == 'js': - code = self._download_webpage( - player_url, video_id, - note=download_note, - errnote='Download of %s failed' % player_url) - res = self._parse_sig_js(code) - elif player_type == 'swf': - urlh = self._request_webpage( - player_url, video_id, - note=download_note, - errnote='Download of %s failed' % player_url) - code = urlh.read() - res = self._parse_sig_swf(code) - else: - assert False, 'Invalid player type %r' % player_type - - test_string = ''.join(map(compat_chr, range(len(example_sig)))) - cache_res = res(test_string) - cache_spec = [ord(c) for c in cache_res] - - self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec) - return res - - def _print_sig_code(self, func, example_sig): - def gen_sig_code(idxs): - def _genslice(start, end, step): - starts = '' if start == 0 else str(start) - ends = (':%d' % (end + step)) if end + step >= 0 else ':' - steps = '' if step == 1 else (':%d' % step) - return 's[%s%s%s]' % (starts, ends, steps) - - step = None - # Quelch pyflakes warnings - start will be set when step is set - start = '(Never used)' - for i, prev in zip(idxs[1:], idxs[:-1]): - if step is not None: - if i - prev == step: - continue - yield _genslice(start, prev, step) - step = None - continue - if i - prev in [-1, 1]: - step = i - prev - start = prev - continue - else: - yield 's[%d]' % prev - if step is None: - yield 's[%d]' % i - else: - yield _genslice(start, i, step) - - test_string = ''.join(map(compat_chr, range(len(example_sig)))) - cache_res = func(test_string) - cache_spec = [ord(c) for c in cache_res] - expr_code = ' + '.join(gen_sig_code(cache_spec)) - signature_id_tuple = '(%s)' % ( - ', '.join(compat_str(len(p)) for p in example_sig.split('.'))) - code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n' - ' return %s\n') % (signature_id_tuple, expr_code) - self.to_screen('Extracted signature function:\n' + code) - - def _parse_sig_js(self, jscode): - funcname = self._search_regex( - (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', - # Obsolete patterns - r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', - r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('), - jscode, 'Initial JS player signature function name', group='sig') - - jsi = JSInterpreter(jscode) - initial_function = jsi.extract_function(funcname) - return lambda s: initial_function([s]) - - def _parse_sig_swf(self, file_contents): - swfi = SWFInterpreter(file_contents) - TARGET_CLASSNAME = 'SignatureDecipher' - searched_class = swfi.extract_class(TARGET_CLASSNAME) - initial_function = swfi.extract_function(searched_class, 'decipher') - return lambda s: initial_function([s]) - - def _decrypt_signature(self, s, video_id, player_url, age_gate=False): - """Turn the encrypted s field into a working signature""" - - if player_url is None: - raise ExtractorError('Cannot decrypt signature without player_url') - - if player_url.startswith('//'): - player_url = 'https:' + player_url - elif not re.match(r'https?://', player_url): - player_url = compat_urlparse.urljoin( - 'https://www.youtube.com', player_url) - try: - player_id = (player_url, self._signature_cache_id(s)) - if player_id not in self._player_cache: - func = self._extract_signature_function( - video_id, player_url, s - ) - self._player_cache[player_id] = func - func = self._player_cache[player_id] - if self._downloader.params.get('youtube_print_sig_code'): - self._print_sig_code(func, s) - return func(s) - except Exception as e: - tb = traceback.format_exc() - raise ExtractorError( - 'Signature extraction failed: ' + tb, cause=e) - - def _get_subtitles(self, video_id, webpage): - try: - subs_doc = self._download_xml( - 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, - video_id, note=False) - except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err)) - return {} - - sub_lang_list = {} - for track in subs_doc.findall('track'): - lang = track.attrib['lang_code'] - if lang in sub_lang_list: - continue - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse_urlencode({ - 'lang': lang, - 'v': video_id, - 'fmt': ext, - 'name': track.attrib['name'].encode('utf-8'), - }) - sub_formats.append({ - 'url': 'https://www.youtube.com/api/timedtext?' + params, - 'ext': ext, - }) - sub_lang_list[lang] = sub_formats - if not sub_lang_list: - self._downloader.report_warning('video doesn\'t have subtitles') - return {} - return sub_lang_list - - def _get_ytplayer_config(self, video_id, webpage): - patterns = ( - # User data may contain arbitrary character sequences that may affect - # JSON extraction with regex, e.g. when '};' is contained the second - # regex won't capture the whole JSON. Yet working around by trying more - # concrete regex first keeping in mind proper quoted string handling - # to be implemented in future that will replace this workaround (see - # https://github.com/ytdl-org/youtube-dl/issues/7468, - # https://github.com/ytdl-org/youtube-dl/pull/7599) - r';ytplayer\.config\s*=\s*({.+?});ytplayer', - r';ytplayer\.config\s*=\s*({.+?});', - ) - config = self._search_regex( - patterns, webpage, 'ytplayer.config', default=None) - if config: - return self._parse_json( - uppercase_escape(config), video_id, fatal=False) - - def _get_automatic_captions(self, video_id, webpage): - """We need the webpage for getting the captions url, pass it as an - argument to speed up the process.""" - self.to_screen('%s: Looking for automatic captions' % video_id) - player_config = self._get_ytplayer_config(video_id, webpage) - err_msg = 'Couldn\'t find automatic captions for %s' % video_id - if not player_config: - self._downloader.report_warning(err_msg) - return {} - try: - args = player_config['args'] - caption_url = args.get('ttsurl') - if caption_url: - timestamp = args['timestamp'] - # We get the available subtitles - list_params = compat_urllib_parse_urlencode({ - 'type': 'list', - 'tlangs': 1, - 'asrs': 1, - }) - list_url = caption_url + '&' + list_params - caption_list = self._download_xml(list_url, video_id) - original_lang_node = caption_list.find('track') - if original_lang_node is None: - self._downloader.report_warning('Video doesn\'t have automatic captions') - return {} - original_lang = original_lang_node.attrib['lang_code'] - caption_kind = original_lang_node.attrib.get('kind', '') - - sub_lang_list = {} - for lang_node in caption_list.findall('target'): - sub_lang = lang_node.attrib['lang_code'] - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse_urlencode({ - 'lang': original_lang, - 'tlang': sub_lang, - 'fmt': ext, - 'ts': timestamp, - 'kind': caption_kind, - }) - sub_formats.append({ - 'url': caption_url + '&' + params, - 'ext': ext, - }) - sub_lang_list[sub_lang] = sub_formats - return sub_lang_list - - def make_captions(sub_url, sub_langs): - parsed_sub_url = compat_urllib_parse_urlparse(sub_url) - caption_qs = compat_parse_qs(parsed_sub_url.query) - captions = {} - for sub_lang in sub_langs: - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - caption_qs.update({ - 'tlang': [sub_lang], - 'fmt': [ext], - }) - sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace( - query=compat_urllib_parse_urlencode(caption_qs, True))) - sub_formats.append({ - 'url': sub_url, - 'ext': ext, - }) - captions[sub_lang] = sub_formats - return captions - - # New captions format as of 22.06.2017 - player_response = args.get('player_response') - if player_response and isinstance(player_response, compat_str): - player_response = self._parse_json( - player_response, video_id, fatal=False) - if player_response: - renderer = player_response['captions']['playerCaptionsTracklistRenderer'] - base_url = renderer['captionTracks'][0]['baseUrl'] - sub_lang_list = [] - for lang in renderer['translationLanguages']: - lang_code = lang.get('languageCode') - if lang_code: - sub_lang_list.append(lang_code) - return make_captions(base_url, sub_lang_list) - - # Some videos don't provide ttsurl but rather caption_tracks and - # caption_translation_languages (e.g. 20LmZk1hakA) - # Does not used anymore as of 22.06.2017 - caption_tracks = args['caption_tracks'] - caption_translation_languages = args['caption_translation_languages'] - caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] - sub_lang_list = [] - for lang in caption_translation_languages.split(','): - lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) - sub_lang = lang_qs.get('lc', [None])[0] - if sub_lang: - sub_lang_list.append(sub_lang) - return make_captions(caption_url, sub_lang_list) - # An extractor error can be raise by the download process if there are - # no automatic captions but there are subtitles - except (KeyError, IndexError, ExtractorError): - self._downloader.report_warning(err_msg) - return {} - - def _mark_watched(self, video_id, video_info, player_response): - playback_url = url_or_none(try_get( - player_response, - lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get( - video_info, lambda x: x['videostats_playback_base_url'][0])) - if not playback_url: - return - parsed_playback_url = compat_urlparse.urlparse(playback_url) - qs = compat_urlparse.parse_qs(parsed_playback_url.query) - - # cpn generation algorithm is reverse engineered from base.js. - # In fact it works even with dummy cpn. - CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' - cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) - - qs.update({ - 'ver': ['2'], - 'cpn': [cpn], - }) - playback_url = compat_urlparse.urlunparse( - parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True))) - - self._download_webpage( - playback_url, video_id, 'Marking watched', - 'Unable to mark watched', fatal=False) - - @staticmethod - def _extract_urls(webpage): - # Embedded YouTube player - entries = [ - unescapeHTML(mobj.group('url')) - for mobj in re.finditer(r'''(?x) - (?: - <iframe[^>]+?src=| - data-video-url=| - <embed[^>]+?src=| - embedSWF\(?:\s*| - <object[^>]+data=| - new\s+SWFObject\( - ) - (["\']) - (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ - (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?) - \1''', webpage)] - - # lazyYT YouTube embed - entries.extend(list(map( - unescapeHTML, - re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)))) - - # Wordpress "YouTube Video Importer" plugin - matches = re.findall(r'''(?x)<div[^>]+ - class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ - data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) - entries.extend(m[-1] for m in matches) - - return entries - - @staticmethod - def _extract_url(webpage): - urls = YoutubeIE._extract_urls(webpage) - return urls[0] if urls else None - - @classmethod - def extract_id(cls, url): - mobj = re.match(cls._VALID_URL, url, re.VERBOSE) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - video_id = mobj.group(2) - return video_id - - def _extract_annotations(self, video_id): - return self._download_webpage( - 'https://www.youtube.com/annotations_invideo', video_id, - note='Downloading annotations', - errnote='Unable to download video annotations', fatal=False, - query={ - 'features': 1, - 'legacy': 1, - 'video_id': video_id, - }) - - @staticmethod - def _extract_chapters(description, duration): - if not description: - return None - chapter_lines = re.findall( - r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)', - description) - if not chapter_lines: - return None - chapters = [] - for next_num, (chapter_line, time_point) in enumerate( - chapter_lines, start=1): - start_time = parse_duration(time_point) - if start_time is None: - continue - if start_time > duration: - break - end_time = (duration if next_num == len(chapter_lines) - else parse_duration(chapter_lines[next_num][1])) - if end_time is None: - continue - if end_time > duration: - end_time = duration - if start_time > end_time: - break - chapter_title = re.sub( - r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-') - chapter_title = re.sub(r'\s+', ' ', chapter_title) - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': chapter_title, - }) - return chapters - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - - proto = ( - 'http' if self._downloader.params.get('prefer_insecure', False) - else 'https') - - start_time = None - end_time = None - parsed_url = compat_urllib_parse_urlparse(url) - for component in [parsed_url.fragment, parsed_url.query]: - query = compat_parse_qs(component) - if start_time is None and 't' in query: - start_time = parse_duration(query['t'][0]) - if start_time is None and 'start' in query: - start_time = parse_duration(query['start'][0]) - if end_time is None and 'end' in query: - end_time = parse_duration(query['end'][0]) - - # Extract original video URL from URL with redirection, like age verification, using next_url parameter - mobj = re.search(self._NEXT_URL_RE, url) - if mobj: - url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/') - video_id = self.extract_id(url) - - # Get video webpage - url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id - video_webpage = self._download_webpage(url, video_id) - - # Attempt to extract SWF player URL - mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) - if mobj is not None: - player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) - else: - player_url = None - - dash_mpds = [] - - def add_dash_mpd(video_info): - dash_mpd = video_info.get('dashmpd') - if dash_mpd and dash_mpd[0] not in dash_mpds: - dash_mpds.append(dash_mpd[0]) - - def add_dash_mpd_pr(pl_response): - dash_mpd = url_or_none(try_get( - pl_response, lambda x: x['streamingData']['dashManifestUrl'], - compat_str)) - if dash_mpd and dash_mpd not in dash_mpds: - dash_mpds.append(dash_mpd) - - is_live = None - view_count = None - - def extract_view_count(v_info): - return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) - - def extract_token(v_info): - return dict_get(v_info, ('account_playback_token', 'accountPlaybackToken', 'token')) - - player_response = {} - - # Get video info - embed_webpage = None - if re.search(r'player-age-gate-content">', video_webpage) is not None: - age_gate = True - # We simulate the access to the video from www.youtube.com/v/{video_id} - # this can be viewed without login into Youtube - url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage') - data = compat_urllib_parse_urlencode({ - 'video_id': video_id, - 'eurl': 'https://youtube.googleapis.com/v/' + video_id, - 'sts': self._search_regex( - r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), - }) - video_info_url = proto + '://www.youtube.com/get_video_info?' + data - video_info_webpage = self._download_webpage( - video_info_url, video_id, - note='Refetching age-gated info webpage', - errnote='unable to download video info webpage') - video_info = compat_parse_qs(video_info_webpage) - add_dash_mpd(video_info) - else: - age_gate = False - video_info = None - sts = None - # Try looking directly into the video webpage - ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) - if ytplayer_config: - args = ytplayer_config['args'] - if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): - # Convert to the same format returned by compat_parse_qs - video_info = dict((k, [v]) for k, v in args.items()) - add_dash_mpd(video_info) - # Rental video is not rented but preview is available (e.g. - # https://www.youtube.com/watch?v=yYr8q0y5Jfg, - # https://github.com/ytdl-org/youtube-dl/issues/10532) - if not video_info and args.get('ypc_vid'): - return self.url_result( - args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) - if args.get('livestream') == '1' or args.get('live_playback') == 1: - is_live = True - sts = ytplayer_config.get('sts') - if not player_response: - pl_response = str_or_none(args.get('player_response')) - if pl_response: - pl_response = self._parse_json(pl_response, video_id, fatal=False) - if isinstance(pl_response, dict): - player_response = pl_response - if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): - add_dash_mpd_pr(player_response) - # We also try looking in get_video_info since it may contain different dashmpd - # URL that points to a DASH manifest with possibly different itag set (some itags - # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH - # manifest pointed by get_video_info's dashmpd). - # The general idea is to take a union of itags of both DASH manifests (for example - # video with such 'manifest behavior' see https://github.com/ytdl-org/youtube-dl/issues/6093) - self.report_video_info_webpage_download(video_id) - for el in ('embedded', 'detailpage', 'vevo', ''): - query = { - 'video_id': video_id, - 'ps': 'default', - 'eurl': '', - 'gl': 'US', - 'hl': 'en', - } - if el: - query['el'] = el - if sts: - query['sts'] = sts - video_info_webpage = self._download_webpage( - '%s://www.youtube.com/get_video_info' % proto, - video_id, note=False, - errnote='unable to download video info webpage', - fatal=False, query=query) - if not video_info_webpage: - continue - get_video_info = compat_parse_qs(video_info_webpage) - if not player_response: - pl_response = get_video_info.get('player_response', [None])[0] - if isinstance(pl_response, dict): - player_response = pl_response - add_dash_mpd_pr(player_response) - add_dash_mpd(get_video_info) - if view_count is None: - view_count = extract_view_count(get_video_info) - if not video_info: - video_info = get_video_info - get_token = extract_token(get_video_info) - if get_token: - # Different get_video_info requests may report different results, e.g. - # some may report video unavailability, but some may serve it without - # any complaint (see https://github.com/ytdl-org/youtube-dl/issues/7362, - # the original webpage as well as el=info and el=embedded get_video_info - # requests report video unavailability due to geo restriction while - # el=detailpage succeeds and returns valid data). This is probably - # due to YouTube measures against IP ranges of hosting providers. - # Working around by preferring the first succeeded video_info containing - # the token if no such video_info yet was found. - token = extract_token(video_info) - if not token: - video_info = get_video_info - break - - def extract_unavailable_message(): - return self._html_search_regex( - r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>', - video_webpage, 'unavailable message', default=None) - - if not video_info: - unavailable_message = extract_unavailable_message() - if not unavailable_message: - unavailable_message = 'Unable to extract video data' - raise ExtractorError( - 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id) - - video_details = try_get( - player_response, lambda x: x['videoDetails'], dict) or {} - - video_title = video_info.get('title', [None])[0] or video_details.get('title') - if not video_title: - self._downloader.report_warning('Unable to extract video title') - video_title = '_' - - description_original = video_description = get_element_by_id("eow-description", video_webpage) - if video_description: - - def replace_url(m): - redir_url = compat_urlparse.urljoin(url, m.group(1)) - parsed_redir_url = compat_urllib_parse_urlparse(redir_url) - if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect': - qs = compat_parse_qs(parsed_redir_url.query) - q = qs.get('q') - if q and q[0]: - return q[0] - return redir_url - - description_original = video_description = re.sub(r'''(?x) - <a\s+ - (?:[a-zA-Z-]+="[^"]*"\s+)*? - (?:title|href)="([^"]+)"\s+ - (?:[a-zA-Z-]+="[^"]*"\s+)*? - class="[^"]*"[^>]*> - [^<]+\.{3}\s* - </a> - ''', replace_url, video_description) - video_description = clean_html(video_description) - else: - video_description = self._html_search_meta('description', video_webpage) or video_details.get('shortDescription') - - if not smuggled_data.get('force_singlefeed', False): - if not self._downloader.params.get('noplaylist'): - multifeed_metadata_list = try_get( - player_response, - lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'], - compat_str) or try_get( - video_info, lambda x: x['multifeed_metadata_list'][0], compat_str) - if multifeed_metadata_list: - entries = [] - feed_ids = [] - for feed in multifeed_metadata_list.split(','): - # Unquote should take place before split on comma (,) since textual - # fields may contain comma as well (see - # https://github.com/ytdl-org/youtube-dl/issues/8536) - feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) - entries.append({ - '_type': 'url_transparent', - 'ie_key': 'Youtube', - 'url': smuggle_url( - '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), - {'force_singlefeed': True}), - 'title': '%s (%s)' % (video_title, feed_data['title'][0]), - }) - feed_ids.append(feed_data['id'][0]) - self.to_screen( - 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' - % (', '.join(feed_ids), video_id)) - return self.playlist_result(entries, video_id, video_title, video_description) - else: - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - - if view_count is None: - view_count = extract_view_count(video_info) - if view_count is None and video_details: - view_count = int_or_none(video_details.get('viewCount')) - - # Check for "rental" videos - if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True) - - def _extract_filesize(media_url): - return int_or_none(self._search_regex( - r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) - - if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): - self.report_rtmp_download() - formats = [{ - 'format_id': '_rtmp', - 'protocol': 'rtmp', - 'url': video_info['conn'][0], - 'player_url': player_url, - }] - elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): - encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] - if 'rtmpe%3Dyes' in encoded_url_map: - raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True) - formats_spec = {} - fmt_list = video_info.get('fmt_list', [''])[0] - if fmt_list: - for fmt in fmt_list.split(','): - spec = fmt.split('/') - if len(spec) > 1: - width_height = spec[1].split('x') - if len(width_height) == 2: - formats_spec[spec[0]] = { - 'resolution': spec[1], - 'width': int_or_none(width_height[0]), - 'height': int_or_none(width_height[1]), - } - q = qualities(['small', 'medium', 'hd720']) - streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) - if streaming_formats: - for fmt in streaming_formats: - itag = str_or_none(fmt.get('itag')) - if not itag: - continue - quality = fmt.get('quality') - quality_label = fmt.get('qualityLabel') or quality - formats_spec[itag] = { - 'asr': int_or_none(fmt.get('audioSampleRate')), - 'filesize': int_or_none(fmt.get('contentLength')), - 'format_note': quality_label, - 'fps': int_or_none(fmt.get('fps')), - 'height': int_or_none(fmt.get('height')), - 'quality': q(quality), - # bitrate for itag 43 is always 2147483647 - 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, - 'width': int_or_none(fmt.get('width')), - } - formats = [] - for url_data_str in encoded_url_map.split(','): - url_data = compat_parse_qs(url_data_str) - if 'itag' not in url_data or 'url' not in url_data or url_data.get('drm_families'): - continue - stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0])) - # Unsupported FORMAT_STREAM_TYPE_OTF - if stream_type == 3: - continue - format_id = url_data['itag'][0] - url = url_data['url'][0] - - if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): - ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' - jsplayer_url_json = self._search_regex( - ASSETS_RE, - embed_webpage if age_gate else video_webpage, - 'JS player URL (1)', default=None) - if not jsplayer_url_json and not age_gate: - # We need the embed website after all - if embed_webpage is None: - embed_url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage( - embed_url, video_id, 'Downloading embed webpage') - jsplayer_url_json = self._search_regex( - ASSETS_RE, embed_webpage, 'JS player URL') - - player_url = json.loads(jsplayer_url_json) - if player_url is None: - player_url_json = self._search_regex( - r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', - video_webpage, 'age gate player URL') - player_url = json.loads(player_url_json) - - if 'sig' in url_data: - url += '&signature=' + url_data['sig'][0] - elif 's' in url_data: - encrypted_sig = url_data['s'][0] - - if self._downloader.params.get('verbose'): - if player_url is None: - player_version = 'unknown' - player_desc = 'unknown' - else: - if player_url.endswith('swf'): - player_version = self._search_regex( - r'-(.+?)(?:/watch_as3)?\.swf$', player_url, - 'flash player', fatal=False) - player_desc = 'flash player %s' % player_version - else: - player_version = self._search_regex( - [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', - r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], - player_url, - 'html5 player', fatal=False) - player_desc = 'html5 player %s' % player_version - - parts_sizes = self._signature_cache_id(encrypted_sig) - self.to_screen('{%s} signature length %s, %s' % - (format_id, parts_sizes, player_desc)) - - signature = self._decrypt_signature( - encrypted_sig, video_id, player_url, age_gate) - sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature' - url += '&%s=%s' % (sp, signature) - if 'ratebypass' not in url: - url += '&ratebypass=yes' - - dct = { - 'format_id': format_id, - 'url': url, - 'player_url': player_url, - } - if format_id in self._formats: - dct.update(self._formats[format_id]) - if format_id in formats_spec: - dct.update(formats_spec[format_id]) - - # Some itags are not included in DASH manifest thus corresponding formats will - # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993). - # Trying to extract metadata from url_encoded_fmt_stream_map entry. - mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0]) - width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) - - filesize = int_or_none(url_data.get( - 'clen', [None])[0]) or _extract_filesize(url) - - quality = url_data.get('quality', [None])[0] - - more_fields = { - 'filesize': filesize, - 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000), - 'width': width, - 'height': height, - 'fps': int_or_none(url_data.get('fps', [None])[0]), - 'format_note': url_data.get('quality_label', [None])[0] or quality, - 'quality': q(quality), - } - for key, value in more_fields.items(): - if value: - dct[key] = value - type_ = url_data.get('type', [None])[0] - if type_: - type_split = type_.split(';') - kind_ext = type_split[0].split('/') - if len(kind_ext) == 2: - kind, _ = kind_ext - dct['ext'] = mimetype2ext(type_split[0]) - if kind in ('audio', 'video'): - codecs = None - for mobj in re.finditer( - r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_): - if mobj.group('key') == 'codecs': - codecs = mobj.group('val') - break - if codecs: - dct.update(parse_codecs(codecs)) - if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none': - dct['downloader_options'] = { - # Youtube throttles chunks >~10M - 'http_chunk_size': 10485760, - } - formats.append(dct) - else: - manifest_url = ( - url_or_none(try_get( - player_response, - lambda x: x['streamingData']['hlsManifestUrl'], - compat_str)) - or url_or_none(try_get( - video_info, lambda x: x['hlsvp'][0], compat_str))) - if manifest_url: - formats = [] - m3u8_formats = self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', fatal=False) - for a_format in m3u8_formats: - itag = self._search_regex( - r'/itag/(\d+)/', a_format['url'], 'itag', default=None) - if itag: - a_format['format_id'] = itag - if itag in self._formats: - dct = self._formats[itag].copy() - dct.update(a_format) - a_format = dct - a_format['player_url'] = player_url - # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming - a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' - formats.append(a_format) - else: - error_message = clean_html(video_info.get('reason', [None])[0]) - if not error_message: - error_message = extract_unavailable_message() - if error_message: - raise ExtractorError(error_message, expected=True) - raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info') - - # uploader - video_uploader = try_get( - video_info, lambda x: x['author'][0], - compat_str) or str_or_none(video_details.get('author')) - if video_uploader: - video_uploader = compat_urllib_parse_unquote_plus(video_uploader) - else: - self._downloader.report_warning('unable to extract uploader name') - - # uploader_id - video_uploader_id = None - video_uploader_url = None - mobj = re.search( - r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', - video_webpage) - if mobj is not None: - video_uploader_id = mobj.group('uploader_id') - video_uploader_url = mobj.group('uploader_url') - else: - self._downloader.report_warning('unable to extract uploader nickname') - - channel_id = ( - str_or_none(video_details.get('channelId')) - or self._html_search_meta( - 'channelId', video_webpage, 'channel id', default=None) - or self._search_regex( - r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1', - video_webpage, 'channel id', default=None, group='id')) - channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None - - # thumbnail image - # We try first to get a high quality image: - m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', - video_webpage, re.DOTALL) - if m_thumb is not None: - video_thumbnail = m_thumb.group(1) - elif 'thumbnail_url' not in video_info: - self._downloader.report_warning('unable to extract video thumbnail') - video_thumbnail = None - else: # don't panic if we can't find it - video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) - - # upload date - upload_date = self._html_search_meta( - 'datePublished', video_webpage, 'upload date', default=None) - if not upload_date: - upload_date = self._search_regex( - [r'(?s)id="eow-date.*?>(.*?)</span>', - r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], - video_webpage, 'upload date', default=None) - upload_date = unified_strdate(upload_date) - - video_license = self._html_search_regex( - r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li', - video_webpage, 'license', default=None) - - m_music = re.search( - r'''(?x) - <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s* - <ul[^>]*>\s* - <li>(?P<title>.+?) - by (?P<creator>.+?) - (?: - \(.+?\)| - <a[^>]* - (?: - \bhref=["\']/red[^>]*>| # drop possible - >\s*Listen ad-free with YouTube Red # YouTube Red ad - ) - .*? - )?</li - ''', - video_webpage) - if m_music: - video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) - video_creator = clean_html(m_music.group('creator')) - else: - video_alt_title = video_creator = None - - def extract_meta(field): - return self._html_search_regex( - r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field, - video_webpage, field, default=None) - - track = extract_meta('Song') - artist = extract_meta('Artist') - album = extract_meta('Album') - - # Youtube Music Auto-generated description - release_date = release_year = None - if video_description: - mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description) - if mobj: - if not track: - track = mobj.group('track').strip() - if not artist: - artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')) - if not album: - album = mobj.group('album'.strip()) - release_year = mobj.group('release_year') - release_date = mobj.group('release_date') - if release_date: - release_date = release_date.replace('-', '') - if not release_year: - release_year = int(release_date[:4]) - if release_year: - release_year = int(release_year) - - m_episode = re.search( - r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', - video_webpage) - if m_episode: - series = unescapeHTML(m_episode.group('series')) - season_number = int(m_episode.group('season')) - episode_number = int(m_episode.group('episode')) - else: - series = season_number = episode_number = None - - m_cat_container = self._search_regex( - r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', - video_webpage, 'categories', default=None) - if m_cat_container: - category = self._html_search_regex( - r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', - default=None) - video_categories = None if category is None else [category] - else: - video_categories = None - - video_tags = [ - unescapeHTML(m.group('content')) - for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] - - def _extract_count(count_name): - return str_to_int(self._search_regex( - r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' - % re.escape(count_name), - video_webpage, count_name, default=None)) - - like_count = _extract_count('like') - dislike_count = _extract_count('dislike') - - if view_count is None: - view_count = str_to_int(self._search_regex( - r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage, - 'view count', default=None)) - - average_rating = ( - float_or_none(video_details.get('averageRating')) - or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0]))) - - # subtitles - video_subtitles = self.extract_subtitles(video_id, video_webpage) - automatic_captions = self.extract_automatic_captions(video_id, video_webpage) - - video_duration = try_get( - video_info, lambda x: int_or_none(x['length_seconds'][0])) - if not video_duration: - video_duration = int_or_none(video_details.get('lengthSeconds')) - if not video_duration: - video_duration = parse_duration(self._html_search_meta( - 'duration', video_webpage, 'video duration')) - - # annotations - video_annotations = None - if self._downloader.params.get('writeannotations', False): - video_annotations = self._extract_annotations(video_id) - - chapters = self._extract_chapters(description_original, video_duration) - - # Look for the DASH manifest - if self._downloader.params.get('youtube_include_dash_manifest', True): - dash_mpd_fatal = True - for mpd_url in dash_mpds: - dash_formats = {} - try: - def decrypt_sig(mobj): - s = mobj.group(1) - dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) - return '/signature/%s' % dec_s - - mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url) - - for df in self._extract_mpd_formats( - mpd_url, video_id, fatal=dash_mpd_fatal, - formats_dict=self._formats): - if not df.get('filesize'): - df['filesize'] = _extract_filesize(df['url']) - # Do not overwrite DASH format found in some previous DASH manifest - if df['format_id'] not in dash_formats: - dash_formats[df['format_id']] = df - # Additional DASH manifests may end up in HTTP Error 403 therefore - # allow them to fail without bug report message if we already have - # some DASH manifest succeeded. This is temporary workaround to reduce - # burst of bug reports until we figure out the reason and whether it - # can be fixed at all. - dash_mpd_fatal = False - except (ExtractorError, KeyError) as e: - self.report_warning( - 'Skipping DASH manifest: %r' % e, video_id) - if dash_formats: - # Remove the formats we found through non-DASH, they - # contain less info and it can be wrong, because we use - # fixed values (for example the resolution). See - # https://github.com/ytdl-org/youtube-dl/issues/5774 for an - # example. - formats = [f for f in formats if f['format_id'] not in dash_formats.keys()] - formats.extend(dash_formats.values()) - - # Check for malformed aspect ratio - stretched_m = re.search( - r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">', - video_webpage) - if stretched_m: - w = float(stretched_m.group('w')) - h = float(stretched_m.group('h')) - # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0). - # We will only process correct ratios. - if w > 0 and h > 0: - ratio = w / h - for f in formats: - if f.get('vcodec') != 'none': - f['stretched_ratio'] = ratio - - if not formats: - token = extract_token(video_info) - if not token: - if 'reason' in video_info: - if 'The uploader has not made this video available in your country.' in video_info['reason']: - regions_allowed = self._html_search_meta( - 'regionsAllowed', video_webpage, default=None) - countries = regions_allowed.split(',') if regions_allowed else None - self.raise_geo_restricted( - msg=video_info['reason'][0], countries=countries) - reason = video_info['reason'][0] - if 'Invalid parameters' in reason: - unavailable_message = extract_unavailable_message() - if unavailable_message: - reason = unavailable_message - raise ExtractorError( - 'YouTube said: %s' % reason, - expected=True, video_id=video_id) - else: - raise ExtractorError( - '"token" parameter not in video info for unknown reason', - video_id=video_id) - - if not formats and (video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos'])): - raise ExtractorError('This video is DRM protected.', expected=True) - - self._sort_formats(formats) - - self.mark_watched(video_id, video_info, player_response) - - return { - 'id': video_id, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'uploader_url': video_uploader_url, - 'channel_id': channel_id, - 'channel_url': channel_url, - 'upload_date': upload_date, - 'license': video_license, - 'creator': video_creator or artist, - 'title': video_title, - 'alt_title': video_alt_title or track, - 'thumbnail': video_thumbnail, - 'description': video_description, - 'categories': video_categories, - 'tags': video_tags, - 'subtitles': video_subtitles, - 'automatic_captions': automatic_captions, - 'duration': video_duration, - 'age_limit': 18 if age_gate else 0, - 'annotations': video_annotations, - 'chapters': chapters, - 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id, - 'view_count': view_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'average_rating': average_rating, - 'formats': formats, - 'is_live': is_live, - 'start_time': start_time, - 'end_time': end_time, - 'series': series, - 'season_number': season_number, - 'episode_number': episode_number, - 'track': track, - 'artist': artist, - 'album': album, - 'release_date': release_date, - 'release_year': release_year, - } - - -class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): - IE_DESC = 'YouTube.com playlists' - _VALID_URL = r"""(?x)(?: - (?:https?://)? - (?:\w+\.)? - (?: - (?: - youtube\.com| - invidio\.us - ) - / - (?: - (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11})) - \? (?:.*?[&;])*? (?:p|a|list)= - | p/ - )| - youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= - ) - ( - (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,} - # Top tracks, they can also include dots - |(?:MC)[\w\.]* - ) - .* - | - (%(playlist_id)s) - )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} - _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?' - IE_NAME = 'youtube:playlist' - _TESTS = [{ - 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', - 'info_dict': { - 'title': 'ytdl test PL', - 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', - }, - 'playlist_count': 3, - }, { - 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', - 'info_dict': { - 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', - 'title': 'YDL_Empty_List', - }, - 'playlist_count': 0, - 'skip': 'This playlist is private', - }, { - 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', - 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', - 'info_dict': { - 'title': '29C3: Not my department', - 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', - }, - 'playlist_count': 95, - }, { - 'note': 'issue #673', - 'url': 'PLBB231211A4F62143', - 'info_dict': { - 'title': '[OLD]Team Fortress 2 (Class-based LP)', - 'id': 'PLBB231211A4F62143', - }, - 'playlist_mincount': 26, - }, { - 'note': 'Large playlist', - 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', - 'info_dict': { - 'title': 'Uploads from Cauchemar', - 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', - }, - 'playlist_mincount': 799, - }, { - 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', - 'info_dict': { - 'title': 'YDL_safe_search', - 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', - }, - 'playlist_count': 2, - 'skip': 'This playlist is private', - }, { - 'note': 'embedded', - 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', - 'playlist_count': 4, - 'info_dict': { - 'title': 'JODA15', - 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', - } - }, { - 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', - 'playlist_mincount': 485, - 'info_dict': { - 'title': '2017 華語最新單曲 (2/24更新)', - 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', - } - }, { - 'note': 'Embedded SWF player', - 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0', - 'playlist_count': 4, - 'info_dict': { - 'title': 'JODA7', - 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ', - } - }, { - 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', - 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', - 'info_dict': { - 'title': 'Uploads from Interstellar Movie', - 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', - }, - 'playlist_mincount': 21, - }, { - # Playlist URL that does not actually serve a playlist - 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', - 'info_dict': { - 'id': 'FqZTN594JQw', - 'ext': 'webm', - 'title': "Smiley's People 01 detective, Adventure Series, Action", - 'uploader': 'STREEM', - 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', - 'upload_date': '20150526', - 'license': 'Standard YouTube License', - 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', - 'categories': ['People & Blogs'], - 'tags': list, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [YoutubeIE.ie_key()], - }, { - 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5', - 'info_dict': { - 'id': 'yeWKywCrFtk', - 'ext': 'mp4', - 'title': 'Small Scale Baler and Braiding Rugs', - 'uploader': 'Backus-Page House Museum', - 'uploader_id': 'backuspagemuseum', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum', - 'upload_date': '20161008', - 'license': 'Standard YouTube License', - 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a', - 'categories': ['Nonprofits & Activism'], - 'tags': list, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'noplaylist': True, - 'skip_download': True, - }, - }, { - 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', - 'only_matching': True, - }, { - 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', - 'only_matching': True, - }, { - # music album playlist - 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', - 'only_matching': True, - }, { - 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', - 'only_matching': True, - }] - - def _real_initialize(self): - self._login() - - def _extract_mix(self, playlist_id): - # The mixes are generated from a single video - # the id of the playlist is just 'RD' + video_id - ids = [] - last_id = playlist_id[-11:] - for n in itertools.count(1): - url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) - webpage = self._download_webpage( - url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n)) - new_ids = orderedSet(re.findall( - r'''(?xs)data-video-username=".*?".*? - href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), - webpage)) - # Fetch new pages until all the videos are repeated, it seems that - # there are always 51 unique videos. - new_ids = [_id for _id in new_ids if _id not in ids] - if not new_ids: - break - ids.extend(new_ids) - last_id = ids[-1] - - url_results = self._ids_to_results(ids) - - search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage) - title_span = ( - search_title('playlist-title') - or search_title('title long-title') - or search_title('title')) - title = clean_html(title_span) - - return self.playlist_result(url_results, playlist_id, title) - - def _extract_playlist(self, playlist_id): - url = self._TEMPLATE_URL % playlist_id - page = self._download_webpage(url, playlist_id) - - # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604) - for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page): - match = match.strip() - # Check if the playlist exists or is private - mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match) - if mobj: - reason = mobj.group('reason') - message = 'This playlist %s' % reason - if 'private' in reason: - message += ', use --username or --netrc to access it' - message += '.' - raise ExtractorError(message, expected=True) - elif re.match(r'[^<]*Invalid parameters[^<]*', match): - raise ExtractorError( - 'Invalid parameters. Maybe URL is incorrect.', - expected=True) - elif re.match(r'[^<]*Choose your language[^<]*', match): - continue - else: - self.report_warning('Youtube gives an alert message: ' + match) - - playlist_title = self._html_search_regex( - r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>', - page, 'title', default=None) - - _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref=' - uploader = self._search_regex( - r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE, - page, 'uploader', default=None) - mobj = re.search( - r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE, - page) - if mobj: - uploader_id = mobj.group('uploader_id') - uploader_url = compat_urlparse.urljoin(url, mobj.group('path')) - else: - uploader_id = uploader_url = None - - has_videos = True - - if not playlist_title: - try: - # Some playlist URLs don't actually serve a playlist (e.g. - # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4) - next(self._entries(page, playlist_id)) - except StopIteration: - has_videos = False - - playlist = self.playlist_result( - self._entries(page, playlist_id), playlist_id, playlist_title) - playlist.update({ - 'uploader': uploader, - 'uploader_id': uploader_id, - 'uploader_url': uploader_url, - }) - - return has_videos, playlist - - def _check_download_just_video(self, url, playlist_id): - # Check if it's a video-specific URL - query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - video_id = query_dict.get('v', [None])[0] or self._search_regex( - r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url, - 'video id', default=None) - if video_id: - if self._downloader.params.get('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - return video_id, self.url_result(video_id, 'Youtube', video_id=video_id) - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - return video_id, None - return None, None - - def _real_extract(self, url): - # Extract playlist id - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - playlist_id = mobj.group(1) or mobj.group(2) - - video_id, video = self._check_download_just_video(url, playlist_id) - if video: - return video - - if playlist_id.startswith(('RD', 'UL', 'PU')): - # Mixes require a custom extraction process - return self._extract_mix(playlist_id) - - has_videos, playlist = self._extract_playlist(playlist_id) - if has_videos or not video_id: - return playlist - - # Some playlist URLs don't actually serve a playlist (see - # https://github.com/ytdl-org/youtube-dl/issues/10537). - # Fallback to plain video extraction if there is a video id - # along with playlist id. - return self.url_result(video_id, 'Youtube', video_id=video_id) - - -class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): - IE_DESC = 'YouTube.com channels' - _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)' - _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' - _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' - IE_NAME = 'youtube:channel' - _TESTS = [{ - 'note': 'paginated channel', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', - 'playlist_mincount': 91, - 'info_dict': { - 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'Uploads from lex will', - } - }, { - 'note': 'Age restricted channel', - # from https://www.youtube.com/user/DeusExOfficial - 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w', - 'playlist_mincount': 64, - 'info_dict': { - 'id': 'UUs0ifCMCm1icqRbqhUINa0w', - 'title': 'Uploads from Deus Ex', - }, - }, { - 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url) - else super(YoutubeChannelIE, cls).suitable(url)) - - def _build_template_url(self, url, channel_id): - return self._TEMPLATE_URL % channel_id - - def _real_extract(self, url): - channel_id = self._match_id(url) - - url = self._build_template_url(url, channel_id) - - # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778) - # Workaround by extracting as a playlist if managed to obtain channel playlist URL - # otherwise fallback on channel by page extraction - channel_page = self._download_webpage( - url + '?view=57', channel_id, - 'Downloading channel page', fatal=False) - if channel_page is False: - channel_playlist_id = False - else: - channel_playlist_id = self._html_search_meta( - 'channelId', channel_page, 'channel id', default=None) - if not channel_playlist_id: - channel_url = self._html_search_meta( - ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'), - channel_page, 'channel url', default=None) - if channel_url: - channel_playlist_id = self._search_regex( - r'vnd\.youtube://user/([0-9A-Za-z_-]+)', - channel_url, 'channel id', default=None) - if channel_playlist_id and channel_playlist_id.startswith('UC'): - playlist_id = 'UU' + channel_playlist_id[2:] - return self.url_result( - compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist') - - channel_page = self._download_webpage(url, channel_id, 'Downloading page #1') - autogenerated = re.search(r'''(?x) - class="[^"]*?(?: - channel-header-autogenerated-label| - yt-channel-title-autogenerated - )[^"]*"''', channel_page) is not None - - if autogenerated: - # The videos are contained in a single page - # the ajax pages can't be used, they are empty - entries = [ - self.url_result( - video_id, 'Youtube', video_id=video_id, - video_title=video_title) - for video_id, video_title in self.extract_videos_from_page(channel_page)] - return self.playlist_result(entries, channel_id) - - try: - next(self._entries(channel_page, channel_id)) - except StopIteration: - alert_message = self._html_search_regex( - r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>', - channel_page, 'alert', default=None, group='alert') - if alert_message: - raise ExtractorError('Youtube said: %s' % alert_message, expected=True) - - return self.playlist_result(self._entries(channel_page, channel_id), channel_id) - - -class YoutubeUserIE(YoutubeChannelIE): - IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' - _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos' - IE_NAME = 'youtube:user' - - _TESTS = [{ - 'url': 'https://www.youtube.com/user/TheLinuxFoundation', - 'playlist_mincount': 320, - 'info_dict': { - 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ', - 'title': 'Uploads from The Linux Foundation', - } - }, { - # Only available via https://www.youtube.com/c/12minuteathlete/videos - # but not https://www.youtube.com/user/12minuteathlete/videos - 'url': 'https://www.youtube.com/c/12minuteathlete/videos', - 'playlist_mincount': 249, - 'info_dict': { - 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ', - 'title': 'Uploads from 12 Minute Athlete', - } - }, { - 'url': 'ytuser:phihag', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/gametrailers', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/gametrailers', - 'only_matching': True, - }, { - # This channel is not available, geo restricted to JP - 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - # Don't return True if the url can be extracted with other youtube - # extractor, the regex would is too permissive and it would match. - other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls) - if any(ie.suitable(url) for ie in other_yt_ies): - return False - else: - return super(YoutubeUserIE, cls).suitable(url) - - def _build_template_url(self, url, channel_id): - mobj = re.match(self._VALID_URL, url) - return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id')) - - -class YoutubeLiveIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube.com live streams' - _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live' - IE_NAME = 'youtube:live' - - _TESTS = [{ - 'url': 'https://www.youtube.com/user/TheYoungTurks/live', - 'info_dict': { - 'id': 'a48o2S1cPoo', - 'ext': 'mp4', - 'title': 'The Young Turks - Live Main Show', - 'uploader': 'The Young Turks', - 'uploader_id': 'TheYoungTurks', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', - 'upload_date': '20150715', - 'license': 'Standard YouTube License', - 'description': 'md5:438179573adcdff3c97ebb1ee632b891', - 'categories': ['News & Politics'], - 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/TheYoungTurks/live', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') - base_url = mobj.group('base_url') - webpage = self._download_webpage(url, channel_id, fatal=False) - if webpage: - page_type = self._og_search_property( - 'type', webpage, 'page type', default='') - video_id = self._html_search_meta( - 'videoId', webpage, 'video id', default=None) - if page_type.startswith('video') and video_id and re.match( - r'^[0-9A-Za-z_-]{11}$', video_id): - return self.url_result(video_id, YoutubeIE.ie_key()) - return self.url_result(base_url) - - -class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): - IE_DESC = 'YouTube.com user/channel playlists' - _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists' - IE_NAME = 'youtube:playlists' - - _TESTS = [{ - 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', - 'playlist_mincount': 4, - 'info_dict': { - 'id': 'ThirstForScience', - 'title': 'Thirst for Science', - }, - }, { - # with "Load more" button - 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', - 'playlist_mincount': 70, - 'info_dict': { - 'id': 'igorkle1', - 'title': 'Игорь Клейнер', - }, - }, { - 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists', - 'playlist_mincount': 17, - 'info_dict': { - 'id': 'UCiU1dHvZObB2iP6xkJ__Icw', - 'title': 'Chem Player', - }, - }] - - -class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' - - -class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): - IE_DESC = 'YouTube.com searches' - # there doesn't appear to be a real limit, for example if you search for - # 'python' you get more than 8.000.000 results - _MAX_RESULTS = float('inf') - IE_NAME = 'youtube:search' - _SEARCH_KEY = 'ytsearch' - _EXTRA_QUERY_ARGS = {} - _TESTS = [] - - def _get_n_results(self, query, n): - """Get a specified number of results for a query""" - - videos = [] - limit = n - - url_query = { - 'search_query': query.encode('utf-8'), - } - url_query.update(self._EXTRA_QUERY_ARGS) - result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query) - - for pagenum in itertools.count(1): - data = self._download_json( - result_url, video_id='query "%s"' % query, - note='Downloading page %s' % pagenum, - errnote='Unable to download API page', - query={'spf': 'navigate'}) - html_content = data[1]['body']['content'] - - if 'class="search-message' in html_content: - raise ExtractorError( - '[youtube] No video results', expected=True) - - new_videos = list(self._process_page(html_content)) - videos += new_videos - if not new_videos or len(videos) > limit: - break - next_link = self._html_search_regex( - r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next', - html_content, 'next link', default=None) - if next_link is None: - break - result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link) - - if len(videos) > n: - videos = videos[:n] - return self.playlist_result(videos, query) - - -class YoutubeSearchDateIE(YoutubeSearchIE): - IE_NAME = YoutubeSearchIE.IE_NAME + ':date' - _SEARCH_KEY = 'ytsearchdate' - IE_DESC = 'YouTube.com searches, newest videos first' - _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'} - - -class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): - IE_DESC = 'YouTube.com search URLs' - IE_NAME = 'youtube:search_url' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' - _TESTS = [{ - 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', - 'playlist_mincount': 5, - 'info_dict': { - 'title': 'youtube-dl test video', - } - }, { - 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - query = compat_urllib_parse_unquote_plus(mobj.group('query')) - webpage = self._download_webpage(url, query) - return self.playlist_result(self._process_page(webpage), playlist_title=query) - - -class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): - IE_DESC = 'YouTube.com (multi-season) shows' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)' - IE_NAME = 'youtube:show' - _TESTS = [{ - 'url': 'https://www.youtube.com/show/airdisasters', - 'playlist_mincount': 5, - 'info_dict': { - 'id': 'airdisasters', - 'title': 'Air Disasters', - } - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - return super(YoutubeShowIE, self)._real_extract( - 'https://www.youtube.com/show/%s/playlists' % playlist_id) - - -class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): - """ - Base class for feed extractors - Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. - """ - _LOGIN_REQUIRED = True - - @property - def IE_NAME(self): - return 'youtube:%s' % self._FEED_NAME - - def _real_initialize(self): - self._login() - - def _entries(self, page): - # The extraction process is the same as for playlists, but the regex - # for the video ids doesn't contain an index - ids = [] - more_widget_html = content_html = page - for page_num in itertools.count(1): - matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) - - # 'recommended' feed has infinite 'load more' and each new portion spins - # the same videos in (sometimes) slightly different order, so we'll check - # for unicity and break when portion has no new videos - new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) - if not new_ids: - break - - ids.extend(new_ids) - - for entry in self._ids_to_results(new_ids): - yield entry - - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: - break - - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] - - def _real_extract(self, url): - page = self._download_webpage( - 'https://www.youtube.com/feed/%s' % self._FEED_NAME, - self._PLAYLIST_TITLE) - return self.playlist_result( - self._entries(page), playlist_title=self._PLAYLIST_TITLE) - - -class YoutubeWatchLaterIE(YoutubePlaylistIE): - IE_NAME = 'youtube:watchlater' - IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater' - - _TESTS = [{ - 'url': 'https://www.youtube.com/playlist?list=WL', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL', - 'only_matching': True, - }] - - def _real_extract(self, url): - _, video = self._check_download_just_video(url, 'WL') - if video: - return video - _, playlist = self._extract_playlist('WL') - return playlist - - -class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): - IE_NAME = 'youtube:favorites' - IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?' - _LOGIN_REQUIRED = True - - def _real_extract(self, url): - webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos') - playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id') - return self.url_result(playlist_id, 'YoutubePlaylist') - - -class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?' - _FEED_NAME = 'recommended' - _PLAYLIST_TITLE = 'Youtube Recommended videos' - - -class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' - _FEED_NAME = 'subscriptions' - _PLAYLIST_TITLE = 'Youtube Subscriptions' - - -class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory' - _FEED_NAME = 'history' - _PLAYLIST_TITLE = 'Youtube History' - - -class YoutubeTruncatedURLIE(InfoExtractor): - IE_NAME = 'youtube:truncated_url' - IE_DESC = False # Do not list - _VALID_URL = r'''(?x) - (?:https?://)? - (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/ - (?:watch\?(?: - feature=[a-z_]+| - annotation_id=annotation_[^&]+| - x-yt-cl=[0-9]+| - hl=[^&]*| - t=[0-9]+ - )? - | - attribution_link\?a=[^&]+ - ) - $ - ''' - - _TESTS = [{ - 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?feature=foo', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?hl=en-GB', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?t=2372', - 'only_matching': True, - }] - - def _real_extract(self, url): - raise ExtractorError( - 'Did you forget to quote the URL? Remember that & is a meta ' - 'character in most shells, so you want to put the URL in quotes, ' - 'like youtube-dl ' - '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" ' - ' or simply youtube-dl BaW_jenozKc .', - expected=True) - - -class YoutubeTruncatedIDIE(InfoExtractor): - IE_NAME = 'youtube:truncated_id' - IE_DESC = False # Do not list - _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$' - - _TESTS = [{ - 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - raise ExtractorError( - 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url), - expected=True) |