From 4c07546e7a5e5882abdda896009b744e947df1c4 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 17 Oct 2019 19:58:13 -0700 Subject: Extraction: Replace youtube-dl with custom-built watch page extraction --- youtube_dl/extractor/generic.py | 3335 --------------------------------------- 1 file changed, 3335 deletions(-) delete mode 100644 youtube_dl/extractor/generic.py (limited to 'youtube_dl/extractor/generic.py') diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py deleted file mode 100644 index aa04905..0000000 --- a/youtube_dl/extractor/generic.py +++ /dev/null @@ -1,3335 +0,0 @@ -# coding: utf-8 - -from __future__ import unicode_literals - -import os -import re -import sys - -from .common import InfoExtractor -from .youtube import YoutubeIE -from ..compat import ( - compat_etree_fromstring, - compat_str, - compat_urllib_parse_unquote, - compat_urlparse, - compat_xml_parse_error, -) -from ..utils import ( - determine_ext, - ExtractorError, - float_or_none, - HEADRequest, - is_html, - js_to_json, - KNOWN_EXTENSIONS, - merge_dicts, - mimetype2ext, - orderedSet, - sanitized_Request, - smuggle_url, - unescapeHTML, - unified_strdate, - unsmuggle_url, - UnsupportedError, - xpath_text, -) -from .commonprotocols import RtmpIE -from .brightcove import ( - BrightcoveLegacyIE, - BrightcoveNewIE, -) -from .nexx import ( - NexxIE, - NexxEmbedIE, -) -from .nbc import NBCSportsVPlayerIE -from .ooyala import OoyalaIE -from .rutv import RUTVIE -from .tvc import TVCIE -from .sportbox import SportBoxEmbedIE -from .smotri import SmotriIE -from .myvi import MyviIE -from .condenast import CondeNastIE -from .udn import UDNEmbedIE -from .senateisvp import SenateISVPIE -from .svt import SVTIE -from .pornhub import PornHubIE -from .xhamster import XHamsterEmbedIE -from .tnaflix import TNAFlixNetworkEmbedIE -from .drtuber import DrTuberIE -from .redtube import RedTubeIE -from .tube8 import Tube8IE -from .vimeo import VimeoIE -from .dailymotion import DailymotionIE -from .dailymail import DailyMailIE -from .onionstudios import OnionStudiosIE -from .viewlift import ViewLiftEmbedIE -from .mtv import MTVServicesEmbeddedIE -from .pladform import PladformIE -from .videomore import VideomoreIE -from .webcaster import WebcasterFeedIE -from .googledrive import GoogleDriveIE -from .jwplatform import JWPlatformIE -from .digiteka import DigitekaIE -from .arkena import ArkenaIE -from .instagram import InstagramIE -from .liveleak import LiveLeakIE -from .threeqsdn import ThreeQSDNIE -from .theplatform import ThePlatformIE -from .vessel import VesselIE -from .kaltura import KalturaIE -from .eagleplatform import EaglePlatformIE -from .facebook import FacebookIE -from .soundcloud import SoundcloudIE -from .tunein import TuneInBaseIE -from .vbox7 import Vbox7IE -from .dbtv import DBTVIE -from .piksel import PikselIE -from .videa import VideaIE -from .twentymin import TwentyMinutenIE -from .ustream import UstreamIE -from .openload import OpenloadIE -from .videopress import VideoPressIE -from .rutube import RutubeIE -from .limelight import LimelightBaseIE -from .anvato import AnvatoIE -from .washingtonpost import WashingtonPostIE -from .wistia import WistiaIE -from .mediaset import MediasetIE -from .joj import JojIE -from .megaphone import MegaphoneIE -from .vzaar import VzaarIE -from .channel9 import Channel9IE -from .vshare import VShareIE -from .mediasite import MediasiteIE -from .springboardplatform import SpringboardPlatformIE -from .yapfiles import YapFilesIE -from .vice import ViceIE -from .xfileshare import XFileShareIE -from .cloudflarestream import CloudflareStreamIE -from .peertube import PeerTubeIE -from .indavideo import IndavideoEmbedIE -from .apa import APAIE -from .foxnews import FoxNewsIE - - -class GenericIE(InfoExtractor): - IE_DESC = 'Generic downloader that works on some sites' - _VALID_URL = r'.*' - IE_NAME = 'generic' - _TESTS = [ - # Direct link to a video - { - 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', - 'md5': '67d406c2bcb6af27fa886f31aa934bbe', - 'info_dict': { - 'id': 'trailer', - 'ext': 'mp4', - 'title': 'trailer', - 'upload_date': '20100513', - } - }, - # Direct link to media delivered compressed (until Accept-Encoding is *) - { - 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac', - 'md5': '128c42e68b13950268b648275386fc74', - 'info_dict': { - 'id': 'FictionJunction-Parallel_Hearts', - 'ext': 'flac', - 'title': 'FictionJunction-Parallel_Hearts', - 'upload_date': '20140522', - }, - 'expected_warnings': [ - 'URL could be a direct video link, returning it as such.' - ], - 'skip': 'URL invalid', - }, - # Direct download with broken HEAD - { - 'url': 'http://ai-radio.org:8000/radio.opus', - 'info_dict': { - 'id': 'radio', - 'ext': 'opus', - 'title': 'radio', - }, - 'params': { - 'skip_download': True, # infinite live stream - }, - 'expected_warnings': [ - r'501.*Not Implemented', - r'400.*Bad Request', - ], - }, - # Direct link with incorrect MIME type - { - 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', - 'md5': '4ccbebe5f36706d85221f204d7eb5913', - 'info_dict': { - 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', - 'id': '5_Lennart_Poettering_-_Systemd', - 'ext': 'webm', - 'title': '5_Lennart_Poettering_-_Systemd', - 'upload_date': '20141120', - }, - 'expected_warnings': [ - 'URL could be a direct video link, returning it as such.' - ] - }, - # RSS feed - { - 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', - 'info_dict': { - 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', - 'title': 'Zero Punctuation', - 'description': 're:.*groundbreaking video review series.*' - }, - 'playlist_mincount': 11, - }, - # RSS feed with enclosure - { - 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', - 'info_dict': { - 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - 'ext': 'm4v', - 'upload_date': '20150228', - 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - } - }, - # RSS feed with enclosures and unsupported link URLs - { - 'url': 'http://www.hellointernet.fm/podcast?format=rss', - 'info_dict': { - 'id': 'http://www.hellointernet.fm/podcast?format=rss', - 'description': 'CGP Grey and Brady Haran talk about YouTube, life, work, whatever.', - 'title': 'Hello Internet', - }, - 'playlist_mincount': 100, - }, - # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng - { - 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', - 'info_dict': { - 'id': 'smil', - 'ext': 'mp4', - 'title': 'Automatics, robotics and biocybernetics', - 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', - 'upload_date': '20130627', - 'formats': 'mincount:16', - 'subtitles': 'mincount:1', - }, - 'params': { - 'force_generic_extractor': True, - 'skip_download': True, - }, - }, - # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html - { - 'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil', - 'info_dict': { - 'id': 'hds', - 'ext': 'flv', - 'title': 'hds', - 'formats': 'mincount:1', - }, - 'params': { - 'skip_download': True, - }, - }, - # SMIL from https://www.restudy.dk/video/play/id/1637 - { - 'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml', - 'info_dict': { - 'id': 'video_1637', - 'ext': 'flv', - 'title': 'video_1637', - 'formats': 'mincount:3', - }, - 'params': { - 'skip_download': True, - }, - }, - # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm - { - 'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil', - 'info_dict': { - 'id': 'smil-service', - 'ext': 'flv', - 'title': 'smil-service', - 'formats': 'mincount:1', - }, - 'params': { - 'skip_download': True, - }, - }, - # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370 - { - 'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil', - 'info_dict': { - 'id': '4719370', - 'ext': 'mp4', - 'title': '571de1fd-47bc-48db-abf9-238872a58d1f', - 'formats': 'mincount:3', - }, - 'params': { - 'skip_download': True, - }, - }, - # XSPF playlist from http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html - { - 'url': 'http://www.telegraaf.nl/xml/playlist/2015/8/7/mZlp2ctYIUEB.xspf', - 'info_dict': { - 'id': 'mZlp2ctYIUEB', - 'ext': 'mp4', - 'title': 'Tikibad ontruimd wegens brand', - 'description': 'md5:05ca046ff47b931f9b04855015e163a4', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 33, - }, - 'params': { - 'skip_download': True, - }, - }, - # MPD from http://dash-mse-test.appspot.com/media.html - { - 'url': 'http://yt-dash-mse-test.commondatastorage.googleapis.com/media/car-20120827-manifest.mpd', - 'md5': '4b57baab2e30d6eb3a6a09f0ba57ef53', - 'info_dict': { - 'id': 'car-20120827-manifest', - 'ext': 'mp4', - 'title': 'car-20120827-manifest', - 'formats': 'mincount:9', - 'upload_date': '20130904', - }, - 'params': { - 'format': 'bestvideo', - }, - }, - # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 - { - 'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8', - 'info_dict': { - 'id': 'content', - 'ext': 'mp4', - 'title': 'content', - 'formats': 'mincount:8', - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, - 'skip': 'video gone', - }, - # m3u8 served with Content-Type: text/plain - { - 'url': 'http://www.nacentapps.com/m3u8/index.m3u8', - 'info_dict': { - 'id': 'index', - 'ext': 'mp4', - 'title': 'index', - 'upload_date': '20140720', - 'formats': 'mincount:11', - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, - 'skip': 'video gone', - }, - # google redirect - { - 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', - 'info_dict': { - 'id': 'cmQHVoWB5FY', - 'ext': 'mp4', - 'upload_date': '20130224', - 'uploader_id': 'TheVerge', - 'description': r're:^Chris Ziegler takes a look at the\.*', - 'uploader': 'The Verge', - 'title': 'First Firefox OS phones side-by-side', - }, - 'params': { - 'skip_download': False, - } - }, - { - # redirect in Refresh HTTP header - 'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1', - 'info_dict': { - 'id': 'pO8h3EaFRdo', - 'ext': 'mp4', - 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set', - 'description': 'md5:6294cc1af09c4049e0652b51a2df10d5', - 'upload_date': '20150917', - 'uploader_id': 'brtvofficial', - 'uploader': 'Boiler Room', - }, - 'params': { - 'skip_download': False, - }, - }, - { - 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', - 'md5': '85b90ccc9d73b4acd9138d3af4c27f89', - 'info_dict': { - 'id': '13601338388002', - 'ext': 'mp4', - 'uploader': 'www.hodiho.fr', - 'title': 'R\u00e9gis plante sa Jeep', - } - }, - # bandcamp page with custom domain - { - 'add_ie': ['Bandcamp'], - 'url': 'http://bronyrock.com/track/the-pony-mash', - 'info_dict': { - 'id': '3235767654', - 'ext': 'mp3', - 'title': 'The Pony Mash', - 'uploader': 'M_Pallante', - }, - 'skip': 'There is a limit of 200 free downloads / month for the test song', - }, - { - # embedded brightcove video - # it also tests brightcove videos that need to set the 'Referer' - # in the http requests - 'add_ie': ['BrightcoveLegacy'], - 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', - 'info_dict': { - 'id': '2765128793001', - 'ext': 'mp4', - 'title': 'Le cours de bourse : l’analyse technique', - 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9', - 'uploader': 'BFM BUSINESS', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # embedded with itemprop embedURL and video id spelled as `idVideo` - 'add_id': ['BrightcoveLegacy'], - 'url': 'http://bfmbusiness.bfmtv.com/mediaplayer/chroniques/olivier-delamarche/', - 'info_dict': { - 'id': '5255628253001', - 'ext': 'mp4', - 'title': 'md5:37c519b1128915607601e75a87995fc0', - 'description': 'md5:37f7f888b434bb8f8cc8dbd4f7a4cf26', - 'uploader': 'BFM BUSINESS', - 'uploader_id': '876450612001', - 'timestamp': 1482255315, - 'upload_date': '20161220', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # https://github.com/rg3/youtube-dl/issues/2253 - 'url': 'http://bcove.me/i6nfkrc3', - 'md5': '0ba9446db037002366bab3b3eb30c88c', - 'info_dict': { - 'id': '3101154703001', - 'ext': 'mp4', - 'title': 'Still no power', - 'uploader': 'thestar.com', - 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', - }, - 'add_ie': ['BrightcoveLegacy'], - 'skip': 'video gone', - }, - { - 'url': 'http://www.championat.com/video/football/v/87/87499.html', - 'md5': 'fb973ecf6e4a78a67453647444222983', - 'info_dict': { - 'id': '3414141473001', - 'ext': 'mp4', - 'title': 'Видео. Удаление Дзагоева (ЦСКА)', - 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"', - 'uploader': 'Championat', - }, - }, - { - # https://github.com/rg3/youtube-dl/issues/3541 - 'add_ie': ['BrightcoveLegacy'], - 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', - 'info_dict': { - 'id': '3866516442001', - 'ext': 'mp4', - 'title': 'Leer mij vrouwen kennen: Aflevering 1', - 'description': 'Leer mij vrouwen kennen: Aflevering 1', - 'uploader': 'SBS Broadcasting', - }, - 'skip': 'Restricted to Netherlands', - 'params': { - 'skip_download': True, # m3u8 download - }, - }, - { - # Brightcove video in