From b5d0d817bc8a23ef6dc2a00d1af6fad893143206 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 30 Oct 2013 01:09:44 +0100 Subject: Remove superfluous space --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index ce349fe20..cef4dce85 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -63,7 +63,7 @@ class InfoExtractor(object): * ext Will be calculated from url if missing * format A human-readable description of the format ("mp4 container with h264/opus"). - Calculated from the format_id, width, height + Calculated from the format_id, width, height. and format_note fields if missing. * format_id A short description of the format ("mp4_h264_opus" or "19") -- cgit v1.2.3 From 9103bbc5cd11957de2e906e4401dcf4df9511d28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 3 Nov 2013 12:11:13 +0100 Subject: Add the 'webpage_url' field to info_dict The url for the video page, it must allow to reproduce the result. It's automatically set by YoutubeDL if it's missing. --- youtube_dl/extractor/common.py | 3 +++ 1 file changed, 3 insertions(+) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index cef4dce85..e0ccba533 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -71,6 +71,9 @@ class InfoExtractor(object): ("3D" or "DASH video") * width Width of the video, if known * height Height of the video, if known + webpage_url: The url to the video webpage, if given to youtube-dl it + should allow to get the same result again. (It will be set + by YoutubeDL if it's missing) Unless mentioned otherwise, the fields should be Unicode strings. -- cgit v1.2.3 From a8eeb0597b11dbc9d1b48f95264cc2815311aa15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Cie=C5=9Blak?= Date: Tue, 5 Nov 2013 23:19:29 +0100 Subject: Fix AssertionError when og property not found On tvp.pl some webpages contain OpenGraph metadata and some don't. If og property is not found, _og_search_description fails with WARNING: unable to extract OpenGraph description; please report this issue on http://yt-dl.org/bug Traceback (most recent call last): File "/usr/home/saper/bin/youtube-dl", line 18, in youtube_dl.main() File "/usr/home/saper/sw/youtube-dl/youtube_dl/__init__.py", line 766, in main _real_main(argv) File "/usr/home/saper/sw/youtube-dl/youtube_dl/__init__.py", line 719, in _real_main retcode = ydl.download(all_urls) File "/usr/home/saper/sw/youtube-dl/youtube_dl/YoutubeDL.py", line 715, in download videos = self.extract_info(url) File "/usr/home/saper/sw/youtube-dl/youtube_dl/YoutubeDL.py", line 348, in extract_info ie_result = ie.extract(url) File "/usr/home/saper/sw/youtube-dl/youtube_dl/extractor/common.py", line 125, in extract return self._real_extract(url) File "/usr/home/saper/sw/youtube-dl/youtube_dl/extractor/tvp.py", line 56, in _real_extract info['description'] = self._og_search_description(webpage) File "/usr/home/saper/sw/youtube-dl/youtube_dl/extractor/common.py", line 331, in _og_search_description return self._og_search_property('description', html, fatal=False, **kargs) File "/usr/home/saper/sw/youtube-dl/youtube_dl/extractor/common.py", line 325, in _og_search_property return unescapeHTML(escaped) File "/usr/home/saper/sw/youtube-dl/youtube_dl/utils.py", line 494, in unescapeHTML assert type(s) == type(u'') AssertionError The patch allows me to use: try: info['description'] = self._og_search_description(webpage) info['thumbnail'] = self._og_search_thumbnail(webpage) except RegexNotFoundError: pass --- youtube_dl/extractor/common.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e0ccba533..fb2d50a09 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -322,7 +322,9 @@ class InfoExtractor(object): if name is None: name = 'OpenGraph %s' % prop escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs) - return unescapeHTML(escaped) + if not escaped is None: + return unescapeHTML(escaped) + return None def _og_search_thumbnail(self, html, **kargs): return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs) -- cgit v1.2.3 From eb0a83986642cf660820b168bd83c8770e3e5ce6 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 12 Nov 2013 10:36:23 +0100 Subject: [common] Simplify og_search_property --- youtube_dl/extractor/common.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index fb2d50a09..9c20d30b4 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -322,9 +322,9 @@ class InfoExtractor(object): if name is None: name = 'OpenGraph %s' % prop escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs) - if not escaped is None: - return unescapeHTML(escaped) - return None + if escaped is None: + return None + return unescapeHTML(escaped) def _og_search_thumbnail(self, html, **kargs): return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs) -- cgit v1.2.3 From ab2d524780736249c8988313db021e83642c24d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 15 Nov 2013 12:24:54 +0100 Subject: Improve the OpenGraph regex * Do not accept '>' between the property and content attributes. * Recognize the properties if the content attribute is before the property attribute using two regexes (fixes the extraction of the description for SlideshareIE). --- youtube_dl/extractor/common.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 9c20d30b4..e02176852 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -315,13 +315,17 @@ class InfoExtractor(object): # Helper functions for extracting OpenGraph info @staticmethod - def _og_regex(prop): - return r']+?property=[\'"]og:%s[\'"][^>]+?content=(?:"(.+?)"|\'(.+?)\')' % esc_prop, + r']+?content=(?:"(.+?)"|\'(.+?)\')[^>]+?property=[\'"]og:%s[\'"]' % esc_prop, + ] def _og_search_property(self, prop, html, name=None, **kargs): if name is None: name = 'OpenGraph %s' % prop - escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs) + escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs) if escaped is None: return None return unescapeHTML(escaped) @@ -336,8 +340,8 @@ class InfoExtractor(object): return self._og_search_property('title', html, **kargs) def _og_search_video_url(self, html, name='video url', secure=True, **kargs): - regexes = [self._og_regex('video')] - if secure: regexes.insert(0, self._og_regex('video:secure_url')) + regexes = self._og_regexes('video') + if secure: regexes = self._og_regexes('video:secure_url') + regexes return self._html_search_regex(regexes, html, name, **kargs) def _rta_search(self, html): -- cgit v1.2.3 From 78fb87b2837e15124b5855734a951598dfe025fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 15 Nov 2013 12:54:13 +0100 Subject: Don't accept '>' inside the content attribute in OpenGraph regexes --- youtube_dl/extractor/common.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e02176852..45dd01789 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -316,10 +316,12 @@ class InfoExtractor(object): # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - esc_prop = re.escape(prop) + content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')' + property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop) + template = r']+?%s[^>]+?%s' return [ - r']+?property=[\'"]og:%s[\'"][^>]+?content=(?:"(.+?)"|\'(.+?)\')' % esc_prop, - r']+?content=(?:"(.+?)"|\'(.+?)\')[^>]+?property=[\'"]og:%s[\'"]' % esc_prop, + template % (property_re, content_re), + template % (content_re, property_re), ] def _og_search_property(self, prop, html, name=None, **kargs): -- cgit v1.2.3 From 91c7271aabdd74c833ef570db59018e2d9f9d803 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 16 Nov 2013 01:08:43 +0100 Subject: Add automatic generation of format note based on bitrate and codecs --- youtube_dl/extractor/common.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 45dd01789..f787d0a3c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -71,6 +71,10 @@ class InfoExtractor(object): ("3D" or "DASH video") * width Width of the video, if known * height Height of the video, if known + * abr Average audio bitrate in KBit/s + * acodec Name of the audio codec in use + * vbr Average video bitrate in KBit/s + * vcodec Name of the video codec in use webpage_url: The url to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set by YoutubeDL if it's missing) -- cgit v1.2.3