aboutsummaryrefslogtreecommitdiffstats
path: root/yt_dlp/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'yt_dlp/extractor')
-rw-r--r--yt_dlp/extractor/adobeconnect.py2
-rw-r--r--yt_dlp/extractor/allocine.py6
-rw-r--r--yt_dlp/extractor/archiveorg.py3
-rw-r--r--yt_dlp/extractor/asiancrush.py3
-rw-r--r--yt_dlp/extractor/bbc.py5
-rw-r--r--yt_dlp/extractor/breitbart.py5
-rw-r--r--yt_dlp/extractor/callin.py2
-rw-r--r--yt_dlp/extractor/cbc.py6
-rw-r--r--yt_dlp/extractor/closertotruth.py3
-rw-r--r--yt_dlp/extractor/common.py10
-rw-r--r--yt_dlp/extractor/cspan.py2
-rw-r--r--yt_dlp/extractor/fivetv.py3
-rw-r--r--yt_dlp/extractor/foxgay.py3
-rw-r--r--yt_dlp/extractor/generic.py6
-rw-r--r--yt_dlp/extractor/glide.py4
-rw-r--r--yt_dlp/extractor/hellporno.py3
-rw-r--r--yt_dlp/extractor/huya.py3
-rw-r--r--yt_dlp/extractor/imdb.py2
-rw-r--r--yt_dlp/extractor/infoq.py2
-rw-r--r--yt_dlp/extractor/iwara.py3
-rw-r--r--yt_dlp/extractor/linkedin.py2
-rw-r--r--yt_dlp/extractor/miaopai.py3
-rw-r--r--yt_dlp/extractor/mojvideo.py3
-rw-r--r--yt_dlp/extractor/newgrounds.py6
-rw-r--r--yt_dlp/extractor/nhk.py4
-rw-r--r--yt_dlp/extractor/playvid.py3
-rw-r--r--yt_dlp/extractor/rule34video.py2
-rw-r--r--yt_dlp/extractor/senategov.py2
-rw-r--r--yt_dlp/extractor/sunporno.py3
-rw-r--r--yt_dlp/extractor/thisav.py4
-rw-r--r--yt_dlp/extractor/traileraddict.py3
-rw-r--r--yt_dlp/extractor/varzesh3.py3
-rw-r--r--yt_dlp/extractor/vshare.py3
-rw-r--r--yt_dlp/extractor/vupload.py2
-rw-r--r--yt_dlp/extractor/weibo.py3
-rw-r--r--yt_dlp/extractor/yahoo.py2
-rw-r--r--yt_dlp/extractor/youjizz.py3
37 files changed, 49 insertions, 78 deletions
diff --git a/yt_dlp/extractor/adobeconnect.py b/yt_dlp/extractor/adobeconnect.py
index e688dddcb..e2e6f93f3 100644
--- a/yt_dlp/extractor/adobeconnect.py
+++ b/yt_dlp/extractor/adobeconnect.py
@@ -14,7 +14,7 @@ class AdobeConnectIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
qs = compat_parse_qs(self._search_regex(r"swfUrl\s*=\s*'([^']+)'", webpage, 'swf url').split('?')[1])
is_live = qs.get('isLive', ['false'])[0] == 'true'
formats = []
diff --git a/yt_dlp/extractor/allocine.py b/yt_dlp/extractor/allocine.py
index cd533acfc..403a277e9 100644
--- a/yt_dlp/extractor/allocine.py
+++ b/yt_dlp/extractor/allocine.py
@@ -7,6 +7,7 @@ from ..utils import (
int_or_none,
qualities,
remove_end,
+ strip_or_none,
try_get,
unified_timestamp,
url_basename,
@@ -102,10 +103,7 @@ class AllocineIE(InfoExtractor):
video_id = display_id
media_data = self._download_json(
'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id)
- title = remove_end(
- self._html_search_regex(
- r'(?s)<title>(.+?)</title>', webpage, 'title').strip(),
- ' - AlloCiné')
+ title = remove_end(strip_or_none(self._html_extract_title(webpage), ' - AlloCiné'))
for key, value in media_data['video'].items():
if not key.endswith('Path'):
continue
diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py
index b06ac74ae..2ab3c1beb 100644
--- a/yt_dlp/extractor/archiveorg.py
+++ b/yt_dlp/extractor/archiveorg.py
@@ -483,8 +483,7 @@ class YoutubeWebArchiveIE(InfoExtractor):
regex), webpage, name, default='{}'), video_id, fatal=False)
def _extract_webpage_title(self, webpage):
- page_title = self._html_search_regex(
- r'<title>([^<]*)</title>', webpage, 'title', default='')
+ page_title = self._html_extract_title(webpage, default='')
# YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix.
return self._html_search_regex(
r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)',
diff --git a/yt_dlp/extractor/asiancrush.py b/yt_dlp/extractor/asiancrush.py
index 75a632958..7f1940fca 100644
--- a/yt_dlp/extractor/asiancrush.py
+++ b/yt_dlp/extractor/asiancrush.py
@@ -181,8 +181,7 @@ class AsianCrushPlaylistIE(AsianCrushBaseIE):
'title', default=None) or self._og_search_title(
webpage, default=None) or self._html_search_meta(
'twitter:title', webpage, 'title',
- default=None) or self._search_regex(
- r'<title>([^<]+)</title>', webpage, 'title', fatal=False)
+ default=None) or self._html_extract_title(webpage)
if title:
title = re.sub(r'\s*\|\s*.+?$', '', title)
diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py
index 823155730..29ad7ded7 100644
--- a/yt_dlp/extractor/bbc.py
+++ b/yt_dlp/extractor/bbc.py
@@ -906,9 +906,8 @@ class BBCIE(BBCCoUkIE):
playlist_title = json_ld_info.get('title')
if not playlist_title:
- playlist_title = self._og_search_title(
- webpage, default=None) or self._html_search_regex(
- r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
+ playlist_title = (self._og_search_title(webpage, default=None)
+ or self._html_extract_title(webpage, 'playlist title', default=None))
if playlist_title:
playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
diff --git a/yt_dlp/extractor/breitbart.py b/yt_dlp/extractor/breitbart.py
index f50f719dc..e029aa627 100644
--- a/yt_dlp/extractor/breitbart.py
+++ b/yt_dlp/extractor/breitbart.py
@@ -29,9 +29,8 @@ class BreitBartIE(InfoExtractor):
self._sort_formats(formats)
return {
'id': video_id,
- 'title': self._og_search_title(
- webpage, default=None) or self._html_search_regex(
- r'(?s)<title>(.*?)</title>', webpage, 'video title'),
+ 'title': (self._og_search_title(webpage, default=None)
+ or self._html_extract_title(webpage, 'video title')),
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'age_limit': self._rta_search(webpage),
diff --git a/yt_dlp/extractor/callin.py b/yt_dlp/extractor/callin.py
index acf327ace..1f3b7cfff 100644
--- a/yt_dlp/extractor/callin.py
+++ b/yt_dlp/extractor/callin.py
@@ -54,7 +54,7 @@ class CallinIE(InfoExtractor):
id = episode['id']
title = (episode.get('title')
or self._og_search_title(webpage, fatal=False)
- or self._html_search_regex('<title>(.*?)</title>', webpage, 'title'))
+ or self._html_extract_title(webpage))
url = episode['m3u8']
formats = self._extract_m3u8_formats(url, display_id, ext='ts')
self._sort_formats(formats)
diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py
index ac1272f7b..fba8bf965 100644
--- a/yt_dlp/extractor/cbc.py
+++ b/yt_dlp/extractor/cbc.py
@@ -127,9 +127,9 @@ class CBCIE(InfoExtractor):
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- title = self._og_search_title(webpage, default=None) or self._html_search_meta(
- 'twitter:title', webpage, 'title', default=None) or self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title', fatal=False)
+ title = (self._og_search_title(webpage, default=None)
+ or self._html_search_meta('twitter:title', webpage, 'title', default=None)
+ or self._html_extract_title(webpage))
entries = [
self._extract_player_init(player_init, display_id)
for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)]
diff --git a/yt_dlp/extractor/closertotruth.py b/yt_dlp/extractor/closertotruth.py
index 26243d52d..517e121e0 100644
--- a/yt_dlp/extractor/closertotruth.py
+++ b/yt_dlp/extractor/closertotruth.py
@@ -54,8 +54,7 @@ class CloserToTruthIE(InfoExtractor):
r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)',
webpage, 'kaltura partner_id')
- title = self._search_regex(
- r'<title>(.+?)\s*\|\s*.+?</title>', webpage, 'video title')
+ title = self._html_extract_title(webpage, 'video title')
select = self._search_regex(
r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>',
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index af964c527..81688eb54 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1329,9 +1329,8 @@ class InfoExtractor(object):
def _og_search_description(self, html, **kargs):
return self._og_search_property('description', html, fatal=False, **kargs)
- def _og_search_title(self, html, **kargs):
- kargs.setdefault('fatal', False)
- return self._og_search_property('title', html, **kargs)
+ def _og_search_title(self, html, *, fatal=False, **kargs):
+ return self._og_search_property('title', html, fatal=fatal, **kargs)
def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
regexes = self._og_regexes('video') + self._og_regexes('video:url')
@@ -1342,9 +1341,8 @@ class InfoExtractor(object):
def _og_search_url(self, html, **kargs):
return self._og_search_property('url', html, **kargs)
- def _html_extract_title(self, html, name, **kwargs):
- return self._html_search_regex(
- r'(?s)<title>(.*?)</title>', html, name, **kwargs)
+ def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
+ return self._html_search_regex(r'(?s)<title>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
name = variadic(name)
diff --git a/yt_dlp/extractor/cspan.py b/yt_dlp/extractor/cspan.py
index d29b58ba6..f51159bbe 100644
--- a/yt_dlp/extractor/cspan.py
+++ b/yt_dlp/extractor/cspan.py
@@ -278,7 +278,7 @@ class CSpanCongressIE(InfoExtractor):
video_id, transform_source=js_to_json)
title = (self._og_search_title(webpage, default=None)
- or self._html_search_regex(r'(?s)<title>(.*?)</title>', webpage, 'video title'))
+ or self._html_extract_title(webpage, 'video title'))
description = (self._og_search_description(webpage, default=None)
or self._html_search_meta('description', webpage, 'description', default=None))
diff --git a/yt_dlp/extractor/fivetv.py b/yt_dlp/extractor/fivetv.py
index be81fccb8..d6bebd19b 100644
--- a/yt_dlp/extractor/fivetv.py
+++ b/yt_dlp/extractor/fivetv.py
@@ -75,8 +75,7 @@ class FiveTVIE(InfoExtractor):
r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'],
webpage, 'video url')
- title = self._og_search_title(webpage, default=None) or self._search_regex(
- r'<title>([^<]+)</title>', webpage, 'title')
+ title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage)
duration = int_or_none(self._og_search_property(
'video:duration', webpage, 'duration', default=None))
diff --git a/yt_dlp/extractor/foxgay.py b/yt_dlp/extractor/foxgay.py
index 512a10645..1c53e0642 100644
--- a/yt_dlp/extractor/foxgay.py
+++ b/yt_dlp/extractor/foxgay.py
@@ -29,8 +29,7 @@ class FoxgayIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = remove_end(self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title'), ' - Foxgay.com')
+ title = remove_end(self._html_extract_title(webpage), ' - Foxgay.com')
description = get_element_by_id('inf_tit', webpage)
# The default user-agent with foxgay cookies leads to pages without videos
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py
index 4a2e30158..65e803dd7 100644
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -2873,10 +2873,8 @@ class GenericIE(InfoExtractor):
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
- video_title = self._og_search_title(
- webpage, default=None) or self._html_search_regex(
- r'(?s)<title>(.*?)</title>', webpage, 'video title',
- default='video')
+ video_title = (self._og_search_title(webpage, default=None)
+ or self._html_extract_title(webpage, 'video title', default='video'))
# Try to detect age limit automatically
age_limit = self._rta_search(webpage)
diff --git a/yt_dlp/extractor/glide.py b/yt_dlp/extractor/glide.py
index d94dfbf09..12af859be 100644
--- a/yt_dlp/extractor/glide.py
+++ b/yt_dlp/extractor/glide.py
@@ -23,9 +23,7 @@ class GlideIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(
- r'<title>(.+?)</title>', webpage,
- 'title', default=None) or self._og_search_title(webpage)
+ title = self._html_extract_title(webpage, default=None) or self._og_search_title(webpage)
video_url = self._proto_relative_url(self._search_regex(
r'<source[^>]+src=(["\'])(?P<url>.+?)\1',
webpage, 'video URL', default=None,
diff --git a/yt_dlp/extractor/hellporno.py b/yt_dlp/extractor/hellporno.py
index fae425103..92d32cdcc 100644
--- a/yt_dlp/extractor/hellporno.py
+++ b/yt_dlp/extractor/hellporno.py
@@ -38,8 +38,7 @@ class HellPornoIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
- title = remove_end(self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title'), ' - Hell Porno')
+ title = remove_end(self._html_extract_title(webpage), ' - Hell Porno')
info = self._parse_html5_media_entries(url, webpage, display_id)[0]
self._sort_formats(info['formats'])
diff --git a/yt_dlp/extractor/huya.py b/yt_dlp/extractor/huya.py
index b81439682..4e96f22fa 100644
--- a/yt_dlp/extractor/huya.py
+++ b/yt_dlp/extractor/huya.py
@@ -66,8 +66,7 @@ class HuyaLiveIE(InfoExtractor):
room_info = try_get(stream_data, lambda x: x['data'][0]['gameLiveInfo'])
if not room_info:
raise ExtractorError('Can not extract the room info', expected=True)
- title = room_info.get('roomName') or room_info.get('introduction') or self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title')
+ title = room_info.get('roomName') or room_info.get('introduction') or self._html_extract_title(webpage)
screen_type = room_info.get('screenType')
live_source_type = room_info.get('liveSourceType')
stream_info_list = stream_data['data'][0]['gameStreamInfoList']
diff --git a/yt_dlp/extractor/imdb.py b/yt_dlp/extractor/imdb.py
index 7eb66d821..96cee2e2f 100644
--- a/yt_dlp/extractor/imdb.py
+++ b/yt_dlp/extractor/imdb.py
@@ -68,7 +68,7 @@ class ImdbIE(InfoExtractor):
video_info = traverse_obj(info, ('props', 'pageProps', 'videoPlaybackData', 'video'), default={})
title = (traverse_obj(video_info, ('name', 'value'), ('primaryTitle', 'titleText', 'text'))
or self._html_search_meta(('og:title', 'twitter:title'), webpage, default=None)
- or self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title'))
+ or self._html_extract_title(webpage))
data = video_info.get('playbackURLs') or try_get(self._download_json(
'https://www.imdb.com/ve/data/VIDEO_PLAYBACK_DATA', video_id,
query={
diff --git a/yt_dlp/extractor/infoq.py b/yt_dlp/extractor/infoq.py
index 0a70a1fb4..347cc5154 100644
--- a/yt_dlp/extractor/infoq.py
+++ b/yt_dlp/extractor/infoq.py
@@ -115,7 +115,7 @@ class InfoQIE(BokeCCBaseIE):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- video_title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title')
+ video_title = self._html_extract_title(webpage)
video_description = self._html_search_meta('description', webpage, 'description')
if '/cn/' in url:
diff --git a/yt_dlp/extractor/iwara.py b/yt_dlp/extractor/iwara.py
index 254d98692..c0e01e352 100644
--- a/yt_dlp/extractor/iwara.py
+++ b/yt_dlp/extractor/iwara.py
@@ -76,8 +76,7 @@ class IwaraIE(InfoExtractor):
'age_limit': age_limit,
}
- title = remove_end(self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title'), ' | Iwara')
+ title = remove_end(self._html_extract_title(webpage), ' | Iwara')
thumbnail = self._html_search_regex(
r'poster=[\'"]([^\'"]+)', webpage, 'thumbnail', default=None)
diff --git a/yt_dlp/extractor/linkedin.py b/yt_dlp/extractor/linkedin.py
index bf549e164..0f57bfa06 100644
--- a/yt_dlp/extractor/linkedin.py
+++ b/yt_dlp/extractor/linkedin.py
@@ -102,7 +102,7 @@ class LinkedInIE(LinkedInBaseIE):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
description = clean_html(get_element_by_class('share-update-card__update-text', webpage))
like_count = int_or_none(get_element_by_class('social-counts-reactions__social-counts-numRections', webpage))
creator = strip_or_none(clean_html(get_element_by_class('comment__actor-name', webpage)))
diff --git a/yt_dlp/extractor/miaopai.py b/yt_dlp/extractor/miaopai.py
index f9e35ac7f..cf0610bdf 100644
--- a/yt_dlp/extractor/miaopai.py
+++ b/yt_dlp/extractor/miaopai.py
@@ -24,8 +24,7 @@ class MiaoPaiIE(InfoExtractor):
webpage = self._download_webpage(
url, video_id, headers={'User-Agent': self._USER_AGENT_IPAD})
- title = self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
thumbnail = self._html_search_regex(
r'<div[^>]+class=(?P<q1>[\'"]).*\bvideo_img\b.*(?P=q1)[^>]+data-url=(?P<q2>[\'"])(?P<url>[^\'"]+)(?P=q2)',
webpage, 'thumbnail', fatal=False, group='url')
diff --git a/yt_dlp/extractor/mojvideo.py b/yt_dlp/extractor/mojvideo.py
index 0421f3f44..16d94052b 100644
--- a/yt_dlp/extractor/mojvideo.py
+++ b/yt_dlp/extractor/mojvideo.py
@@ -38,8 +38,7 @@ class MojvideoIE(InfoExtractor):
r'<errordesc>([^<]*)</errordesc>', playerapi, 'error description', fatal=False)
raise ExtractorError('%s said: %s' % (self.IE_NAME, error_desc), expected=True)
- title = self._html_search_regex(
- r'<title>([^<]+)</title>', playerapi, 'title')
+ title = self._html_extract_title(playerapi)
video_url = self._html_search_regex(
r'<file>([^<]+)</file>', playerapi, 'video URL')
thumbnail = self._html_search_regex(
diff --git a/yt_dlp/extractor/newgrounds.py b/yt_dlp/extractor/newgrounds.py
index 1e1274ef0..6525a6d8a 100644
--- a/yt_dlp/extractor/newgrounds.py
+++ b/yt_dlp/extractor/newgrounds.py
@@ -106,8 +106,7 @@ class NewgroundsIE(InfoExtractor):
uploader = None
webpage = self._download_webpage(url, media_id)
- title = self._html_search_regex(
- r'<title>(.+?)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
media_url_string = self._search_regex(
r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None)
@@ -219,8 +218,7 @@ class NewgroundsPlaylistIE(InfoExtractor):
webpage = self._download_webpage(url, playlist_id)
- title = self._search_regex(
- r'<title>([^>]+)</title>', webpage, 'title', default=None)
+ title = self._html_extract_title(webpage, default=None)
# cut left menu
webpage = self._search_regex(
diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py
index 626c6379b..3b8efc3e6 100644
--- a/yt_dlp/extractor/nhk.py
+++ b/yt_dlp/extractor/nhk.py
@@ -309,7 +309,9 @@ class NhkForSchoolProgramListIE(InfoExtractor):
webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id)
- title = self._og_search_title(webpage, fatal=False) or self._html_extract_title(webpage, fatal=False) or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False)
+ title = (self._og_search_title(webpage)
+ or self._html_extract_title(webpage)
+ or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False))
title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None
description = self._html_search_regex(
r'(?s)<div\s+class="programDetail\s*">\s*<p>[^<]+</p>',
diff --git a/yt_dlp/extractor/playvid.py b/yt_dlp/extractor/playvid.py
index 4aef186ea..e1c406b6c 100644
--- a/yt_dlp/extractor/playvid.py
+++ b/yt_dlp/extractor/playvid.py
@@ -85,8 +85,7 @@ class PlayvidIE(InfoExtractor):
# Extract title - should be in the flashvars; if not, look elsewhere
if video_title is None:
- video_title = self._html_search_regex(
- r'<title>(.*?)</title', webpage, 'title')
+ video_title = self._html_extract_title(webpage)
return {
'id': video_id,
diff --git a/yt_dlp/extractor/rule34video.py b/yt_dlp/extractor/rule34video.py
index 522d4ccd5..a602a9f33 100644
--- a/yt_dlp/extractor/rule34video.py
+++ b/yt_dlp/extractor/rule34video.py
@@ -49,7 +49,7 @@ class Rule34VideoIE(InfoExtractor):
'quality': quality,
})
- title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
thumbnail = self._html_search_regex(r'preview_url:\s+\'([^\']+)\'', webpage, 'thumbnail', default=None)
duration = self._html_search_regex(r'"icon-clock"></i>\s+<span>((?:\d+:?)+)', webpage, 'duration', default=None)
diff --git a/yt_dlp/extractor/senategov.py b/yt_dlp/extractor/senategov.py
index 6f4240422..b295184a1 100644
--- a/yt_dlp/extractor/senategov.py
+++ b/yt_dlp/extractor/senategov.py
@@ -112,7 +112,7 @@ class SenateISVPIE(InfoExtractor):
if smuggled_data.get('force_title'):
title = smuggled_data['force_title']
else:
- title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id)
+ title = self._html_extract_title(webpage)
poster = qs.get('poster')
thumbnail = poster[0] if poster else None
diff --git a/yt_dlp/extractor/sunporno.py b/yt_dlp/extractor/sunporno.py
index 68051169b..59b77bf92 100644
--- a/yt_dlp/extractor/sunporno.py
+++ b/yt_dlp/extractor/sunporno.py
@@ -36,8 +36,7 @@ class SunPornoIE(InfoExtractor):
webpage = self._download_webpage(
'http://www.sunporno.com/videos/%s' % video_id, video_id)
- title = self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
description = self._html_search_meta(
'description', webpage, 'description')
thumbnail = self._html_search_regex(
diff --git a/yt_dlp/extractor/thisav.py b/yt_dlp/extractor/thisav.py
index 4af286e6d..6bb00b3ab 100644
--- a/yt_dlp/extractor/thisav.py
+++ b/yt_dlp/extractor/thisav.py
@@ -37,9 +37,7 @@ class ThisAVIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- title = remove_end(self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title'),
- ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站')
+ title = remove_end(self._html_extract_title(webpage), ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站')
video_url = self._html_search_regex(
r"addVariable\('file','([^']+)'\);", webpage, 'video url', default=None)
if video_url:
diff --git a/yt_dlp/extractor/traileraddict.py b/yt_dlp/extractor/traileraddict.py
index 10100fbcf..514f4793e 100644
--- a/yt_dlp/extractor/traileraddict.py
+++ b/yt_dlp/extractor/traileraddict.py
@@ -24,8 +24,7 @@ class TrailerAddictIE(InfoExtractor):
name = mobj.group('movie') + '/' + mobj.group('trailer_name')
webpage = self._download_webpage(url, name)
- title = self._search_regex(r'<title>(.+?)</title>',
- webpage, 'video title').replace(' - Trailer Addict', '')
+ title = self._html_extract_title(webpage, 'video title').replace(' - Trailer Addict', '')
view_count_str = self._search_regex(
r'<span class="views_n">([0-9,.]+)</span>',
webpage, 'view count', fatal=False)
diff --git a/yt_dlp/extractor/varzesh3.py b/yt_dlp/extractor/varzesh3.py
index 81313dc9d..32655b96d 100644
--- a/yt_dlp/extractor/varzesh3.py
+++ b/yt_dlp/extractor/varzesh3.py
@@ -42,8 +42,7 @@ class Varzesh3IE(InfoExtractor):
video_url = self._search_regex(
r'<source[^>]+src="([^"]+)"', webpage, 'video url')
- title = remove_start(self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title'), 'ویدیو ورزش 3 | ')
+ title = remove_start(self._html_extract_title(webpage), 'ویدیو ورزش 3 | ')
description = self._html_search_regex(
r'(?s)<div class="matn">(.+?)</div>',
diff --git a/yt_dlp/extractor/vshare.py b/yt_dlp/extractor/vshare.py
index c631ac1fa..b4874ac39 100644
--- a/yt_dlp/extractor/vshare.py
+++ b/yt_dlp/extractor/vshare.py
@@ -50,8 +50,7 @@ class VShareIE(InfoExtractor):
'https://vshare.io/v/%s/width-650/height-430/1' % video_id,
video_id, headers={'Referer': url})
- title = self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
title = title.split(' - ')[0]
error = self._html_search_regex(
diff --git a/yt_dlp/extractor/vupload.py b/yt_dlp/extractor/vupload.py
index 2229a6591..b561f63f7 100644
--- a/yt_dlp/extractor/vupload.py
+++ b/yt_dlp/extractor/vupload.py
@@ -28,7 +28,7 @@ class VuploadIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
video_json = self._parse_json(self._html_search_regex(r'sources:\s*(.+?]),', webpage, 'video'), video_id, transform_source=js_to_json)
formats = []
for source in video_json:
diff --git a/yt_dlp/extractor/weibo.py b/yt_dlp/extractor/weibo.py
index 621df5b54..dafa2af3b 100644
--- a/yt_dlp/extractor/weibo.py
+++ b/yt_dlp/extractor/weibo.py
@@ -73,8 +73,7 @@ class WeiboIE(InfoExtractor):
webpage = self._download_webpage(
url, video_id, note='Revisiting webpage')
- title = self._html_search_regex(
- r'<title>(.+?)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
video_formats = compat_parse_qs(self._search_regex(
r'video-sources=\\\"(.+?)\"', webpage, 'video_sources'))
diff --git a/yt_dlp/extractor/yahoo.py b/yt_dlp/extractor/yahoo.py
index 6cf3b1de2..20504de2c 100644
--- a/yt_dlp/extractor/yahoo.py
+++ b/yt_dlp/extractor/yahoo.py
@@ -533,7 +533,7 @@ class YahooJapanNewsIE(InfoExtractor):
title = self._html_search_meta(
['og:title', 'twitter:title'], webpage, 'title', default=None
- ) or self._html_search_regex('<title>([^<]+)</title>', webpage, 'title')
+ ) or self._html_extract_title(webpage)
if display_id == host:
# Headline page (w/ multiple BC playlists) ('news.yahoo.co.jp', 'headlines.yahoo.co.jp/videonews/', ...)
diff --git a/yt_dlp/extractor/youjizz.py b/yt_dlp/extractor/youjizz.py
index 5f5fbf21c..111623ffe 100644
--- a/yt_dlp/extractor/youjizz.py
+++ b/yt_dlp/extractor/youjizz.py
@@ -36,8 +36,7 @@ class YouJizzIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(
- r'<title>(.+?)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
formats = []