From 3869028ffb6be6ab719e5cf1004276dfdfd1216d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 16 Sep 2017 12:18:38 +0800 Subject: [utils] Use bytes-like objects in dfxp2srt This fixes handling of non-UTF8 TTML subtitles Closes #14191 --- youtube_dl/utils.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9e4492d40..b724e0b70 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2572,14 +2572,18 @@ def srt_subtitles_timecode(seconds): def dfxp2srt(dfxp_data): + ''' + @param dfxp_data A bytes-like object containing DFXP data + @returns A unicode object containing converted SRT data + ''' LEGACY_NAMESPACES = ( - ('http://www.w3.org/ns/ttml', [ - 'http://www.w3.org/2004/11/ttaf1', - 'http://www.w3.org/2006/04/ttaf1', - 'http://www.w3.org/2006/10/ttaf1', + (b'http://www.w3.org/ns/ttml', [ + b'http://www.w3.org/2004/11/ttaf1', + b'http://www.w3.org/2006/04/ttaf1', + b'http://www.w3.org/2006/10/ttaf1', ]), - ('http://www.w3.org/ns/ttml#styling', [ - 'http://www.w3.org/ns/ttml#style', + (b'http://www.w3.org/ns/ttml#styling', [ + b'http://www.w3.org/ns/ttml#style', ]), ) @@ -2674,7 +2678,7 @@ def dfxp2srt(dfxp_data): for ns in v: dfxp_data = dfxp_data.replace(ns, k) - dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8')) + dfxp = compat_etree_fromstring(dfxp_data) out = [] paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') -- cgit v1.2.3