mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-23 09:01:43 +00:00
[webvtt, extractor/youtube] Extract auto-subs from livestream VODs
Closes #4130 Authored by: pukkandan, fstirlitz
This commit is contained in:
parent
07b47084ba
commit
c646d76f67
@ -2298,7 +2298,7 @@ def refetch_manifest(format_id, delay):
|
|||||||
microformats = traverse_obj(
|
microformats = traverse_obj(
|
||||||
prs, (..., 'microformat', 'playerMicroformatRenderer'),
|
prs, (..., 'microformat', 'playerMicroformatRenderer'),
|
||||||
expected_type=dict, default=[])
|
expected_type=dict, default=[])
|
||||||
_, is_live, _, formats = self._list_formats(video_id, microformats, video_details, prs, player_url)
|
_, is_live, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url)
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
def mpd_feed(format_id, delay):
|
def mpd_feed(format_id, delay):
|
||||||
@ -3136,7 +3136,7 @@ def append_client(*client_names):
|
|||||||
self.report_warning(last_error)
|
self.report_warning(last_error)
|
||||||
return prs, player_url
|
return prs, player_url
|
||||||
|
|
||||||
def _extract_formats(self, streaming_data, video_id, player_url, is_live, duration):
|
def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, is_live, duration):
|
||||||
itags, stream_ids = {}, []
|
itags, stream_ids = {}, []
|
||||||
itag_qualities, res_qualities = {}, {}
|
itag_qualities, res_qualities = {}, {}
|
||||||
q = qualities([
|
q = qualities([
|
||||||
@ -3293,17 +3293,22 @@ def process_manifest_format(f, proto, itag):
|
|||||||
if val in qdict), -1)
|
if val in qdict), -1)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
subtitles = {}
|
||||||
for sd in streaming_data:
|
for sd in streaming_data:
|
||||||
hls_manifest_url = get_hls and sd.get('hlsManifestUrl')
|
hls_manifest_url = get_hls and sd.get('hlsManifestUrl')
|
||||||
if hls_manifest_url:
|
if hls_manifest_url:
|
||||||
for f in self._extract_m3u8_formats(hls_manifest_url, video_id, 'mp4', fatal=False):
|
fmts, subs = self._extract_m3u8_formats_and_subtitles(hls_manifest_url, video_id, 'mp4', fatal=False, live=is_live)
|
||||||
|
subtitles = self._merge_subtitles(subs, subtitles)
|
||||||
|
for f in fmts:
|
||||||
if process_manifest_format(f, 'hls', self._search_regex(
|
if process_manifest_format(f, 'hls', self._search_regex(
|
||||||
r'/itag/(\d+)', f['url'], 'itag', default=None)):
|
r'/itag/(\d+)', f['url'], 'itag', default=None)):
|
||||||
yield f
|
yield f
|
||||||
|
|
||||||
dash_manifest_url = get_dash and sd.get('dashManifestUrl')
|
dash_manifest_url = get_dash and sd.get('dashManifestUrl')
|
||||||
if dash_manifest_url:
|
if dash_manifest_url:
|
||||||
for f in self._extract_mpd_formats(dash_manifest_url, video_id, fatal=False):
|
formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False)
|
||||||
|
subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH
|
||||||
|
for f in formats:
|
||||||
if process_manifest_format(f, 'dash', f['format_id']):
|
if process_manifest_format(f, 'dash', f['format_id']):
|
||||||
f['filesize'] = int_or_none(self._search_regex(
|
f['filesize'] = int_or_none(self._search_regex(
|
||||||
r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None))
|
r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None))
|
||||||
@ -3311,6 +3316,7 @@ def process_manifest_format(f, proto, itag):
|
|||||||
f['is_from_start'] = True
|
f['is_from_start'] = True
|
||||||
|
|
||||||
yield f
|
yield f
|
||||||
|
yield subtitles
|
||||||
|
|
||||||
def _extract_storyboard(self, player_responses, duration):
|
def _extract_storyboard(self, player_responses, duration):
|
||||||
spec = get_first(
|
spec = get_first(
|
||||||
@ -3371,9 +3377,9 @@ def _list_formats(self, video_id, microformats, video_details, player_responses,
|
|||||||
is_live = get_first(live_broadcast_details, 'isLiveNow')
|
is_live = get_first(live_broadcast_details, 'isLiveNow')
|
||||||
|
|
||||||
streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[])
|
streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[])
|
||||||
formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live, duration))
|
*formats, subtitles = self._extract_formats_and_subtitles(streaming_data, video_id, player_url, is_live, duration)
|
||||||
|
|
||||||
return live_broadcast_details, is_live, streaming_data, formats
|
return live_broadcast_details, is_live, streaming_data, formats, subtitles
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
url, smuggled_data = unsmuggle_url(url, {})
|
url, smuggled_data = unsmuggle_url(url, {})
|
||||||
@ -3464,8 +3470,8 @@ def feed_entry(name):
|
|||||||
'The livestream has not finished processing. Only 4 hours of the video can be currently downloaded. '
|
'The livestream has not finished processing. Only 4 hours of the video can be currently downloaded. '
|
||||||
'This is a known issue and patches are welcome')
|
'This is a known issue and patches are welcome')
|
||||||
|
|
||||||
live_broadcast_details, is_live, streaming_data, formats = self._list_formats(
|
live_broadcast_details, is_live, streaming_data, formats, automatic_captions = \
|
||||||
video_id, microformats, video_details, player_responses, player_url, duration)
|
self._list_formats(video_id, microformats, video_details, player_responses, player_url)
|
||||||
|
|
||||||
if not formats:
|
if not formats:
|
||||||
if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')):
|
if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')):
|
||||||
@ -3595,6 +3601,7 @@ def feed_entry(name):
|
|||||||
'release_timestamp': live_start_time,
|
'release_timestamp': live_start_time,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
subtitles = {}
|
||||||
pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict)
|
pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict)
|
||||||
if pctr:
|
if pctr:
|
||||||
def get_lang_code(track):
|
def get_lang_code(track):
|
||||||
@ -3624,7 +3631,6 @@ def process_language(container, base_url, lang_code, sub_name, query):
|
|||||||
# NB: Constructing the full subtitle dictionary is slow
|
# NB: Constructing the full subtitle dictionary is slow
|
||||||
get_translated_subs = 'translated_subs' not in self._configuration_arg('skip') and (
|
get_translated_subs = 'translated_subs' not in self._configuration_arg('skip') and (
|
||||||
self.get_param('writeautomaticsub', False) or self.get_param('listsubtitles'))
|
self.get_param('writeautomaticsub', False) or self.get_param('listsubtitles'))
|
||||||
subtitles, automatic_captions = {}, {}
|
|
||||||
for lang_code, caption_track in captions.items():
|
for lang_code, caption_track in captions.items():
|
||||||
base_url = caption_track.get('baseUrl')
|
base_url = caption_track.get('baseUrl')
|
||||||
orig_lang = parse_qs(base_url).get('lang', [None])[-1]
|
orig_lang = parse_qs(base_url).get('lang', [None])[-1]
|
||||||
@ -3655,8 +3661,9 @@ def process_language(container, base_url, lang_code, sub_name, query):
|
|||||||
# Setting tlang=lang returns damaged subtitles.
|
# Setting tlang=lang returns damaged subtitles.
|
||||||
process_language(automatic_captions, base_url, trans_code, trans_name,
|
process_language(automatic_captions, base_url, trans_code, trans_name,
|
||||||
{} if orig_lang == orig_trans_code else {'tlang': trans_code})
|
{} if orig_lang == orig_trans_code else {'tlang': trans_code})
|
||||||
info['automatic_captions'] = automatic_captions
|
|
||||||
info['subtitles'] = subtitles
|
info['automatic_captions'] = automatic_captions
|
||||||
|
info['subtitles'] = subtitles
|
||||||
|
|
||||||
parsed_url = urllib.parse.urlparse(url)
|
parsed_url = urllib.parse.urlparse(url)
|
||||||
for component in [parsed_url.fragment, parsed_url.query]:
|
for component in [parsed_url.fragment, parsed_url.query]:
|
||||||
|
@ -161,6 +161,12 @@ class Magic(HeaderBlock):
|
|||||||
_REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
|
_REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
|
||||||
_REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
|
_REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
|
||||||
|
|
||||||
|
# This was removed from the spec in the 2017 revision;
|
||||||
|
# the last spec draft to describe this syntax element is
|
||||||
|
# <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>.
|
||||||
|
# Nevertheless, YouTube keeps serving those
|
||||||
|
_REGEX_META = re.compile(r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])')
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __parse_tsmap(cls, parser):
|
def __parse_tsmap(cls, parser):
|
||||||
parser = parser.child()
|
parser = parser.child()
|
||||||
@ -200,13 +206,18 @@ def parse(cls, parser):
|
|||||||
raise ParseError(parser)
|
raise ParseError(parser)
|
||||||
|
|
||||||
extra = m.group(1)
|
extra = m.group(1)
|
||||||
local, mpegts = None, None
|
local, mpegts, meta = None, None, ''
|
||||||
if parser.consume(cls._REGEX_TSMAP):
|
while not parser.consume(_REGEX_NL):
|
||||||
local, mpegts = cls.__parse_tsmap(parser)
|
if parser.consume(cls._REGEX_TSMAP):
|
||||||
if not parser.consume(_REGEX_NL):
|
local, mpegts = cls.__parse_tsmap(parser)
|
||||||
|
continue
|
||||||
|
m = parser.consume(cls._REGEX_META)
|
||||||
|
if m:
|
||||||
|
meta += m.group(0)
|
||||||
|
continue
|
||||||
raise ParseError(parser)
|
raise ParseError(parser)
|
||||||
parser.commit()
|
parser.commit()
|
||||||
return cls(extra=extra, mpegts=mpegts, local=local)
|
return cls(extra=extra, mpegts=mpegts, local=local, meta=meta)
|
||||||
|
|
||||||
def write_into(self, stream):
|
def write_into(self, stream):
|
||||||
stream.write('WEBVTT')
|
stream.write('WEBVTT')
|
||||||
@ -219,6 +230,8 @@ def write_into(self, stream):
|
|||||||
stream.write(',MPEGTS:')
|
stream.write(',MPEGTS:')
|
||||||
stream.write(str(self.mpegts if self.mpegts is not None else 0))
|
stream.write(str(self.mpegts if self.mpegts is not None else 0))
|
||||||
stream.write('\n')
|
stream.write('\n')
|
||||||
|
if self.meta:
|
||||||
|
stream.write(self.meta)
|
||||||
stream.write('\n')
|
stream.write('\n')
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user