From e0585e6562bc467a17c9fef7b48b83c6f0f83652 Mon Sep 17 00:00:00 2001 From: MinePlayersPE <20515340+MinePlayersPE@users.noreply.github.com> Date: Thu, 20 Jan 2022 05:35:27 +0700 Subject: [PATCH] [TikTok] Extract captions (#2185) Closes #2184 Authored by: MinePlayersPE --- yt_dlp/extractor/tiktok.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 9e0bec709..6dffdf05e 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -17,6 +17,7 @@ int_or_none, join_nonempty, LazyList, + srt_subtitles_timecode, str_or_none, traverse_obj, try_get, @@ -83,6 +84,27 @@ def _call_api(self, ep, query, video_id, fatal=True, 'Accept': 'application/json', }, query=real_query) + def _get_subtitles(self, aweme_detail, aweme_id): + # TODO: Extract text positioning info + subtitles = {} + captions_info = traverse_obj( + aweme_detail, ('interaction_stickers', ..., 'auto_video_caption_info', 'auto_captions', ...), expected_type=dict, default=[]) + for caption in captions_info: + caption_url = traverse_obj(caption, ('url', 'url_list', ...), expected_type=url_or_none, get_all=False) + if not caption_url: + continue + caption_json = self._download_json( + caption_url, aweme_id, note='Downloading captions', errnote='Unable to download captions', fatal=False) + if not caption_json: + continue + subtitles.setdefault(caption.get('language', 'en'), []).append({ + 'ext': 'srt', + 'data': '\n\n'.join( + f'{i + 1}\n{srt_subtitles_timecode(line["start_time"] / 1000)} --> {srt_subtitles_timecode(line["end_time"] / 1000)}\n{line["text"]}' + for i, line in enumerate(caption_json['utterances']) if line.get('text')) + }) + return subtitles + def _parse_aweme_video_app(self, aweme_detail): aweme_id = aweme_detail['aweme_id'] video_info = aweme_detail['video'] @@ -218,6 +240,7 @@ def extract_addr(addr, add_meta={}): 'artist': music_author, 'timestamp': int_or_none(aweme_detail.get('create_time')), 'formats': formats, + 'subtitles': self.extract_subtitles(aweme_detail, aweme_id), 'thumbnails': thumbnails, 'duration': int_or_none(traverse_obj(video_info, 'duration', ('download_addr', 'duration')), scale=1000), 'availability': self._availability( @@ -396,6 +419,10 @@ class TikTokIE(TikTokBaseIE): 'comment_count': int, }, 'expected_warnings': ['Video not available'] + }, { + # Auto-captions available + 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758', + 'only_matching': True }] def _extract_aweme_app(self, aweme_id):