mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-27 02:42:30 +00:00
[extractor/tiktok] Extract SIGI_STATE
Based on #3624, https://github.com/ytdl-org/youtube-dl/pull/30479 Closes #3551 Authored by dirkf, sulyi, pukkandan
This commit is contained in:
parent
7e88d7d78f
commit
a39a7ba8d6
@ -12,6 +12,7 @@
|
|||||||
HEADRequest,
|
HEADRequest,
|
||||||
LazyList,
|
LazyList,
|
||||||
UnsupportedError,
|
UnsupportedError,
|
||||||
|
get_element_by_id,
|
||||||
get_first,
|
get_first,
|
||||||
int_or_none,
|
int_or_none,
|
||||||
join_nonempty,
|
join_nonempty,
|
||||||
@ -33,11 +34,22 @@ class TikTokBaseIE(InfoExtractor):
|
|||||||
_UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s'
|
_UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s'
|
||||||
_WEBPAGE_HOST = 'https://www.tiktok.com/'
|
_WEBPAGE_HOST = 'https://www.tiktok.com/'
|
||||||
QUALITIES = ('360p', '540p', '720p', '1080p')
|
QUALITIES = ('360p', '540p', '720p', '1080p')
|
||||||
|
_session_initialized = False
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _create_url(user_id, video_id):
|
def _create_url(user_id, video_id):
|
||||||
return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}'
|
return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}'
|
||||||
|
|
||||||
|
def _get_sigi_state(self, webpage, display_id):
|
||||||
|
return self._parse_json(get_element_by_id(
|
||||||
|
'SIGI_STATE|sigi-persisted-data', webpage, escape_value=False), display_id)
|
||||||
|
|
||||||
|
def _real_initialize(self):
|
||||||
|
if self._session_initialized:
|
||||||
|
return
|
||||||
|
self._request_webpage(HEADRequest('https://www.tiktok.com'), None, note='Setting up session', fatal=False)
|
||||||
|
TikTokBaseIE._session_initialized = True
|
||||||
|
|
||||||
def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True,
|
def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True,
|
||||||
note='Downloading API JSON', errnote='Unable to download API page'):
|
note='Downloading API JSON', errnote='Unable to download API page'):
|
||||||
self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for _ in range(160)))
|
self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for _ in range(160)))
|
||||||
@ -263,6 +275,9 @@ def extract_addr(addr, add_meta={}):
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
'id': aweme_id,
|
'id': aweme_id,
|
||||||
|
'extractor_key': TikTokIE.ie_key(),
|
||||||
|
'extractor': TikTokIE.IE_NAME,
|
||||||
|
'webpage_url': self._create_url(author_info.get('uid'), aweme_id),
|
||||||
'title': aweme_detail.get('desc'),
|
'title': aweme_detail.get('desc'),
|
||||||
'description': aweme_detail.get('desc'),
|
'description': aweme_detail.get('desc'),
|
||||||
'view_count': int_or_none(stats_info.get('play_count')),
|
'view_count': int_or_none(stats_info.get('play_count')),
|
||||||
@ -461,7 +476,7 @@ class TikTokIE(TikTokBaseIE):
|
|||||||
'repost_count': int,
|
'repost_count': int,
|
||||||
'comment_count': int,
|
'comment_count': int,
|
||||||
},
|
},
|
||||||
'expected_warnings': ['Video not available']
|
'expected_warnings': ['trying with webpage', 'Unable to find video in feed']
|
||||||
}, {
|
}, {
|
||||||
# Video without title and description
|
# Video without title and description
|
||||||
'url': 'https://www.tiktok.com/@pokemonlife22/video/7059698374567611694',
|
'url': 'https://www.tiktok.com/@pokemonlife22/video/7059698374567611694',
|
||||||
@ -485,7 +500,29 @@ class TikTokIE(TikTokBaseIE):
|
|||||||
'repost_count': int,
|
'repost_count': int,
|
||||||
'comment_count': int,
|
'comment_count': int,
|
||||||
},
|
},
|
||||||
'expected_warnings': ['Video not available', 'Creating a generic title']
|
}, {
|
||||||
|
# hydration JSON is sent in a <script> element
|
||||||
|
'url': 'https://www.tiktok.com/@denidil6/video/7065799023130643713',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '7065799023130643713',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'title': '#denidil#денидил',
|
||||||
|
'description': '#denidil#денидил',
|
||||||
|
'uploader': 'denidil6',
|
||||||
|
'uploader_id': '7046664115636405250',
|
||||||
|
'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAsvMSzFdQ4ikl3uR2TEJwMBbB2yZh2Zxwhx-WCo3rbDpAharE3GQCrFuJArI3C8QJ',
|
||||||
|
'artist': 'Holocron Music',
|
||||||
|
'album': 'Wolf Sounds (1 Hour) Enjoy the Company of the Animal That Is the Majestic King of the Night',
|
||||||
|
'track': 'Wolf Sounds (1 Hour) Enjoy the Company of the Animal That Is the Majestic King of the Night',
|
||||||
|
'timestamp': 1645134536,
|
||||||
|
'duration': 26,
|
||||||
|
'upload_date': '20220217',
|
||||||
|
'view_count': int,
|
||||||
|
'like_count': int,
|
||||||
|
'repost_count': int,
|
||||||
|
'comment_count': int,
|
||||||
|
},
|
||||||
|
'expected_warnings': ['trying feed workaround', 'Unable to find video in feed']
|
||||||
}, {
|
}, {
|
||||||
# Auto-captions available
|
# Auto-captions available
|
||||||
'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758',
|
'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758',
|
||||||
@ -504,7 +541,7 @@ def _extract_aweme_app(self, aweme_id):
|
|||||||
if not aweme_detail:
|
if not aweme_detail:
|
||||||
raise ExtractorError('Video not available', video_id=aweme_id)
|
raise ExtractorError('Video not available', video_id=aweme_id)
|
||||||
except ExtractorError as e:
|
except ExtractorError as e:
|
||||||
self.report_warning(f'{e}; Retrying with feed workaround')
|
self.report_warning(f'{e.orig_msg}; trying feed workaround')
|
||||||
feed_list = self._call_api('feed', {'aweme_id': aweme_id}, aweme_id,
|
feed_list = self._call_api('feed', {'aweme_id': aweme_id}, aweme_id,
|
||||||
note='Downloading video feed', errnote='Unable to download video feed').get('aweme_list') or []
|
note='Downloading video feed', errnote='Unable to download video feed').get('aweme_list') or []
|
||||||
aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None)
|
aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None)
|
||||||
@ -514,26 +551,19 @@ def _extract_aweme_app(self, aweme_id):
|
|||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
video_id, user_id = self._match_valid_url(url).group('id', 'user_id')
|
video_id, user_id = self._match_valid_url(url).group('id', 'user_id')
|
||||||
url = self._create_url(user_id, video_id)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return self._extract_aweme_app(video_id)
|
return self._extract_aweme_app(video_id)
|
||||||
except ExtractorError as e:
|
except ExtractorError as e:
|
||||||
self.report_warning(f'{e}; Retrying with webpage')
|
self.report_warning(f'{e}; trying with webpage')
|
||||||
|
|
||||||
# If we only call once, we get a 403 when downlaoding the video.
|
url = self._create_url(user_id, video_id)
|
||||||
self._download_webpage(url, video_id)
|
webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'User-Agent:Mozilla/5.0'})
|
||||||
webpage = self._download_webpage(url, video_id, note='Downloading video webpage')
|
|
||||||
next_data = self._search_nextjs_data(webpage, video_id, default='{}')
|
next_data = self._search_nextjs_data(webpage, video_id, default='{}')
|
||||||
|
|
||||||
if next_data:
|
if next_data:
|
||||||
status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode'), expected_type=int) or 0
|
status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode'), expected_type=int) or 0
|
||||||
video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct'), expected_type=dict)
|
video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct'), expected_type=dict)
|
||||||
else:
|
else:
|
||||||
sigi_json = self._search_regex(
|
sigi_data = self._get_sigi_state(webpage, video_id)
|
||||||
r'>\s*window\[[\'"]SIGI_STATE[\'"]\]\s*=\s*(?P<sigi_state>{.+});',
|
|
||||||
webpage, 'sigi data', group='sigi_state')
|
|
||||||
sigi_data = self._parse_json(sigi_json, video_id)
|
|
||||||
status = traverse_obj(sigi_data, ('VideoPage', 'statusCode'), expected_type=int) or 0
|
status = traverse_obj(sigi_data, ('VideoPage', 'statusCode'), expected_type=int) or 0
|
||||||
video_data = traverse_obj(sigi_data, ('ItemModule', video_id), expected_type=dict)
|
video_data = traverse_obj(sigi_data, ('ItemModule', video_id), expected_type=dict)
|
||||||
|
|
||||||
@ -849,7 +879,7 @@ def _real_extract(self, url):
|
|||||||
try:
|
try:
|
||||||
return self._extract_aweme_app(video_id)
|
return self._extract_aweme_app(video_id)
|
||||||
except ExtractorError as e:
|
except ExtractorError as e:
|
||||||
self.report_warning(f'{e}; Retrying with webpage')
|
self.report_warning(f'{e}; trying with webpage')
|
||||||
|
|
||||||
webpage = self._download_webpage(url, video_id)
|
webpage = self._download_webpage(url, video_id)
|
||||||
render_data_json = self._search_regex(
|
render_data_json = self._search_regex(
|
||||||
|
Loading…
Reference in New Issue
Block a user