From 49174788035038b8f12f971ec04ef897df60f435 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 Jul 2017 18:39:01 +0700 Subject: [PATCH] [ted] Fix extraction (closes #13535)) --- youtube_dl/extractor/ted.py | 49 +++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 3f3c681aef..f27d0e3139 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -6,7 +6,10 @@ from .common import InfoExtractor from ..compat import compat_str -from ..utils import int_or_none +from ..utils import ( + int_or_none, + try_get, +) class TEDIE(InfoExtractor): @@ -113,8 +116,9 @@ class TEDIE(InfoExtractor): } def _extract_info(self, webpage): - info_json = self._search_regex(r'q\("\w+.init",({.+})\)', - webpage, 'info json') + info_json = self._search_regex( + r'(?s)q\(\s*"\w+.init"\s*,\s*({.+})\)\s*', + webpage, 'info json') return json.loads(info_json) def _real_extract(self, url): @@ -136,11 +140,16 @@ def _playlist_videos_info(self, url, name): webpage = self._download_webpage(url, name, 'Downloading playlist webpage') info = self._extract_info(webpage) - playlist_info = info['playlist'] + + playlist_info = try_get( + info, lambda x: x['__INITIAL_DATA__']['playlist'], + dict) or info['playlist'] playlist_entries = [ self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key()) - for talk in info['talks'] + for talk in try_get( + info, lambda x: x['__INITIAL_DATA__']['talks'], + dict) or info['talks'] ] return self.playlist_result( playlist_entries, @@ -149,9 +158,14 @@ def _playlist_videos_info(self, url, name): def _talk_info(self, url, video_name): webpage = self._download_webpage(url, video_name) - self.report_extraction(video_name) - talk_info = self._extract_info(webpage)['talks'][0] + info = self._extract_info(webpage) + + talk_info = try_get( + info, lambda x: x['__INITIAL_DATA__']['talks'][0], + dict) or info['talks'][0] + + title = talk_info['title'].strip() external = talk_info.get('external') if external: @@ -165,19 +179,27 @@ def _talk_info(self, url, video_name): 'url': ext_url or external['uri'], } + native_downloads = try_get( + talk_info, lambda x: x['downloads']['nativeDownloads'], + dict) or talk_info['nativeDownloads'] + formats = [{ 'url': format_url, 'format_id': format_id, 'format': format_id, - } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None] + } for (format_id, format_url) in native_downloads.items() if format_url is not None] if formats: for f in formats: finfo = self._NATIVE_FORMATS.get(f['format_id']) if finfo: f.update(finfo) + player_talk = talk_info['player_talks'][0] + + resources_ = player_talk.get('resources') or talk_info.get('resources') + http_url = None - for format_id, resources in talk_info['resources'].items(): + for format_id, resources in resources_.items(): if format_id == 'h264': for resource in resources: h264_url = resource.get('file') @@ -237,14 +259,11 @@ def _talk_info(self, url, video_name): video_id = compat_str(talk_info['id']) - thumbnail = talk_info['thumb'] - if not thumbnail.startswith('http'): - thumbnail = 'http://' + thumbnail return { 'id': video_id, - 'title': talk_info['title'].strip(), - 'uploader': talk_info['speaker'], - 'thumbnail': thumbnail, + 'title': title, + 'uploader': player_talk.get('speaker') or talk_info.get('speaker'), + 'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'), 'description': self._og_search_description(webpage), 'subtitles': self._get_subtitles(video_id, talk_info), 'formats': formats,