From 8749477ed0a3cbc85d1726b6526fa5e794ce6072 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 10 Apr 2015 22:27:16 +0600 Subject: [PATCH] [rai] Fix extraction (Closes #5396) --- youtube_dl/extractor/rai.py | 72 ++++++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 144e33982..115cc64cc 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -13,7 +13,7 @@ class RaiIE(InfoExtractor): - _VALID_URL = r'(?Phttp://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html)' + _VALID_URL = r'(?P(?Phttp://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it))/dl/.+?-(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html)' _TESTS = [ { 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', @@ -64,32 +64,65 @@ class RaiIE(InfoExtractor): }, ] + def _extract_relinker_url(self, webpage): + return self._proto_relative_url(self._search_regex( + [r'name="videourl" content="([^"]+)"', r'var\s+videoURL(?:_MP4)?\s*=\s*"([^"]+)"'], + webpage, 'relinker url', default=None)) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + host = mobj.group('host') - media = self._download_json('%s?json' % mobj.group('url'), video_id, 'Downloading video JSON') + webpage = self._download_webpage(url, video_id) - title = media.get('name') - description = media.get('desc') - thumbnail = media.get('image_300') or media.get('image_medium') or media.get('image') - duration = parse_duration(media.get('length')) - uploader = media.get('author') - upload_date = unified_strdate(media.get('date')) + relinker_url = self._extract_relinker_url(webpage) - formats = [] + if not relinker_url: + iframe_path = self._search_regex( + r']+src="/?(dl/[^"]+\?iframe\b[^"]*)"', + webpage, 'iframe') + iframe_page = self._download_webpage( + '%s/%s' % (host, iframe_path), video_id) + relinker_url = self._extract_relinker_url(iframe_page) - for format_id in ['wmv', 'm3u8', 'mediaUri', 'h264']: - media_url = media.get(format_id) - if not media_url: - continue - formats.append({ + relinker = self._download_json( + '%s&output=47' % relinker_url, video_id) + + media_url = relinker['video'][0] + ct = relinker.get('ct') + if ct == 'f4m': + formats = self._extract_f4m_formats( + media_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id) + else: + formats = [{ 'url': media_url, - 'format_id': format_id, - 'ext': 'mp4', - }) + 'format_id': ct, + }] - subtitles = self.extract_subtitles(video_id, url) + json_link = self._html_search_meta( + 'jsonlink', webpage, 'JSON link', default=None) + if json_link: + media = self._download_json( + host + json_link, video_id, 'Downloading video JSON') + title = media.get('name') + description = media.get('desc') + thumbnail = media.get('image_300') or media.get('image_medium') or media.get('image') + duration = parse_duration(media.get('length')) + uploader = media.get('author') + upload_date = unified_strdate(media.get('date')) + else: + title = self._search_regex( + r'var\s+videoTitolo\s*=\s*"([^"]+)";', + webpage, 'title', default=None) or self._og_search_title(webpage) + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + duration = None + uploader = self._html_search_meta('Editore', webpage, 'uploader') + upload_date = unified_strdate(self._html_search_meta( + 'item-date', webpage, 'upload date')) + + subtitles = self.extract_subtitles(video_id, webpage) return { 'id': video_id, @@ -103,8 +136,7 @@ def _real_extract(self, url): 'subtitles': subtitles, } - def _get_subtitles(self, video_id, url): - webpage = self._download_webpage(url, video_id) + def _get_subtitles(self, video_id, webpage): subtitles = {} m = re.search(r'