From 73c801d6606b052fd97f27bf18d2dd7f1eebc3ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 14 Oct 2016 23:43:09 +0700 Subject: [PATCH] [orf:tvthek] Fix extraction and modernize (closes #10898) --- youtube_dl/extractor/orf.py | 46 ++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 6ae30679a..c7b107572 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -1,28 +1,28 @@ # coding: utf-8 from __future__ import unicode_literals -import json import re import calendar import datetime from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( HEADRequest, unified_strdate, - ExtractorError, strip_jsonp, int_or_none, float_or_none, determine_ext, remove_end, + unescapeHTML, ) class ORFTVthekIE(InfoExtractor): IE_NAME = 'orf:tvthek' IE_DESC = 'ORF TVthek' - _VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes|topics?/.+?|program/[^/]+)/(?P\d+)' + _VALID_URL = r'https?://tvthek\.orf\.at/(?:[^/]+/)+(?P\d+)' _TESTS = [{ 'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389', @@ -51,26 +51,23 @@ class ORFTVthekIE(InfoExtractor): 'skip_download': True, # rtsp downloads }, '_skip': 'Blocked outside of Austria / Germany', + }, { + 'url': 'http://tvthek.orf.at/topic/Fluechtlingskrise/10463081/Heimat-Fremde-Heimat/13879132/Senioren-betreuen-Migrantenkinder/13879141', + 'skip_download': True, + }, { + 'url': 'http://tvthek.orf.at/profile/Universum/35429', + 'skip_download': True, }] def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - data_json = self._search_regex( - r'initializeAdworx\((.+?)\);\n', webpage, 'video info') - all_data = json.loads(data_json) - - def get_segments(all_data): - for data in all_data: - if data['name'] in ( - 'Tracker::EPISODE_DETAIL_PAGE_OVER_PROGRAM', - 'Tracker::EPISODE_DETAIL_PAGE_OVER_TOPIC'): - return data['values']['segments'] - - sdata = get_segments(all_data) - if not sdata: - raise ExtractorError('Unable to extract segments') + data_jsb = self._parse_json( + self._search_regex( + r']+class=(["\']).*?VideoPlaylist.*?\1[^>]+data-jsb=(["\'])(?P.+?)\2', + webpage, 'playlist', group='json'), + playlist_id, transform_source=unescapeHTML)['playlist']['videos'] def quality_to_int(s): m = re.search('([0-9]+)', s) @@ -79,8 +76,11 @@ def quality_to_int(s): return int(m.group(1)) entries = [] - for sd in sdata: - video_id = sd['id'] + for sd in data_jsb: + video_id, title = sd.get('id'), sd.get('title') + if not video_id or not title: + continue + video_id = compat_str(video_id) formats = [{ 'preference': -10 if fd['delivery'] == 'hls' else None, 'format_id': '%s-%s-%s' % ( @@ -88,7 +88,7 @@ def quality_to_int(s): 'url': fd['src'], 'protocol': fd['protocol'], 'quality': quality_to_int(fd['quality']), - } for fd in sd['playlist_item_array']['sources']] + } for fd in sd['sources']] # Check for geoblocking. # There is a property is_geoprotection, but that's always false @@ -115,14 +115,14 @@ def quality_to_int(s): self._check_formats(formats, video_id) self._sort_formats(formats) - upload_date = unified_strdate(sd['created_date']) + upload_date = unified_strdate(sd.get('created_date')) entries.append({ '_type': 'video', 'id': video_id, - 'title': sd['header'], + 'title': title, 'formats': formats, 'description': sd.get('description'), - 'duration': int(sd['duration_in_seconds']), + 'duration': int_or_none(sd.get('duration_in_seconds')), 'upload_date': upload_date, 'thumbnail': sd.get('image_full_url'), })