[wdr] Rework extractors (closes #14598)

This commit is contained in:
Sergey M․ 2018-01-13 23:28:08 +07:00
parent 2d8bb80c60
commit 54e8f62e01
No known key found for this signature in database
GPG Key ID: 2C393E0F18A9236D
3 changed files with 124 additions and 148 deletions

View File

@ -991,7 +991,6 @@
from .sport5 import Sport5IE from .sport5 import Sport5IE
from .sportbox import SportBoxEmbedIE from .sportbox import SportBoxEmbedIE
from .sportdeutschland import SportDeutschlandIE from .sportdeutschland import SportDeutschlandIE
from .sportschau import SportschauIE
from .sprout import SproutIE from .sprout import SproutIE
from .srgssr import ( from .srgssr import (
SRGSSRIE, SRGSSRIE,
@ -1289,6 +1288,7 @@
from .watchindianporn import WatchIndianPornIE from .watchindianporn import WatchIndianPornIE
from .wdr import ( from .wdr import (
WDRIE, WDRIE,
WDRPageIE,
WDRElefantIE, WDRElefantIE,
WDRMobileIE, WDRMobileIE,
) )

View File

@ -1,38 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from .wdr import WDRBaseIE
from ..utils import get_element_by_attribute
class SportschauIE(WDRBaseIE):
IE_NAME = 'Sportschau'
_VALID_URL = r'https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video-?(?P<id>[^/#?]+)\.html'
_TEST = {
'url': 'http://www.sportschau.de/uefaeuro2016/videos/video-dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100.html',
'info_dict': {
'id': 'mdb-1140188',
'display_id': 'dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100',
'ext': 'mp4',
'title': 'DFB-Team geht gut gelaunt ins Spiel gegen Polen',
'description': 'Vor dem zweiten Gruppenspiel gegen Polen herrscht gute Stimmung im deutschen Team. Insbesondere Bastian Schweinsteiger strotzt vor Optimismus nach seinem Tor gegen die Ukraine.',
'upload_date': '20160615',
},
'skip': 'Geo-restricted to Germany',
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = get_element_by_attribute('class', 'headline', webpage)
description = self._html_search_meta('description', webpage, 'description')
info = self._extract_wdr_video(webpage, video_id)
info.update({
'title': title,
'description': description,
})
return info

View File

@ -4,50 +4,52 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import (
compat_str,
compat_urlparse,
)
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
ExtractorError, ExtractorError,
js_to_json, js_to_json,
strip_jsonp, strip_jsonp,
try_get,
unified_strdate, unified_strdate,
update_url_query, update_url_query,
urlhandle_detect_ext, urlhandle_detect_ext,
) )
class WDRBaseIE(InfoExtractor): class WDRIE(InfoExtractor):
def _extract_jsonp_url(self, webpage, display_id): _VALID_URL = r'https?://deviceids-medp\.wdr\.de/ondemand/\d+/(?P<id>\d+)\.js'
# for wdr.de the data-extension is in a tag with the class "mediaLink" _TEST = {
# for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" 'url': 'http://deviceids-medp.wdr.de/ondemand/155/1557833.js',
# for wdrmaus, in a tag with the class "videoButton" (previously a link 'info_dict': {
# to the page in a multiline "videoLink"-tag) 'id': 'mdb-1140188',
json_metadata = self._html_search_regex( 'display_id': 'dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100',
r'''(?sx)class= 'ext': 'mp4',
(?: 'title': 'DFB-Team geht gut gelaunt ins Spiel gegen Polen',
(["\'])(?:mediaLink|wdrrPlayerPlayBtn|videoButton)\b.*?\1[^>]+| 'description': 'Vor dem zweiten Gruppenspiel gegen Polen herrscht gute Stimmung im deutschen Team. Insbesondere Bastian Schweinsteiger strotzt vor Optimismus nach seinem Tor gegen die Ukraine.',
(["\'])videoLink\b.*?\2[\s]*>\n[^\n]* 'upload_date': '20160615',
)data-extension=(["\'])(?P<data>(?:(?!\3).)+)\3 },
''', 'skip': 'Geo-restricted to Germany',
webpage, 'media link', default=None, group='data') }
if not json_metadata: def _real_extract(self, url):
return video_id = self._match_id(url)
media_link_obj = self._parse_json(json_metadata, display_id,
transform_source=js_to_json)
return media_link_obj['mediaObj']['url']
def _extract_wdr_video(self, jsonp_url, display_id):
metadata = self._download_json( metadata = self._download_json(
jsonp_url, display_id, transform_source=strip_jsonp) url, video_id, transform_source=strip_jsonp)
metadata_tracker_data = metadata['trackerData'] is_live = metadata.get('mediaType') == 'live'
metadata_media_resource = metadata['mediaResource']
tracker_data = metadata['trackerData']
media_resource = metadata['mediaResource']
formats = [] formats = []
# check if the metadata contains a direct URL to a file # check if the metadata contains a direct URL to a file
for kind, media_resource in metadata_media_resource.items(): for kind, media_resource in media_resource.items():
if kind not in ('dflt', 'alt'): if kind not in ('dflt', 'alt'):
continue continue
@ -58,13 +60,13 @@ def _extract_wdr_video(self, jsonp_url, display_id):
ext = determine_ext(medium_url) ext = determine_ext(medium_url)
if ext == 'm3u8': if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
medium_url, display_id, 'mp4', 'm3u8_native', medium_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls')) m3u8_id='hls'))
elif ext == 'f4m': elif ext == 'f4m':
manifest_url = update_url_query( manifest_url = update_url_query(
medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'}) medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'})
formats.extend(self._extract_f4m_formats( formats.extend(self._extract_f4m_formats(
manifest_url, display_id, f4m_id='hds', fatal=False)) manifest_url, video_id, f4m_id='hds', fatal=False))
elif ext == 'smil': elif ext == 'smil':
formats.extend(self._extract_smil_formats( formats.extend(self._extract_smil_formats(
medium_url, 'stream', fatal=False)) medium_url, 'stream', fatal=False))
@ -74,7 +76,7 @@ def _extract_wdr_video(self, jsonp_url, display_id):
} }
if ext == 'unknown_video': if ext == 'unknown_video':
urlh = self._request_webpage( urlh = self._request_webpage(
medium_url, display_id, note='Determining extension') medium_url, video_id, note='Determining extension')
ext = urlhandle_detect_ext(urlh) ext = urlhandle_detect_ext(urlh)
a_format['ext'] = ext a_format['ext'] = ext
formats.append(a_format) formats.append(a_format)
@ -82,30 +84,30 @@ def _extract_wdr_video(self, jsonp_url, display_id):
self._sort_formats(formats) self._sort_formats(formats)
subtitles = {} subtitles = {}
caption_url = metadata_media_resource.get('captionURL') caption_url = media_resource.get('captionURL')
if caption_url: if caption_url:
subtitles['de'] = [{ subtitles['de'] = [{
'url': caption_url, 'url': caption_url,
'ext': 'ttml', 'ext': 'ttml',
}] }]
title = metadata_tracker_data['trackerClipTitle'] title = tracker_data['trackerClipTitle']
return { return {
'id': metadata_tracker_data.get('trackerClipId', display_id), 'id': tracker_data.get('trackerClipId', video_id),
'display_id': display_id, 'title': self._live_title(title) if is_live else title,
'title': title, 'alt_title': tracker_data.get('trackerClipSubcategory'),
'alt_title': metadata_tracker_data.get('trackerClipSubcategory'),
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
'upload_date': unified_strdate(metadata_tracker_data.get('trackerClipAirTime')), 'upload_date': unified_strdate(tracker_data.get('trackerClipAirTime')),
'is_live': is_live,
} }
class WDRIE(WDRBaseIE): class WDRPageIE(InfoExtractor):
_CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5'
_PAGE_REGEX = r'/(?:mediathek/)?[^/]+/(?P<type>[^/]+)/(?P<display_id>.+)\.html' _PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P<display_id>[^/]+)\.html'
_VALID_URL = r'(?P<page_url>https?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL _VALID_URL = r'https?://(?:www\d?\.)?(?:wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL
_TESTS = [ _TESTS = [
{ {
@ -125,6 +127,7 @@ class WDRIE(WDRBaseIE):
'ext': 'ttml', 'ext': 'ttml',
}]}, }]},
}, },
'skip': 'HTTP Error 404: Not Found',
}, },
{ {
'url': 'http://www1.wdr.de/mediathek/audio/wdr3/wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100.html', 'url': 'http://www1.wdr.de/mediathek/audio/wdr3/wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100.html',
@ -140,19 +143,17 @@ class WDRIE(WDRBaseIE):
'is_live': False, 'is_live': False,
'subtitles': {} 'subtitles': {}
}, },
'skip': 'HTTP Error 404: Not Found',
}, },
{ {
'url': 'http://www1.wdr.de/mediathek/video/live/index.html', 'url': 'http://www1.wdr.de/mediathek/video/live/index.html',
'info_dict': { 'info_dict': {
'id': 'mdb-103364', 'id': 'mdb-1406149',
'ext': 'mp4', 'ext': 'mp4',
'display_id': 'index', 'title': r're:^WDR Fernsehen im Livestream \(nur in Deutschland erreichbar\) [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'title': r're:^WDR Fernsehen im Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'alt_title': 'WDR Fernsehen Live', 'alt_title': 'WDR Fernsehen Live',
'upload_date': None, 'upload_date': '20150101',
'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9',
'is_live': True, 'is_live': True,
'subtitles': {}
}, },
'params': { 'params': {
'skip_download': True, # m3u8 download 'skip_download': True, # m3u8 download
@ -160,19 +161,18 @@ class WDRIE(WDRBaseIE):
}, },
{ {
'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html', 'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html',
'playlist_mincount': 8, 'playlist_mincount': 7,
'info_dict': { 'info_dict': {
'id': 'aktuelle-stunde/aktuelle-stunde-120', 'id': 'aktuelle-stunde-120',
}, },
}, },
{ {
'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5', 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5',
'info_dict': { 'info_dict': {
'id': 'mdb-1323501', 'id': 'mdb-1552552',
'ext': 'mp4', 'ext': 'mp4',
'upload_date': 're:^[0-9]{8}$', 'upload_date': 're:^[0-9]{8}$',
'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$', 'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$',
'description': 'Die Seite mit der Maus -',
}, },
'skip': 'The id changes from week to week because of the new episode' 'skip': 'The id changes from week to week because of the new episode'
}, },
@ -184,7 +184,6 @@ class WDRIE(WDRBaseIE):
'ext': 'mp4', 'ext': 'mp4',
'upload_date': '20130919', 'upload_date': '20130919',
'title': 'Sachgeschichte - Achterbahn ', 'title': 'Sachgeschichte - Achterbahn ',
'description': 'Die Seite mit der Maus -',
}, },
}, },
{ {
@ -192,83 +191,100 @@ class WDRIE(WDRBaseIE):
# Live stream, MD5 unstable # Live stream, MD5 unstable
'info_dict': { 'info_dict': {
'id': 'mdb-869971', 'id': 'mdb-869971',
'ext': 'flv', 'ext': 'mp4',
'title': 'COSMO Livestream', 'title': r're:^COSMO Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': 'md5:2309992a6716c347891c045be50992e4',
'upload_date': '20160101', 'upload_date': '20160101',
}, },
'params': {
'skip_download': True, # m3u8 download
}
},
{
'url': 'http://www.sportschau.de/handballem2018/handball-nationalmannschaft-em-stolperstein-vorrunde-100.html',
'info_dict': {
'id': 'mdb-1556012',
'ext': 'mp4',
'title': 'DHB-Vizepräsident Bob Hanning - "Die Weltspitze ist extrem breit"',
'upload_date': '20180111',
},
'params': {
'skip_download': True,
},
},
{
'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html',
'only_matching': True,
} }
] ]
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
url_type = mobj.group('type')
page_url = mobj.group('page_url')
display_id = mobj.group('display_id') display_id = mobj.group('display_id')
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
jsonp_url = self._extract_jsonp_url(webpage, display_id) entries = []
info_dict = self._extract_wdr_video(jsonp_url, display_id)
if not info_dict: # Article with several videos
# for wdr.de the data-extension is in a tag with the class "mediaLink"
# for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn"
# for wdrmaus, in a tag with the class "videoButton" (previously a link
# to the page in a multiline "videoLink"-tag)
for mobj in re.finditer(
r'''(?sx)class=
(?:
(["\'])(?:mediaLink|wdrrPlayerPlayBtn|videoButton)\b.*?\1[^>]+|
(["\'])videoLink\b.*?\2[\s]*>\n[^\n]*
)data-extension=(["\'])(?P<data>(?:(?!\3).)+)\3
''', webpage):
media_link_obj = self._parse_json(
mobj.group('data'), display_id, transform_source=js_to_json,
fatal=False)
if not media_link_obj:
continue
jsonp_url = try_get(
media_link_obj, lambda x: x['mediaObj']['url'], compat_str)
if jsonp_url:
entries.append(self.url_result(jsonp_url, ie=WDRIE.ie_key()))
# Playlist (e.g. https://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html)
if not entries:
entries = [ entries = [
self.url_result(page_url + href[0], 'WDR') self.url_result(
for href in re.findall( compat_urlparse.urljoin(url, mobj.group('href')),
r'<a href="(%s)"[^>]+data-extension=' % self._PAGE_REGEX, ie=WDRPageIE.ie_key())
webpage) for mobj in re.finditer(
r'<a[^>]+\bhref=(["\'])(?P<href>(?:(?!\1).)+)\1[^>]+\bdata-extension=',
webpage) if re.match(self._PAGE_REGEX, mobj.group('href'))
] ]
if entries: # Playlist page return self.playlist_result(entries, playlist_id=display_id)
return self.playlist_result(entries, playlist_id=display_id)
raise ExtractorError('No downloadable streams found', expected=True)
is_live = url_type == 'live'
if is_live:
info_dict.update({
'title': self._live_title(info_dict['title']),
'upload_date': None,
})
elif 'upload_date' not in info_dict:
info_dict['upload_date'] = unified_strdate(self._html_search_meta('DC.Date', webpage, 'upload date'))
info_dict.update({
'description': self._html_search_meta('Description', webpage),
'is_live': is_live,
})
return info_dict
class WDRElefantIE(WDRBaseIE): class WDRElefantIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)wdrmaus.de/elefantenseite/#(?P<display_id>.+)' _VALID_URL = r'https?://(?:www\.)wdrmaus\.de/elefantenseite/#(?P<id>.+)'
IE_NAME = 'wdr:elefant' _TEST = {
'url': 'http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015',
_TESTS = [ 'info_dict': {
{ 'title': 'Folge Oster-Spezial 2015',
'url': 'http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015', 'id': 'mdb-1088195',
'info_dict': { 'ext': 'mp4',
'title': 'Folge Oster-Spezial 2015', 'age_limit': None,
'id': 'mdb-1088195', 'upload_date': '20150406'
'ext': 'mp4',
'age_limit': None,
'upload_date': '20150406'
},
'params': {
'skip_download' : True,
},
}, },
] 'params': {
'skip_download': True,
},
}
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) display_id = self._match_id(url)
display_id = mobj.group('display_id')
# Table of Contents seems to always be at this address, so fetch it directly. # Table of Contents seems to always be at this address, so fetch it directly.
# The website fetches configurationJS.php5, which links to tableOfContentsJS.php5. # The website fetches configurationJS.php5, which links to tableOfContentsJS.php5.
table_of_contents = self._download_json( table_of_contents = self._download_json(
'https://www.wdrmaus.de/elefantenseite/data/tableOfContentsJS.php5', display_id) 'https://www.wdrmaus.de/elefantenseite/data/tableOfContentsJS.php5',
display_id)
if display_id not in table_of_contents: if display_id not in table_of_contents:
raise ExtractorError( raise ExtractorError(
'No entry in site\'s table of contents for this URL. ' 'No entry in site\'s table of contents for this URL. '
@ -276,15 +292,13 @@ def _real_extract(self, url):
expected=True) expected=True)
xml_metadata_path = table_of_contents[display_id]['xmlPath'] xml_metadata_path = table_of_contents[display_id]['xmlPath']
xml_metadata = self._download_xml( xml_metadata = self._download_xml(
'https://www.wdrmaus.de/elefantenseite/' + xml_metadata_path, display_id) 'https://www.wdrmaus.de/elefantenseite/' + xml_metadata_path,
display_id)
zmdb_url_element = xml_metadata.find('./movie/zmdb_url') zmdb_url_element = xml_metadata.find('./movie/zmdb_url')
if zmdb_url_element is None: if zmdb_url_element is None:
raise ExtractorError( raise ExtractorError(
'The URL looks valid, but no video was found. Note that download only works ' '%s is not a video' % display_id, expected=True)
'on pages showing a single video, not on video selection pages.', return self.url_result(zmdb_url_element.text, ie=WDRIE.ie_key())
expected=True)
info_dict = self._extract_wdr_video(zmdb_url_element.text, display_id)
return info_dict
class WDRMobileIE(InfoExtractor): class WDRMobileIE(InfoExtractor):