[jukebox] remove extractor and handle it using generic extractor

This commit is contained in:
remitamine 2015-09-25 10:52:48 +01:00
parent 0940c5b4c6
commit 6aeba407db
4 changed files with 61 additions and 119 deletions

View File

@ -262,7 +262,6 @@
from .jadorecettepub import JadoreCettePubIE from .jadorecettepub import JadoreCettePubIE
from .jeuxvideo import JeuxVideoIE from .jeuxvideo import JeuxVideoIE
from .jove import JoveIE from .jove import JoveIE
from .jukebox import JukeboxIE
from .jpopsukitv import JpopsukiIE from .jpopsukitv import JpopsukiIE
from .kaltura import KalturaIE from .kaltura import KalturaIE
from .kanalplay import KanalPlayIE from .kanalplay import KanalPlayIE

View File

@ -50,6 +50,7 @@
from .onionstudios import OnionStudiosIE from .onionstudios import OnionStudiosIE
from .snagfilms import SnagFilmsEmbedIE from .snagfilms import SnagFilmsEmbedIE
from .screenwavemedia import ScreenwaveMediaIE from .screenwavemedia import ScreenwaveMediaIE
from .ultimedia import UltimediaIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -1029,6 +1030,21 @@ class GenericIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'title': 'cinemasnob', 'title': 'cinemasnob',
}, },
},
# Ultimedia embed
{
'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html',
'md5': '25551df6e7c7ab8096ceeeae048c5f64',
'info_dict': {
'id': 'r303r',
'ext': 'mp4',
'title': 'Kosheen - Pride (live)',
'thumbnail': 're:^https?://.*\.jpg',
'duration': 293,
'upload_date': '20081103',
'timestamp': 1225733392,
'uploader_id': '33m03',
},
} }
] ]
@ -1751,6 +1767,11 @@ def _playlist_from_matches(matches, getter=None, ie=None):
if mobj is not None: if mobj is not None:
return self.url_result(unescapeHTML(mobj.group('url')), 'ScreenwaveMedia') return self.url_result(unescapeHTML(mobj.group('url')), 'ScreenwaveMedia')
# Look for Ulltimedia embeds
ultimedia_url = UltimediaIE._extract_url(webpage)
if ultimedia_url:
return self.url_result(self._proto_relative_url(ultimedia_url), 'Ultimedia')
# Look for AdobeTVVideo embeds # Look for AdobeTVVideo embeds
mobj = re.search( mobj = re.search(
r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]',

View File

@ -1,59 +0,0 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
RegexNotFoundError,
unescapeHTML,
)
class JukeboxIE(InfoExtractor):
_VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<id>[a-z0-9\-]+)\.html'
_TEST = {
'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html',
'info_dict': {
'id': 'r303r',
'ext': 'flv',
'title': 'Kosheen-En Vivo Pride',
'uploader': 'Kosheen',
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
html = self._download_webpage(url, video_id)
iframe_url = unescapeHTML(self._search_regex(r'<iframe .*src="([^"]*)"', html, 'iframe url'))
iframe_html = self._download_webpage(iframe_url, video_id, 'Downloading iframe')
if re.search(r'class="jkb_waiting"', iframe_html) is not None:
raise ExtractorError('Video is not available(in your country?)!')
self.report_extraction(video_id)
try:
video_url = self._search_regex(r'"config":{"file":"(?P<video_url>http:[^"]+\?mdtk=[0-9]+)"',
iframe_html, 'video url')
video_url = unescapeHTML(video_url).replace('\/', '/')
except RegexNotFoundError:
youtube_url = self._search_regex(
r'config":{"file":"(http:\\/\\/www\.youtube\.com\\/watch\?v=[^"]+)"',
iframe_html, 'youtube url')
youtube_url = unescapeHTML(youtube_url).replace('\/', '/')
self.to_screen('Youtube video detected')
return self.url_result(youtube_url, ie='Youtube')
title = self._html_search_regex(r'<h1 class="inline">([^<]+)</h1>',
html, 'title')
artist = self._html_search_regex(r'<span id="infos_article_artist">([^<]+)</span>',
html, 'artist')
return {
'id': video_id,
'url': video_url,
'title': artist + '-' + title,
'uploader': artist,
}

View File

@ -4,102 +4,83 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_urllib_parse_urlparse from ..utils import int_or_none
from ..utils import (
ExtractorError,
qualities,
unified_strdate,
clean_html,
)
class UltimediaIE(InfoExtractor): class UltimediaIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ultimedia\.com/default/index/video[^/]+/id/(?P<id>[\d+a-z]+)' _VALID_URL = r'https?://(?:www\.)?ultimedia\.com/deliver/(?P<type>generic|musique)(?:/[^/]+)*/(?:src|article)/(?P<id>[\d+a-z]+)'
_TESTS = [{ _TESTS = [{
# news # news
'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r', 'url': 'https://www.ultimedia.com/deliver/generic/iframe/mdtk/01601930/zone/1/src/s8uk0r/autoplay/yes/ad/no/width/714/height/435',
'md5': '276a0e49de58c7e85d32b057837952a2', 'md5': '276a0e49de58c7e85d32b057837952a2',
'info_dict': { 'info_dict': {
'id': 's8uk0r', 'id': 's8uk0r',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Loi sur la fin de vie: le texte prévoit un renforcement des directives anticipées', 'title': 'Loi sur la fin de vie: le texte prévoit un renforcement des directives anticipées',
'description': 'md5:3e5c8fd65791487333dda5db8aed32af',
'thumbnail': 're:^https?://.*\.jpg', 'thumbnail': 're:^https?://.*\.jpg',
'duration': 74,
'upload_date': '20150317', 'upload_date': '20150317',
'timestamp': 1426604939,
'uploader_id': '3fszv',
}, },
}, { }, {
# music # music
'url': 'https://www.ultimedia.com/default/index/videomusic/id/xvpfp8', 'url': 'https://www.ultimedia.com/deliver/musique/iframe/mdtk/01601930/zone/1/article/xvpfp8/autoplay/yes/ad/no/width/714/height/435',
'md5': '2ea3513813cf230605c7e2ffe7eca61c', 'md5': '2ea3513813cf230605c7e2ffe7eca61c',
'info_dict': { 'info_dict': {
'id': 'xvpfp8', 'id': 'xvpfp8',
'ext': 'mp4', 'ext': 'mp4',
'title': "Two - C'est la vie (Clip)", 'title': 'Two - C\'est La Vie (clip)',
'description': 'Two',
'thumbnail': 're:^https?://.*\.jpg', 'thumbnail': 're:^https?://.*\.jpg',
'duration': 233,
'upload_date': '20150224', 'upload_date': '20150224',
'timestamp': 1424760500,
'uploader_id': '3rfzk',
}, },
}] }]
@staticmethod
def _extract_url(webpage):
mobj = re.search(
r'<(?:iframe|script)[^>]+src=["\'](?P<url>(?:https?:)?//(?:www\.)?ultimedia\.com/deliver/(?:generic|musique)(?:/[^/]+)*/(?:src|article)/[\d+a-z]+)',
webpage)
if mobj:
return mobj.group('url')
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_type, video_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, video_id)
deliver_url = self._proto_relative_url(self._search_regex( deliver_info = self._download_json(
r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?ultimedia\.com/deliver/[^"]+)"', 'http://www.ultimedia.com/deliver/video?video=%s&topic=%s' % (video_id, video_type),
webpage, 'deliver URL'), compat_urllib_parse_urlparse(url).scheme + ':')
deliver_page = self._download_webpage(
deliver_url, video_id, 'Downloading iframe page')
if '>This video is currently not available' in deliver_page:
raise ExtractorError(
'Video %s is currently not available' % video_id, expected=True)
player = self._parse_json(
self._search_regex(
r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on",
deliver_page, 'player'),
video_id) video_id)
quality = qualities(['flash', 'html5']) yt_id = deliver_info.get('yt_id')
if yt_id:
return self.url_result(yt_id, 'Youtube')
jwconf = deliver_info['jwconf']
formats = [] formats = []
for mode in player['modes']: for source in jwconf['playlist'][0]['sources']:
video_url = mode.get('config', {}).get('file')
if not video_url:
continue
if re.match(r'https?://www\.youtube\.com/.+?', video_url):
return self.url_result(video_url, 'Youtube')
formats.append({ formats.append({
'url': video_url, 'url': source['file'],
'format_id': mode.get('type'), 'format_id': source.get('label'),
'quality': quality(mode.get('type')),
}) })
self._sort_formats(formats) self._sort_formats(formats)
thumbnail = player.get('image') title = deliver_info['title']
thumbnail = jwconf.get('image')
title = clean_html(( duration = int_or_none(deliver_info.get('duration'))
self._html_search_regex( timestamp = int_or_none(deliver_info.get('release_time'))
r'(?s)<div\s+id="catArticle">.+?</div>(.+?)</h1>', uploader_id = deliver_info.get('owner_id')
webpage, 'title', default=None) or
self._search_regex(
r"var\s+nameVideo\s*=\s*'([^']+)'",
deliver_page, 'title')))
description = clean_html(self._html_search_regex(
r'(?s)<span>Description</span>(.+?)</p>', webpage,
'description', fatal=False))
upload_date = unified_strdate(self._search_regex(
r'Ajouté le\s*<span>([^<]+)', webpage,
'upload date', fatal=False))
return { return {
'id': video_id, 'id': video_id,
'title': title, 'title': title,
'description': description,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'upload_date': upload_date, 'duration': duration,
'timestamp': timestamp,
'uploader_id': uploader_id,
'formats': formats, 'formats': formats,
} }