[extractors] Use new framework for existing embeds (#4307)

`Brightcove` is difficult to migrate because it's subclasses may depend
on the signature of the current functions. So it is left as-is for now

Note: Tests have not been migrated
This commit is contained in:
pukkandan 2022-08-01 06:53:25 +05:30
parent 1e8fe57e5c
commit bfd973ece3
138 changed files with 499 additions and 1909 deletions

View File

@ -446,7 +446,7 @@
DWIE, DWIE,
DWArticleIE, DWArticleIE,
) )
from .eagleplatform import EaglePlatformIE from .eagleplatform import EaglePlatformIE, ClipYouEmbedIE
from .ebaumsworld import EbaumsWorldIE from .ebaumsworld import EbaumsWorldIE
from .echomsk import EchoMskIE from .echomsk import EchoMskIE
from .egghead import ( from .egghead import (
@ -1555,6 +1555,7 @@
SharedIE, SharedIE,
VivoIE, VivoIE,
) )
from .sharevideos import ShareVideosEmbedIE
from .shemaroome import ShemarooMeIE from .shemaroome import ShemarooMeIE
from .showroomlive import ShowRoomLiveIE from .showroomlive import ShowRoomLiveIE
from .simplecast import ( from .simplecast import (

View File

@ -232,6 +232,7 @@ def _real_extract(self, url):
class AdobeTVVideoIE(AdobeTVBaseIE): class AdobeTVVideoIE(AdobeTVBaseIE):
IE_NAME = 'adobetv:video' IE_NAME = 'adobetv:video'
_VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)' _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)'
_EMBED_REGEX = [r'<iframe[^>]+src=[\'"](?P<url>(?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]']
_TEST = { _TEST = {
# From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners

View File

@ -1,4 +1,3 @@
import re
import urllib.parse import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
@ -7,7 +6,6 @@
ExtractorError, ExtractorError,
determine_ext, determine_ext,
scale_thumbnails_to_max_format_width, scale_thumbnails_to_max_format_width,
unescapeHTML,
) )
@ -91,7 +89,7 @@ def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
info = self._search_json_ld(webpage, video_id, expected_type='NewsArticle') info = self._search_json_ld(webpage, video_id, expected_type='NewsArticle')
embed_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage)) embed_urls = list(Ant1NewsGrEmbedIE._extract_embed_urls(url, webpage))
if not embed_urls: if not embed_urls:
raise ExtractorError('no videos found for %s' % video_id, expected=True) raise ExtractorError('no videos found for %s' % video_id, expected=True)
return self.playlist_from_matches( return self.playlist_from_matches(
@ -104,6 +102,7 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE):
IE_DESC = 'ant1news.gr embedded videos' IE_DESC = 'ant1news.gr embedded videos'
_BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player' _BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player'
_VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P<id>[^#&]+)' _VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P<id>[^#&]+)'
_EMBED_REGEX = [rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)']
_API_PATH = '/news/templates/data/jsonPlayer' _API_PATH = '/news/templates/data/jsonPlayer'
_TESTS = [{ _TESTS = [{
@ -117,16 +116,6 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE):
}, },
}] }]
@classmethod
def _extract_urls(cls, webpage):
_EMBED_URL_RE = rf'{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+'
_EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_EMBED_URL_RE})(?P=_q1)'
for mobj in re.finditer(_EMBED_RE, webpage):
url = unescapeHTML(mobj.group('url'))
if not cls.suitable(url):
continue
yield url
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)

View File

@ -340,30 +340,16 @@ def _get_anvato_videos(self, access_key, video_id):
'subtitles': subtitles, 'subtitles': subtitles,
} }
@staticmethod @classmethod
def _extract_urls(ie, webpage, video_id): def _extract_from_webpage(cls, url, webpage):
entries = [] for mobj in re.finditer(cls._ANVP_RE, webpage):
for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage): anvplayer_data = unescapeHTML(json.loads(mobj.group('anvp'))) or {}
anvplayer_data = ie._parse_json( video_id, access_key = anvplayer_data.get('video'), anvplayer_data.get('accessKey')
mobj.group('anvp'), video_id, transform_source=unescapeHTML,
fatal=False)
if not anvplayer_data:
continue
video = anvplayer_data.get('video')
if not isinstance(video, compat_str) or not video.isdigit():
continue
access_key = anvplayer_data.get('accessKey')
if not access_key:
mcp = anvplayer_data.get('mcp')
if mcp:
access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get(
mcp.lower())
if not access_key: if not access_key:
access_key = cls._MCP_TO_ACCESS_KEY_TABLE.get((anvplayer_data.get('mcp') or '').lower())
if not (video_id or '').isdigit() or not access_key:
continue continue
entries.append(ie.url_result( yield cls.url_result(f'anvato:{access_key}:{video_id}', AnvatoIE, video_id)
'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(),
video_id=video))
return entries
def _extract_anvato_videos(self, webpage, video_id): def _extract_anvato_videos(self, webpage, video_id):
anvplayer_data = self._parse_json( anvplayer_data = self._parse_json(

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
@ -10,6 +8,7 @@
class APAIE(InfoExtractor): class APAIE(InfoExtractor):
_VALID_URL = r'(?P<base_url>https?://[^/]+\.apa\.at)/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _VALID_URL = r'(?P<base_url>https?://[^/]+\.apa\.at)/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
_EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1']
_TESTS = [{ _TESTS = [{
'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029',
'md5': '2b12292faeb0a7d930c778c7a5b4759b', 'md5': '2b12292faeb0a7d930c778c7a5b4759b',
@ -30,14 +29,6 @@ class APAIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_urls(webpage):
return [
mobj.group('url')
for mobj in re.finditer(
r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1',
webpage)]
def _real_extract(self, url): def _real_extract(self, url):
mobj = self._match_valid_url(url) mobj = self._match_valid_url(url)
video_id, base_url = mobj.group('id', 'base_url') video_id, base_url = mobj.group('id', 'base_url')

View File

@ -10,6 +10,7 @@
class AparatIE(InfoExtractor): class AparatIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)' _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)'
_EMBED_REGEX = [r'<iframe .*?src="(?P<url>http://www\.aparat\.com/video/[^"]+)"']
_TESTS = [{ _TESTS = [{
'url': 'http://www.aparat.com/v/wP8On', 'url': 'http://www.aparat.com/v/wP8On',

View File

@ -70,8 +70,8 @@ class ArcPublishingIE(InfoExtractor):
], 'video-api-cdn.%s.arcpublishing.com/api'), ], 'video-api-cdn.%s.arcpublishing.com/api'),
] ]
@staticmethod @classmethod
def _extract_urls(webpage): def _extract_embed_urls(cls, url, webpage):
entries = [] entries = []
# https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview
for powa_el in re.findall(r'(<div[^>]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage): for powa_el in re.findall(r'(<div[^>]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage):

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
@ -19,6 +17,8 @@ class ArkenaIE(InfoExtractor):
play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P<id>[^/]+)/[^/]+/(?P<account_id>\d+) play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P<id>[^/]+)/[^/]+/(?P<account_id>\d+)
) )
''' '''
# See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video
_EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1']
_TESTS = [{ _TESTS = [{
'url': 'https://video.qbrick.com/play2/embed/player?accountId=1034090&mediaId=d8ab4607-00090107-aab86310', 'url': 'https://video.qbrick.com/play2/embed/player?accountId=1034090&mediaId=d8ab4607-00090107-aab86310',
'md5': '97f117754e5f3c020f5f26da4a44ebaf', 'md5': '97f117754e5f3c020f5f26da4a44ebaf',
@ -50,15 +50,6 @@ class ArkenaIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_url(webpage):
# See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video
mobj = re.search(
r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1',
webpage)
if mobj:
return mobj.group('url')
def _real_extract(self, url): def _real_extract(self, url):
mobj = self._match_valid_url(url) mobj = self._match_valid_url(url)
video_id = mobj.group('id') video_id = mobj.group('id')

View File

@ -204,6 +204,7 @@ def _real_extract(self, url):
class ArteTVEmbedIE(InfoExtractor): class ArteTVEmbedIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+' _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
_EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
_TESTS = [{ _TESTS = [{
'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A', 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
'info_dict': { 'info_dict': {
@ -219,12 +220,6 @@ class ArteTVEmbedIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_urls(webpage):
return [url for _, url in re.findall(
r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1',
webpage)]
def _real_extract(self, url): def _real_extract(self, url):
qs = parse_qs(url) qs = parse_qs(url)
json_url = qs['json_url'][0] json_url = qs['json_url'][0]

View File

@ -22,6 +22,7 @@
class BandcampIE(InfoExtractor): class BandcampIE(InfoExtractor):
_VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)' _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)'
_EMBED_REGEX = [r'<meta property="og:url"[^>]*?content="(?P<url>.*?bandcamp\.com.*?)"']
_TESTS = [{ _TESTS = [{
'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
'md5': 'c557841d5e50261777a6585648adf439', 'md5': 'c557841d5e50261777a6585648adf439',

View File

@ -46,6 +46,7 @@ class BBCCoUkIE(InfoExtractor):
) )
(?P<id>%s)(?!/(?:episodes|broadcasts|clips)) (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
''' % _ID_REGEX ''' % _ID_REGEX
_EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
_LOGIN_URL = 'https://account.bbc.com/signin' _LOGIN_URL = 'https://account.bbc.com/signin'
_NETRC_MACHINE = 'bbc' _NETRC_MACHINE = 'bbc'

View File

@ -13,6 +13,7 @@
class BitChuteIE(InfoExtractor): class BitChuteIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)' _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)'
_EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
_TESTS = [{ _TESTS = [{
'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/', 'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/',
'md5': '7e427d7ed7af5a75b5855705ec750e2b', 'md5': '7e427d7ed7af5a75b5855705ec750e2b',
@ -33,14 +34,6 @@ class BitChuteIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_urls(webpage):
return [
mobj.group('url')
for mobj in re.finditer(
r'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>%s)' % BitChuteIE._VALID_URL,
webpage)]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)

View File

@ -1,5 +1,3 @@
import re
from ..utils import ( from ..utils import (
mimetype2ext, mimetype2ext,
parse_duration, parse_duration,
@ -13,7 +11,7 @@
class BloggerIE(InfoExtractor): class BloggerIE(InfoExtractor):
IE_NAME = 'blogger.com' IE_NAME = 'blogger.com'
_VALID_URL = r'https?://(?:www\.)?blogger\.com/video\.g\?token=(?P<id>.+)' _VALID_URL = r'https?://(?:www\.)?blogger\.com/video\.g\?token=(?P<id>.+)'
_VALID_EMBED = r'''<iframe[^>]+src=["']((?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']''' _EMBED_REGEX = [r'''<iframe[^>]+src=["'](?P<url>(?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']''']
_TESTS = [{ _TESTS = [{
'url': 'https://www.blogger.com/video.g?token=AD6v5dzEe9hfcARr5Hlq1WTkYy6t-fXH3BBahVhGvVHe5szdEUBEloSEDSTA8-b111089KbfWuBvTN7fnbxMtymsHhXAXwVvyzHH4Qch2cfLQdGxKQrrEuFpC1amSl_9GuLWODjPgw', 'url': 'https://www.blogger.com/video.g?token=AD6v5dzEe9hfcARr5Hlq1WTkYy6t-fXH3BBahVhGvVHe5szdEUBEloSEDSTA8-b111089KbfWuBvTN7fnbxMtymsHhXAXwVvyzHH4Qch2cfLQdGxKQrrEuFpC1amSl_9GuLWODjPgw',
'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac', 'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac',
@ -26,10 +24,6 @@ class BloggerIE(InfoExtractor):
} }
}] }]
@staticmethod
def _extract_urls(webpage):
return re.findall(BloggerIE._VALID_EMBED, webpage)
def _real_extract(self, url): def _real_extract(self, url):
token_id = self._match_id(url) token_id = self._match_id(url)
webpage = self._download_webpage(url, token_id) webpage = self._download_webpage(url, token_id)

View File

@ -81,7 +81,7 @@ def _real_extract(self, url):
continue continue
entries.append(self.url_result(video['url'])) entries.append(self.url_result(video['url']))
facebook_urls = FacebookIE._extract_urls(webpage) facebook_urls = FacebookIE._extract_embed_urls(url, webpage)
entries.extend([ entries.extend([
self.url_result(facebook_url) self.url_result(facebook_url)
for facebook_url in facebook_urls]) for facebook_url in facebook_urls])

View File

@ -14,6 +14,7 @@ class Channel9IE(InfoExtractor):
IE_DESC = 'Channel 9' IE_DESC = 'Channel 9'
IE_NAME = 'channel9' IE_NAME = 'channel9'
_VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)' _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
_EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b']
_TESTS = [{ _TESTS = [{
'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
@ -78,12 +79,6 @@ class Channel9IE(InfoExtractor):
_RSS_URL = 'http://channel9.msdn.com/%s/RSS' _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe[^>]+src=["\'](https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b',
webpage)
def _extract_list(self, video_id, rss_url=None): def _extract_list(self, video_id, rss_url=None):
if not rss_url: if not rss_url:
rss_url = self._RSS_URL % video_id rss_url = self._RSS_URL % video_id

View File

@ -7,6 +7,8 @@
class CinchcastIE(InfoExtractor): class CinchcastIE(InfoExtractor):
_VALID_URL = r'https?://player\.cinchcast\.com/.*?(?:assetId|show_id)=(?P<id>[0-9]+)' _VALID_URL = r'https?://player\.cinchcast\.com/.*?(?:assetId|show_id)=(?P<id>[0-9]+)'
_EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1']
_TESTS = [{ _TESTS = [{
'url': 'http://player.cinchcast.com/?show_id=5258197&platformId=1&assetType=single', 'url': 'http://player.cinchcast.com/?show_id=5258197&platformId=1&assetType=single',
'info_dict': { 'info_dict': {

View File

@ -1,5 +1,4 @@
import base64 import base64
import re
from .common import InfoExtractor from .common import InfoExtractor
@ -16,6 +15,7 @@ class CloudflareStreamIE(InfoExtractor):
) )
(?P<id>%s) (?P<id>%s)
''' % (_DOMAIN_RE, _EMBED_RE, _ID_RE) ''' % (_DOMAIN_RE, _EMBED_RE, _ID_RE)
_EMBED_REGEX = [fr'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//{_EMBED_RE}(?:{_ID_RE}).*?)\1']
_TESTS = [{ _TESTS = [{
'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717', 'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717',
'info_dict': { 'info_dict': {
@ -37,14 +37,6 @@ class CloudflareStreamIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_urls(webpage):
return [
mobj.group('url')
for mobj in re.finditer(
r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//%s(?:%s).*?)\1' % (CloudflareStreamIE._EMBED_RE, CloudflareStreamIE._ID_RE),
webpage)]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
domain = 'bytehighway.net' if 'bytehighway.net/' in url else 'videodelivery.net' domain = 'bytehighway.net' if 'bytehighway.net/' in url else 'videodelivery.net'

View File

@ -3882,6 +3882,11 @@ def _extract_embed_urls(cls, url, webpage):
class StopExtraction(Exception): class StopExtraction(Exception):
pass pass
@classmethod
def _extract_url(cls, webpage): # TODO: Remove
"""Only for compatibility with some older extractors"""
return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
class SearchInfoExtractor(InfoExtractor): class SearchInfoExtractor(InfoExtractor):
""" """

View File

@ -58,7 +58,10 @@ class CondeNastIE(InfoExtractor):
)''' % '|'.join(_SITES.keys()) )''' % '|'.join(_SITES.keys())
IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values()))
EMBED_URL = r'(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?' % '|'.join(_SITES.keys()) _EMBED_REGEX = [r'''(?x)
<(?:iframe|script)[^>]+?src=(["\'])(?P<url>
(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?
)\1''' % '|'.join(_SITES.keys())]
_TESTS = [{ _TESTS = [{
'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', 'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led',

View File

@ -7,6 +7,8 @@
class CrooksAndLiarsIE(InfoExtractor): class CrooksAndLiarsIE(InfoExtractor):
_VALID_URL = r'https?://embed\.crooksandliars\.com/(?:embed|v)/(?P<id>[A-Za-z0-9]+)' _VALID_URL = r'https?://embed\.crooksandliars\.com/(?:embed|v)/(?P<id>[A-Za-z0-9]+)'
_EMBED_REGEX = [r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1']
_TESTS = [{ _TESTS = [{
'url': 'https://embed.crooksandliars.com/embed/8RUoRhRi', 'url': 'https://embed.crooksandliars.com/embed/8RUoRhRi',
'info_dict': { 'info_dict': {

View File

@ -163,7 +163,7 @@ def add_referer(formats):
video_id = m.group('id') video_id = m.group('id')
video_type = 'program' if m.group('type') == 'prog' else 'clip' video_type = 'program' if m.group('type') == 'prog' else 'clip'
else: else:
senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) senate_isvp_url = SenateISVPIE._extract_url(webpage)
if senate_isvp_url: if senate_isvp_url:
title = self._og_search_title(webpage) title = self._og_search_title(webpage)
surl = smuggle_url(senate_isvp_url, {'force_title': title}) surl = smuggle_url(senate_isvp_url, {'force_title': title})

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str from ..compat import compat_str
from ..utils import ( from ..utils import (
@ -12,6 +10,7 @@
class DailyMailIE(InfoExtractor): class DailyMailIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/(?:video/[^/]+/video-|embed/video/)(?P<id>[0-9]+)' _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/(?:video/[^/]+/video-|embed/video/)(?P<id>[0-9]+)'
_EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?dailymail\.co\.uk/embed/video/\d+\.html)']
_TESTS = [{ _TESTS = [{
'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html', 'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html',
'md5': 'f6129624562251f628296c3a9ffde124', 'md5': 'f6129624562251f628296c3a9ffde124',
@ -26,12 +25,6 @@ class DailyMailIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?dailymail\.co\.uk/embed/video/\d+\.html)',
webpage)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)

View File

@ -99,6 +99,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
[/=](?P<id>[^/?_&]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))? [/=](?P<id>[^/?_&]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))?
''' '''
IE_NAME = 'dailymotion' IE_NAME = 'dailymotion'
_EMBED_REGEX = [r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1']
_TESTS = [{ _TESTS = [{
'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news', 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news',
'md5': '074b95bdee76b9e3654137aee9c79dfe', 'md5': '074b95bdee76b9e3654137aee9c79dfe',
@ -208,18 +209,13 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
} }
xid''' xid'''
@staticmethod @classmethod
def _extract_urls(webpage): def _extract_embed_urls(cls, url, webpage):
urls = []
# Look for embedded Dailymotion player
# https://developer.dailymotion.com/player#player-parameters # https://developer.dailymotion.com/player#player-parameters
for mobj in re.finditer( yield from super()._extract_embed_urls(url, webpage)
r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage):
urls.append(unescapeHTML(mobj.group('url')))
for mobj in re.finditer( for mobj in re.finditer(
r'(?s)DM\.player\([^,]+,\s*{.*?video[\'"]?\s*:\s*["\']?(?P<id>[0-9a-zA-Z]+).+?}\s*\);', webpage): r'(?s)DM\.player\([^,]+,\s*{.*?video[\'"]?\s*:\s*["\']?(?P<id>[0-9a-zA-Z]+).+?}\s*\);', webpage):
urls.append('https://www.dailymotion.com/embed/video/' + mobj.group('id')) yield from 'https://www.dailymotion.com/embed/video/' + mobj.group('id')
return urls
def _real_extract(self, url): def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url) url, smuggled_data = unsmuggle_url(url)
@ -378,6 +374,15 @@ class DailymotionPlaylistIE(DailymotionPlaylistBaseIE):
}] }]
_OBJECT_TYPE = 'collection' _OBJECT_TYPE = 'collection'
@classmethod
def _extract_embed_urls(cls, url, webpage):
# Look for embedded Dailymotion playlist player (#3822)
for mobj in re.finditer(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1',
webpage):
for p in re.findall(r'list\[\]=/playlist/([^/]+)/', unescapeHTML(mobj.group('url'))):
yield '//dailymotion.com/playlist/%s' % p
class DailymotionUserIE(DailymotionPlaylistBaseIE): class DailymotionUserIE(DailymotionPlaylistBaseIE):
IE_NAME = 'dailymotion:user' IE_NAME = 'dailymotion:user'

View File

@ -1,10 +1,9 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
class DBTVIE(InfoExtractor): class DBTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?dagbladet\.no/video/(?:(?:embed|(?P<display_id>[^/]+))/)?(?P<id>[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8})' _VALID_URL = r'https?://(?:www\.)?dagbladet\.no/video/(?:(?:embed|(?P<display_id>[^/]+))/)?(?P<id>[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8})'
_EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dagbladet\.no/video/embed/(?:[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8}).*?)\1']
_TESTS = [{ _TESTS = [{
'url': 'https://www.dagbladet.no/video/PynxJnNWChE/', 'url': 'https://www.dagbladet.no/video/PynxJnNWChE/',
'md5': 'b8f850ba1860adbda668d367f9b77699', 'md5': 'b8f850ba1860adbda668d367f9b77699',
@ -28,12 +27,6 @@ class DBTVIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_urls(webpage):
return [url for _, url in re.findall(
r'<iframe[^>]+src=(["\'])((?:https?:)?//(?:www\.)?dagbladet\.no/video/embed/(?:[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8}).*?)\1',
webpage)]
def _real_extract(self, url): def _real_extract(self, url):
display_id, video_id = self._match_valid_url(url).groups() display_id, video_id = self._match_valid_url(url).groups()
info = { info = {

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import int_or_none from ..utils import int_or_none
@ -25,6 +23,7 @@ class DigitekaIE(InfoExtractor):
) )
/id /id
)/(?P<id>[\d+a-z]+)''' )/(?P<id>[\d+a-z]+)'''
_EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=["\'](?P<url>(?:https?:)?//(?:www\.)?ultimedia\.com/deliver/(?:generic|musique)(?:/[^/]+)*/(?:src|article)/[\d+a-z]+)']
_TESTS = [{ _TESTS = [{
# news # news
'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r', 'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r',
@ -58,14 +57,6 @@ class DigitekaIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_url(webpage):
mobj = re.search(
r'<(?:iframe|script)[^>]+src=["\'](?P<url>(?:https?:)?//(?:www\.)?ultimedia\.com/deliver/(?:generic|musique)(?:/[^/]+)*/(?:src|article)/[\d+a-z]+)',
webpage)
if mobj:
return mobj.group('url')
def _real_extract(self, url): def _real_extract(self, url):
mobj = self._match_valid_url(url) mobj = self._match_valid_url(url)
video_id = mobj.group('id') video_id = mobj.group('id')

View File

@ -11,6 +11,7 @@
class DrTuberIE(InfoExtractor): class DrTuberIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:www|m)\.)?drtuber\.com/(?:video|embed)/(?P<id>\d+)(?:/(?P<display_id>[\w-]+))?' _VALID_URL = r'https?://(?:(?:www|m)\.)?drtuber\.com/(?:video|embed)/(?P<id>\d+)(?:/(?P<display_id>[\w-]+))?'
_EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?drtuber\.com/embed/\d+)']
_TESTS = [{ _TESTS = [{
'url': 'http://www.drtuber.com/video/1740434/hot-perky-blonde-naked-golf', 'url': 'http://www.drtuber.com/video/1740434/hot-perky-blonde-naked-golf',
'md5': '93e680cf2536ad0dfb7e74d94a89facd', 'md5': '93e680cf2536ad0dfb7e74d94a89facd',
@ -33,12 +34,6 @@ class DrTuberIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?drtuber\.com/embed/\d+)',
webpage)
def _real_extract(self, url): def _real_extract(self, url):
mobj = self._match_valid_url(url) mobj = self._match_valid_url(url)
video_id = mobj.group('id') video_id = mobj.group('id')

View File

@ -1,3 +1,4 @@
import functools
import re import re
from .common import InfoExtractor from .common import InfoExtractor
@ -5,6 +6,7 @@
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
smuggle_url,
unsmuggle_url, unsmuggle_url,
url_or_none, url_or_none,
) )
@ -18,6 +20,7 @@ class EaglePlatformIE(InfoExtractor):
) )
(?P<id>\d+) (?P<id>\d+)
''' '''
_EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1']
_TESTS = [{ _TESTS = [{
# http://lenta.ru/news/2015/03/06/navalny/ # http://lenta.ru/news/2015/03/06/navalny/
'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201', 'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201',
@ -52,14 +55,14 @@ class EaglePlatformIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod @classmethod
def _extract_url(webpage): def _extract_embed_urls(cls, url, webpage):
# Regular iframe embedding add_referer = functools.partial(smuggle_url, data={'referrer': url})
mobj = re.search(
r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1', res = tuple(super()._extract_embed_urls(url, webpage))
webpage) if res:
if mobj is not None: return map(add_referer, res)
return mobj.group('url')
PLAYER_JS_RE = r''' PLAYER_JS_RE = r'''
<script[^>]+ <script[^>]+
src=(?P<qjs>["\'])(?:https?:)?//(?P<host>(?:(?!(?P=qjs)).)+\.media\.eagleplatform\.com)/player/player\.js(?P=qjs) src=(?P<qjs>["\'])(?:https?:)?//(?P<host>(?:(?!(?P=qjs)).)+\.media\.eagleplatform\.com)/player/player\.js(?P=qjs)
@ -74,7 +77,7 @@ def _extract_url(webpage):
data-id=["\'](?P<id>\d+) data-id=["\'](?P<id>\d+)
''' % PLAYER_JS_RE, webpage) ''' % PLAYER_JS_RE, webpage)
if mobj is not None: if mobj is not None:
return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() return [add_referer('eagleplatform:%(host)s:%(id)s' % mobj.groupdict())]
# Generalization of "Javascript code usage", "Combined usage" and # Generalization of "Javascript code usage", "Combined usage" and
# "Usage without attaching to DOM" embeddings (see # "Usage without attaching to DOM" embeddings (see
# http://dultonmedia.github.io/eplayer/) # http://dultonmedia.github.io/eplayer/)
@ -95,7 +98,7 @@ def _extract_url(webpage):
</script> </script>
''' % PLAYER_JS_RE, webpage) ''' % PLAYER_JS_RE, webpage)
if mobj is not None: if mobj is not None:
return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() return [add_referer('eagleplatform:%(host)s:%(id)s' % mobj.groupdict())]
@staticmethod @staticmethod
def _handle_error(response): def _handle_error(response):
@ -201,3 +204,14 @@ def _real_extract(self, url):
'age_limit': age_limit, 'age_limit': age_limit,
'formats': formats, 'formats': formats,
} }
class ClipYouEmbedIE(InfoExtractor):
_VALID_URL = False
@classmethod
def _extract_embed_urls(cls, url, webpage):
mobj = re.search(
r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
if mobj is not None:
yield smuggle_url('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), {'referrer': url})

View File

@ -1,3 +1,5 @@
import re
import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote from ..compat import compat_urllib_parse_unquote
@ -9,5 +11,14 @@ class EmbedlyIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@classmethod
def _extract_embed_urls(cls, url, webpage):
# Bypass suitable check
for mobj in re.finditer(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage):
yield mobj.group('url')
for mobj in re.finditer(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage):
yield urllib.parse.unquote(mobj.group('url'))
def _real_extract(self, url): def _real_extract(self, url):
return self.url_result(compat_urllib_parse_unquote(self._match_id(url))) return self.url_result(compat_urllib_parse_unquote(self._match_id(url)))

View File

@ -15,7 +15,6 @@
parse_iso8601, parse_iso8601,
str_or_none, str_or_none,
try_get, try_get,
unescapeHTML,
url_or_none, url_or_none,
variadic, variadic,
) )
@ -275,6 +274,7 @@ class ERTWebtvEmbedIE(InfoExtractor):
IE_DESC = 'ert.gr webtv embedded videos' IE_DESC = 'ert.gr webtv embedded videos'
_BASE_PLAYER_URL_RE = re.escape('//www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php') _BASE_PLAYER_URL_RE = re.escape('//www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php')
_VALID_URL = rf'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?f=(?P<id>[^#&]+)' _VALID_URL = rf'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?f=(?P<id>[^#&]+)'
_EMBED_REGEX = [rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>(?:https?:)?{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)']
_TESTS = [{ _TESTS = [{
'url': 'https://www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php?f=trailers/E2251_TO_DIKTYO_E09_16-01_1900.mp4&bgimg=/photos/2022/1/to_diktio_ep09_i_istoria_tou_diadiktiou_stin_Ellada_1021x576.jpg', 'url': 'https://www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php?f=trailers/E2251_TO_DIKTYO_E09_16-01_1900.mp4&bgimg=/photos/2022/1/to_diktio_ep09_i_istoria_tou_diadiktiou_stin_Ellada_1021x576.jpg',
@ -287,17 +287,6 @@ class ERTWebtvEmbedIE(InfoExtractor):
}, },
}] }]
@classmethod
def _extract_urls(cls, webpage):
EMBED_URL_RE = rf'(?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+'
EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{EMBED_URL_RE})(?P=_q1)'
for mobj in re.finditer(EMBED_RE, webpage):
url = unescapeHTML(mobj.group('url'))
if not cls.suitable(url):
continue
yield url
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
formats, subs = self._extract_m3u8_formats_and_subtitles( formats, subs = self._extract_m3u8_formats_and_subtitles(

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
@ -17,6 +15,7 @@ class ExpressenIE(InfoExtractor):
tv/(?:[^/]+/)* tv/(?:[^/]+/)*
(?P<id>[^/?#&]+) (?P<id>[^/?#&]+)
''' '''
_EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?(?:expressen|di)\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1']
_TESTS = [{ _TESTS = [{
'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/', 'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/',
'md5': 'deb2ca62e7b1dcd19fa18ba37523f66e', 'md5': 'deb2ca62e7b1dcd19fa18ba37523f66e',
@ -45,13 +44,6 @@ class ExpressenIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_urls(webpage):
return [
mobj.group('url') for mobj in re.finditer(
r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?(?:expressen|di)\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1',
webpage)]
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)

View File

@ -57,6 +57,13 @@ class FacebookIE(InfoExtractor):
) )
(?P<id>[0-9]+) (?P<id>[0-9]+)
''' '''
_EMBED_REGEX = [
r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1',
# Facebook API embed https://developers.facebook.com/docs/plugins/embedded-video-player
r'''(?x)<div[^>]+
class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+
data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''',
]
_LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1' _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
_CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1' _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
_NETRC_MACHINE = 'facebook' _NETRC_MACHINE = 'facebook'
@ -311,21 +318,6 @@ class FacebookIE(InfoExtractor):
'graphURI': '/api/graphql/' 'graphURI': '/api/graphql/'
} }
@staticmethod
def _extract_urls(webpage):
urls = []
for mobj in re.finditer(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1',
webpage):
urls.append(mobj.group('url'))
# Facebook API embed
# see https://developers.facebook.com/docs/plugins/embedded-video-player
for mobj in re.finditer(r'''(?x)<div[^>]+
class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+
data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage):
urls.append(mobj.group('url'))
return urls
def _perform_login(self, username, password): def _perform_login(self, username, password):
login_page_req = sanitized_Request(self._LOGIN_URL) login_page_req = sanitized_Request(self._LOGIN_URL)
self._set_cookie('facebook.com', 'locale', 'en_US') self._set_cookie('facebook.com', 'locale', 'en_US')

View File

@ -56,8 +56,8 @@ class FoxNewsIE(AMPIE):
}, },
] ]
@staticmethod @classmethod
def _extract_urls(webpage): def _extract_embed_urls(cls, url, webpage):
return [ return [
f'https://video.foxnews.com/v/video-embed.html?video_id={mobj.group("video_id")}' f'https://video.foxnews.com/v/video-embed.html?video_id={mobj.group("video_id")}'
for mobj in re.finditer( for mobj in re.finditer(
@ -125,4 +125,4 @@ def _real_extract(self, url):
'http://video.foxnews.com/v/' + video_id, FoxNewsIE.ie_key()) 'http://video.foxnews.com/v/' + video_id, FoxNewsIE.ie_key())
return self.url_result( return self.url_result(
FoxNewsIE._extract_urls(webpage)[0], FoxNewsIE.ie_key()) FoxNewsIE._extract_embed_urls(url, webpage)[0], FoxNewsIE.ie_key())

View File

@ -32,6 +32,7 @@ class FranceTVIE(InfoExtractor):
(?P<id>[^@]+)(?:@(?P<catalog>.+))? (?P<id>[^@]+)(?:@(?P<catalog>.+))?
) )
''' '''
_EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1']
_TESTS = [{ _TESTS = [{
# without catalog # without catalog
@ -370,7 +371,7 @@ def _real_extract(self, url):
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
dailymotion_urls = DailymotionIE._extract_urls(webpage) dailymotion_urls = DailymotionIE._extract_embed_urls(url, webpage)
if dailymotion_urls: if dailymotion_urls:
return self.playlist_result([ return self.playlist_result([
self.url_result(dailymotion_url, DailymotionIE.ie_key()) self.url_result(dailymotion_url, DailymotionIE.ie_key())

View File

@ -11,7 +11,7 @@
class GediDigitalIE(InfoExtractor): class GediDigitalIE(InfoExtractor):
_VALID_URL = r'''(?x:(?P<url>(?:https?:)//video\. _VALID_URL = r'''(?x:(?P<base_url>(?:https?:)//video\.
(?: (?:
(?: (?:
(?:espresso\.)?repubblica (?:espresso\.)?repubblica
@ -34,6 +34,12 @@ class GediDigitalIE(InfoExtractor):
|lasentinella |lasentinella
)\.gelocal )\.gelocal
)\.it(?:/[^/]+){2,4}/(?P<id>\d+))(?:$|[?&].*))''' )\.it(?:/[^/]+){2,4}/(?P<id>\d+))(?:$|[?&].*))'''
_EMBED_REGEX = [rf'''(?x)
(?:
data-frame-src=|
<iframe[^\n]+src=
)
(["'])(?P<url>{_VALID_URL})\1''']
_TESTS = [{ _TESTS = [{
'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683',
'md5': '84658d7fb9e55a6e57ecc77b73137494', 'md5': '84658d7fb9e55a6e57ecc77b73137494',
@ -109,22 +115,9 @@ def _sanitize_urls(urls):
urls[i] = urljoin(base_url(e), url_basename(e)) urls[i] = urljoin(base_url(e), url_basename(e))
return urls return urls
@staticmethod @classmethod
def _extract_urls(webpage): def _extract_embed_urls(cls, url, webpage):
entries = [ return cls._sanitize_urls(tuple(super()._extract_embed_urls(url, webpage)))
mobj.group('eurl')
for mobj in re.finditer(r'''(?x)
(?:
data-frame-src=|
<iframe[^\n]+src=
)
(["'])(?P<eurl>%s)\1''' % GediDigitalIE._VALID_URL, webpage)]
return GediDigitalIE._sanitize_urls(entries)
@staticmethod
def _extract_url(webpage):
urls = GediDigitalIE._extract_urls(webpage)
return urls[0] if urls else None
@staticmethod @staticmethod
def _clean_formats(formats): def _clean_formats(formats):
@ -139,8 +132,7 @@ def _clean_formats(formats):
formats[:] = clean_formats formats[:] = clean_formats
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id, url = self._match_valid_url(url).group('id', 'base_url')
url = self._match_valid_url(url).group('url')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
title = self._html_search_meta( title = self._html_search_meta(
['twitter:title', 'og:title'], webpage, fatal=True) ['twitter:title', 'og:title'], webpage, fatal=True)

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
@ -11,6 +9,7 @@
class GfycatIE(InfoExtractor): class GfycatIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:www|giant|thumbs)\.)?gfycat\.com/(?i:ru/|ifr/|gifs/detail/)?(?P<id>[^-/?#\."\']+)' _VALID_URL = r'https?://(?:(?:www|giant|thumbs)\.)?gfycat\.com/(?i:ru/|ifr/|gifs/detail/)?(?P<id>[^-/?#\."\']+)'
_EMBED_REGEX = [rf'<(?:iframe|source)[^>]+\bsrc=["\'](?P<url>{_VALID_URL})']
_TESTS = [{ _TESTS = [{
'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher', 'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher',
'info_dict': { 'info_dict': {
@ -82,14 +81,6 @@ class GfycatIE(InfoExtractor):
'only_matching': True 'only_matching': True
}] }]
@staticmethod
def _extract_urls(webpage):
return [
mobj.group('url')
for mobj in re.finditer(
r'<(?:iframe|source)[^>]+\bsrc=["\'](?P<url>%s)' % GfycatIE._VALID_URL,
webpage)]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)

View File

@ -174,7 +174,7 @@ def build_player_url(cls, video_id, integration, origin_url=None):
return cls._smuggle_origin_url(f'https:{cls._BASE_PLAYER_URL}?{query_string}', origin_url) return cls._smuggle_origin_url(f'https:{cls._BASE_PLAYER_URL}?{query_string}', origin_url)
@classmethod @classmethod
def _extract_urls(cls, webpage, origin_url): def _extract_embed_urls(cls, url, webpage):
# https://docs.glomex.com/publisher/video-player-integration/javascript-api/ # https://docs.glomex.com/publisher/video-player-integration/javascript-api/
quot_re = r'["\']' quot_re = r'["\']'
@ -183,9 +183,9 @@ def _extract_urls(cls, webpage, origin_url):
(?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=q)).)+ (?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=q)).)+
)(?P=q)''' )(?P=q)'''
for mobj in re.finditer(regex, webpage): for mobj in re.finditer(regex, webpage):
url = unescapeHTML(mobj.group('url')) embed_url = unescapeHTML(mobj.group('url'))
if cls.suitable(url): if cls.suitable(embed_url):
yield cls._smuggle_origin_url(url, origin_url) yield cls._smuggle_origin_url(embed_url, url)
regex = fr'''(?x) regex = fr'''(?x)
<glomex-player [^>]+?>| <glomex-player [^>]+?>|
@ -193,7 +193,7 @@ def _extract_urls(cls, webpage, origin_url):
for mobj in re.finditer(regex, webpage): for mobj in re.finditer(regex, webpage):
attrs = extract_attributes(mobj.group(0)) attrs = extract_attributes(mobj.group(0))
if attrs.get('data-integration-id') and attrs.get('data-playlist-id'): if attrs.get('data-integration-id') and attrs.get('data-playlist-id'):
yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], origin_url) yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], url)
# naive parsing of inline scripts for hard-coded integration parameters # naive parsing of inline scripts for hard-coded integration parameters
regex = fr'''(?x) regex = fr'''(?x)
@ -206,7 +206,7 @@ def _extract_urls(cls, webpage, origin_url):
continue continue
playlist_id = re.search(regex % 'playlistId', script) playlist_id = re.search(regex % 'playlistId', script)
if playlist_id: if playlist_id:
yield cls.build_player_url(playlist_id, integration_id, origin_url) yield cls.build_player_url(playlist_id, integration_id, url)
def _real_extract(self, url): def _real_extract(self, url):
url, origin_url = self._unsmuggle_origin_url(url) url, origin_url = self._unsmuggle_origin_url(url)

View File

@ -77,13 +77,13 @@ class GoogleDriveIE(InfoExtractor):
_caption_formats_ext = [] _caption_formats_ext = []
_captions_xml = None _captions_xml = None
@staticmethod @classmethod
def _extract_url(webpage): def _extract_embed_urls(cls, url, webpage):
mobj = re.search( mobj = re.search(
r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})', r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
webpage) webpage)
if mobj: if mobj:
return 'https://drive.google.com/file/d/%s' % mobj.group('id') yield 'https://drive.google.com/file/d/%s' % mobj.group('id')
def _download_subtitles_xml(self, video_id, subtitles_id, hl): def _download_subtitles_xml(self, video_id, subtitles_id, hl):
if self._captions_xml: if self._captions_xml:

View File

@ -121,7 +121,7 @@ def _make_kaltura_result(kaltura_url):
if kaltura_id: if kaltura_id:
return _make_kaltura_result('kaltura:2238431:%s' % kaltura_id) return _make_kaltura_result('kaltura:2238431:%s' % kaltura_id)
yt_urls = YoutubeIE._extract_urls(webpage) yt_urls = YoutubeIE._extract_embed_urls(url, webpage)
if yt_urls: if yt_urls:
return self.playlist_from_matches( return self.playlist_from_matches(
yt_urls, video_id, title, ie=YoutubeIE.ie_key()) yt_urls, video_id, title, ie=YoutubeIE.ie_key())

View File

@ -17,6 +17,7 @@ class HuffPostIE(InfoExtractor):
HPLEmbedPlayer/\?segmentId= HPLEmbedPlayer/\?segmentId=
) )
(?P<id>[0-9a-f]+)''' (?P<id>[0-9a-f]+)'''
_EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1']
_TEST = { _TEST = {
'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677', 'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677',

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str from ..compat import compat_str
from ..utils import ( from ..utils import (
@ -12,6 +10,14 @@
class IndavideoEmbedIE(InfoExtractor): class IndavideoEmbedIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P<id>[\da-f]+)' _VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P<id>[\da-f]+)'
# Some example URLs covered by generic extractor:
# http://indavideo.hu/video/Vicces_cica_1
# http://index.indavideo.hu/video/2015_0728_beregszasz
# http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko
# http://erotika.indavideo.hu/video/Amator_tini_punci
# http://film.indavideo.hu/video/f_hrom_nagymamm_volt
# http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes
_EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)']
_TESTS = [{ _TESTS = [{
'url': 'http://indavideo.hu/player/video/1bdc3c6d80/', 'url': 'http://indavideo.hu/player/video/1bdc3c6d80/',
'md5': 'c8a507a1c7410685f83a06eaeeaafeab', 'md5': 'c8a507a1c7410685f83a06eaeeaafeab',
@ -37,20 +43,6 @@ class IndavideoEmbedIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
# Some example URLs covered by generic extractor:
# http://indavideo.hu/video/Vicces_cica_1
# http://index.indavideo.hu/video/2015_0728_beregszasz
# http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko
# http://erotika.indavideo.hu/video/Amator_tini_punci
# http://film.indavideo.hu/video/f_hrom_nagymamm_volt
# http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)',
webpage)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)

View File

@ -243,6 +243,7 @@ def _real_extract(self, url):
class InstagramIE(InstagramBaseIE): class InstagramIE(InstagramBaseIE):
_VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com(?:/[^/]+)?/(?:p|tv|reel)/(?P<id>[^/?#&]+))' _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com(?:/[^/]+)?/(?:p|tv|reel)/(?P<id>[^/?#&]+))'
_EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1']
_TESTS = [{ _TESTS = [{
'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
'md5': '0d2da106a9d2631273e192b372806516', 'md5': '0d2da106a9d2631273e192b372806516',
@ -346,23 +347,16 @@ class InstagramIE(InstagramBaseIE):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod @classmethod
def _extract_embed_url(webpage): def _extract_embed_urls(cls, url, webpage):
mobj = re.search( res = tuple(super()._extract_embed_urls(url, webpage))
r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1', if res:
webpage) return res
if mobj:
return mobj.group('url')
blockquote_el = get_element_by_attribute( mobj = re.search(r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1',
'class', 'instagram-media', webpage) get_element_by_attribute('class', 'instagram-media', webpage) or '')
if blockquote_el is None:
return
mobj = re.search(
r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el)
if mobj: if mobj:
return mobj.group('link') return [mobj.group('link')]
def _real_extract(self, url): def _real_extract(self, url):
video_id, url = self._match_valid_url(url).group('id', 'url') video_id, url = self._match_valid_url(url).group('id', 'url')

View File

@ -13,6 +13,7 @@ class IviIE(InfoExtractor):
IE_DESC = 'ivi.ru' IE_DESC = 'ivi.ru'
IE_NAME = 'ivi' IE_NAME = 'ivi'
_VALID_URL = r'https?://(?:www\.)?ivi\.(?:ru|tv)/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?ivi\.(?:ru|tv)/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P<id>\d+)'
_EMBED_REGEX = [r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1']
_GEO_BYPASS = False _GEO_BYPASS = False
_GEO_COUNTRIES = ['RU'] _GEO_COUNTRIES = ['RU']
_LIGHT_KEY = b'\xf1\x02\x32\xb7\xbc\x5c\x7a\xe8\xf7\x96\xc1\x33\x2b\x27\xa1\x8c' _LIGHT_KEY = b'\xf1\x02\x32\xb7\xbc\x5c\x7a\xe8\xf7\x96\xc1\x33\x2b\x27\xa1\x8c'

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str from ..compat import compat_str
from ..utils import ( from ..utils import (
@ -18,6 +16,7 @@ class JojIE(InfoExtractor):
) )
(?P<id>[^/?#^]+) (?P<id>[^/?#^]+)
''' '''
_EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1']
_TESTS = [{ _TESTS = [{
'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932', 'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932',
'info_dict': { 'info_dict': {
@ -38,14 +37,6 @@ class JojIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_urls(webpage):
return [
mobj.group('url')
for mobj in re.finditer(
r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1',
webpage)]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)

View File

@ -22,13 +22,8 @@ class JWPlatformIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod @classmethod
def _extract_url(webpage): def _extract_embed_urls(cls, url, webpage):
urls = JWPlatformIE._extract_urls(webpage)
return urls[0] if urls else None
@staticmethod
def _extract_urls(webpage):
for tag, key in ((r'(?:script|iframe)', 'src'), ('input', 'value')): for tag, key in ((r'(?:script|iframe)', 'src'), ('input', 'value')):
# <input value=URL> is used by hyland.com # <input value=URL> is used by hyland.com
# if we find <iframe>, dont look for <input> # if we find <iframe>, dont look for <input>

View File

@ -111,13 +111,8 @@ class KalturaIE(InfoExtractor):
} }
] ]
@staticmethod @classmethod
def _extract_url(webpage): def _extract_embed_urls(cls, url, webpage):
urls = KalturaIE._extract_urls(webpage)
return urls[0] if urls else None
@staticmethod
def _extract_urls(webpage):
# Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site # Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site
finditer = ( finditer = (
list(re.finditer( list(re.finditer(
@ -159,14 +154,14 @@ def _extract_urls(webpage):
for k, v in embed_info.items(): for k, v in embed_info.items():
if v: if v:
embed_info[k] = v.strip() embed_info[k] = v.strip()
url = 'kaltura:%(partner_id)s:%(id)s' % embed_info embed_url = 'kaltura:%(partner_id)s:%(id)s' % embed_info
escaped_pid = re.escape(embed_info['partner_id']) escaped_pid = re.escape(embed_info['partner_id'])
service_mobj = re.search( service_mobj = re.search(
r'<script[^>]+src=(["\'])(?P<id>(?:https?:)?//(?:(?!\1).)+)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid), r'<script[^>]+src=(["\'])(?P<id>(?:https?:)?//(?:(?!\1).)+)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid),
webpage) webpage)
if service_mobj: if service_mobj:
url = smuggle_url(url, {'service_url': service_mobj.group('id')}) embed_url = smuggle_url(embed_url, {'service_url': service_mobj.group('id')})
urls.append(url) urls.append(embed_url)
return urls return urls
def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs): def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs):

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_str, compat_str,
@ -10,8 +8,6 @@
parse_iso8601, parse_iso8601,
strip_or_none, strip_or_none,
try_get, try_get,
unescapeHTML,
urljoin,
) )
@ -55,6 +51,7 @@ class KinjaEmbedIE(InfoExtractor):
vine| vine|
youtube-(?:list|video) youtube-(?:list|video)
)-(?P<id>[^&]+)''' % (_DOMAIN_REGEX, _COMMON_REGEX) )-(?P<id>[^&]+)''' % (_DOMAIN_REGEX, _COMMON_REGEX)
_EMBED_REGEX = [rf'(?x)<iframe[^>]+?src=(?P<q>["\'])(?P<url>(?:(?:https?:)?//{_DOMAIN_REGEX})?{_COMMON_REGEX}(?:(?!\1).)+)\1']
_TESTS = [{ _TESTS = [{
'url': 'https://kinja.com/ajax/inset/iframe?id=fb-10103303356633621', 'url': 'https://kinja.com/ajax/inset/iframe?id=fb-10103303356633621',
'only_matching': True, 'only_matching': True,
@ -119,12 +116,6 @@ class KinjaEmbedIE(InfoExtractor):
'youtube-video': ('youtube.com/embed/', 'Youtube'), 'youtube-video': ('youtube.com/embed/', 'Youtube'),
} }
@staticmethod
def _extract_urls(webpage, url):
return [urljoin(url, unescapeHTML(mobj.group('url'))) for mobj in re.finditer(
r'(?x)<iframe[^>]+?src=(?P<q>["\'])(?P<url>(?:(?:https?:)?//%s)?%s(?:(?!\1).)+)\1' % (KinjaEmbedIE._DOMAIN_REGEX, KinjaEmbedIE._COMMON_REGEX),
webpage)]
def _real_extract(self, url): def _real_extract(self, url):
video_type, video_id = self._match_valid_url(url).groups() video_type, video_id = self._match_valid_url(url).groups()

View File

@ -10,6 +10,7 @@
class LibsynIE(InfoExtractor): class LibsynIE(InfoExtractor):
_VALID_URL = r'(?P<mainurl>https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+))' _VALID_URL = r'(?P<mainurl>https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+))'
_EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1']
_TESTS = [{ _TESTS = [{
'url': 'http://html5-player.libsyn.com/embed/episode/id/6385796/', 'url': 'http://html5-player.libsyn.com/embed/episode/id/6385796/',

View File

@ -17,7 +17,7 @@ class LimelightBaseIE(InfoExtractor):
_PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s' _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s'
@classmethod @classmethod
def _extract_urls(cls, webpage, source_url): def _extract_embed_urls(cls, url, webpage):
lm = { lm = {
'Media': 'media', 'Media': 'media',
'Channel': 'channel', 'Channel': 'channel',
@ -25,7 +25,7 @@ def _extract_urls(cls, webpage, source_url):
} }
def smuggle(url): def smuggle(url):
return smuggle_url(url, {'source_url': source_url}) return smuggle_url(url, {'source_url': url})
entries = [] entries = []
for kind, video_id in re.findall( for kind, video_id in re.findall(

View File

@ -23,6 +23,8 @@
class LivestreamIE(InfoExtractor): class LivestreamIE(InfoExtractor):
IE_NAME = 'livestream' IE_NAME = 'livestream'
_VALID_URL = r'https?://(?:new\.)?livestream\.com/(?:accounts/(?P<account_id>\d+)|(?P<account_name>[^/]+))/(?:events/(?P<event_id>\d+)|(?P<event_name>[^/]+))(?:/videos/(?P<id>\d+))?' _VALID_URL = r'https?://(?:new\.)?livestream\.com/(?:accounts/(?P<account_id>\d+)|(?P<account_name>[^/]+))/(?:events/(?P<event_id>\d+)|(?P<event_name>[^/]+))(?:/videos/(?P<id>\d+))?'
_EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"']
_TESTS = [{ _TESTS = [{
'url': 'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370', 'url': 'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370',
'md5': '53274c76ba7754fb0e8d072716f2292b', 'md5': '53274c76ba7754fb0e8d072716f2292b',

View File

@ -14,6 +14,7 @@
class MainStreamingIE(InfoExtractor): class MainStreamingIE(InfoExtractor):
_VALID_URL = r'https?://(?:webtools-?)?(?P<host>[A-Za-z0-9-]*\.msvdn.net)/(?:embed|amp_embed|content)/(?P<id>\w+)' _VALID_URL = r'https?://(?:webtools-?)?(?P<host>[A-Za-z0-9-]*\.msvdn.net)/(?:embed|amp_embed|content)/(?P<id>\w+)'
_EMBED_REGEX = [rf'<iframe[^>]+?src=["\']?(?P<url>{_VALID_URL})["\']?']
IE_DESC = 'MainStreaming Player' IE_DESC = 'MainStreaming Player'
_TESTS = [ _TESTS = [
@ -102,13 +103,6 @@ class MainStreamingIE(InfoExtractor):
} }
] ]
@staticmethod
def _extract_urls(webpage):
mobj = re.findall(
r'<iframe[^>]+?src=["\']?(?P<url>%s)["\']?' % MainStreamingIE._VALID_URL, webpage)
if mobj:
return [group[0] for group in mobj]
def _playlist_entries(self, host, playlist_content): def _playlist_entries(self, host, playlist_content):
for entry in playlist_content: for entry in playlist_content:
content_id = entry.get('contentID') content_id = entry.get('contentID')

View File

@ -3,11 +3,29 @@
compat_b64decode, compat_b64decode,
compat_urllib_parse_unquote, compat_urllib_parse_unquote,
) )
from ..utils import int_or_none from ..utils import classproperty, int_or_none
class MangomoloBaseIE(InfoExtractor): class MangomoloBaseIE(InfoExtractor):
_BASE_REGEX = r'https?://(?:admin\.mangomolo\.com/analytics/index\.php/customers/embed/|player\.mangomolo\.com/v1/)' _BASE_REGEX = r'(?:https?:)?//(?:admin\.mangomolo\.com/analytics/index\.php/customers/embed/|player\.mangomolo\.com/v1/)'
_SLUG = None
@classproperty
def _VALID_URL(cls):
return f'{cls._BASE_REGEX}{cls._SLUG}'
@classproperty
def _EMBED_REGEX(cls):
return [rf'<iframe[^>]+src=(["\'])(?P<url>{cls._VALID_URL}.+?)\1']
def _extract_from_webpage(self, url, webpage):
for res in super()._extract_from_webpage(url, webpage):
yield {
**res,
'_type': 'url_transparent',
'id': self._search_regex(self._SLUG, res['url'], 'id', group='id'),
'uploader': self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader'),
}
def _get_real_id(self, page_id): def _get_real_id(self, page_id):
return page_id return page_id
@ -41,14 +59,15 @@ def _real_extract(self, url):
class MangomoloVideoIE(MangomoloBaseIE): class MangomoloVideoIE(MangomoloBaseIE):
_TYPE = 'video' _TYPE = 'video'
IE_NAME = 'mangomolo:' + _TYPE IE_NAME = 'mangomolo:' + _TYPE
_VALID_URL = MangomoloBaseIE._BASE_REGEX + r'video\?.*?\bid=(?P<id>\d+)' _SLUG = r'video\?.*?\bid=(?P<id>\d+)'
_IS_LIVE = False _IS_LIVE = False
class MangomoloLiveIE(MangomoloBaseIE): class MangomoloLiveIE(MangomoloBaseIE):
_TYPE = 'live' _TYPE = 'live'
IE_NAME = 'mangomolo:' + _TYPE IE_NAME = 'mangomolo:' + _TYPE
_VALID_URL = MangomoloBaseIE._BASE_REGEX + r'(live|index)\?.*?\bchannelid=(?P<id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+)' _SLUG = r'(?:live|index)\?.*?\bchannelid=(?P<id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+)'
_IS_LIVE = True _IS_LIVE = True
def _get_real_id(self, page_id): def _get_real_id(self, page_id):

View File

@ -69,8 +69,8 @@ class MedialaanIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod @classmethod
def _extract_urls(webpage): def _extract_embed_urls(cls, url, webpage):
entries = [] entries = []
for element in re.findall(r'(<div[^>]+data-mychannels-type="video"[^>]*>)', webpage): for element in re.findall(r'(<div[^>]+data-mychannels-type="video"[^>]*>)', webpage):
mychannels_id = extract_attributes(element).get('data-mychannels-id') mychannels_id = extract_attributes(element).get('data-mychannels-id')

View File

@ -167,8 +167,7 @@ class MediasetIE(ThePlatformBaseIE):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod def _extract_from_webpage(self, url, webpage):
def _extract_urls(ie, webpage):
def _qs(url): def _qs(url):
return parse_qs(url) return parse_qs(url)
@ -188,8 +187,7 @@ def _program_guid(qs):
video_id = embed_qs.get('id', [None])[0] video_id = embed_qs.get('id', [None])[0]
if not video_id: if not video_id:
continue continue
urlh = ie._request_webpage( urlh = self._request_webpage(embed_url, video_id, note='Following embed URL redirect')
embed_url, video_id, note='Following embed URL redirect')
embed_url = urlh.geturl() embed_url = urlh.geturl()
program_guid = _program_guid(_qs(embed_url)) program_guid = _program_guid(_qs(embed_url))
if program_guid: if program_guid:

View File

@ -13,7 +13,7 @@
str_or_none, str_or_none,
try_call, try_call,
try_get, try_get,
unescapeHTML, smuggle_url,
unsmuggle_url, unsmuggle_url,
url_or_none, url_or_none,
urljoin, urljoin,
@ -25,6 +25,7 @@
class MediasiteIE(InfoExtractor): class MediasiteIE(InfoExtractor):
_VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/[^/#?]+/Presentation)/(?P<id>%s)(?P<query>\?[^#]+|)' % _ID_RE _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/[^/#?]+/Presentation)/(?P<id>%s)(?P<query>\?[^#]+|)' % _ID_RE
_EMBED_REGEX = [r'(?xi)<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/%s(?:\?.*?)?)\1' % _ID_RE]
_TESTS = [ _TESTS = [
{ {
'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d',
@ -112,13 +113,10 @@ class MediasiteIE(InfoExtractor):
5: 'video3', 5: 'video3',
} }
@staticmethod @classmethod
def _extract_urls(webpage): def _extract_embed_urls(cls, url, webpage):
return [ for embed_url in super()._extract_embed_urls(url, webpage):
unescapeHTML(mobj.group('url')) yield smuggle_url(embed_url, {'UrlReferrer': url})
for mobj in re.finditer(
r'(?xi)<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/%s(?:\?.*?)?)\1' % _ID_RE,
webpage)]
def __extract_slides(self, *, stream_id, snum, Stream, duration, images): def __extract_slides(self, *, stream_id, snum, Stream, duration, images):
slide_base_url = Stream['SlideBaseUrl'] slide_base_url = Stream['SlideBaseUrl']

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import js_to_json from ..utils import js_to_json
@ -8,6 +6,7 @@ class MegaphoneIE(InfoExtractor):
IE_NAME = 'megaphone.fm' IE_NAME = 'megaphone.fm'
IE_DESC = 'megaphone.fm embedded players' IE_DESC = 'megaphone.fm embedded players'
_VALID_URL = r'https://player\.megaphone\.fm/(?P<id>[A-Z0-9]+)' _VALID_URL = r'https://player\.megaphone\.fm/(?P<id>[A-Z0-9]+)'
_EMBED_REGEX = [rf'<iframe[^>]*?\ssrc=["\'](?P<url>{_VALID_URL})']
_TEST = { _TEST = {
'url': 'https://player.megaphone.fm/GLT9749789991?"', 'url': 'https://player.megaphone.fm/GLT9749789991?"',
'md5': '4816a0de523eb3e972dc0dda2c191f96', 'md5': '4816a0de523eb3e972dc0dda2c191f96',
@ -45,8 +44,3 @@ def _real_extract(self, url):
'duration': episode_data['duration'], 'duration': episode_data['duration'],
'formats': formats, 'formats': formats,
} }
@classmethod
def _extract_urls(cls, webpage):
return [m[0] for m in re.findall(
r'<iframe[^>]*?\ssrc=["\'](%s)' % cls._VALID_URL, webpage)]

View File

@ -104,7 +104,7 @@ class MegaTVComEmbedIE(MegaTVComBaseIE):
IE_NAME = 'megatvcom:embed' IE_NAME = 'megatvcom:embed'
IE_DESC = 'megatv.com embedded videos' IE_DESC = 'megatv.com embedded videos'
_VALID_URL = r'(?:https?:)?//(?:www\.)?megatv\.com/embed/?\?p=(?P<id>\d+)' _VALID_URL = r'(?:https?:)?//(?:www\.)?megatv\.com/embed/?\?p=(?P<id>\d+)'
_EMBED_RE = re.compile(rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)''') _EMBED_REGEX = [rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)''']
_TESTS = [{ _TESTS = [{
'url': 'https://www.megatv.com/embed/?p=2020520979', 'url': 'https://www.megatv.com/embed/?p=2020520979',
@ -134,11 +134,6 @@ class MegaTVComEmbedIE(MegaTVComBaseIE):
}, },
}] }]
@classmethod
def _extract_urls(cls, webpage):
for mobj in cls._EMBED_RE.finditer(webpage):
yield unescapeHTML(mobj.group('url'))
def _match_canonical_url(self, webpage): def _match_canonical_url(self, webpage):
LINK_RE = r'''(?x) LINK_RE = r'''(?x)
<link(?: <link(?:

View File

@ -92,6 +92,10 @@ class MLBIE(MLBBaseIE):
(?P<id>\d+) (?P<id>\d+)
) )
''' '''
_EMBED_REGEX = [
r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
r'data-video-link=["\'](?P<url>http://m\.mlb\.com/video/[^"\']+)',
]
_TESTS = [ _TESTS = [
{ {
'url': 'https://www.mlb.com/mariners/video/ackleys-spectacular-catch/c-34698933', 'url': 'https://www.mlb.com/mariners/video/ackleys-spectacular-catch/c-34698933',

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
@ -59,17 +57,12 @@ def _real_extract(self, url):
class MofosexEmbedIE(InfoExtractor): class MofosexEmbedIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=(?P<id>\d+)'
_EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=\d+)']
_TESTS = [{ _TESTS = [{
'url': 'https://www.mofosex.com/embed/?videoid=318131&referrer=KM', 'url': 'https://www.mofosex.com/embed/?videoid=318131&referrer=KM',
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=\d+)',
webpage)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
return self.url_result( return self.url_result(

View File

@ -331,6 +331,7 @@ def _real_extract(self, url):
class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):
IE_NAME = 'mtvservices:embedded' IE_NAME = 'mtvservices:embedded'
_VALID_URL = r'https?://media\.mtvnservices\.com/embed/(?P<mgid>.+?)(\?|/|$)' _VALID_URL = r'https?://media\.mtvnservices\.com/embed/(?P<mgid>.+?)(\?|/|$)'
_EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1']
_TEST = { _TEST = {
# From http://www.thewrap.com/peter-dinklage-sums-up-game-of-thrones-in-45-seconds-video/ # From http://www.thewrap.com/peter-dinklage-sums-up-game-of-thrones-in-45-seconds-video/
@ -346,13 +347,6 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):
}, },
} }
@staticmethod
def _extract_url(webpage):
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1', webpage)
if mobj:
return mobj.group('url')
def _get_feed_url(self, uri, url=None): def _get_feed_url(self, uri, url=None):
video_id = self._id_from_uri(uri) video_id = self._id_from_uri(uri)
config = self._download_json( config = self._download_json(

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from .vimple import SprutoBaseIE from .vimple import SprutoBaseIE
@ -26,6 +24,7 @@ class MyviIE(SprutoBaseIE):
) )
(?P<id>[\da-zA-Z_-]+) (?P<id>[\da-zA-Z_-]+)
''' '''
_EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//myvi\.(?:ru/player|tv)/(?:embed/html|flash)/[^"]+)\1']
_TESTS = [{ _TESTS = [{
'url': 'http://myvi.ru/player/embed/html/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0', 'url': 'http://myvi.ru/player/embed/html/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0',
'md5': '571bbdfba9f9ed229dc6d34cc0f335bf', 'md5': '571bbdfba9f9ed229dc6d34cc0f335bf',
@ -56,13 +55,6 @@ class MyviIE(SprutoBaseIE):
'only_matching': True, 'only_matching': True,
}] }]
@classmethod
def _extract_url(cls, webpage):
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//myvi\.(?:ru/player|tv)/(?:embed/html|flash)/[^"]+)\1', webpage)
if mobj:
return mobj.group('url')
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)

View File

@ -184,6 +184,7 @@ def _real_extract(self, url):
class NBCSportsVPlayerIE(InfoExtractor): class NBCSportsVPlayerIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:vplayer\.nbcsports\.com|(?:www\.)?nbcsports\.com/vplayer)/' _VALID_URL_BASE = r'https?://(?:vplayer\.nbcsports\.com|(?:www\.)?nbcsports\.com/vplayer)/'
_VALID_URL = _VALID_URL_BASE + r'(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)' _VALID_URL = _VALID_URL_BASE + r'(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)'
_EMBED_REGEX = [r'(?:iframe[^>]+|var video|div[^>]+data-(?:mpx-)?)[sS]rc\s?=\s?"(?P<url>%s[^\"]+)' % _VALID_URL_BASE]
_TESTS = [{ _TESTS = [{
'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI', 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI',
@ -207,13 +208,6 @@ class NBCSportsVPlayerIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_url(webpage):
video_urls = re.search(
r'(?:iframe[^>]+|var video|div[^>]+data-(?:mpx-)?)[sS]rc\s?=\s?"(?P<url>%s[^\"]+)' % NBCSportsVPlayerIE._VALID_URL_BASE, webpage)
if video_urls:
return video_urls.group('url')
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
@ -317,6 +311,7 @@ def _real_extract(self, url):
class NBCNewsIE(ThePlatformIE): class NBCNewsIE(ThePlatformIE):
_VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P<id>[^/?]+)' _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P<id>[^/?]+)'
_EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1']
_TESTS = [ _TESTS = [
{ {

View File

@ -114,8 +114,8 @@ def _extract_domain_id(webpage):
webpage) webpage)
return mobj.group('id') if mobj else None return mobj.group('id') if mobj else None
@staticmethod @classmethod
def _extract_urls(webpage): def _extract_embed_urls(cls, url, webpage):
# Reference: # Reference:
# 1. https://nx-s.akamaized.net/files/201510/44.pdf # 1. https://nx-s.akamaized.net/files/201510/44.pdf
@ -135,10 +135,6 @@ def _extract_urls(webpage):
return entries return entries
@staticmethod
def _extract_url(webpage):
return NexxIE._extract_urls(webpage)[0]
def _handle_error(self, response): def _handle_error(self, response):
if traverse_obj(response, ('metadata', 'notice'), expected_type=str): if traverse_obj(response, ('metadata', 'notice'), expected_type=str):
self.report_warning('%s said: %s' % (self.IE_NAME, response['metadata']['notice'])) self.report_warning('%s said: %s' % (self.IE_NAME, response['metadata']['notice']))
@ -498,6 +494,8 @@ def find_video(result):
class NexxEmbedIE(InfoExtractor): class NexxEmbedIE(InfoExtractor):
_VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:video/)?(?P<id>[^/?#&]+)' _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:video/)?(?P<id>[^/?#&]+)'
# Reference. https://nx-s.akamaized.net/files/201510/44.pdf
_EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:(?!\1).)+)\1']
_TESTS = [{ _TESTS = [{
'url': 'http://embed.nexx.cloud/748/KC1614647Z27Y7T?autoplay=1', 'url': 'http://embed.nexx.cloud/748/KC1614647Z27Y7T?autoplay=1',
'md5': '16746bfc28c42049492385c989b26c4a', 'md5': '16746bfc28c42049492385c989b26c4a',
@ -521,16 +519,6 @@ class NexxEmbedIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_urls(webpage):
# Reference:
# 1. https://nx-s.akamaized.net/files/201510/44.pdf
# iFrame Embed Integration
return [mobj.group('url') for mobj in re.finditer(
r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:(?!\1).)+)\1',
webpage)]
def _real_extract(self, url): def _real_extract(self, url):
embed_id = self._match_id(url) embed_id = self._match_id(url)

View File

@ -103,6 +103,7 @@ def get_file_size(file_size):
class NYTimesIE(NYTimesBaseIE): class NYTimesIE(NYTimesBaseIE):
_VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)' _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)'
_EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>']
_TESTS = [{ _TESTS = [{
'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263', 'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263',

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_etree_fromstring, compat_etree_fromstring,
@ -31,6 +29,7 @@ class OdnoklassnikiIE(InfoExtractor):
) )
(?P<id>[\d-]+) (?P<id>[\d-]+)
''' '''
_EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1']
_TESTS = [{ _TESTS = [{
'note': 'Coub embedded', 'note': 'Coub embedded',
'url': 'http://ok.ru/video/1484130554189', 'url': 'http://ok.ru/video/1484130554189',
@ -161,13 +160,6 @@ class OdnoklassnikiIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_url(webpage):
mobj = re.search(
r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage)
if mobj:
return mobj.group('url')
def _real_extract(self, url): def _real_extract(self, url):
try: try:
return self._extract_desktop(url) return self._extract_desktop(url)

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str from ..compat import compat_str
from ..utils import js_to_json from ..utils import js_to_json
@ -7,6 +5,7 @@
class OnionStudiosIE(InfoExtractor): class OnionStudiosIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?onionstudios\.com/(?:video(?:s/[^/]+-|/)|embed\?.*\bid=)(?P<id>\d+)(?!-)' _VALID_URL = r'https?://(?:www\.)?onionstudios\.com/(?:video(?:s/[^/]+-|/)|embed\?.*\bid=)(?P<id>\d+)(?!-)'
_EMBED_REGEX = [r'(?s)<(?:iframe|bulbs-video)[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?onionstudios\.com/(?:embed.+?|video/\d+\.json))\1']
_TESTS = [{ _TESTS = [{
'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937', 'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937',
@ -29,13 +28,6 @@ class OnionStudiosIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_url(webpage):
mobj = re.search(
r'(?s)<(?:iframe|bulbs-video)[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?onionstudios\.com/(?:embed.+?|video/\d+\.json))\1', webpage)
if mobj:
return mobj.group('url')
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)

View File

@ -10,6 +10,7 @@
determine_ext, determine_ext,
float_or_none, float_or_none,
int_or_none, int_or_none,
smuggle_url,
try_get, try_get,
unsmuggle_url, unsmuggle_url,
) )
@ -151,6 +152,29 @@ class OoyalaIE(OoyalaBaseIE):
} }
] ]
def _extract_from_webpage(self, url, webpage):
mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage)
or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage)
or re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage)
or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage)
or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
if mobj is not None:
embed_token = self._search_regex(
r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)',
webpage, 'ooyala embed token', default=None)
yield self._build_url_result(smuggle_url(
mobj.group('ec'), {
'domain': url,
'embed_token': embed_token,
}))
return
# Look for multiple Ooyala embeds on SBN network websites
mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
if mobj is not None:
for v in self._parse_json(mobj.group(1), self._generic_id(url), fatal=False) or []:
yield self._build_url_result(smuggle_url(v['provider_video_id'], {'domain': url}))
@staticmethod @staticmethod
def _url_for_embed_code(embed_code): def _url_for_embed_code(embed_code):
return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code

View File

@ -1,4 +1,3 @@
import re
import calendar import calendar
import json import json
import functools import functools
@ -73,15 +72,10 @@ def _call_api(self, base_url, path, video_id, data=None, fatal=True, **kwargs):
def _parse_fragment(url): def _parse_fragment(url):
return {k: json.loads(v[0]) for k, v in compat_urlparse.parse_qs(compat_urllib_parse_urlparse(url).fragment).items()} return {k: json.loads(v[0]) for k, v in compat_urlparse.parse_qs(compat_urllib_parse_urlparse(url).fragment).items()}
@staticmethod
def _extract_urls(webpage):
return [m.group('url') for m in re.finditer(
r'<iframe[^>]+src=["\'](?P<url>%s/Pages/(Viewer|Embed|Sessions/List)\.aspx[^"\']+)' % PanoptoIE.BASE_URL_RE,
webpage)]
class PanoptoIE(PanoptoBaseIE): class PanoptoIE(PanoptoBaseIE):
_VALID_URL = PanoptoBaseIE.BASE_URL_RE + r'/Pages/(Viewer|Embed)\.aspx.*(?:\?|&)id=(?P<id>[a-f0-9-]+)' _VALID_URL = PanoptoBaseIE.BASE_URL_RE + r'/Pages/(Viewer|Embed)\.aspx.*(?:\?|&)id=(?P<id>[a-f0-9-]+)'
_EMBED_REGEX = [rf'<iframe[^>]+src=["\'](?P<url>{PanoptoBaseIE.BASE_URL_RE}/Pages/(Viewer|Embed|Sessions/List)\.aspx[^"\']+)']
_TESTS = [ _TESTS = [
{ {
'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=26b3ae9e-4a48-4dcc-96ba-0befba08a0fb', 'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=26b3ae9e-4a48-4dcc-96ba-0befba08a0fb',

View File

@ -1057,6 +1057,7 @@ class PeerTubeIE(InfoExtractor):
) )
(?P<id>%s) (?P<id>%s)
''' % (_INSTANCES_RE, _UUID_RE) ''' % (_INSTANCES_RE, _UUID_RE)
_EMBED_REGEX = [r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//{_INSTANCES_RE}/videos/embed/{cls._UUID_RE})''']
_TESTS = [{ _TESTS = [{
'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d', 'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d',
'md5': '8563064d245a4be5705bddb22bb00a28', 'md5': '8563064d245a4be5705bddb22bb00a28',
@ -1158,16 +1159,15 @@ def _extract_peertube_url(webpage, source_url):
'>We are sorry but it seems that PeerTube is not compatible with your web browser.<')): '>We are sorry but it seems that PeerTube is not compatible with your web browser.<')):
return 'peertube:%s:%s' % mobj.group('host', 'id') return 'peertube:%s:%s' % mobj.group('host', 'id')
@staticmethod @classmethod
def _extract_urls(webpage, source_url): def _extract_embed_urls(cls, url, webpage):
entries = re.findall( embeds = tuple(super()._extract_embed_urls(url, webpage))
r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//%s/videos/embed/%s)''' if embeds:
% (PeerTubeIE._INSTANCES_RE, PeerTubeIE._UUID_RE), webpage) return embeds
if not entries:
peertube_url = PeerTubeIE._extract_peertube_url(webpage, source_url) peertube_url = cls._extract_peertube_url(webpage, url)
if peertube_url: if peertube_url:
entries = [peertube_url] return [peertube_url]
return entries
def _call_api(self, host, video_id, path, note=None, errnote=None, fatal=True): def _call_api(self, host, video_id, path, note=None, errnote=None, fatal=True):
return self._download_json( return self._download_json(

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
@ -67,6 +65,7 @@ class PeriscopeIE(PeriscopeBaseIE):
IE_DESC = 'Periscope' IE_DESC = 'Periscope'
IE_NAME = 'periscope' IE_NAME = 'periscope'
_VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P<id>[^/?#]+)' _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P<id>[^/?#]+)'
_EMBED_REGEX = [r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?(?:periscope|pscp)\.tv/(?:(?!\1).)+)\1']
# Alive example URLs can be found here https://www.periscope.tv/ # Alive example URLs can be found here https://www.periscope.tv/
_TESTS = [{ _TESTS = [{
'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==',
@ -92,13 +91,6 @@ class PeriscopeIE(PeriscopeBaseIE):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_url(webpage):
mobj = re.search(
r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?(?:periscope|pscp)\.tv/(?:(?!\1).)+)\1', webpage)
if mobj:
return mobj.group('url')
def _real_extract(self, url): def _real_extract(self, url):
token = self._match_id(url) token = self._match_id(url)

View File

@ -30,6 +30,7 @@ class PikselIE(InfoExtractor):
)\.jp| )\.jp|
vidego\.baltimorecity\.gov vidego\.baltimorecity\.gov
)/v/(?:refid/(?P<refid>[^/]+)/prefid/)?(?P<id>[\w-]+)''' )/v/(?:refid/(?P<refid>[^/]+)/prefid/)?(?P<id>[\w-]+)'''
_EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//player\.piksel\.com/v/[a-z0-9]+)']
_TESTS = [ _TESTS = [
{ {
'url': 'http://player.piksel.com/v/ums2867l', 'url': 'http://player.piksel.com/v/ums2867l',
@ -62,14 +63,6 @@ class PikselIE(InfoExtractor):
} }
] ]
@staticmethod
def _extract_url(webpage):
mobj = re.search(
r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//player\.piksel\.com/v/[a-z0-9]+)',
webpage)
if mobj:
return mobj.group('url')
def _call_api(self, app_token, resource, display_id, query, fatal=True): def _call_api(self, app_token, resource, display_id, query, fatal=True):
response = (self._download_json( response = (self._download_json(
'http://player.piksel.com/ws/ws_%s/api/%s/mode/json/apiv/5' % (resource, app_token), 'http://player.piksel.com/ws/ws_%s/api/%s/mode/json/apiv/5' % (resource, app_token),

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
@ -24,6 +22,7 @@ class PladformIE(InfoExtractor):
) )
(?P<id>\d+) (?P<id>\d+)
''' '''
_EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1']
_TESTS = [{ _TESTS = [{
'url': 'http://out.pladform.ru/player?pl=18079&type=html5&videoid=100231282', 'url': 'http://out.pladform.ru/player?pl=18079&type=html5&videoid=100231282',
'info_dict': { 'info_dict': {
@ -61,13 +60,6 @@ class PladformIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_url(webpage):
mobj = re.search(
r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1', webpage)
if mobj:
return mobj.group('url')
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)

View File

@ -7,6 +7,8 @@
class PlaywireIE(InfoExtractor): class PlaywireIE(InfoExtractor):
_VALID_URL = r'https?://(?:config|cdn)\.playwire\.com(?:/v2)?/(?P<publisher_id>\d+)/(?:videos/v2|embed|config)/(?P<id>\d+)' _VALID_URL = r'https?://(?:config|cdn)\.playwire\.com(?:/v2)?/(?P<publisher_id>\d+)/(?:videos/v2|embed|config)/(?P<id>\d+)'
_EMBED_REGEX = [r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1']
_TESTS = [{ _TESTS = [{
'url': 'http://config.playwire.com/14907/videos/v2/3353705/player.json', 'url': 'http://config.playwire.com/14907/videos/v2/3353705/player.json',
'md5': 'e6398701e3595888125729eaa2329ed9', 'md5': 'e6398701e3595888125729eaa2329ed9',

View File

@ -128,6 +128,7 @@ class PornHubIE(PornHubBaseIE):
) )
(?P<id>[\da-z]+) (?P<id>[\da-z]+)
''' % PornHubBaseIE._PORNHUB_HOST_RE ''' % PornHubBaseIE._PORNHUB_HOST_RE
_EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)']
_TESTS = [{ _TESTS = [{
'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
'md5': 'a6391306d050e4547f62b3f485dd9ba9', 'md5': 'a6391306d050e4547f62b3f485dd9ba9',
@ -257,12 +258,6 @@ class PornHubIE(PornHubBaseIE):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)',
webpage)
def _extract_count(self, pattern, webpage, name): def _extract_count(self, pattern, webpage, name):
return str_to_int(self._search_regex(pattern, webpage, '%s count' % name, default=None)) return str_to_int(self._search_regex(pattern, webpage, '%s count' % name, default=None))

View File

@ -281,6 +281,20 @@ class RCSEmbedsIE(RCSBaseIE):
(?:gazzanet\.)?gazzetta (?:gazzanet\.)?gazzetta
)\.it) )\.it)
/video-embed/(?P<id>[^/=&\?]+?)(?:$|\?)''' /video-embed/(?P<id>[^/=&\?]+?)(?:$|\?)'''
_EMBED_REGEX = [r'''(?x)
(?:
data-frame-src=|
<iframe[^\n]+src=
)
(["'])
(?P<url>(?:https?:)?//video\.
(?:
rcs|
(?:corriere\w+\.)?corriere|
(?:gazzanet\.)?gazzetta
)
\.it/video-embed/.+?)
\1''']
_TESTS = [{ _TESTS = [{
'url': 'https://video.rcs.it/video-embed/iodonna-0001585037', 'url': 'https://video.rcs.it/video-embed/iodonna-0001585037',
'md5': '623ecc8ffe7299b2d0c1046d8331a9df', 'md5': '623ecc8ffe7299b2d0c1046d8331a9df',
@ -321,30 +335,9 @@ def _sanitize_urls(urls):
urls[i] = urljoin(base_url(e), url_basename(e)) urls[i] = urljoin(base_url(e), url_basename(e))
return urls return urls
@staticmethod @classmethod
def _extract_urls(webpage): def _extract_embed_urls(cls, url, webpage):
entries = [ return cls._sanitize_urls(tuple(super()._extract_embed_urls(url, webpage)))
mobj.group('url')
for mobj in re.finditer(r'''(?x)
(?:
data-frame-src=|
<iframe[^\n]+src=
)
(["'])
(?P<url>(?:https?:)?//video\.
(?:
rcs|
(?:corriere\w+\.)?corriere|
(?:gazzanet\.)?gazzetta
)
\.it/video-embed/.+?)
\1''', webpage)]
return RCSEmbedsIE._sanitize_urls(entries)
@staticmethod
def _extract_url(webpage):
urls = RCSEmbedsIE._extract_urls(webpage)
return urls[0] if urls else None
class RCSIE(RCSBaseIE): class RCSIE(RCSBaseIE):

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
@ -14,6 +12,7 @@
class RedTubeIE(InfoExtractor): class RedTubeIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:\w+\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)' _VALID_URL = r'https?://(?:(?:\w+\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)'
_EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)']
_TESTS = [{ _TESTS = [{
'url': 'https://www.redtube.com/38864951', 'url': 'https://www.redtube.com/38864951',
'md5': '4fba70cbca3aefd25767ab4b523c9878', 'md5': '4fba70cbca3aefd25767ab4b523c9878',
@ -37,12 +36,6 @@ class RedTubeIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)',
webpage)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage( webpage = self._download_webpage(

View File

@ -8,6 +8,7 @@
class RtlNlIE(InfoExtractor): class RtlNlIE(InfoExtractor):
IE_NAME = 'rtl.nl' IE_NAME = 'rtl.nl'
IE_DESC = 'rtl.nl and rtlxl.nl' IE_DESC = 'rtl.nl and rtlxl.nl'
_EMBED_REGEX = [r'<iframe[^>]+?\bsrc=(?P<q1>[\'"])(?P<url>(?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)(?P=q1)']
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
https?://(?:(?:www|static)\.)? https?://(?:(?:www|static)\.)?
(?: (?:

View File

@ -15,6 +15,7 @@
class RumbleEmbedIE(InfoExtractor): class RumbleEmbedIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)' _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)'
_EMBED_REGEX = [fr'(?:<(?:script|iframe)[^>]+\bsrc=|["\']embedUrl["\']\s*:\s*)["\'](?P<url>{_VALID_URL})']
_TESTS = [{ _TESTS = [{
'url': 'https://rumble.com/embed/v5pv5f', 'url': 'https://rumble.com/embed/v5pv5f',
'md5': '36a18a049856720189f30977ccbb2c34', 'md5': '36a18a049856720189f30977ccbb2c34',
@ -51,11 +52,10 @@ class RumbleEmbedIE(InfoExtractor):
}] }]
@classmethod @classmethod
def _extract_urls(cls, webpage): def _extract_embed_urls(cls, url, webpage):
embeds = tuple(re.finditer( embeds = tuple(super()._extract_embed_urls(url, webpage))
fr'(?:<(?:script|iframe)[^>]+\bsrc=|["\']embedUrl["\']\s*:\s*)["\'](?P<url>{cls._VALID_URL})', webpage))
if embeds: if embeds:
return [mobj.group('url') for mobj in embeds] return embeds
return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer( return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer(
r'<script>\s*Rumble\(\s*"play"\s*,\s*{\s*[\'"]video[\'"]\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)] r'<script>\s*Rumble\(\s*"play"\s*,\s*{\s*[\'"]video[\'"]\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)]

View File

@ -1,4 +1,3 @@
import re
import itertools import itertools
from .common import InfoExtractor from .common import InfoExtractor
@ -94,6 +93,7 @@ class RutubeIE(RutubeBaseIE):
IE_NAME = 'rutube' IE_NAME = 'rutube'
IE_DESC = 'Rutube videos' IE_DESC = 'Rutube videos'
_VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})' _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})'
_EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/embed/[\da-z]{32}.*?)\1']
_TESTS = [{ _TESTS = [{
'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
@ -128,12 +128,6 @@ class RutubeIE(RutubeBaseIE):
def suitable(cls, url): def suitable(cls, url):
return False if RutubePlaylistIE.suitable(url) else super(RutubeIE, cls).suitable(url) return False if RutubePlaylistIE.suitable(url) else super(RutubeIE, cls).suitable(url)
@staticmethod
def _extract_urls(webpage):
return [mobj.group('url') for mobj in re.finditer(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/embed/[\da-z]{32}.*?)\1',
webpage)]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
info = self._download_and_extract_info(video_id) info = self._download_and_extract_info(video_id)

View File

@ -20,6 +20,10 @@ class RUTVIE(InfoExtractor):
) )
(?P<id>\d+) (?P<id>\d+)
''' '''
_EMBED_URLS = [
r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1',
r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)',
]
_TESTS = [ _TESTS = [
{ {
@ -107,19 +111,6 @@ class RUTVIE(InfoExtractor):
}, },
] ]
@classmethod
def _extract_url(cls, webpage):
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
if mobj:
return mobj.group('url')
mobj = re.search(
r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)',
webpage)
if mobj:
return mobj.group('url')
def _real_extract(self, url): def _real_extract(self, url):
mobj = self._match_valid_url(url) mobj = self._match_valid_url(url)
video_id = mobj.group('id') video_id = mobj.group('id')

View File

@ -135,7 +135,7 @@ class RuutuIE(InfoExtractor):
_API_BASE = 'https://gatling.nelonenmedia.fi' _API_BASE = 'https://gatling.nelonenmedia.fi'
@classmethod @classmethod
def _extract_urls(cls, webpage): def _extract_embed_urls(cls, url, webpage):
# nelonen.fi # nelonen.fi
settings = try_call( settings = try_call(
lambda: json.loads(re.search( lambda: json.loads(re.search(

View File

@ -15,6 +15,12 @@ class SBSIE(InfoExtractor):
.*?\bplay=|/watch/ .*?\bplay=|/watch/
)|news/(?:embeds/)?video/ )|news/(?:embeds/)?video/
)(?P<id>[0-9]+)''' )(?P<id>[0-9]+)'''
_EMBED_REGEX = [r'''(?x)]
(?:
<meta\s+property="og:video"\s+content=|
<iframe[^>]+?src=
)
(["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''']
_TESTS = [{ _TESTS = [{
# Original URL is handled by the generic IE which finds the iframe: # Original URL is handled by the generic IE which finds the iframe:

View File

@ -49,6 +49,7 @@
class SenateISVPIE(InfoExtractor): class SenateISVPIE(InfoExtractor):
_IE_NAME = 'senate.gov:isvp' _IE_NAME = 'senate.gov:isvp'
_VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)' _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)'
_EMBED_REGEX = [r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]"]
_TESTS = [{ _TESTS = [{
'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',
@ -87,14 +88,6 @@ class SenateISVPIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _search_iframe_url(webpage):
mobj = re.search(
r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]",
webpage)
if mobj:
return mobj.group('url')
def _real_extract(self, url): def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {}) url, smuggled_data = unsmuggle_url(url, {})

View File

@ -43,14 +43,14 @@ class SendtoNewsIE(InfoExtractor):
_URL_TEMPLATE = '//embed.sendtonews.com/player2/embedplayer.php?SC=%s' _URL_TEMPLATE = '//embed.sendtonews.com/player2/embedplayer.php?SC=%s'
@classmethod @classmethod
def _extract_url(cls, webpage): def _extract_embed_urls(cls, url, webpage):
mobj = re.search(r'''(?x)<script[^>]+src=([\'"]) mobj = re.search(r'''(?x)<script[^>]+src=([\'"])
(?:https?:)?//embed\.sendtonews\.com/player/responsiveembed\.php\? (?:https?:)?//embed\.sendtonews\.com/player/responsiveembed\.php\?
.*\bSC=(?P<SC>[0-9a-zA-Z-]+).* .*\bSC=(?P<SC>[0-9a-zA-Z-]+).*
\1>''', webpage) \1>''', webpage)
if mobj: if mobj:
sc = mobj.group('SC') sc = mobj.group('SC')
return cls._URL_TEMPLATE % sc yield cls._URL_TEMPLATE % sc
def _real_extract(self, url): def _real_extract(self, url):
playlist_id = self._match_id(url) playlist_id = self._match_id(url)

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_str, compat_str,
@ -20,6 +18,7 @@ def _raw_id(src_url):
class SeznamZpravyIE(InfoExtractor): class SeznamZpravyIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?seznamzpravy\.cz/iframe/player\?.*\bsrc=' _VALID_URL = r'https?://(?:www\.)?seznamzpravy\.cz/iframe/player\?.*\bsrc='
_EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?seznamzpravy\.cz/iframe/player\?.*?)\1']
_TESTS = [{ _TESTS = [{
'url': 'https://www.seznamzpravy.cz/iframe/player?duration=241&serviceSlug=zpravy&src=https%3A%2F%2Fv39-a.sdn.szn.cz%2Fv_39%2Fvmd%2F5999c902ea707c67d8e267a9%3Ffl%3Dmdk%2C432f65a0%7C&itemType=video&autoPlay=false&title=Sv%C4%9Bt%20bez%20obalu%3A%20%C4%8Ce%C5%A1t%C3%AD%20voj%C3%A1ci%20na%20mis%C3%ADch%20(kr%C3%A1tk%C3%A1%20verze)&series=Sv%C4%9Bt%20bez%20obalu&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_F_I%2FR5puJ.jpeg%3Ffl%3Dcro%2C0%2C0%2C1920%2C1080%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=1920&height=1080&cutFrom=0&cutTo=0&splVersion=VOD&contentId=170889&contextId=35990&showAdvert=true&collocation=&autoplayPossible=true&embed=&isVideoTooShortForPreroll=false&isVideoTooLongForPostroll=true&videoCommentOpKey=&videoCommentId=&version=4.0.76&dotService=zpravy&gemiusPrismIdentifier=bVc1ZIb_Qax4W2v5xOPGpMeCP31kFfrTzj0SqPTLh_b.Z7&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5&sectionPrefixPreroll=%2Fzpravy', 'url': 'https://www.seznamzpravy.cz/iframe/player?duration=241&serviceSlug=zpravy&src=https%3A%2F%2Fv39-a.sdn.szn.cz%2Fv_39%2Fvmd%2F5999c902ea707c67d8e267a9%3Ffl%3Dmdk%2C432f65a0%7C&itemType=video&autoPlay=false&title=Sv%C4%9Bt%20bez%20obalu%3A%20%C4%8Ce%C5%A1t%C3%AD%20voj%C3%A1ci%20na%20mis%C3%ADch%20(kr%C3%A1tk%C3%A1%20verze)&series=Sv%C4%9Bt%20bez%20obalu&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_F_I%2FR5puJ.jpeg%3Ffl%3Dcro%2C0%2C0%2C1920%2C1080%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=1920&height=1080&cutFrom=0&cutTo=0&splVersion=VOD&contentId=170889&contextId=35990&showAdvert=true&collocation=&autoplayPossible=true&embed=&isVideoTooShortForPreroll=false&isVideoTooLongForPostroll=true&videoCommentOpKey=&videoCommentId=&version=4.0.76&dotService=zpravy&gemiusPrismIdentifier=bVc1ZIb_Qax4W2v5xOPGpMeCP31kFfrTzj0SqPTLh_b.Z7&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5&sectionPrefixPreroll=%2Fzpravy',
'info_dict': { 'info_dict': {
@ -48,13 +47,6 @@ class SeznamZpravyIE(InfoExtractor):
}, },
}] }]
@staticmethod
def _extract_urls(webpage):
return [
mobj.group('url') for mobj in re.finditer(
r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?seznamzpravy\.cz/iframe/player\?.*?)\1',
webpage)]
def _extract_sdn_formats(self, sdn_url, video_id): def _extract_sdn_formats(self, sdn_url, video_id):
sdn_data = self._download_json(sdn_url, video_id) sdn_data = self._download_json(sdn_url, video_id)
@ -162,5 +154,5 @@ def _real_extract(self, url):
return self.playlist_result([ return self.playlist_result([
self.url_result(entry_url, ie=SeznamZpravyIE.ie_key()) self.url_result(entry_url, ie=SeznamZpravyIE.ie_key())
for entry_url in SeznamZpravyIE._extract_urls(webpage)], for entry_url in SeznamZpravyIE._extract_embed_urls(url, webpage)],
article_id, title, description) article_id, title, description)

View File

@ -0,0 +1,6 @@
from .common import InfoExtractor
class ShareVideosEmbedIE(InfoExtractor):
_VALID_URL = False
_EMBED_REGEX = [r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1']

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
clean_podcast_url, clean_podcast_url,
@ -68,6 +66,11 @@ def _parse_episode(self, episode):
class SimplecastIE(SimplecastBaseIE): class SimplecastIE(SimplecastBaseIE):
IE_NAME = 'simplecast' IE_NAME = 'simplecast'
_VALID_URL = r'https?://(?:api\.simplecast\.com/episodes|player\.simplecast\.com)/(?P<id>%s)' % SimplecastBaseIE._UUID_REGEX _VALID_URL = r'https?://(?:api\.simplecast\.com/episodes|player\.simplecast\.com)/(?P<id>%s)' % SimplecastBaseIE._UUID_REGEX
_EMBED_REGEX = [rf'''(?x)<iframe[^>]+src=["\']
(?P<url>https?://(?:
embed\.simplecast\.com/[0-9a-f]{8}|
player\.simplecast\.com/{SimplecastBaseIE._UUID_REGEX}
))''']
_COMMON_TEST_INFO = { _COMMON_TEST_INFO = {
'display_id': 'errant-signal-chris-franklin-new-wave-video-essays', 'display_id': 'errant-signal-chris-franklin-new-wave-video-essays',
'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876', 'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
@ -94,15 +97,6 @@ class SimplecastIE(SimplecastBaseIE):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'''(?x)<iframe[^>]+src=["\']
(
https?://(?:embed\.simplecast\.com/[0-9a-f]{8}|
player\.simplecast\.com/%s
))''' % SimplecastBaseIE._UUID_REGEX, webpage)
def _real_extract(self, url): def _real_extract(self, url):
episode_id = self._match_id(url) episode_id = self._match_id(url)
episode = self._call_api('episodes/%s', episode_id) episode = self._call_api('episodes/%s', episode_id)

View File

@ -33,18 +33,13 @@
class SoundcloudEmbedIE(InfoExtractor): class SoundcloudEmbedIE(InfoExtractor):
_VALID_URL = r'https?://(?:w|player|p)\.soundcloud\.com/player/?.*?\burl=(?P<id>.+)' _VALID_URL = r'https?://(?:w|player|p)\.soundcloud\.com/player/?.*?\burl=(?P<id>.+)'
_EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1']
_TEST = { _TEST = {
# from https://www.soundi.fi/uutiset/ennakkokuuntelussa-timo-kaukolammen-station-to-station-to-station-julkaisua-juhlitaan-tanaan-g-livelabissa/ # from https://www.soundi.fi/uutiset/ennakkokuuntelussa-timo-kaukolammen-station-to-station-to-station-julkaisua-juhlitaan-tanaan-g-livelabissa/
'url': 'https://w.soundcloud.com/player/?visual=true&url=https%3A%2F%2Fapi.soundcloud.com%2Fplaylists%2F922213810&show_artwork=true&maxwidth=640&maxheight=960&dnt=1&secret_token=s-ziYey', 'url': 'https://w.soundcloud.com/player/?visual=true&url=https%3A%2F%2Fapi.soundcloud.com%2Fplaylists%2F922213810&show_artwork=true&maxwidth=640&maxheight=960&dnt=1&secret_token=s-ziYey',
'only_matching': True, 'only_matching': True,
} }
@staticmethod
def _extract_urls(webpage):
return [m.group('url') for m in re.finditer(
r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1',
webpage)]
def _real_extract(self, url): def _real_extract(self, url):
query = parse_qs(url) query = parse_qs(url)
api_url = query['url'][0] api_url = query['url'][0]

View File

@ -21,6 +21,7 @@ class SpankwireIE(InfoExtractor):
) )
(?P<id>\d+) (?P<id>\d+)
''' '''
_EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)']
_TESTS = [{ _TESTS = [{
# download URL pattern: */<height>P_<tbr>K_<video_id>.mp4 # download URL pattern: */<height>P_<tbr>K_<video_id>.mp4
'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/',
@ -65,12 +66,6 @@ class SpankwireIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)',
webpage)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
@ -11,6 +9,7 @@
class SportBoxIE(InfoExtractor): class SportBoxIE(InfoExtractor):
_VALID_URL = r'https?://(?:news\.sportbox|matchtv)\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)' _VALID_URL = r'https?://(?:news\.sportbox|matchtv)\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)'
_EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://(?:news\.sportbox|matchtv)\.ru/vdl/player[^"]+)"']
_TESTS = [{ _TESTS = [{
'url': 'http://news.sportbox.ru/vdl/player/ci/211355', 'url': 'http://news.sportbox.ru/vdl/player/ci/211355',
'info_dict': { 'info_dict': {
@ -42,12 +41,6 @@ class SportBoxIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe[^>]+src="(https?://(?:news\.sportbox|matchtv)\.ru/vdl/player[^"]+)"',
webpage)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)

View File

@ -23,6 +23,7 @@ class SpotifyBaseIE(InfoExtractor):
'ShowEpisodes': 'e0e5ce27bd7748d2c59b4d44ba245a8992a05be75d6fabc3b20753fc8857444d', 'ShowEpisodes': 'e0e5ce27bd7748d2c59b4d44ba245a8992a05be75d6fabc3b20753fc8857444d',
} }
_VALID_URL_TEMPL = r'https?://open\.spotify\.com/(?:embed-podcast/|embed/|)%s/(?P<id>[^/?&#]+)' _VALID_URL_TEMPL = r'https?://open\.spotify\.com/(?:embed-podcast/|embed/|)%s/(?P<id>[^/?&#]+)'
_EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://open\.spotify.com/embed/[^"]+)"']
def _real_initialize(self): def _real_initialize(self):
self._ACCESS_TOKEN = self._download_json( self._ACCESS_TOKEN = self._download_json(
@ -97,12 +98,6 @@ def _extract_episode(self, episode, series):
'series': series, 'series': series,
} }
@classmethod
def _extract_urls(cls, webpage):
return re.findall(
r'<iframe[^>]+src="(https?://open\.spotify.com/embed/[^"]+)"',
webpage)
class SpotifyIE(SpotifyBaseIE): class SpotifyIE(SpotifyBaseIE):
IE_NAME = 'spotify' IE_NAME = 'spotify'

View File

@ -21,6 +21,7 @@ class SpringboardPlatformIE(InfoExtractor):
xml_feeds_advanced/index/(?P<index_2>\d+)/rss3/(?P<id_2>\d+) xml_feeds_advanced/index/(?P<index_2>\d+)/rss3/(?P<id_2>\d+)
) )
''' '''
_EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cms\.springboardplatform\.com/embed_iframe/\d+/video/\d+.*?)\1']
_TESTS = [{ _TESTS = [{
'url': 'http://cms.springboardplatform.com/previews/159/video/981017/0/0/1', 'url': 'http://cms.springboardplatform.com/previews/159/video/981017/0/0/1',
'md5': '5c3cb7b5c55740d482561099e920f192', 'md5': '5c3cb7b5c55740d482561099e920f192',
@ -45,14 +46,6 @@ class SpringboardPlatformIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_urls(webpage):
return [
mobj.group('url')
for mobj in re.finditer(
r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cms\.springboardplatform\.com/embed_iframe/\d+/video/\d+.*?)\1',
webpage)]
def _real_extract(self, url): def _real_extract(self, url):
mobj = self._match_valid_url(url) mobj = self._match_valid_url(url)
video_id = mobj.group('id') or mobj.group('id_2') video_id = mobj.group('id') or mobj.group('id_2')

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
@ -12,6 +10,7 @@
class StreamableIE(InfoExtractor): class StreamableIE(InfoExtractor):
_VALID_URL = r'https?://streamable\.com/(?:[es]/)?(?P<id>\w+)' _VALID_URL = r'https?://streamable\.com/(?:[es]/)?(?P<id>\w+)'
_EMBED_REGEX = [r'<iframe[^>]+\bsrc=(?P<q1>[\'"])(?P<url>(?:https?:)?//streamable\.com/.+?)(?P=q1)']
_TESTS = [ _TESTS = [
{ {
'url': 'https://streamable.com/dnd1', 'url': 'https://streamable.com/dnd1',
@ -53,14 +52,6 @@ class StreamableIE(InfoExtractor):
} }
] ]
@staticmethod
def _extract_url(webpage):
mobj = re.search(
r'<iframe[^>]+src=(?P<q1>[\'"])(?P<src>(?:https?:)?//streamable\.com/(?:(?!\1).+))(?P=q1)',
webpage)
if mobj:
return mobj.group('src')
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)

View File

@ -46,14 +46,15 @@ class SubstackIE(InfoExtractor):
}] }]
@classmethod @classmethod
def _extract_url(cls, webpage, url): def _extract_embed_urls(cls, url, webpage):
if not re.search(r'<script[^>]+src=["\']https://substackcdn.com/[^"\']+\.js', webpage): if not re.search(r'<script[^>]+src=["\']https://substackcdn.com/[^"\']+\.js', webpage):
return return
mobj = re.search(r'{[^}]*["\']subdomain["\']\s*:\s*["\'](?P<subdomain>[^"]+)', webpage) mobj = re.search(r'{[^}]*["\']subdomain["\']\s*:\s*["\'](?P<subdomain>[^"]+)', webpage)
if mobj: if mobj:
parsed = urllib.parse.urlparse(url) parsed = urllib.parse.urlparse(url)
return parsed._replace(netloc=f'{mobj.group("subdomain")}.substack.com').geturl() yield parsed._replace(netloc=f'{mobj.group("subdomain")}.substack.com').geturl()
raise cls.StopExtraction()
def _extract_video_formats(self, video_id, username): def _extract_video_formats(self, video_id, username):
formats, subtitles = [], {} formats, subtitles = [], {}

View File

@ -101,6 +101,7 @@ def _extract_video(self, video_info, video_id):
class SVTIE(SVTBaseIE): class SVTIE(SVTBaseIE):
_VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P<widget_id>\d+)&.*?\barticleId=(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P<widget_id>\d+)&.*?\barticleId=(?P<id>\d+)'
_EMBED_REGEX = [r'(?:<iframe src|href)="(?P<url>%s[^"]*)"' % _VALID_URL]
_TEST = { _TEST = {
'url': 'http://www.svt.se/wd?widgetId=23991&sectionId=541&articleId=2900353&type=embed&contextSectionId=123&autostart=false', 'url': 'http://www.svt.se/wd?widgetId=23991&sectionId=541&articleId=2900353&type=embed&contextSectionId=123&autostart=false',
'md5': '33e9a5d8f646523ce0868ecfb0eed77d', 'md5': '33e9a5d8f646523ce0868ecfb0eed77d',
@ -113,13 +114,6 @@ class SVTIE(SVTBaseIE):
}, },
} }
@staticmethod
def _extract_url(webpage):
mobj = re.search(
r'(?:<iframe src|href)="(?P<url>%s[^"]*)"' % SVTIE._VALID_URL, webpage)
if mobj:
return mobj.group('url')
def _real_extract(self, url): def _real_extract(self, url):
mobj = self._match_valid_url(url) mobj = self._match_valid_url(url)
widget_id = mobj.group('widget_id') widget_id = mobj.group('widget_id')

View File

@ -140,12 +140,12 @@ def _is_teachable(webpage):
r'<link[^>]+href=["\']https?://(?:process\.fs|assets)\.teachablecdn\.com', r'<link[^>]+href=["\']https?://(?:process\.fs|assets)\.teachablecdn\.com',
webpage) webpage)
@staticmethod @classmethod
def _extract_url(webpage, source_url): def _extract_embed_urls(cls, url, webpage):
if not TeachableIE._is_teachable(webpage): if cls._is_teachable(webpage):
return if re.match(r'https?://[^/]+/(?:courses|p)', url):
if re.match(r'https?://[^/]+/(?:courses|p)', source_url): yield f'{cls._URL_PREFIX}{url}'
return '%s%s' % (TeachableBaseIE._URL_PREFIX, source_url) raise cls.StopExtraction()
def _real_extract(self, url): def _real_extract(self, url):
mobj = self._match_valid_url(url) mobj = self._match_valid_url(url)
@ -160,7 +160,7 @@ def _real_extract(self, url):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
wistia_urls = WistiaIE._extract_urls(webpage) wistia_urls = WistiaIE._extract_embed_urls(url, webpage)
if not wistia_urls: if not wistia_urls:
if any(re.search(p, webpage) for p in ( if any(re.search(p, webpage) for p in (
r'class=["\']lecture-contents-locked', r'class=["\']lecture-contents-locked',

View File

@ -215,6 +215,7 @@ def _real_extract(self, url):
class TedEmbedIE(InfoExtractor): class TedEmbedIE(InfoExtractor):
_VALID_URL = r'https?://embed(?:-ssl)?\.ted\.com/' _VALID_URL = r'https?://embed(?:-ssl)?\.ted\.com/'
_EMBED_REGEX = [rf'<iframe[^>]+?src=(["\'])(?P<url>{_VALID_URL}.+?)\1']
_TESTS = [{ _TESTS = [{
'url': 'https://embed.ted.com/talks/janet_stovall_how_to_get_serious_about_diversity_and_inclusion_in_the_workplace', 'url': 'https://embed.ted.com/talks/janet_stovall_how_to_get_serious_about_diversity_and_inclusion_in_the_workplace',
@ -233,10 +234,5 @@ class TedEmbedIE(InfoExtractor):
}, },
}] }]
@classmethod
def _extract_urls(cls, webpage):
return [mobj.group('url') for mobj in re.finditer(
fr'<iframe[^>]+?src=(["\'])(?P<url>{cls._VALID_URL}.+?)\1', webpage)]
def _real_extract(self, url): def _real_extract(self, url):
return self.url_result(re.sub(r'://embed(-ssl)?', '://www', url), TedTalkIE.ie_key()) return self.url_result(re.sub(r'://embed(-ssl)?', '://www', url), TedTalkIE.ie_key())

View File

@ -123,6 +123,13 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
(?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/ (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
(?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)?|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))? (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)?|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
|theplatform:)(?P<id>[^/\?&]+)''' |theplatform:)(?P<id>[^/\?&]+)'''
_EMBED_REGEX = [
r'''(?x)
<meta\s+
property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+
content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2''',
r'(?s)<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//player\.theplatform\.com/p/.+?)\1'
]
_TESTS = [{ _TESTS = [{
# from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/ # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/
@ -192,22 +199,11 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
}] }]
@classmethod @classmethod
def _extract_urls(cls, webpage): def _extract_embed_urls(cls, url, webpage):
m = re.search(
r'''(?x)
<meta\s+
property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+
content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2
''', webpage)
if m:
return [m.group('url')]
# Are whitespaces ignored in URLs? # Are whitespaces ignored in URLs?
# https://github.com/ytdl-org/youtube-dl/issues/12044 # https://github.com/ytdl-org/youtube-dl/issues/12044
matches = re.findall( for embed_url in super()._extract_embed_urls(url, webpage):
r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage) yield re.sub(r'\s', '', embed_url)
if matches:
return [re.sub(r'\s', '', list(zip(*matches))[1][0])]
@staticmethod @staticmethod
def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False): def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False):

View File

@ -1,5 +1,3 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_HTTPError from ..compat import compat_HTTPError
from ..utils import ( from ..utils import (
@ -16,6 +14,7 @@ class ThreeQSDNIE(InfoExtractor):
IE_NAME = '3qsdn' IE_NAME = '3qsdn'
IE_DESC = '3Q SDN' IE_DESC = '3Q SDN'
_VALID_URL = r'https?://playout\.3qsdn\.com/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _VALID_URL = r'https?://playout\.3qsdn\.com/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
_EMBED_REGEX = [r'<iframe[^>]+\b(?:data-)?src=(["\'])(?P<url>%s.*?)\1' % _VALID_URL]
_TESTS = [{ _TESTS = [{
# https://player.3qsdn.com/demo.html # https://player.3qsdn.com/demo.html
'url': 'https://playout.3qsdn.com/7201c779-6b3c-11e7-a40e-002590c750be', 'url': 'https://playout.3qsdn.com/7201c779-6b3c-11e7-a40e-002590c750be',
@ -76,12 +75,13 @@ class ThreeQSDNIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod def _extract_from_webpage(self, url, webpage):
def _extract_url(webpage): for res in super()._extract_from_webpage(url, webpage):
mobj = re.search( yield {
r'<iframe[^>]+\b(?:data-)?src=(["\'])(?P<url>%s.*?)\1' % ThreeQSDNIE._VALID_URL, webpage) **res,
if mobj: '_type': 'url_transparent',
return mobj.group('url') 'uploader': self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader'),
}
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)

View File

@ -1,7 +1,6 @@
import itertools import itertools
import json import json
import random import random
import re
import string import string
import time import time
@ -379,6 +378,7 @@ def _parse_aweme_video_web(self, aweme_detail, webpage_url):
class TikTokIE(TikTokBaseIE): class TikTokIE(TikTokBaseIE):
_VALID_URL = r'https?://www\.tiktok\.com/(?:embed|@(?P<user_id>[\w\.-]+)/video)/(?P<id>\d+)' _VALID_URL = r'https?://www\.tiktok\.com/(?:embed|@(?P<user_id>[\w\.-]+)/video)/(?P<id>\d+)'
_EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
_TESTS = [{ _TESTS = [{
'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610', 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610',
@ -529,11 +529,6 @@ class TikTokIE(TikTokBaseIE):
'only_matching': True 'only_matching': True
}] }]
@classmethod
def _extract_urls(cls, webpage):
return [mobj.group('url') for mobj in re.finditer(
rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{cls._VALID_URL})', webpage)]
def _extract_aweme_app(self, aweme_id): def _extract_aweme_app(self, aweme_id):
try: try:
aweme_detail = self._call_api('aweme/detail', {'aweme_id': aweme_id}, aweme_id, aweme_detail = self._call_api('aweme/detail', {'aweme_id': aweme_id}, aweme_id,

Some files were not shown because too many files have changed in this diff Show More