[lazy_extractors] Fix bug in 2c4aaaddc9

SearchIEs must not inherit from extractors that have a _VALID_URL defined
This commit is contained in:
pukkandan 2021-12-03 03:16:08 +05:30
parent 43b2290658
commit 63ccf4ff1a
No known key found for this signature in database
GPG Key ID: 0F00D95A001F4698
2 changed files with 173 additions and 172 deletions

View File

@ -662,11 +662,32 @@ def pagefunc(pagenum):
} }
NicovideoSearchIE_NAME = 'nicovideo:search' class NicovideoSearchBaseIE(InfoExtractor):
def _entries(self, url, item_id, query=None, note='Downloading page %(page)s'):
query = query or {}
pages = [query['page']] if 'page' in query else itertools.count(1)
for page_num in pages:
query['page'] = str(page_num)
webpage = self._download_webpage(url, item_id, query=query, note=note % {'page': page_num})
results = re.findall(r'(?<=data-video-id=)["\']?(?P<videoid>.*?)(?=["\'])', webpage)
for item in results:
yield self.url_result(f'http://www.nicovideo.jp/watch/{item}', 'Niconico', item)
if not results:
break
class NicovideoSearchURLIE(InfoExtractor): class NicovideoSearchIE(NicovideoSearchBaseIE, SearchInfoExtractor):
IE_NAME = f'{NicovideoSearchIE_NAME}_url' IE_DESC = 'Nico video search'
IE_NAME = 'nicovideo:search'
_SEARCH_KEY = 'nicosearch'
def _search_results(self, query):
return self._entries(
self._proto_relative_url(f'//www.nicovideo.jp/search/{query}'), query)
class NicovideoSearchURLIE(NicovideoSearchBaseIE):
IE_NAME = f'{NicovideoSearchIE.IE_NAME}_url'
IE_DESC = 'Nico video search URLs' IE_DESC = 'Nico video search URLs'
_VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/search/(?P<id>[^?#&]+)?' _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/search/(?P<id>[^?#&]+)?'
_TESTS = [{ _TESTS = [{
@ -685,37 +706,14 @@ class NicovideoSearchURLIE(InfoExtractor):
'playlist_count': 31, 'playlist_count': 31,
}] }]
def _entries(self, url, item_id, query=None, note='Downloading page %(page)s'):
query = query or {}
pages = [query['page']] if 'page' in query else itertools.count(1)
for page_num in pages:
query['page'] = str(page_num)
webpage = self._download_webpage(url, item_id, query=query, note=note % {'page': page_num})
results = re.findall(r'(?<=data-video-id=)["\']?(?P<videoid>.*?)(?=["\'])', webpage)
for item in results:
yield self.url_result(f'http://www.nicovideo.jp/watch/{item}', 'Niconico', item)
if not results:
break
def _real_extract(self, url): def _real_extract(self, url):
query = self._match_id(url) query = self._match_id(url)
return self.playlist_result(self._entries(url, query), query, query) return self.playlist_result(self._entries(url, query), query, query)
class NicovideoSearchIE(SearchInfoExtractor, NicovideoSearchURLIE): class NicovideoSearchDateIE(NicovideoSearchBaseIE, SearchInfoExtractor):
IE_DESC = 'Nico video search'
IE_NAME = NicovideoSearchIE_NAME
_SEARCH_KEY = 'nicosearch'
_TESTS = []
def _search_results(self, query):
return self._entries(
self._proto_relative_url(f'//www.nicovideo.jp/search/{query}'), query)
class NicovideoSearchDateIE(NicovideoSearchIE):
IE_DESC = 'Nico video search, newest first' IE_DESC = 'Nico video search, newest first'
IE_NAME = f'{NicovideoSearchIE_NAME}:date' IE_NAME = f'{NicovideoSearchIE.IE_NAME}:date'
_SEARCH_KEY = 'nicosearchdate' _SEARCH_KEY = 'nicosearchdate'
_TESTS = [{ _TESTS = [{
'url': 'nicosearchdateall:a', 'url': 'nicosearchdateall:a',
@ -756,7 +754,7 @@ def _get_entries_for_date(self, url, item_id, start_date, end_date=None, page_nu
if page_num: if page_num:
query['page'] = str(page_num) query['page'] = str(page_num)
yield from NicovideoSearchURLIE._entries(self, url, item_id, query=query, note=note) yield from super()._entries(url, item_id, query=query, note=note)
class NiconicoUserIE(InfoExtractor): class NiconicoUserIE(InfoExtractor):

View File

@ -58,7 +58,149 @@ def _real_extract(self, url):
return self.url_result(api_url) return self.url_result(api_url)
class SoundcloudIE(InfoExtractor): class SoundcloudBaseIE(InfoExtractor):
_API_V2_BASE = 'https://api-v2.soundcloud.com/'
_BASE_URL = 'https://soundcloud.com/'
def _store_client_id(self, client_id):
self._downloader.cache.store('soundcloud', 'client_id', client_id)
def _update_client_id(self):
webpage = self._download_webpage('https://soundcloud.com/', None)
for src in reversed(re.findall(r'<script[^>]+src="([^"]+)"', webpage)):
script = self._download_webpage(src, None, fatal=False)
if script:
client_id = self._search_regex(
r'client_id\s*:\s*"([0-9a-zA-Z]{32})"',
script, 'client id', default=None)
if client_id:
self._CLIENT_ID = client_id
self._store_client_id(client_id)
return
raise ExtractorError('Unable to extract client id')
def _download_json(self, *args, **kwargs):
non_fatal = kwargs.get('fatal') is False
if non_fatal:
del kwargs['fatal']
query = kwargs.get('query', {}).copy()
for _ in range(2):
query['client_id'] = self._CLIENT_ID
kwargs['query'] = query
try:
return super()._download_json(*args, **compat_kwargs(kwargs))
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403):
self._store_client_id(None)
self._update_client_id()
continue
elif non_fatal:
self.report_warning(error_to_compat_str(e))
return False
raise
def _real_initialize(self):
self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf'
self._login()
_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
_API_AUTH_QUERY_TEMPLATE = '?client_id=%s'
_API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s'
_API_VERIFY_AUTH_TOKEN = 'https://api-auth.soundcloud.com/connect/session%s'
_access_token = None
_HEADERS = {}
_NETRC_MACHINE = 'soundcloud'
def _login(self):
username, password = self._get_login_info()
if username is None:
return
if username == 'oauth' and password is not None:
self._access_token = password
query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID
payload = {'session': {'access_token': self._access_token}}
token_verification = sanitized_Request(self._API_VERIFY_AUTH_TOKEN % query, json.dumps(payload).encode('utf-8'))
response = self._download_json(token_verification, None, note='Verifying login token...', fatal=False)
if response is not False:
self._HEADERS = {'Authorization': 'OAuth ' + self._access_token}
self.report_login()
else:
self.report_warning('Provided authorization token seems to be invalid. Continue as guest')
elif username is not None:
self.report_warning(
'Login using username and password is not currently supported. '
'Use "--user oauth --password <oauth_token>" to login using an oauth token')
r'''
def genDevId():
def genNumBlock():
return ''.join([str(random.randrange(10)) for i in range(6)])
return '-'.join([genNumBlock() for i in range(4)])
payload = {
'client_id': self._CLIENT_ID,
'recaptcha_pubkey': 'null',
'recaptcha_response': 'null',
'credentials': {
'identifier': username,
'password': password
},
'signature': self.sign(username, password, self._CLIENT_ID),
'device_id': genDevId(),
'user_agent': self._USER_AGENT
}
query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID
login = sanitized_Request(self._API_AUTH_URL_PW % query, json.dumps(payload).encode('utf-8'))
response = self._download_json(login, None)
self._access_token = response.get('session').get('access_token')
if not self._access_token:
self.report_warning('Unable to get access token, login may has failed')
else:
self._HEADERS = {'Authorization': 'OAuth ' + self._access_token}
'''
# signature generation
def sign(self, user, pw, clid):
a = 33
i = 1
s = 440123
w = 117
u = 1800000
l = 1042
b = 37
k = 37
c = 5
n = '0763ed7314c69015fd4a0dc16bbf4b90' # _KEY
y = '8' # _REV
r = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' # _USER_AGENT
e = user # _USERNAME
t = clid # _CLIENT_ID
d = '-'.join([str(mInt) for mInt in [a, i, s, w, u, l, b, k]])
p = n + y + d + r + e + t + d + n
h = p
m = 8011470
f = 0
for f in range(f, len(h)):
m = (m >> 1) + ((1 & m) << 23)
m += ord(h[f])
m &= 16777215
# c is not even needed
out = str(y) + ':' + str(d) + ':' + format(m, 'x') + ':' + str(c)
return out
@classmethod
def _resolv_url(cls, url):
return cls._API_V2_BASE + 'resolve?url=' + url
class SoundcloudIE(SoundcloudBaseIE):
"""Information extractor for soundcloud.com """Information extractor for soundcloud.com
To access the media, the uid of the song and a stream token To access the media, the uid of the song and a stream token
must be extracted from the page source and the script must make must be extracted from the page source and the script must make
@ -250,8 +392,6 @@ class SoundcloudIE(InfoExtractor):
}, },
] ]
_API_V2_BASE = 'https://api-v2.soundcloud.com/'
_BASE_URL = 'https://soundcloud.com/'
_IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg'
_ARTWORK_MAP = { _ARTWORK_MAP = {
@ -267,143 +407,6 @@ class SoundcloudIE(InfoExtractor):
'original': 0, 'original': 0,
} }
def _store_client_id(self, client_id):
self._downloader.cache.store('soundcloud', 'client_id', client_id)
def _update_client_id(self):
webpage = self._download_webpage('https://soundcloud.com/', None)
for src in reversed(re.findall(r'<script[^>]+src="([^"]+)"', webpage)):
script = self._download_webpage(src, None, fatal=False)
if script:
client_id = self._search_regex(
r'client_id\s*:\s*"([0-9a-zA-Z]{32})"',
script, 'client id', default=None)
if client_id:
self._CLIENT_ID = client_id
self._store_client_id(client_id)
return
raise ExtractorError('Unable to extract client id')
def _download_json(self, *args, **kwargs):
non_fatal = kwargs.get('fatal') is False
if non_fatal:
del kwargs['fatal']
query = kwargs.get('query', {}).copy()
for _ in range(2):
query['client_id'] = self._CLIENT_ID
kwargs['query'] = query
try:
return super(SoundcloudIE, self)._download_json(*args, **compat_kwargs(kwargs))
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403):
self._store_client_id(None)
self._update_client_id()
continue
elif non_fatal:
self.report_warning(error_to_compat_str(e))
return False
raise
def _real_initialize(self):
self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf'
self._login()
_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
_API_AUTH_QUERY_TEMPLATE = '?client_id=%s'
_API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s'
_API_VERIFY_AUTH_TOKEN = 'https://api-auth.soundcloud.com/connect/session%s'
_access_token = None
_HEADERS = {}
_NETRC_MACHINE = 'soundcloud'
def _login(self):
username, password = self._get_login_info()
if username is None:
return
if username == 'oauth' and password is not None:
self._access_token = password
query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID
payload = {'session': {'access_token': self._access_token}}
token_verification = sanitized_Request(self._API_VERIFY_AUTH_TOKEN % query, json.dumps(payload).encode('utf-8'))
response = self._download_json(token_verification, None, note='Verifying login token...', fatal=False)
if response is not False:
self._HEADERS = {'Authorization': 'OAuth ' + self._access_token}
self.report_login()
else:
self.report_warning('Provided authorization token seems to be invalid. Continue as guest')
elif username is not None:
self.report_warning(
'Login using username and password is not currently supported. '
'Use "--user oauth --password <oauth_token>" to login using an oauth token')
r'''
def genDevId():
def genNumBlock():
return ''.join([str(random.randrange(10)) for i in range(6)])
return '-'.join([genNumBlock() for i in range(4)])
payload = {
'client_id': self._CLIENT_ID,
'recaptcha_pubkey': 'null',
'recaptcha_response': 'null',
'credentials': {
'identifier': username,
'password': password
},
'signature': self.sign(username, password, self._CLIENT_ID),
'device_id': genDevId(),
'user_agent': self._USER_AGENT
}
query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID
login = sanitized_Request(self._API_AUTH_URL_PW % query, json.dumps(payload).encode('utf-8'))
response = self._download_json(login, None)
self._access_token = response.get('session').get('access_token')
if not self._access_token:
self.report_warning('Unable to get access token, login may has failed')
else:
self._HEADERS = {'Authorization': 'OAuth ' + self._access_token}
'''
# signature generation
def sign(self, user, pw, clid):
a = 33
i = 1
s = 440123
w = 117
u = 1800000
l = 1042
b = 37
k = 37
c = 5
n = '0763ed7314c69015fd4a0dc16bbf4b90' # _KEY
y = '8' # _REV
r = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' # _USER_AGENT
e = user # _USERNAME
t = clid # _CLIENT_ID
d = '-'.join([str(mInt) for mInt in [a, i, s, w, u, l, b, k]])
p = n + y + d + r + e + t + d + n
h = p
m = 8011470
f = 0
for f in range(f, len(h)):
m = (m >> 1) + ((1 & m) << 23)
m += ord(h[f])
m &= 16777215
# c is not even needed
out = str(y) + ':' + str(d) + ':' + format(m, 'x') + ':' + str(c)
return out
@classmethod
def _resolv_url(cls, url):
return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url
def _extract_info_dict(self, info, full_title=None, secret_token=None): def _extract_info_dict(self, info, full_title=None, secret_token=None):
track_id = compat_str(info['id']) track_id = compat_str(info['id'])
title = info['title'] title = info['title']
@ -581,7 +584,7 @@ def _real_extract(self, url):
return self._extract_info_dict(info, full_title, token) return self._extract_info_dict(info, full_title, token)
class SoundcloudPlaylistBaseIE(SoundcloudIE): class SoundcloudPlaylistBaseIE(SoundcloudBaseIE):
def _extract_set(self, playlist, token=None): def _extract_set(self, playlist, token=None):
playlist_id = compat_str(playlist['id']) playlist_id = compat_str(playlist['id'])
tracks = playlist.get('tracks') or [] tracks = playlist.get('tracks') or []
@ -654,7 +657,7 @@ def _real_extract(self, url):
return self._extract_set(info, token) return self._extract_set(info, token)
class SoundcloudPagedPlaylistBaseIE(SoundcloudIE): class SoundcloudPagedPlaylistBaseIE(SoundcloudBaseIE):
def _extract_playlist(self, base_url, playlist_id, playlist_title): def _extract_playlist(self, base_url, playlist_id, playlist_title):
return { return {
'_type': 'playlist', '_type': 'playlist',
@ -853,7 +856,7 @@ def _real_extract(self, url):
return self._extract_set(data, token) return self._extract_set(data, token)
class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): class SoundcloudSearchIE(SoundcloudBaseIE, SearchInfoExtractor):
IE_NAME = 'soundcloud:search' IE_NAME = 'soundcloud:search'
IE_DESC = 'Soundcloud search' IE_DESC = 'Soundcloud search'
_SEARCH_KEY = 'scsearch' _SEARCH_KEY = 'scsearch'