[youtube] More metadata extraction for channels/playlists

This commit is contained in:
pukkandan 2021-02-02 21:51:32 +05:30
parent 18590cecdb
commit b60419c51a
2 changed files with 53 additions and 24 deletions

View File

@ -336,9 +336,8 @@ class InfoExtractor(object):
There must be a key "entries", which is a list, an iterable, or a PagedList There must be a key "entries", which is a list, an iterable, or a PagedList
object, each element of which is a valid dictionary by this specification. object, each element of which is a valid dictionary by this specification.
Additionally, playlists can have "id", "title", "description", "uploader", Additionally, playlists can have "id", "title", and any other relevent
"uploader_id", "uploader_url", "duration" attributes with the same semantics attributes with the same semantics as videos (see above).
as videos (see above).
_type "multi_video" indicates that there are multiple videos that _type "multi_video" indicates that there are multiple videos that
@ -967,10 +966,11 @@ def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None,
urls, playlist_id=playlist_id, playlist_title=playlist_title) urls, playlist_id=playlist_id, playlist_title=playlist_title)
@staticmethod @staticmethod
def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
"""Returns a playlist""" """Returns a playlist"""
video_info = {'_type': 'playlist', video_info = {'_type': 'playlist',
'entries': entries} 'entries': entries}
video_info.update(kwargs)
if playlist_id: if playlist_id:
video_info['id'] = playlist_id video_info['id'] = playlist_id
if playlist_title: if playlist_title:

View File

@ -31,6 +31,7 @@
clean_html, clean_html,
error_to_compat_str, error_to_compat_str,
ExtractorError, ExtractorError,
format_field,
float_or_none, float_or_none,
get_element_by_id, get_element_by_id,
int_or_none, int_or_none,
@ -2675,6 +2676,7 @@ def decrypt_sig(mobj):
'uploader': video_uploader, 'uploader': video_uploader,
'uploader_id': video_uploader_id, 'uploader_id': video_uploader_id,
'uploader_url': video_uploader_url, 'uploader_url': video_uploader_url,
'channel': video_uploader,
'channel_id': channel_id, 'channel_id': channel_id,
'channel_url': channel_url, 'channel_url': channel_url,
'upload_date': upload_date, 'upload_date': upload_date,
@ -3402,44 +3404,71 @@ def _extract_uploader(data):
uploader['uploader_url'] = urljoin( uploader['uploader_url'] = urljoin(
'https://www.youtube.com/', 'https://www.youtube.com/',
try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str)) try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
return uploader return {k:v for k, v in uploader.items() if v is not None}
def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
playlist_id = title = description = channel_url = channel_name = channel_id = None
thumbnails_list = tags = []
selected_tab = self._extract_selected_tab(tabs) selected_tab = self._extract_selected_tab(tabs)
renderer = try_get( renderer = try_get(
data, lambda x: x['metadata']['channelMetadataRenderer'], dict) data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
playlist_id = title = description = None
if renderer: if renderer:
channel_title = renderer.get('title') or item_id channel_name = renderer.get('title')
tab_title = selected_tab.get('title') channel_url = renderer.get('channelUrl')
title = channel_title or item_id channel_id = renderer.get('externalId')
if tab_title:
title += ' - %s' % tab_title
description = renderer.get('description')
playlist_id = renderer.get('externalId')
# this has thumbnails, but there is currently no thumbnail field for playlists
# sidebar.playlistSidebarRenderer has even more data, but its stucture is more complec
renderer = try_get(
data, lambda x: x['microformat']['microformatDataRenderer'], dict)
if not renderer: if not renderer:
renderer = try_get( renderer = try_get(
data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
if renderer: if renderer:
title = renderer.get('title') title = renderer.get('title')
description = renderer.get('description') description = renderer.get('description')
playlist_id = item_id playlist_id = channel_id
tags = renderer.get('keywords', '').split()
thumbnails_list = (
try_get(renderer, lambda x: x['avatar']['thumbnails'], list)
or data['sidebar']['playlistSidebarRenderer']['items'][0]['playlistSidebarPrimaryInfoRenderer']['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails']
or [])
thumbnails = []
for t in thumbnails_list:
if not isinstance(t, dict):
continue
thumbnail_url = url_or_none(t.get('url'))
if not thumbnail_url:
continue
thumbnails.append({
'url': thumbnail_url,
'width': int_or_none(t.get('width')),
'height': int_or_none(t.get('height')),
})
if playlist_id is None: if playlist_id is None:
playlist_id = item_id playlist_id = item_id
if title is None: if title is None:
title = "Youtube " + playlist_id.title() title = playlist_id
playlist = self.playlist_result( title += format_field(selected_tab, 'title', ' - %s')
metadata = {
'playlist_id': playlist_id,
'playlist_title': title,
'playlist_description': description,
'uploader': channel_name,
'uploader_id': channel_id,
'uploader_url': channel_url,
'thumbnails': thumbnails,
'tags': tags,
}
if not channel_id:
metadata.update(self._extract_uploader(data))
metadata.update({
'channel': metadata['uploader'],
'channel_id': metadata['uploader_id'],
'channel_url': metadata['uploader_url']})
return self.playlist_result(
self._entries(selected_tab, identity_token), self._entries(selected_tab, identity_token),
playlist_id=playlist_id, playlist_title=title, **metadata)
playlist_description=description)
playlist.update(self._extract_uploader(data))
return playlist
def _extract_from_playlist(self, item_id, url, data, playlist): def _extract_from_playlist(self, item_id, url, data, playlist):
title = playlist.get('title') or try_get( title = playlist.get('title') or try_get(