mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-27 02:42:30 +00:00
[extractor/rutube] Extract chapters from description (#6345)
Authored by: mushbite
This commit is contained in:
parent
08ff6d59f9
commit
22ccd5420b
@ -3649,6 +3649,38 @@ def _generic_title(self, url='', webpage='', *, default=None):
|
|||||||
or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
|
or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
|
||||||
or default)
|
or default)
|
||||||
|
|
||||||
|
def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
|
||||||
|
if not duration:
|
||||||
|
return
|
||||||
|
chapter_list = [{
|
||||||
|
'start_time': start_function(chapter),
|
||||||
|
'title': title_function(chapter),
|
||||||
|
} for chapter in chapter_list or []]
|
||||||
|
if not strict:
|
||||||
|
chapter_list.sort(key=lambda c: c['start_time'] or 0)
|
||||||
|
|
||||||
|
chapters = [{'start_time': 0}]
|
||||||
|
for idx, chapter in enumerate(chapter_list):
|
||||||
|
if chapter['start_time'] is None:
|
||||||
|
self.report_warning(f'Incomplete chapter {idx}')
|
||||||
|
elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
|
||||||
|
chapters.append(chapter)
|
||||||
|
elif chapter not in chapters:
|
||||||
|
self.report_warning(
|
||||||
|
f'Invalid start time ({chapter["start_time"]} < {chapters[-1]["start_time"]}) for chapter "{chapter["title"]}"')
|
||||||
|
return chapters[1:]
|
||||||
|
|
||||||
|
def _extract_chapters_from_description(self, description, duration):
|
||||||
|
duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
|
||||||
|
sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
|
||||||
|
return self._extract_chapters_helper(
|
||||||
|
re.findall(sep_re % (duration_re, r'.+?'), description or ''),
|
||||||
|
start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
|
||||||
|
duration=duration, strict=False) or self._extract_chapters_helper(
|
||||||
|
re.findall(sep_re % (r'.+?', duration_re), description or ''),
|
||||||
|
start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
|
||||||
|
duration=duration, strict=False)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
|
def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
|
||||||
all_known = all(map(
|
all_known = all(map(
|
||||||
|
@ -25,8 +25,7 @@ def _download_api_info(self, video_id, query=None):
|
|||||||
video_id, 'Downloading video JSON',
|
video_id, 'Downloading video JSON',
|
||||||
'Unable to download video JSON', query=query)
|
'Unable to download video JSON', query=query)
|
||||||
|
|
||||||
@staticmethod
|
def _extract_info(self, video, video_id=None, require_title=True):
|
||||||
def _extract_info(video, video_id=None, require_title=True):
|
|
||||||
title = video['title'] if require_title else video.get('title')
|
title = video['title'] if require_title else video.get('title')
|
||||||
|
|
||||||
age_limit = video.get('is_adult')
|
age_limit = video.get('is_adult')
|
||||||
@ -35,13 +34,15 @@ def _extract_info(video, video_id=None, require_title=True):
|
|||||||
|
|
||||||
uploader_id = try_get(video, lambda x: x['author']['id'])
|
uploader_id = try_get(video, lambda x: x['author']['id'])
|
||||||
category = try_get(video, lambda x: x['category']['name'])
|
category = try_get(video, lambda x: x['category']['name'])
|
||||||
|
description = video.get('description')
|
||||||
|
duration = int_or_none(video.get('duration'))
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'id': video.get('id') or video_id if video_id else video['id'],
|
'id': video.get('id') or video_id if video_id else video['id'],
|
||||||
'title': title,
|
'title': title,
|
||||||
'description': video.get('description'),
|
'description': description,
|
||||||
'thumbnail': video.get('thumbnail_url'),
|
'thumbnail': video.get('thumbnail_url'),
|
||||||
'duration': int_or_none(video.get('duration')),
|
'duration': duration,
|
||||||
'uploader': try_get(video, lambda x: x['author']['name']),
|
'uploader': try_get(video, lambda x: x['author']['name']),
|
||||||
'uploader_id': compat_str(uploader_id) if uploader_id else None,
|
'uploader_id': compat_str(uploader_id) if uploader_id else None,
|
||||||
'timestamp': unified_timestamp(video.get('created_ts')),
|
'timestamp': unified_timestamp(video.get('created_ts')),
|
||||||
@ -50,6 +51,7 @@ def _extract_info(video, video_id=None, require_title=True):
|
|||||||
'view_count': int_or_none(video.get('hits')),
|
'view_count': int_or_none(video.get('hits')),
|
||||||
'comment_count': int_or_none(video.get('comments_count')),
|
'comment_count': int_or_none(video.get('comments_count')),
|
||||||
'is_live': bool_or_none(video.get('is_livestream')),
|
'is_live': bool_or_none(video.get('is_livestream')),
|
||||||
|
'chapters': self._extract_chapters_from_description(description, duration),
|
||||||
}
|
}
|
||||||
|
|
||||||
def _download_and_extract_info(self, video_id, query=None):
|
def _download_and_extract_info(self, video_id, query=None):
|
||||||
@ -111,8 +113,9 @@ class RutubeIE(RutubeBaseIE):
|
|||||||
'view_count': int,
|
'view_count': int,
|
||||||
'thumbnail': 'http://pic.rutubelist.ru/video/d2/a0/d2a0aec998494a396deafc7ba2c82add.jpg',
|
'thumbnail': 'http://pic.rutubelist.ru/video/d2/a0/d2a0aec998494a396deafc7ba2c82add.jpg',
|
||||||
'category': ['Новости и СМИ'],
|
'category': ['Новости и СМИ'],
|
||||||
|
'chapters': [],
|
||||||
},
|
},
|
||||||
|
'expected_warnings': ['Unable to download f4m'],
|
||||||
}, {
|
}, {
|
||||||
'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661',
|
'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661',
|
||||||
'only_matching': True,
|
'only_matching': True,
|
||||||
@ -142,7 +145,28 @@ class RutubeIE(RutubeBaseIE):
|
|||||||
'view_count': int,
|
'view_count': int,
|
||||||
'thumbnail': 'http://pic.rutubelist.ru/video/f2/d4/f2d42b54be0a6e69c1c22539e3152156.jpg',
|
'thumbnail': 'http://pic.rutubelist.ru/video/f2/d4/f2d42b54be0a6e69c1c22539e3152156.jpg',
|
||||||
'category': ['Видеоигры'],
|
'category': ['Видеоигры'],
|
||||||
|
'chapters': [],
|
||||||
},
|
},
|
||||||
|
'expected_warnings': ['Unable to download f4m'],
|
||||||
|
}, {
|
||||||
|
'url': 'https://rutube.ru/video/c65b465ad0c98c89f3b25cb03dcc87c6/',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'c65b465ad0c98c89f3b25cb03dcc87c6',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'chapters': 'count:4',
|
||||||
|
'category': ['Бизнес и предпринимательство'],
|
||||||
|
'description': 'md5:252feac1305257d8c1bab215cedde75d',
|
||||||
|
'thumbnail': 'http://pic.rutubelist.ru/video/71/8f/718f27425ea9706073eb80883dd3787b.png',
|
||||||
|
'duration': 782,
|
||||||
|
'age_limit': 0,
|
||||||
|
'uploader_id': '23491359',
|
||||||
|
'timestamp': 1677153329,
|
||||||
|
'view_count': int,
|
||||||
|
'upload_date': '20230223',
|
||||||
|
'title': 'Бизнес с нуля: найм сотрудников. Интервью с директором строительной компании',
|
||||||
|
'uploader': 'Стас Быков',
|
||||||
|
},
|
||||||
|
'expected_warnings': ['Unable to download f4m'],
|
||||||
}]
|
}]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -3205,11 +3205,11 @@ def _extract_chapters_from_json(self, data, duration):
|
|||||||
'decoratedPlayerBarRenderer', 'playerBar', 'chapteredPlayerBarRenderer', 'chapters'
|
'decoratedPlayerBarRenderer', 'playerBar', 'chapteredPlayerBarRenderer', 'chapters'
|
||||||
), expected_type=list)
|
), expected_type=list)
|
||||||
|
|
||||||
return self._extract_chapters(
|
return self._extract_chapters_helper(
|
||||||
chapter_list,
|
chapter_list,
|
||||||
chapter_time=lambda chapter: float_or_none(
|
start_function=lambda chapter: float_or_none(
|
||||||
traverse_obj(chapter, ('chapterRenderer', 'timeRangeStartMillis')), scale=1000),
|
traverse_obj(chapter, ('chapterRenderer', 'timeRangeStartMillis')), scale=1000),
|
||||||
chapter_title=lambda chapter: traverse_obj(
|
title_function=lambda chapter: traverse_obj(
|
||||||
chapter, ('chapterRenderer', 'title', 'simpleText'), expected_type=str),
|
chapter, ('chapterRenderer', 'title', 'simpleText'), expected_type=str),
|
||||||
duration=duration)
|
duration=duration)
|
||||||
|
|
||||||
@ -3222,42 +3222,10 @@ def _extract_chapters_from_engagement_panel(self, data, duration):
|
|||||||
chapter_title = lambda chapter: self._get_text(chapter, 'title')
|
chapter_title = lambda chapter: self._get_text(chapter, 'title')
|
||||||
|
|
||||||
return next(filter(None, (
|
return next(filter(None, (
|
||||||
self._extract_chapters(traverse_obj(contents, (..., 'macroMarkersListItemRenderer')),
|
self._extract_chapters_helper(traverse_obj(contents, (..., 'macroMarkersListItemRenderer')),
|
||||||
chapter_time, chapter_title, duration)
|
chapter_time, chapter_title, duration)
|
||||||
for contents in content_list)), [])
|
for contents in content_list)), [])
|
||||||
|
|
||||||
def _extract_chapters_from_description(self, description, duration):
|
|
||||||
duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
|
|
||||||
sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
|
|
||||||
return self._extract_chapters(
|
|
||||||
re.findall(sep_re % (duration_re, r'.+?'), description or ''),
|
|
||||||
chapter_time=lambda x: parse_duration(x[0]), chapter_title=lambda x: x[1],
|
|
||||||
duration=duration, strict=False) or self._extract_chapters(
|
|
||||||
re.findall(sep_re % (r'.+?', duration_re), description or ''),
|
|
||||||
chapter_time=lambda x: parse_duration(x[1]), chapter_title=lambda x: x[0],
|
|
||||||
duration=duration, strict=False)
|
|
||||||
|
|
||||||
def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration, strict=True):
|
|
||||||
if not duration:
|
|
||||||
return
|
|
||||||
chapter_list = [{
|
|
||||||
'start_time': chapter_time(chapter),
|
|
||||||
'title': chapter_title(chapter),
|
|
||||||
} for chapter in chapter_list or []]
|
|
||||||
if not strict:
|
|
||||||
chapter_list.sort(key=lambda c: c['start_time'] or 0)
|
|
||||||
|
|
||||||
chapters = [{'start_time': 0}]
|
|
||||||
for idx, chapter in enumerate(chapter_list):
|
|
||||||
if chapter['start_time'] is None:
|
|
||||||
self.report_warning(f'Incomplete chapter {idx}')
|
|
||||||
elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
|
|
||||||
chapters.append(chapter)
|
|
||||||
elif chapter not in chapters:
|
|
||||||
self.report_warning(
|
|
||||||
f'Invalid start time ({chapter["start_time"]} < {chapters[-1]["start_time"]}) for chapter "{chapter["title"]}"')
|
|
||||||
return chapters[1:]
|
|
||||||
|
|
||||||
def _extract_comment(self, comment_renderer, parent=None):
|
def _extract_comment(self, comment_renderer, parent=None):
|
||||||
comment_id = comment_renderer.get('commentId')
|
comment_id = comment_renderer.get('commentId')
|
||||||
if not comment_id:
|
if not comment_id:
|
||||||
|
Loading…
Reference in New Issue
Block a user