[extractor/cspan] Support of C-Span congress videos (#2295)

Authored by: Grabien
This commit is contained in:
Grabien 2022-02-16 21:21:05 +02:00 committed by GitHub
parent 85a0ad0117
commit edecb5f81f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 49 additions and 3 deletions

View File

@ -3,6 +3,7 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_HTMLParseError
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
ExtractorError, ExtractorError,
@ -11,9 +12,11 @@
get_element_by_attribute, get_element_by_attribute,
get_element_by_class, get_element_by_class,
int_or_none, int_or_none,
join_nonempty,
js_to_json, js_to_json,
merge_dicts, merge_dicts,
parse_iso8601, parse_iso8601,
parse_qs,
smuggle_url, smuggle_url,
str_to_int, str_to_int,
unescapeHTML, unescapeHTML,
@ -126,8 +129,12 @@ def add_referer(formats):
ext = 'vtt' ext = 'vtt'
subtitle['ext'] = ext subtitle['ext'] = ext
ld_info = self._search_json_ld(webpage, video_id, default={}) ld_info = self._search_json_ld(webpage, video_id, default={})
title = get_element_by_class('video-page-title', webpage) or \ try:
self._og_search_title(webpage) title = get_element_by_class('video-page-title', webpage)
except compat_HTMLParseError:
title = None
if title is None:
title = self._og_search_title(webpage)
description = get_element_by_attribute('itemprop', 'description', webpage) or \ description = get_element_by_attribute('itemprop', 'description', webpage) or \
self._html_search_meta(['og:description', 'description'], webpage) self._html_search_meta(['og:description', 'description'], webpage)
return merge_dicts(info, ld_info, { return merge_dicts(info, ld_info, {
@ -242,3 +249,42 @@ def get_text_attr(d, attr):
'title': title, 'title': title,
'id': 'c' + video_id if video_type == 'clip' else video_id, 'id': 'c' + video_id if video_type == 'clip' else video_id,
} }
class CSpanCongressIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?c-span\.org/congress/'
_TESTS = [{
'url': 'https://www.c-span.org/congress/?chamber=house&date=2017-12-13&t=1513208380',
'info_dict': {
'id': 'house_2017-12-13',
'title': 'Congressional Chronicle - Members of Congress, Hearings and More',
'description': 'md5:54c264b7a8f219937987610243305a84',
'thumbnail': r're:https://ximage.c-spanvideo.org/.+',
'ext': 'mp4'
}
}]
def _real_extract(self, url):
query = parse_qs(url)
video_date = query.get('date', [None])[0]
video_id = join_nonempty(query.get('chamber', ['senate'])[0], video_date, delim='_')
webpage = self._download_webpage(url, video_id)
if not video_date:
jwp_date = re.search(r'jwsetup.clipprogdate = \'(?P<date>\d{4}-\d{2}-\d{2})\';', webpage)
if jwp_date:
video_id = f'{video_id}_{jwp_date.group("date")}'
jwplayer_data = self._parse_json(
self._search_regex(r'jwsetup\s*=\s*({(?:.|\n)[^;]+});', webpage, 'player config'),
video_id, transform_source=js_to_json)
title = (self._og_search_title(webpage, default=None)
or self._html_search_regex(r'(?s)<title>(.*?)</title>', webpage, 'video title'))
description = (self._og_search_description(webpage, default=None)
or self._html_search_meta('description', webpage, 'description', default=None))
return {
**self._parse_jwplayer_data(jwplayer_data, video_id, False),
'title': re.sub(r'\s+', ' ', title.split('|')[0]).strip(),
'description': description,
'http_headers': {'Referer': 'https://www.c-span.org/'},
}

View File

@ -316,7 +316,7 @@
CrunchyrollBetaIE, CrunchyrollBetaIE,
CrunchyrollBetaShowIE, CrunchyrollBetaShowIE,
) )
from .cspan import CSpanIE from .cspan import CSpanIE, CSpanCongressIE
from .ctsnews import CtsNewsIE from .ctsnews import CtsNewsIE
from .ctv import CTVIE from .ctv import CTVIE
from .ctvnews import CTVNewsIE from .ctvnews import CTVNewsIE