From a2bac6b7adb7b0e955125838e20bb39eece630ce Mon Sep 17 00:00:00 2001 From: columndeeply <106948293+columndeeply@users.noreply.github.com> Date: Wed, 31 Jan 2024 20:16:07 +0000 Subject: [PATCH] [ie/PrankCastPost] Add extractor (#8933) Authored by: columndeeply --- yt_dlp/extractor/_extractors.py | 2 +- yt_dlp/extractor/prankcast.py | 73 ++++++++++++++++++++++++++++++++- 2 files changed, 73 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 82d3004ba..4c8604099 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1518,7 +1518,7 @@ PuhuTVSerieIE, ) from .pr0gramm import Pr0grammIE -from .prankcast import PrankCastIE +from .prankcast import PrankCastIE, PrankCastPostIE from .premiershiprugby import PremiershipRugbyIE from .presstv import PressTVIE from .projectveritas import ProjectVeritasIE diff --git a/yt_dlp/extractor/prankcast.py b/yt_dlp/extractor/prankcast.py index b2ec5bbb8..562aca0ff 100644 --- a/yt_dlp/extractor/prankcast.py +++ b/yt_dlp/extractor/prankcast.py @@ -1,5 +1,8 @@ +import json + from .common import InfoExtractor -from ..utils import parse_iso8601, traverse_obj, try_call +from ..utils import float_or_none, parse_iso8601, str_or_none, try_call +from ..utils.traversal import traverse_obj class PrankCastIE(InfoExtractor): @@ -64,3 +67,71 @@ def _real_extract(self, url): 'categories': [json_info.get('broadcast_category')], 'tags': try_call(lambda: json_info['broadcast_tags'].split(',')) } + + +class PrankCastPostIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?prankcast\.com/[^/?#]+/posts/(?P\d+)-(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://prankcast.com/devonanustart/posts/6214-happy-national-rachel-day-', + 'info_dict': { + 'id': '6214', + 'ext': 'mp3', + 'title': 'Happy National Rachel Day!', + 'display_id': 'happy-national-rachel-day-', + 'timestamp': 1704333938, + 'uploader': 'Devonanustart', + 'channel_id': '4', + 'duration': 13175, + 'cast': ['Devonanustart'], + 'description': '', + 'categories': ['prank call'], + 'upload_date': '20240104' + } + }, { + 'url': 'https://prankcast.com/despicabledogs/posts/6217-jake-the-work-crow-', + 'info_dict': { + 'id': '6217', + 'ext': 'mp3', + 'title': 'Jake the Work Crow!', + 'display_id': 'jake-the-work-crow-', + 'timestamp': 1704346592, + 'uploader': 'despicabledogs', + 'channel_id': '957', + 'duration': 263.287, + 'cast': ['despicabledogs'], + 'description': 'https://imgur.com/a/vtxLvKU', + 'categories': [], + 'upload_date': '20240104' + } + }] + + def _real_extract(self, url): + video_id, display_id = self._match_valid_url(url).group('id', 'display_id') + + webpage = self._download_webpage(url, video_id) + post = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['ssr_data_posts'] + content = self._parse_json(post['post_contents_json'], video_id)[0] + + uploader = post.get('user_name') + guests_json = traverse_obj(content, ('guests_json', {json.loads}, {dict})) or {} + + return { + 'id': video_id, + 'title': post.get('post_title') or self._og_search_title(webpage), + 'display_id': display_id, + 'url': content.get('url'), + 'timestamp': parse_iso8601(content.get('start_date') or content.get('crdate'), ' '), + 'uploader': uploader, + 'channel_id': str_or_none(post.get('user_id')), + 'duration': float_or_none(content.get('duration')), + 'cast': list(filter(None, [uploader] + traverse_obj(guests_json, (..., 'name')))), + 'description': post.get('post_body'), + 'categories': list(filter(None, [content.get('category')])), + 'tags': try_call(lambda: list(filter('', post['post_tags'].split(',')))), + 'subtitles': { + 'live_chat': [{ + 'url': f'https://prankcast.com/api/private/chat/select-broadcast?id={post["content_id"]}&cache=', + 'ext': 'json', + }], + } if post.get('content_id') else None + }