From d0fb4bd16f191445ab577ae23be57fc55242a108 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 13 Jun 2021 21:36:47 +0530 Subject: [PATCH] [pornhub] Extract `cast` Closes #406, https://github.com/ytdl-org/youtube-dl/pull/27384 --- yt_dlp/extractor/common.py | 1 + yt_dlp/extractor/pornhub.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 1524fcb15..b14cf0fc9 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -290,6 +290,7 @@ class InfoExtractor(object): categories: A list of categories that the video falls in, for example ["Sports", "Berlin"] tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"] + cast: A list of the video cast is_live: True, False, or None (=unknown). Whether this video is a live stream that goes on instead of a fixed-length video. was_live: True, False, or None (=unknown). Whether this video was diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index 031454600..cf407a813 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -14,6 +14,7 @@ ) from .openload import PhantomJSwrapper from ..utils import ( + clean_html, determine_ext, ExtractorError, int_or_none, @@ -145,6 +146,7 @@ class PornHubIE(PornHubBaseIE): 'age_limit': 18, 'tags': list, 'categories': list, + 'cast': list, }, }, { # non-ASCII title @@ -464,7 +466,7 @@ def extract_list(meta_key): r'(?s)]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)' % meta_key, webpage, meta_key, default=None) if div: - return re.findall(r']+\bhref=[^>]+>([^<]+)', div) + return [clean_html(x).strip() for x in re.findall(r'(?s)]+\bhref=[^>]+>.+?', div)] info = self._search_json_ld(webpage, video_id, default={}) # description provided in JSON-LD is irrelevant @@ -485,6 +487,7 @@ def extract_list(meta_key): 'age_limit': 18, 'tags': extract_list('tags'), 'categories': extract_list('categories'), + 'cast': extract_list('pornstars'), 'subtitles': subtitles, }, info)