ArteTVIE: extract the video with the correct language

Some urls from the French version of the page could download the German version. Also instead of extracting the json url from the webpage, build it to skip the download
2024-11-27 02:42:30 +00:00 · 2013-07-02 17:34:40 +02:00 · 2013-07-02 17:34:40 +02:00 · 9826925a20
commit 9826925a20
parent 24a267b562
1 changed files with 15 additions and 5 deletions
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@ -16,7 +16,7 @@ class ArteTvIE(InfoExtractor):
    www.arte.tv/guide, the extraction process is different for each one.
    The videos expire in 7 days, so we can't add tests.
    """
-    _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
+    _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
    _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?:fr|de)/.*-(?P<id>.*?).html'
    _LIVE_URL = r'index-[0-9]+\.html$'
@ -57,10 +57,11 @@ def _real_extract(self, url):
        mobj = re.match(self._EMISSION_URL, url)
        if mobj is not None:
            name = mobj.group('name')
            lang = mobj.group('lang')
            # This is not a real id, it can be for example AJT for the news
            # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
            video_id = mobj.group('id')
-            return self._extract_emission(url, video_id)
+            return self._extract_emission(url, video_id, lang)
        mobj = re.match(self._VIDEOS_URL, url)
        if mobj is not None:
@ -72,10 +73,9 @@ def _real_extract(self, url):
            # self.extractLiveStream(url)
            # return
-    def _extract_emission(self, url, video_id):
+    def _extract_emission(self, url, video_id, lang):
        """Extract from www.arte.tv/guide"""
-        webpage = self._download_webpage(url, video_id)
+        json_url = 'http://org-www.arte.tv/papi/tvguide/videos/stream/player/F/%s_PLUS7-F/ALL/ALL.json' % video_id
        json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
        json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
        self.report_extraction(video_id)
@ -91,6 +91,16 @@ def _extract_emission(self, url, video_id):
                     }
        formats = player_info['VSR'].values()
        def _match_lang(f):
            # Return true if that format is in the language of the url
            if lang == 'fr':
                l = 'F'
            elif lang == 'de':
                l = 'A'
            regexes = [r'VO?%s' % l, r'V%s-ST.' % l]
            return any(re.match(r, f['versionCode']) for r in regexes)
        # Some formats may not be in the same language as the url
        formats = filter(_match_lang, formats)
        # We order the formats by quality
        formats = sorted(formats, key=lambda f: int(f['height']))
        # Pick the best quality