From 7577d849a62ecdcc52ede6dcf73edf2a717fc646 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 19 Jun 2016 02:25:34 +0700 Subject: [PATCH] [r7] Fix extraction and add support for articles (Closes #9826) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/r7.py | 95 +++++++++++++++++++----------- 2 files changed, 64 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2ff867651..b1b04f2fc 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -631,7 +631,10 @@ from .qqmusic import ( QQMusicToplistIE, QQMusicPlaylistIE, ) -from .r7 import R7IE +from .r7 import ( + R7IE, + R7ArticleIE, +) from .radiocanada import ( RadioCanadaIE, RadioCanadaAudioVideoIE, diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py index 976c8feec..069dbfaed 100644 --- a/youtube_dl/extractor/r7.py +++ b/youtube_dl/extractor/r7.py @@ -2,22 +2,19 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ( - js_to_json, - unescapeHTML, - int_or_none, -) +from ..utils import int_or_none class R7IE(InfoExtractor): - _VALID_URL = r'''(?x)https?:// + _VALID_URL = r'''(?x) + https?:// (?: (?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/| noticias\.r7\.com(?:/[^/]+)+/[^/]+-| player\.r7\.com/video/i/ ) (?P[\da-f]{24}) - ''' + ''' _TESTS = [{ 'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html', 'md5': '403c4e393617e8e8ddc748978ee8efde', @@ -25,6 +22,7 @@ class R7IE(InfoExtractor): 'id': '54e7050b0cf2ff57e0279389', 'ext': 'mp4', 'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"', + 'description': 'md5:01812008664be76a6479aa58ec865b72', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 98, 'like_count': int, @@ -44,45 +42,72 @@ class R7IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://player.r7.com/video/i/%s' % video_id, video_id) + video = self._download_json( + 'http://player-api.r7.com/video/i/%s' % video_id, video_id) - item = self._parse_json(js_to_json(self._search_regex( - r'(?s)var\s+item\s*=\s*({.+?});', webpage, 'player')), video_id) - - title = unescapeHTML(item['title']) - thumbnail = item.get('init', {}).get('thumbUri') - duration = None - - statistics = item.get('statistics', {}) - like_count = int_or_none(statistics.get('likes')) - view_count = int_or_none(statistics.get('views')) + title = video['title'] formats = [] - for format_key, format_dict in item['playlist'][0].items(): - src = format_dict.get('src') - if not src: - continue - format_id = format_dict.get('format') or format_key - if duration is None: - duration = format_dict.get('duration') - if '.f4m' in src: - formats.extend(self._extract_f4m_formats(src, video_id, preference=-1)) - elif src.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(src, video_id, 'mp4', preference=-2)) - else: - formats.append({ - 'url': src, - 'format_id': format_id, - }) + media_url_hls = video.get('media_url_hls') + if media_url_hls: + formats.extend(self._extract_m3u8_formats( + media_url_hls, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + media_url = video.get('media_url') + if media_url: + f = { + 'url': media_url, + 'format_id': 'http', + } + # m3u8 format always matches the http format, let's copy metadata from + # one to another + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + formats)) + if len(m3u8_formats) == 1: + f_copy = m3u8_formats[0].copy() + f_copy.update(f) + f_copy['protocol'] = 'http' + f = f_copy + formats.append(f) self._sort_formats(formats) + description = video.get('description') + thumbnail = video.get('thumb') + duration = int_or_none(video.get('media_duration')) + like_count = int_or_none(video.get('likes')) + view_count = int_or_none(video.get('views')) + return { 'id': video_id, 'title': title, + 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'like_count': like_count, 'view_count': view_count, 'formats': formats, } + + +class R7ArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:[a-zA-Z]+)\.r7\.com/(?:[^/]+/)+[^/?#&]+-(?P\d+)' + _TEST = { + 'url': 'http://tv.r7.com/record-play/balanco-geral/videos/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-16102015', + 'only_matching': True, + } + + @classmethod + def suitable(cls, url): + return False if R7IE.suitable(url) else super(R7ArticleIE, cls).suitable(url) + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r']+(?:id=["\']player-|class=["\']embed["\'][^>]+id=["\'])([\da-f]{24})', + webpage, 'video id') + + return self.url_result('http://player.r7.com/video/i/%s' % video_id, R7IE.ie_key())