From 4457823dda410c5406f5ab5474b9b1f9325fa7ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Apr 2017 03:56:49 +0700 Subject: [PATCH] [extractor/common] Move censorship checks to a separate method and add check for just another ISP --- youtube_dl/extractor/common.py | 48 ++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6c3c095f7..cdfa7000b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -547,6 +547,34 @@ class InfoExtractor(object): return encoding + def __check_blocked(self, content): + first_block = content[:512] + if ('<title>Access to this site is blocked</title>' in content and + 'Websense' in first_block): + msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' + blocked_iframe = self._html_search_regex( + r'<iframe src="([^"]+)"', content, + 'Websense information URL', default=None) + if blocked_iframe: + msg += ' Visit %s for more details' % blocked_iframe + raise ExtractorError(msg, expected=True) + if '<title>The URL you requested has been blocked</title>' in first_block: + msg = ( + 'Access to this webpage has been blocked by Indian censorship. ' + 'Use a VPN or proxy server (with --proxy) to route around it.') + block_msg = self._html_search_regex( + r'</h1><p>(.*?)</p>', + content, 'block message', default=None) + if block_msg: + msg += ' (Message: "%s")' % block_msg.replace('\n', ' ') + raise ExtractorError(msg, expected=True) + if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and + 'blocklist.rkn.gov.ru' in content): + raise ExtractorError( + 'Access to this webpage has been blocked by decision of the Russian government. ' + 'Visit http://blocklist.rkn.gov.ru/ for a block reason.', + expected=True) + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): content_type = urlh.headers.get('Content-Type', '') webpage_bytes = urlh.read() @@ -588,25 +616,7 @@ class InfoExtractor(object): except LookupError: content = webpage_bytes.decode('utf-8', 'replace') - if ('<title>Access to this site is blocked</title>' in content and - 'Websense' in content[:512]): - msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' - blocked_iframe = self._html_search_regex( - r'<iframe src="([^"]+)"', content, - 'Websense information URL', default=None) - if blocked_iframe: - msg += ' Visit %s for more details' % blocked_iframe - raise ExtractorError(msg, expected=True) - if '<title>The URL you requested has been blocked</title>' in content[:512]: - msg = ( - 'Access to this webpage has been blocked by Indian censorship. ' - 'Use a VPN or proxy server (with --proxy) to route around it.') - block_msg = self._html_search_regex( - r'</h1><p>(.*?)</p>', - content, 'block message', default=None) - if block_msg: - msg += ' (Message: "%s")' % block_msg.replace('\n', ' ') - raise ExtractorError(msg, expected=True) + self.__check_blocked(content) return content