[extractor/common] remove duplicated formats and subtiles in smil manifests

This commit is contained in:
remitamine 2016-02-09 17:15:41 +01:00
parent 1bedf4de06
commit d413095f7e

View File

@ -1186,6 +1186,7 @@ class InfoExtractor(object):
http_count = 0 http_count = 0
m3u8_count = 0 m3u8_count = 0
src_urls = []
videos = smil.findall(self._xpath_ns('.//video', namespace)) videos = smil.findall(self._xpath_ns('.//video', namespace))
for video in videos: for video in videos:
src = video.get('src') src = video.get('src')
@ -1222,6 +1223,9 @@ class InfoExtractor(object):
continue continue
src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
if src_url in src_urls:
continue
src_urls.append(src_url)
if proto == 'm3u8' or src_ext == 'm3u8': if proto == 'm3u8' or src_ext == 'm3u8':
m3u8_formats = self._extract_m3u8_formats( m3u8_formats = self._extract_m3u8_formats(
@ -1267,11 +1271,13 @@ class InfoExtractor(object):
return formats return formats
def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
urls = []
subtitles = {} subtitles = {}
for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
src = textstream.get('src') src = textstream.get('src')
if not src: if not src or src in urls:
continue continue
urls.append(src)
ext = textstream.get('ext') or determine_ext(src) ext = textstream.get('ext') or determine_ext(src)
if not ext: if not ext:
type_ = textstream.get('type') type_ = textstream.get('type')