diff --git a/m3u8/parser.py b/m3u8/parser.py index f045bb81..cb16fe3d 100644 --- a/m3u8/parser.py +++ b/m3u8/parser.py @@ -81,193 +81,173 @@ def parse(content, strict=False, custom_tags_parser=None): if len(found_errors) > 0: raise Exception(found_errors) - lineno = 0 - for line in lines: - lineno += 1 + for lineno, line in enumerate(lines, 1): line = line.strip() + parse_kwargs = { + "line": line, + "lineno": lineno, + "data": data, + "state": state, + "strict": strict, + } # Call custom parser if needed if line.startswith("#") and callable(custom_tags_parser): go_to_next_line = custom_tags_parser(line, lineno, data, state) - # Do not try to parse other standard tags on this line if custom_tags_parser function returns 'True' + # Do not try to parse other standard tags on this line if custom_tags_parser + # function returns `True` if go_to_next_line: continue if line.startswith(protocol.ext_x_byterange): - _parse_byterange(line, state) - state["expect_segment"] = True + _parse_byterange(**parse_kwargs) continue - if line.startswith(protocol.ext_x_bitrate): - _parse_bitrate(line, state) + elif line.startswith(protocol.ext_x_bitrate): + _parse_bitrate(**parse_kwargs) elif line.startswith(protocol.ext_x_targetduration): - _parse_simple_parameter(line, data, float) + _parse_targetduration(**parse_kwargs) elif line.startswith(protocol.ext_x_media_sequence): - _parse_simple_parameter(line, data, int) + _parse_media_sequence(**parse_kwargs) elif line.startswith(protocol.ext_x_discontinuity_sequence): - _parse_simple_parameter(line, data, int) + _parse_discontinuity_sequence(**parse_kwargs) elif line.startswith(protocol.ext_x_program_date_time): - _, program_date_time = _parse_simple_parameter_raw_value( - line, cast_date_time - ) - if not data.get("program_date_time"): - data["program_date_time"] = program_date_time - state["current_program_date_time"] = program_date_time - state["program_date_time"] = program_date_time + _parse_program_date_time(**parse_kwargs) elif line.startswith(protocol.ext_x_discontinuity): - state["discontinuity"] = True + _parse_discontinuity(**parse_kwargs) elif line.startswith(protocol.ext_x_cue_out_cont): - _parse_cueout_cont(line, state) - state["cue_out"] = True + _parse_cueout_cont(**parse_kwargs) elif line.startswith(protocol.ext_x_cue_out): - _parse_cueout(line, state) - state["cue_out_start"] = True - state["cue_out"] = True - if "DURATION" in line.upper(): - state["cue_out_explicitly_duration"] = True + _parse_cueout(**parse_kwargs) elif line.startswith(f"{protocol.ext_oatcls_scte35}:"): - _parse_oatcls_scte35(line, state) + _parse_oatcls_scte35(**parse_kwargs) elif line.startswith(f"{protocol.ext_x_asset}:"): - _parse_asset(line, state) + _parse_asset(**parse_kwargs) elif line.startswith(protocol.ext_x_cue_in): - state["cue_in"] = True + _parse_cue_in(**parse_kwargs) elif line.startswith(protocol.ext_x_cue_span): - state["cue_out"] = True + _parse_cue_span(**parse_kwargs) elif line.startswith(protocol.ext_x_version): - _parse_simple_parameter(line, data, int) + _parse_version(**parse_kwargs) elif line.startswith(protocol.ext_x_allow_cache): - _parse_simple_parameter(line, data) + _parse_allow_cache(**parse_kwargs) elif line.startswith(protocol.ext_x_key): - key = _parse_key(line) - state["current_key"] = key - if key not in data["keys"]: - data["keys"].append(key) + _parse_key(**parse_kwargs) elif line.startswith(protocol.extinf): - _parse_extinf(line, data, state, lineno, strict) - state["expect_segment"] = True + _parse_extinf(**parse_kwargs) elif line.startswith(protocol.ext_x_stream_inf): - state["expect_playlist"] = True - _parse_stream_inf(line, data, state) + _parse_stream_inf(**parse_kwargs) elif line.startswith(protocol.ext_x_i_frame_stream_inf): - _parse_i_frame_stream_inf(line, data) + _parse_i_frame_stream_inf(**parse_kwargs) elif line.startswith(protocol.ext_x_media): - _parse_media(line, data, state) + _parse_media(**parse_kwargs) elif line.startswith(protocol.ext_x_playlist_type): - _parse_simple_parameter(line, data) + _parse_playlist_type(**parse_kwargs) elif line.startswith(protocol.ext_i_frames_only): - data["is_i_frames_only"] = True + _parse_i_frames_only(**parse_kwargs) elif line.startswith(protocol.ext_is_independent_segments): - data["is_independent_segments"] = True + _parse_is_independent_segments(**parse_kwargs) elif line.startswith(protocol.ext_x_endlist): - data["is_endlist"] = True + _parse_endlist(**parse_kwargs) elif line.startswith(protocol.ext_x_map): - quoted_parser = remove_quotes_parser("uri", "byterange") - segment_map_info = _parse_attribute_list( - protocol.ext_x_map, line, quoted_parser - ) - state["current_segment_map"] = segment_map_info - data["segment_map"].append(segment_map_info) + _parse_x_map(**parse_kwargs) elif line.startswith(protocol.ext_x_start): - attribute_parser = {"time_offset": lambda x: float(x)} - start_info = _parse_attribute_list( - protocol.ext_x_start, line, attribute_parser - ) - data["start"] = start_info + _parse_start(**parse_kwargs) elif line.startswith(protocol.ext_x_server_control): - _parse_server_control(line, data, state) + _parse_server_control(**parse_kwargs) elif line.startswith(protocol.ext_x_part_inf): - _parse_part_inf(line, data, state) + _parse_part_inf(**parse_kwargs) elif line.startswith(protocol.ext_x_rendition_report): - _parse_rendition_report(line, data, state) + _parse_rendition_report(**parse_kwargs) elif line.startswith(protocol.ext_x_part): - _parse_part(line, data, state) + _parse_part(**parse_kwargs) elif line.startswith(protocol.ext_x_skip): - _parse_skip(line, data, state) + _parse_skip(**parse_kwargs) elif line.startswith(protocol.ext_x_session_data): - _parse_session_data(line, data, state) + _parse_session_data(**parse_kwargs) elif line.startswith(protocol.ext_x_session_key): - _parse_session_key(line, data, state) + _parse_session_key(**parse_kwargs) elif line.startswith(protocol.ext_x_preload_hint): - _parse_preload_hint(line, data, state) + _parse_preload_hint(**parse_kwargs) elif line.startswith(protocol.ext_x_daterange): - _parse_daterange(line, data, state) + _parse_daterange(**parse_kwargs) elif line.startswith(protocol.ext_x_gap): - state["gap"] = True + _parse_gap(**parse_kwargs) elif line.startswith(protocol.ext_x_content_steering): - _parse_content_steering(line, data, state) + _parse_content_steering(**parse_kwargs) elif line.startswith(protocol.ext_x_image_stream_inf): - _parse_image_stream_inf(line, data) + _parse_image_stream_inf(**parse_kwargs) elif line.startswith(protocol.ext_x_images_only): - data["is_images_only"] = True - + _parse_is_images_only(**parse_kwargs) elif line.startswith(protocol.ext_x_tiles): - _parse_tiles(line, data, state) + _parse_tiles(**parse_kwargs) + # #EXTM3U should be present. elif line.startswith(protocol.ext_m3u): - # We don't parse #EXTM3U, it just should to be present pass + # Blank lines are ignored. elif line.strip() == "": - # blank lines are legal pass + # Lines that don't start with # are either segments or playlists. elif (not line.startswith("#")) and (state["expect_segment"]): - _parse_ts_chunk(line, data, state) - state["expect_segment"] = False + _parse_ts_chunk(**parse_kwargs) elif (not line.startswith("#")) and (state["expect_playlist"]): - _parse_variant_playlist(line, data, state) - state["expect_playlist"] = False + _parse_variant_playlist(**parse_kwargs) + # Lines that haven't been recognized by any of the parsers above are illegal + # in strict mode. elif strict: raise ParseError(lineno, line) - # there could be remaining partial segments + # Handle remaining partial segments. if "segment" in state: data["segments"].append(state.pop("segment")) return data -def _parse_key(line): +def _parse_key(line, data, state, **kwargs): params = ATTRIBUTELISTPATTERN.split(line.replace(protocol.ext_x_key + ":", ""))[ 1::2 ] @@ -275,10 +255,13 @@ def _parse_key(line): for param in params: name, value = param.split("=", 1) key[normalize_attribute(name)] = remove_quotes(value) - return key + + state["current_key"] = key + if key not in data["keys"]: + data["keys"].append(key) -def _parse_extinf(line, data, state, lineno, strict): +def _parse_extinf(line, state, lineno, strict, **kwargs): chunks = line.replace(protocol.extinf + ":", "").split(",", 1) if len(chunks) == 2: duration, title = chunks @@ -292,9 +275,10 @@ def _parse_extinf(line, data, state, lineno, strict): state["segment"] = {} state["segment"]["duration"] = float(duration) state["segment"]["title"] = title + state["expect_segment"] = True -def _parse_ts_chunk(line, data, state): +def _parse_ts_chunk(line, data, state, **kwargs): segment = state.pop("segment") if state.get("program_date_time"): segment["program_date_time"] = state.pop("program_date_time") @@ -328,6 +312,7 @@ def _parse_ts_chunk(line, data, state): segment["dateranges"] = state.pop("dateranges", None) segment["gap_tag"] = state.pop("gap", None) data["segments"].append(segment) + state["expect_segment"] = False def _parse_attribute_list(prefix, line, attribute_parser, default_parser=None): @@ -348,7 +333,8 @@ def _parse_attribute_list(prefix, line, attribute_parser, default_parser=None): return attributes -def _parse_stream_inf(line, data, state): +def _parse_stream_inf(line, data, state, **kwargs): + state["expect_playlist"] = True data["is_variant"] = True data["media_sequence"] = None attribute_parser = remove_quotes_parser( @@ -371,7 +357,7 @@ def _parse_stream_inf(line, data, state): ) -def _parse_i_frame_stream_inf(line, data): +def _parse_i_frame_stream_inf(line, data, **kwargs): attribute_parser = remove_quotes_parser( "codecs", "uri", "pathway_id", "stable_variant_id" ) @@ -391,7 +377,7 @@ def _parse_i_frame_stream_inf(line, data): data["iframe_playlists"].append(iframe_playlist) -def _parse_image_stream_inf(line, data): +def _parse_image_stream_inf(line, data, **kwargs): attribute_parser = remove_quotes_parser( "codecs", "uri", "pathway_id", "stable_variant_id" ) @@ -410,7 +396,11 @@ def _parse_image_stream_inf(line, data): data["image_playlists"].append(image_playlist) -def _parse_tiles(line, data, state): +def _parse_is_images_only(line, data, **kwargs): + data["is_images_only"] = True + + +def _parse_tiles(line, data, state, **kwargs): attribute_parser = remove_quotes_parser("uri") attribute_parser["resolution"] = str attribute_parser["layout"] = str @@ -419,7 +409,7 @@ def _parse_tiles(line, data, state): data["tiles"].append(tiles_info) -def _parse_media(line, data, state): +def _parse_media(line, data, **kwargs): quoted = remove_quotes_parser( "uri", "group_id", @@ -437,25 +427,89 @@ def _parse_media(line, data, state): data["media"].append(media) -def _parse_variant_playlist(line, data, state): +def _parse_variant_playlist(line, data, state, **kwargs): playlist = {"uri": line, "stream_info": state.pop("stream_info")} - data["playlists"].append(playlist) + state["expect_playlist"] = False -def _parse_bitrate(line, state): +def _parse_bitrate(line, state, **kwargs): if "segment" not in state: state["segment"] = {} state["segment"]["bitrate"] = line.replace(protocol.ext_x_bitrate + ":", "") -def _parse_byterange(line, state): +def _parse_byterange(line, state, **kwargs): if "segment" not in state: state["segment"] = {} state["segment"]["byterange"] = line.replace(protocol.ext_x_byterange + ":", "") + state["expect_segment"] = True + + +def _parse_targetduration(**parse_kwargs): + return _parse_simple_parameter(cast_to=float, **parse_kwargs) + + +def _parse_media_sequence(**parse_kwargs): + return _parse_simple_parameter(cast_to=int, **parse_kwargs) + + +def _parse_discontinuity_sequence(**parse_kwargs): + return _parse_simple_parameter(cast_to=int, **parse_kwargs) + + +def _parse_program_date_time(line, state, data, **parse_kwargs): + _, program_date_time = _parse_simple_parameter_raw_value( + line, cast_to=cast_date_time, **parse_kwargs + ) + if not data.get("program_date_time"): + data["program_date_time"] = program_date_time + state["current_program_date_time"] = program_date_time + state["program_date_time"] = program_date_time + + +def _parse_discontinuity(state, **parse_kwargs): + state["discontinuity"] = True + + +def _parse_cue_in(state, **parse_kwargs): + state["cue_in"] = True + + +def _parse_cue_span(state, **parse_kwargs): + state["cue_out"] = True -def _parse_simple_parameter_raw_value(line, cast_to=str, normalize=False): +def _parse_version(**parse_kwargs): + return _parse_simple_parameter(cast_to=int, **parse_kwargs) + + +def _parse_allow_cache(**parse_kwargs): + return _parse_simple_parameter(cast_to=str, **parse_kwargs) + + +def _parse_playlist_type(line, data, **kwargs): + return _parse_simple_parameter(line, data) + + +def _parse_x_map(line, data, state, **kwargs): + quoted_parser = remove_quotes_parser("uri", "byterange") + segment_map_info = _parse_attribute_list(protocol.ext_x_map, line, quoted_parser) + state["current_segment_map"] = segment_map_info + data["segment_map"].append(segment_map_info) + + +def _parse_start(line, data, **kwargs): + attribute_parser = {"time_offset": lambda x: float(x)} + start_info = _parse_attribute_list(protocol.ext_x_start, line, attribute_parser) + data["start"] = start_info + + +def _parse_gap(state, **kwargs): + state["gap"] = True + + +def _parse_simple_parameter_raw_value(line, cast_to=str, normalize=False, **kwargs): param, value = line.split(":", 1) param = normalize_attribute(param.replace("#EXT-X-", "")) if normalize: @@ -463,17 +517,33 @@ def _parse_simple_parameter_raw_value(line, cast_to=str, normalize=False): return param, cast_to(value) -def _parse_and_set_simple_parameter_raw_value(line, data, cast_to=str, normalize=False): +def _parse_and_set_simple_parameter_raw_value( + line, data, cast_to=str, normalize=False, **kwargs +): param, value = _parse_simple_parameter_raw_value(line, cast_to, normalize) data[param] = value return data[param] -def _parse_simple_parameter(line, data, cast_to=str): +def _parse_simple_parameter(line, data, cast_to=str, **kwargs): return _parse_and_set_simple_parameter_raw_value(line, data, cast_to, True) -def _parse_cueout_cont(line, state): +def _parse_i_frames_only(data, **kwargs): + data["is_i_frames_only"] = True + + +def _parse_is_independent_segments(data, **kwargs): + data["is_independent_segments"] = True + + +def _parse_endlist(data, **kwargs): + data["is_endlist"] = True + + +def _parse_cueout_cont(line, state, **kwargs): + state["cue_out"] = True + elements = line.split(":", 1) if len(elements) != 2: return @@ -539,7 +609,7 @@ def _cueout_simple(line): return (None, res.group(1)) -def _parse_cueout(line, state): +def _parse_cueout(line, state, **kwargs): _cueout_state = ( _cueout_no_duration(line) or _cueout_envivio(line, state) @@ -552,8 +622,13 @@ def _parse_cueout(line, state): state["current_cue_out_scte35"] = cue_out_scte35 or current_cue_out_scte35 state["current_cue_out_duration"] = cue_out_duration + state["cue_out_start"] = True + state["cue_out"] = True + if "DURATION" in line.upper(): + state["cue_out_explicitly_duration"] = True + -def _parse_server_control(line, data, state): +def _parse_server_control(line, data, **kwargs): attribute_parser = { "can_block_reload": str, "hold_back": lambda x: float(x), @@ -567,7 +642,7 @@ def _parse_server_control(line, data, state): ) -def _parse_part_inf(line, data, state): +def _parse_part_inf(line, data, **kwargs): attribute_parser = {"part_target": lambda x: float(x)} data["part_inf"] = _parse_attribute_list( @@ -575,7 +650,7 @@ def _parse_part_inf(line, data, state): ) -def _parse_rendition_report(line, data, state): +def _parse_rendition_report(line, data, **kwargs): attribute_parser = remove_quotes_parser("uri") attribute_parser["last_msn"] = int attribute_parser["last_part"] = int @@ -587,7 +662,7 @@ def _parse_rendition_report(line, data, state): data["rendition_reports"].append(rendition_report) -def _parse_part(line, data, state): +def _parse_part(line, state, **kwargs): attribute_parser = remove_quotes_parser("uri") attribute_parser["duration"] = lambda x: float(x) attribute_parser["independent"] = str @@ -613,20 +688,20 @@ def _parse_part(line, data, state): segment["parts"].append(part) -def _parse_skip(line, data, state): +def _parse_skip(line, data, **parse_kwargs): attribute_parser = remove_quotes_parser("recently_removed_dateranges") attribute_parser["skipped_segments"] = int data["skip"] = _parse_attribute_list(protocol.ext_x_skip, line, attribute_parser) -def _parse_session_data(line, data, state): +def _parse_session_data(line, data, **kwargs): quoted = remove_quotes_parser("data_id", "value", "uri", "language") session_data = _parse_attribute_list(protocol.ext_x_session_data, line, quoted) data["session_data"].append(session_data) -def _parse_session_key(line, data, state): +def _parse_session_key(line, data, **kwargs): params = ATTRIBUTELISTPATTERN.split( line.replace(protocol.ext_x_session_key + ":", "") )[1::2] @@ -637,7 +712,7 @@ def _parse_session_key(line, data, state): data["session_keys"].append(key) -def _parse_preload_hint(line, data, state): +def _parse_preload_hint(line, data, **kwargs): attribute_parser = remove_quotes_parser("uri") attribute_parser["type"] = str attribute_parser["byterange_start"] = int @@ -648,7 +723,7 @@ def _parse_preload_hint(line, data, state): ) -def _parse_daterange(line, date, state): +def _parse_daterange(line, state, **kwargs): attribute_parser = remove_quotes_parser("id", "class", "start_date", "end_date") attribute_parser["duration"] = float attribute_parser["planned_duration"] = float @@ -665,7 +740,7 @@ def _parse_daterange(line, date, state): state["dateranges"].append(parsed) -def _parse_content_steering(line, data, state): +def _parse_content_steering(line, data, **kwargs): attribute_parser = remove_quotes_parser("server_uri", "pathway_id") data["content_steering"] = _parse_attribute_list( @@ -673,13 +748,13 @@ def _parse_content_steering(line, data, state): ) -def _parse_oatcls_scte35(line, state): +def _parse_oatcls_scte35(line, state, **kwargs): scte35_cue = line.split(":", 1)[1] state["current_cue_out_oatcls_scte35"] = scte35_cue state["current_cue_out_scte35"] = scte35_cue -def _parse_asset(line, state): +def _parse_asset(line, state, **kwargs): # EXT-X-ASSET attribute values may or may not be quoted, and need to be URL-encoded. # They are preserved as-is here to prevent loss of information. state["asset_metadata"] = _parse_attribute_list(