Skip to content

Commit

Permalink
Merge pull request microsoft#588 from bmerkle/fix#587
Browse files Browse the repository at this point in the history
Fix#587: index list index out of range in transcript_enrich_bucket.py
  • Loading branch information
koreyspace authored Sep 17, 2024
2 parents 2cffd5b + 7a1030c commit 3290089
Showing 1 changed file with 14 additions and 11 deletions.
25 changes: 14 additions & 11 deletions 08-building-search-applications/scripts/transcript_enrich_bucket.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,20 +172,23 @@ def parse_json_vtt_transcript(vtt, metadata):

# Append the last text segment to the last segment in segments dictionary
if seg_begin_seconds and text != "":
previous_segment_tokens = len(tokenizer.encode(segments[-1]["text"]))
current_segment_tokens = len(tokenizer.encode(text))

if previous_segment_tokens + current_segment_tokens < MAX_TOKENS:
segments[-1]["text"] += text
if segments:
previous_segment_tokens = len(tokenizer.encode(segments[-1]["text"]))
current_segment_tokens = len(tokenizer.encode(text))

if previous_segment_tokens + current_segment_tokens < MAX_TOKENS:
segments[-1]["text"] += text
else:
if not first_segment:
# append PERCENTAGE_OVERLAP text to the previous segment
# to smooth context transition
append_text_to_previous_segment(text)
first_segment = False
add_new_segment(metadata, text, seg_begin_seconds)
else:
if not first_segment:
# append PERCENTAGE_OVERLAP text to the previous segment
# to smooth context transition
append_text_to_previous_segment(text)
first_segment = False
# If segments list is empty, add the text as a new segment
add_new_segment(metadata, text, seg_begin_seconds)


def get_transcript(metadata):
"""get the transcript from the .vtt file"""
global total_files
Expand Down

0 comments on commit 3290089

Please sign in to comment.