-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils_podcasts.py
67 lines (49 loc) · 2.41 KB
/
utils_podcasts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import json
import os
import pandas as pd
import numpy as np
import string
from urlextract import URLExtract
import utils_general
def get_file(show_filename_prefix, episode_filename_prefix):
dir1 = show_filename_prefix.split("_")[1][0].upper()
dir2 = show_filename_prefix.split("_")[1][1].upper()
filepath = os.path.join(utils_general.PATH_TO_TEXT_DIR, dir1, dir2, show_filename_prefix, episode_filename_prefix+".txt")
text = ""
text = utils_general.read_file(filepath)
return text, filepath
def clean_urls(text):
extractor = URLExtract()
urls = extractor.find_urls(text)
for url in urls:
text = text.replace(url, "")
return text
def get_description_for_episode_id(episode_id, testset_or_trainset):
# get the description from the appropriate dataframe
if testset_or_trainset == "testset":
description_series = TESTSET_METADATA_DF.loc[TESTSET_METADATA_DF["episode_filename_prefix"] == episode_id, "episode_description"]
description = str(description_series.values[0])
else: # testset_or_trainset == "trainset"
description_series = TRAINSET_METADATA_DF.loc[TRAINSET_METADATA_DF["episode_filename_prefix"] == episode_id, "episode_description"]
description = str(description_series.values[0])
# clean the descriptions the same way as the transcripts
description = clean_urls(description)
description = description.encode("ascii", "ignore").decode()
return description
# modified from https://github.com/potsawee/podcast_trec2020/blob/main/data/processor.py
# also performs some basic cleaning
def get_transcript_text_from_json_asr_file(json_asr_file):
transcript_list = []
with open(json_asr_file) as f:
transcript_dict = json.loads(f.read())
results_list = [r for r in transcript_dict["results"]]
last_result = results_list[-1]
for word_dict in last_result["alternatives"][0]["words"]:
transcript_list.append(word_dict["word"])
transcript_string = " ".join(transcript_list)
# clean the transcripts the same way as the descriptions
transcript_string = clean_urls(transcript_string)
transcript_string = transcript_string.encode("ascii", "ignore").decode()
if transcript_string[-1] not in string.punctuation:
transcript_string += "."
return transcript_string