-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_splitter_manifest.py
64 lines (51 loc) · 2.06 KB
/
generate_splitter_manifest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import csv, os
import string
from collections import defaultdict
"""
personname: {
videoid: {
word: [time_start, length, isFirstWord],
word: [time_start, length, isFirstWord],
...
}
}
"""
# Change these as needed
people = ['TED']
word_data = defaultdict(\
lambda : defaultdict(\
lambda : defaultdict(list)))
for person in people:
word_timing_directory = os.path.join(person, 'word_timing')
word_timing_files = [f for f in os.listdir(word_timing_directory) if os.path.isfile(os.path.join(word_timing_directory, f))]
for word_timing_filename in word_timing_files:
if '.csv' in word_timing_filename:
videoid = word_timing_filename.split('.csv')[0]
word_timing_filepath = os.path.join(person, 'word_timing', word_timing_filename)
with open(word_timing_filepath, 'rb') as word_timing_file:
word_timing_reader = csv.reader(word_timing_file, delimiter=',')
for row in word_timing_reader:
word = row[0].translate(None, string.punctuation).lower()
translated = [row[1], row[2]]
is_first_word = row[3] == 'True'
# if len(word_data[person][videoid][word]) == 0 or is_first_word:
if is_first_word:
# overwrite old record if
# 1) we found a first word (while the old record was not a first word)
word_data[person][videoid][word] = translated
elif not word_data[person][videoid][word]:
# or 2) there's no data
word_data[person][videoid][word] = translated
for personname in word_data:
for videoid in word_data[personname]:
data = []
for word in word_data[personname][videoid]:
row = word_data[personname][videoid][word][0:2] + [word + '.mp4'] # lop off is_first_word
data.append(row)
data.sort(key=lambda x: x[1])
output_filepath = os.path.join(personname, 'splitter_manifest', videoid + '.csv')
if not os.path.exists(os.path.join(personname, 'splitter_manifest')):
os.makedirs(os.path.join(personname, 'splitter_manifest'))
with open(output_filepath, 'wb') as csvfile:
datawriter = csv.writer(csvfile, delimiter=',')
datawriter.writerows(data)