-
Notifications
You must be signed in to change notification settings - Fork 0
/
text2chapters.py
executable file
·145 lines (125 loc) · 5.51 KB
/
text2chapters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python3
""" usage: doit [--offset=OFFSET] [(--override CHAPTER CHUNKID)...] [CHAPTERDATA]
with --override you can force a certain chapter to be on a particular chunkid,
e.g. for fixup, chunkid is an integer and coverts to the respective chunkname
--offset lets you configure the offset for the begin (e.g. when adding the intro)
"""
import time
import urllib
from pydub import AudioSegment, silence
import speech_recognition as sr
from pathlib import PurePath
import os
from docopt import docopt
import json
from collections import OrderedDict
from dataclasses import dataclass
from typing import Union,Any
from datetime import timedelta
import logging
logging.basicConfig(level=logging.INFO)
log = logging.getLogger("text2chapters")
@dataclass
class Chapter:
name: str
seen: list
options: list
mandatory: bool
confirmed: Union[str,None]
is_begin: bool
time: Union[int,None]
class ChapterMarks:
begin: int # begin time (Halli Hallo)
current: int # the current time (last timestamp + length), gets set when chapter was found
chapters: OrderedDict[str, Chapter]
def __init__(self):
self.chapters = OrderedDict()
self.offset = 0
def add(self,name: str, options: list,mandatory:bool = True,is_begin:bool = False):
self.chapters[name] = Chapter(name=name,seen=list(),options=options,mandatory=mandatory,confirmed=None,is_begin=is_begin,time=None)
def find_text(self, chunk:str , chunkdata: dict[str,Any]):
for chap,v in self.chapters.items():
for o in v.options:
for recognizer,text in chunkdata['text'].items():
if o in text.lower():
log.info(f'Found possible match for chapter {chap} in chunk {chunk} as it matched text "{o}" in {recognizer}')
if not chunk in v.seen:
v.seen.append(chunk)
def override_chapters(self,override: dict[str,Any]):
""" override the chapters found """
for k,v in override.items():
self.chapters[k].confirmed = v
def finalize_chapters(self,data: dict[str,Any]):
for k,chap in self.chapters.items():
# step 1: confirm the active chapter
if chap.confirmed:
log.info(f"Chapter {k} already confirmed at {chap.confirmed}")
elif chap.seen:
log.debug(f"Chapter {k} seen at the following locations: {chap.seen}")
if chap.is_begin:
chap.confirmed = chap.seen[0]
else:
for chunk in chap.seen:
begin_speech = data[chunk]['begin_speech']
if begin_speech <= self.current:
log.debug(f"Chapter {k} was found before the previous chapter, ignoring")
continue
else:
chap.confirmed = chunk
log.info(f"Confirmed chunk {chunk} for chapter {k}")
break
# step 2: calculates the next times
if chap.is_begin:
self.begin = data[chap.confirmed]['begin_speech']
if chap.confirmed:
self.current = data[chap.confirmed]['begin_speech']
chap.time = self.current - self.begin
def render(self):
for k,chap in self.chapters.items():
if chap.time is None:
log.warning(f"skipping chapter {k}, no plausible chunk found")
continue
t = str(timedelta(seconds=chap.time) + timedelta(seconds=self.offset))
try:
front,back = t.split(".",1)
back = back[0:3]
t = f"{front}.{back}"
except:
t = f"{t}.000"
print(f"{t} {k}")
m = ChapterMarks()
m.add("Halli Hallo und Herzlich Willkommen",[ "halli hallo", "herzlich willkommen" ],True, True)
m.add("Blast from the Past",[ "blast", "platz von der past" ],False)
m.add("Toter der Woche",["toter","tote der woche","toten der woche","tote woche"],False)
m.add("Untoter der Woche",["untoten der woche","untote woche"],False)
m.add("AI der Woche",["ei der woche","ai der woche"," e der woche"],False)
m.add("News",["news"],True) # this one is tricky
m.add("Themen",["thema","themen"],False) # no news last time
m.add("Mimimi der Woche",["mimimi","mini-me","menü der woche"],False)
m.add("Lesefoo",["lesen.to","lasershow","lese vor"],False)
m.add("Picks",["picks","pigs","dick picks","pics pics pics","pickx","pix"],True)
m.add("Ende",[ "immer eine frohe zeit","passt auf euch auf" ,"habt spaß", "bis zum nächsten mal", "ciao ciao" ],True)
def guess_sendungsnummer():
log.info("Retrieving current bgt show number")
url = "https://pad.binaergewitter.de/"
ret = urllib.request.urlopen(url)
return ret.geturl().split("/")[-1]
def main():
args = docopt(__doc__)
data = json.load(open(args['CHAPTERDATA'] or guess_sendungsnummer() + ".json"))
override = {}
for idx,k in enumerate(args['CHAPTER']):
override[k] = f"chunk{int(args['CHUNKID'][idx]):04d}.wav"
log.debug("Overriding {k} with {override[k]")
offset = args['--offset']
if offset:
log.info(f"Adding offset of {offset}ms to chaptermarks")
m.offset = int(args['--offset'])
for chunk,chunkdata in data.items():
#print(f"Handling Chunk {chunk}")
m.find_text(chunk,chunkdata)
m.override_chapters(override)
m.finalize_chapters(data)
m.render()
if __name__ == "__main__":
main()