diff --git a/tvd/algorithms/__init__.py b/tvd/algorithms/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tvd/algorithms/alignment/._ctm.py b/tvd/algorithms/alignment/._ctm.py new file mode 100644 index 0000000..192580f Binary files /dev/null and b/tvd/algorithms/alignment/._ctm.py differ diff --git a/tvd/algorithms/alignment/__init__.py b/tvd/algorithms/alignment/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tvd/algorithms/alignment/ctm.py b/tvd/algorithms/alignment/ctm.py new file mode 100644 index 0000000..9f05214 --- /dev/null +++ b/tvd/algorithms/alignment/ctm.py @@ -0,0 +1,148 @@ +from tvd import AnnotationGraph, Episode, TAnchored, TFloating, TStart, TEnd +import re +import networkx as nx + +class CTMAligner(object): + """docstring for CTMParser""" + def __init__(self, punctuation=True): + super(CTMAligner, self).__init__() + self.punctuation = punctuation + + def clean_sentence(self, sentenceWords): + sentenceClean = re.sub(r'\([^\)]+\)','', sentenceWords) + sentenceClean = re.sub(r'\[[^\]]+\]','', sentenceClean) + #sentenceClean = re.sub('[.!,;?":]','', sentenceClean) + + sentenceClean = re.sub(r'^[\.!,;?":]+','', sentenceClean) + + sentenceClean = re.sub(r'([\.!,;?":]+)[ ]+([\.!,;?":]+)','\g<1>\g<2>', sentenceClean) + sentenceClean = re.sub(r'[ ]*([\.!,;?":]+)','\g<1> ', sentenceClean) + + sentenceClean = re.sub(r' +',' ', sentenceClean) + sentenceClean = sentenceClean.strip() + return sentenceClean + + def merge_ctm_with_manual_transcript(self, ctmGraph, manualTranscriptGraph): + lastIndexNode=0 + + end = False + + TFloating.reset() + manualTranscriptGraph, map = manualTranscriptGraph.relabel_floating_nodes() + ctmGraph, map2 = ctmGraph.relabel_floating_nodes() + + manualTranscriptGraph, map = manualTranscriptGraph.relabel_floating_nodes(mapping=map) + + nodesWords = nx.topological_sort(ctmGraph) + if nodesWords[lastIndexNode] == TStart(): + lastIndexNode += 1 + + last = -1 + next = -1 + + first_node = None + + first = -1 + for t1, t2, data in manualTranscriptGraph.ordered_edges_iter(data=True): + if 'speech' in data: + sentence = data['speech'] + speaker = data['speaker'] + sentenceClean = self.clean_sentence(sentence) + if not self.punctuation: + sentenceClean = re.sub(r'[\.!,;?":]+','', sentenceClean) + + if sentenceClean != "": + + sentenceWords = "" + + if lastIndexNode < len(nodesWords): + + if first_node is None and t1 != TStart(): + first_node = t1 + manualTranscriptGraph.add_annotation(first_node, nodesWords[lastIndexNode]) + + node_manual_trs_start = t1 + node_manual_trs_end = t2 + + node_float = TFloating() + remainingData = None + if last > 0 and next > 0: + for key in ctmGraph[last][next]: + dataWord = ctmGraph[last][next][key] + if 'speech' in dataWord: + remainingData = dataWord + sentenceWords = remainingData['speech'] + sentenceWords = self.clean_sentence(sentenceWords) + last = -1 + next = -1 + + bAlreadyAdded = False + + if(remainingData is not None): + if 'speech' in remainingData: + remainingData['speaker']=speaker + manualTranscriptGraph.add_annotation(node_manual_trs_start, nodesWords[lastIndexNode], data=remainingData) + if sentenceWords == sentenceClean: + manualTranscriptGraph.add_annotation(nodesWords[lastIndexNode], node_manual_trs_end) + bAlreadyAdded = True + + if not bAlreadyAdded: + if not manualTranscriptGraph.has_edge(node_manual_trs_start, nodesWords[lastIndexNode]): + manualTranscriptGraph.add_annotation(node_manual_trs_start, nodesWords[lastIndexNode]) + + node_end = "" + previousNode = None + while not end and lastIndexNode < len(nodesWords): + node = nodesWords[lastIndexNode] + for node2 in sorted(ctmGraph.successors(node)): + + node_start = node + node_end = node2 + + if previousNode is not None: + if not manualTranscriptGraph.has_edge(previousNode, node_start) and previousNode != node_start : + manualTranscriptGraph.add_annotation(previousNode, node_start) + + for key in ctmGraph[node][node2]: + dataWord = ctmGraph[node][node2][key] + if 'speech' in dataWord: + dataWord['speaker']=speaker + manualTranscriptGraph.add_annotation(node_start, node_end, data=dataWord) + + if 'speech' in dataWord: + if sentenceWords == "": + sentenceWords = dataWord['speech'] + else: + sentenceWords += " " + dataWord['speech'] + sentenceWords = self.clean_sentence(sentenceWords) + if sentenceWords == sentenceClean: + if re.search(r'[\.!,;?":]$', sentenceClean): + #Have to add the next anchored just before the end of the speech turn ... + lastIndexNode+= 2 + if lastIndexNode < len(nodesWords): + node = nodesWords[lastIndexNode] + if node.is_anchored: + manualTranscriptGraph.add_annotation(node_end, node) + node_end = node + lastIndexNode -= 1 + else: + lastIndexNode -= 2 + end = True + previousNode = node_end + lastIndexNode+=1 + + if lastIndexNode+1 < len(nodesWords): + last = nodesWords[lastIndexNode] + next = nodesWords[lastIndexNode+1] + + #print "%s -> %s" % (node_end, node_manual_trs_end) + lastIndexNode+=1 + + manualTranscriptGraph.add_annotation(node_end, node_manual_trs_end) + + end = False + elif sentenceClean != "": + print "Unable to align '%s' !" % (sentenceClean) + return None + + return manualTranscriptGraph diff --git a/tvd/parser/._ctm.py b/tvd/parser/._ctm.py new file mode 100644 index 0000000..24cb34b Binary files /dev/null and b/tvd/parser/._ctm.py differ diff --git a/tvd/parser/__init__.py b/tvd/parser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tvd/parser/ctm.py b/tvd/parser/ctm.py new file mode 100644 index 0000000..a25d0e1 --- /dev/null +++ b/tvd/parser/ctm.py @@ -0,0 +1,101 @@ + +from tvd import AnnotationGraph, Episode, TAnchored, TFloating, TStart, TEnd +import codecs +import re + +class CTMParser(object): + """docstring for CTMParser""" + def __init__(self, punctuation=True): + super(CTMParser, self).__init__() + self.punctuation = punctuation + + def get_graph(self, path2ctm): + g = AnnotationGraph() + + TFloating.reset() + previousNode = TStart() + + + arc = [] + + with codecs.open(path2ctm, "rt", encoding='utf8') as f: + for line in f: + if not re.search(r'^;;', line): + fields = line.strip().split() + start = round(float(fields[2]), 3) + duration = round(float(fields[3]), 3) + end = float(start)+float(duration) + + end = round(end, 3) + + word = fields[4] + + if not self.punctuation: + word = re.sub(r'[\.!,;?":]+',' ', word) + word = re.sub(r' +',' ', word) + + if word != "" and word != ' ': + confidence = fields[5] + if duration == 0: + node_start = previousNode + node_end = TFloating() + if len(arc) == 2: + g.remove_edge(arc[0], arc[1]) + g.add_annotation(arc[0], node_end, arc_data) + node_inter = TFloating() + g.add_annotation(node_end, node_inter, data={'speech':word, 'confidence':confidence}) + g.add_annotation(node_inter, arc[1]) + arc.append(node_end) + arc.append(node_inter) + node_end=arc[1] + elif len(arc) > 2: + node_anc_start = arc[0] + node_anc_end = arc[1] + g.remove_edge(arc[len(arc)-1], node_anc_end) + g.add_annotation(arc[len(arc)-1], node_end, data={'speech':word, 'confidence':confidence}) + g.add_annotation(node_end, node_anc_end) + arc.append(node_end) + node_end=arc[1] + else: + addEdge = True + node_start = TAnchored(start) + node_end = TAnchored(end) + if previousNode.is_floating: + if not g.has_edge(previousNode, node_start): + g.add_annotation(previousNode, node_start) + else: + if node_start.T < previousNode.T: + node_start = previousNode + elif node_start.T > previousNode.T: + g.add_annotation(previousNode, node_start) + if node_start.is_anchored and node_end.is_anchored: + if node_start.T == node_end.T: + addEdge = False + node_start = previousNode + node_end = TFloating() + if len(arc) == 2: + g.remove_edge(arc[0], arc[1]) + g.add_annotation(arc[0], node_end, arc_data) + node_inter = TFloating() + g.add_annotation(node_end, node_inter, data={'speech':word, 'confidence':confidence}) + g.add_annotation(node_inter, arc[1]) + arc.append(node_end) + arc.append(node_inter) + node_end=arc[1] + elif len(arc) > 2: + node_anc_start = arc[0] + node_anc_end = arc[1] + g.remove_edge(arc[len(arc)-1], node_anc_end) + g.add_annotation(arc[len(arc)-1], node_end, data={'speech':word, 'confidence':confidence}) + g.add_annotation(node_end, node_anc_end) + arc.append(node_end) + node_end=arc[1] + else: + arc = [node_start, node_end] + arc_data = {'speech':word, 'confidence':confidence} + if addEdge: + g.add_annotation(node_start, node_end, data={'speech':word, 'confidence':confidence}) + previousNode=node_end + + g.add_annotation(previousNode, TEnd()) + return g