Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Develop #5

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added tvd/algorithms/__init__.py
Empty file.
Binary file added tvd/algorithms/alignment/._ctm.py
Binary file not shown.
Empty file.
148 changes: 148 additions & 0 deletions tvd/algorithms/alignment/ctm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
from tvd import AnnotationGraph, Episode, TAnchored, TFloating, TStart, TEnd
import re
import networkx as nx

class CTMAligner(object):
"""docstring for CTMParser"""
def __init__(self, punctuation=True):
super(CTMAligner, self).__init__()
self.punctuation = punctuation

def clean_sentence(self, sentenceWords):
sentenceClean = re.sub(r'\([^\)]+\)','', sentenceWords)
sentenceClean = re.sub(r'\[[^\]]+\]','', sentenceClean)
#sentenceClean = re.sub('[.!,;?":]','', sentenceClean)

sentenceClean = re.sub(r'^[\.!,;?":]+','', sentenceClean)

sentenceClean = re.sub(r'([\.!,;?":]+)[ ]+([\.!,;?":]+)','\g<1>\g<2>', sentenceClean)
sentenceClean = re.sub(r'[ ]*([\.!,;?":]+)','\g<1> ', sentenceClean)

sentenceClean = re.sub(r' +',' ', sentenceClean)
sentenceClean = sentenceClean.strip()
return sentenceClean

def merge_ctm_with_manual_transcript(self, ctmGraph, manualTranscriptGraph):
lastIndexNode=0

end = False

TFloating.reset()
manualTranscriptGraph, map = manualTranscriptGraph.relabel_floating_nodes()
ctmGraph, map2 = ctmGraph.relabel_floating_nodes()

manualTranscriptGraph, map = manualTranscriptGraph.relabel_floating_nodes(mapping=map)

nodesWords = nx.topological_sort(ctmGraph)
if nodesWords[lastIndexNode] == TStart():
lastIndexNode += 1

last = -1
next = -1

first_node = None

first = -1
for t1, t2, data in manualTranscriptGraph.ordered_edges_iter(data=True):
if 'speech' in data:
sentence = data['speech']
speaker = data['speaker']
sentenceClean = self.clean_sentence(sentence)
if not self.punctuation:
sentenceClean = re.sub(r'[\.!,;?":]+','', sentenceClean)

if sentenceClean != "":

sentenceWords = ""

if lastIndexNode < len(nodesWords):

if first_node is None and t1 != TStart():
first_node = t1
manualTranscriptGraph.add_annotation(first_node, nodesWords[lastIndexNode])

node_manual_trs_start = t1
node_manual_trs_end = t2

node_float = TFloating()
remainingData = None
if last > 0 and next > 0:
for key in ctmGraph[last][next]:
dataWord = ctmGraph[last][next][key]
if 'speech' in dataWord:
remainingData = dataWord
sentenceWords = remainingData['speech']
sentenceWords = self.clean_sentence(sentenceWords)
last = -1
next = -1

bAlreadyAdded = False

if(remainingData is not None):
if 'speech' in remainingData:
remainingData['speaker']=speaker
manualTranscriptGraph.add_annotation(node_manual_trs_start, nodesWords[lastIndexNode], data=remainingData)
if sentenceWords == sentenceClean:
manualTranscriptGraph.add_annotation(nodesWords[lastIndexNode], node_manual_trs_end)
bAlreadyAdded = True

if not bAlreadyAdded:
if not manualTranscriptGraph.has_edge(node_manual_trs_start, nodesWords[lastIndexNode]):
manualTranscriptGraph.add_annotation(node_manual_trs_start, nodesWords[lastIndexNode])

node_end = ""
previousNode = None
while not end and lastIndexNode < len(nodesWords):
node = nodesWords[lastIndexNode]
for node2 in sorted(ctmGraph.successors(node)):

node_start = node
node_end = node2

if previousNode is not None:
if not manualTranscriptGraph.has_edge(previousNode, node_start) and previousNode != node_start :
manualTranscriptGraph.add_annotation(previousNode, node_start)

for key in ctmGraph[node][node2]:
dataWord = ctmGraph[node][node2][key]
if 'speech' in dataWord:
dataWord['speaker']=speaker
manualTranscriptGraph.add_annotation(node_start, node_end, data=dataWord)

if 'speech' in dataWord:
if sentenceWords == "":
sentenceWords = dataWord['speech']
else:
sentenceWords += " " + dataWord['speech']
sentenceWords = self.clean_sentence(sentenceWords)
if sentenceWords == sentenceClean:
if re.search(r'[\.!,;?":]$', sentenceClean):
#Have to add the next anchored just before the end of the speech turn ...
lastIndexNode+= 2
if lastIndexNode < len(nodesWords):
node = nodesWords[lastIndexNode]
if node.is_anchored:
manualTranscriptGraph.add_annotation(node_end, node)
node_end = node
lastIndexNode -= 1
else:
lastIndexNode -= 2
end = True
previousNode = node_end
lastIndexNode+=1

if lastIndexNode+1 < len(nodesWords):
last = nodesWords[lastIndexNode]
next = nodesWords[lastIndexNode+1]

#print "%s -> %s" % (node_end, node_manual_trs_end)
lastIndexNode+=1

manualTranscriptGraph.add_annotation(node_end, node_manual_trs_end)

end = False
elif sentenceClean != "":
print "Unable to align '%s' !" % (sentenceClean)
return None

return manualTranscriptGraph
Binary file added tvd/parser/._ctm.py
Binary file not shown.
Empty file added tvd/parser/__init__.py
Empty file.
101 changes: 101 additions & 0 deletions tvd/parser/ctm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@

from tvd import AnnotationGraph, Episode, TAnchored, TFloating, TStart, TEnd
import codecs
import re

class CTMParser(object):
"""docstring for CTMParser"""
def __init__(self, punctuation=True):
super(CTMParser, self).__init__()
self.punctuation = punctuation

def get_graph(self, path2ctm):
g = AnnotationGraph()

TFloating.reset()
previousNode = TStart()


arc = []

with codecs.open(path2ctm, "rt", encoding='utf8') as f:
for line in f:
if not re.search(r'^;;', line):
fields = line.strip().split()
start = round(float(fields[2]), 3)
duration = round(float(fields[3]), 3)
end = float(start)+float(duration)

end = round(end, 3)

word = fields[4]

if not self.punctuation:
word = re.sub(r'[\.!,;?":]+',' ', word)
word = re.sub(r' +',' ', word)

if word != "" and word != ' ':
confidence = fields[5]
if duration == 0:
node_start = previousNode
node_end = TFloating()
if len(arc) == 2:
g.remove_edge(arc[0], arc[1])
g.add_annotation(arc[0], node_end, arc_data)
node_inter = TFloating()
g.add_annotation(node_end, node_inter, data={'speech':word, 'confidence':confidence})
g.add_annotation(node_inter, arc[1])
arc.append(node_end)
arc.append(node_inter)
node_end=arc[1]
elif len(arc) > 2:
node_anc_start = arc[0]
node_anc_end = arc[1]
g.remove_edge(arc[len(arc)-1], node_anc_end)
g.add_annotation(arc[len(arc)-1], node_end, data={'speech':word, 'confidence':confidence})
g.add_annotation(node_end, node_anc_end)
arc.append(node_end)
node_end=arc[1]
else:
addEdge = True
node_start = TAnchored(start)
node_end = TAnchored(end)
if previousNode.is_floating:
if not g.has_edge(previousNode, node_start):
g.add_annotation(previousNode, node_start)
else:
if node_start.T < previousNode.T:
node_start = previousNode
elif node_start.T > previousNode.T:
g.add_annotation(previousNode, node_start)
if node_start.is_anchored and node_end.is_anchored:
if node_start.T == node_end.T:
addEdge = False
node_start = previousNode
node_end = TFloating()
if len(arc) == 2:
g.remove_edge(arc[0], arc[1])
g.add_annotation(arc[0], node_end, arc_data)
node_inter = TFloating()
g.add_annotation(node_end, node_inter, data={'speech':word, 'confidence':confidence})
g.add_annotation(node_inter, arc[1])
arc.append(node_end)
arc.append(node_inter)
node_end=arc[1]
elif len(arc) > 2:
node_anc_start = arc[0]
node_anc_end = arc[1]
g.remove_edge(arc[len(arc)-1], node_anc_end)
g.add_annotation(arc[len(arc)-1], node_end, data={'speech':word, 'confidence':confidence})
g.add_annotation(node_end, node_anc_end)
arc.append(node_end)
node_end=arc[1]
else:
arc = [node_start, node_end]
arc_data = {'speech':word, 'confidence':confidence}
if addEdge:
g.add_annotation(node_start, node_end, data={'speech':word, 'confidence':confidence})
previousNode=node_end

g.add_annotation(previousNode, TEnd())
return g