tvd-dataset · antoinelaurent · Apr 24, 2014 · Apr 29, 2014 · May 1, 2014 · May 1, 2014
diff --git a/tvd/algorithms/__init__.py b/tvd/algorithms/__init__.py
diff --git a/tvd/algorithms/alignment/._ctm.py b/tvd/algorithms/alignment/._ctm.py
diff --git a/tvd/algorithms/alignment/__init__.py b/tvd/algorithms/alignment/__init__.py
diff --git a/tvd/algorithms/alignment/ctm.py b/tvd/algorithms/alignment/ctm.py
@@ -0,0 +1,148 @@
+from tvd import AnnotationGraph, Episode, TAnchored, TFloating, TStart, TEnd
+import re
+import networkx as nx
+
+class CTMAligner(object):
+	"""docstring for CTMParser"""
+	def __init__(self, punctuation=True):
+		super(CTMAligner, self).__init__()
+		self.punctuation = punctuation
+
+	def clean_sentence(self, sentenceWords):
+		sentenceClean = re.sub(r'\([^\)]+\)','', sentenceWords)
+		sentenceClean = re.sub(r'\[[^\]]+\]','', sentenceClean)
+		#sentenceClean = re.sub('[.!,;?":]','', sentenceClean)
+
+		sentenceClean = re.sub(r'^[\.!,;?":]+','', sentenceClean)
+
+		sentenceClean = re.sub(r'([\.!,;?":]+)[ ]+([\.!,;?":]+)','\g<1>\g<2>', sentenceClean)
+		sentenceClean = re.sub(r'[ ]*([\.!,;?":]+)','\g<1> ', sentenceClean)
+
+		sentenceClean = re.sub(r' +',' ', sentenceClean)
+		sentenceClean = sentenceClean.strip()
+		return sentenceClean
+
+	def merge_ctm_with_manual_transcript(self, ctmGraph, manualTranscriptGraph):
+		lastIndexNode=0
+
+		end = False
+
+		TFloating.reset()
+		manualTranscriptGraph, map = manualTranscriptGraph.relabel_floating_nodes()
+		ctmGraph, map2 = ctmGraph.relabel_floating_nodes()
+
+		manualTranscriptGraph, map = manualTranscriptGraph.relabel_floating_nodes(mapping=map)
+
+		nodesWords = nx.topological_sort(ctmGraph)
+		if nodesWords[lastIndexNode] == TStart():
+			lastIndexNode += 1
+
+		last = -1
+		next = -1
+
+		first_node = None
+
+		first = -1
+		for t1, t2, data in manualTranscriptGraph.ordered_edges_iter(data=True):
+			if 'speech' in data:
+				sentence = data['speech']
+				speaker = data['speaker']
+				sentenceClean = self.clean_sentence(sentence)
+				if not self.punctuation:
+					sentenceClean = re.sub(r'[\.!,;?":]+','', sentenceClean)
+
+				if sentenceClean != "":
+
+					sentenceWords = ""
+
+					if lastIndexNode < len(nodesWords):
+
+						if first_node is None and t1 != TStart():
+							first_node = t1
+							manualTranscriptGraph.add_annotation(first_node, nodesWords[lastIndexNode])
+
+						node_manual_trs_start = t1
+						node_manual_trs_end = t2
+
+						node_float = TFloating()
+						remainingData = None
+						if last > 0 and next > 0:
+							for key in ctmGraph[last][next]:
+								dataWord = ctmGraph[last][next][key]
+								if 'speech' in dataWord:
+									remainingData = dataWord
+									sentenceWords = remainingData['speech']
+									sentenceWords = self.clean_sentence(sentenceWords)
+									last = -1
+									next = -1
+
+						bAlreadyAdded = False
+
+						if(remainingData is not None):
+							if 'speech' in remainingData:
+								remainingData['speaker']=speaker
+							manualTranscriptGraph.add_annotation(node_manual_trs_start, nodesWords[lastIndexNode], data=remainingData)
+							if sentenceWords == sentenceClean:
+								manualTranscriptGraph.add_annotation(nodesWords[lastIndexNode], node_manual_trs_end)
+								bAlreadyAdded = True
+
+						if not bAlreadyAdded:
+							if not manualTranscriptGraph.has_edge(node_manual_trs_start, nodesWords[lastIndexNode]):
+								manualTranscriptGraph.add_annotation(node_manual_trs_start, nodesWords[lastIndexNode])
+
+							node_end = ""
+							previousNode = None
+							while not end and lastIndexNode < len(nodesWords):
+								node = nodesWords[lastIndexNode]
+								for node2 in sorted(ctmGraph.successors(node)):
+
+									node_start = node
+									node_end = node2
+
+									if previousNode is not None:
+										if not manualTranscriptGraph.has_edge(previousNode, node_start) and previousNode != node_start :
+											manualTranscriptGraph.add_annotation(previousNode, node_start)
+
+									for key in ctmGraph[node][node2]:
+										dataWord = ctmGraph[node][node2][key]
+										if 'speech' in dataWord:
+											dataWord['speaker']=speaker
+										manualTranscriptGraph.add_annotation(node_start, node_end, data=dataWord)
+
+										if 'speech' in dataWord:
+											if sentenceWords == "":
+												sentenceWords = dataWord['speech']
+											else:
+												sentenceWords += " " + dataWord['speech']
+											sentenceWords = self.clean_sentence(sentenceWords)
+									if sentenceWords == sentenceClean:
+										if re.search(r'[\.!,;?":]$', sentenceClean):
+											#Have to add the next anchored just before the end of the speech turn ...
+											lastIndexNode+= 2
+											if lastIndexNode < len(nodesWords):
+												node = nodesWords[lastIndexNode]
+												if node.is_anchored:
+													manualTranscriptGraph.add_annotation(node_end, node)
+													node_end = node
+													lastIndexNode -= 1
+												else:
+													lastIndexNode -= 2
+										end = True
+									previousNode = node_end
+								lastIndexNode+=1
+
+							if lastIndexNode+1 < len(nodesWords):
+								last = nodesWords[lastIndexNode]
+								next = nodesWords[lastIndexNode+1]
+
+							#print "%s -> %s" % (node_end, node_manual_trs_end)
+							lastIndexNode+=1
+
+							manualTranscriptGraph.add_annotation(node_end, node_manual_trs_end)
+
+							end = False
+					elif sentenceClean != "":
+						print "Unable to align '%s' !" % (sentenceClean)
+						return None
+
+		return manualTranscriptGraph
diff --git a/tvd/parser/._ctm.py b/tvd/parser/._ctm.py
diff --git a/tvd/parser/__init__.py b/tvd/parser/__init__.py
diff --git a/tvd/parser/ctm.py b/tvd/parser/ctm.py
@@ -0,0 +1,101 @@
+
+from tvd import AnnotationGraph, Episode, TAnchored, TFloating, TStart, TEnd
+import codecs
+import re
+
+class CTMParser(object):
+	"""docstring for CTMParser"""
+	def __init__(self, punctuation=True):
+		super(CTMParser, self).__init__()
+		self.punctuation = punctuation
+
+	def get_graph(self, path2ctm):
+		g = AnnotationGraph()
+
+		TFloating.reset()
+		previousNode = TStart()
+
+
+		arc = []
+
+		with codecs.open(path2ctm, "rt", encoding='utf8') as f:
+			for line in f:
+				if not re.search(r'^;;', line):
+					fields = line.strip().split()
+					start = round(float(fields[2]), 3)
+					duration = round(float(fields[3]), 3)
+					end = float(start)+float(duration)
+
+					end = round(end, 3)
+
+					word = fields[4]
+
+					if not self.punctuation:
+						word = re.sub(r'[\.!,;?":]+',' ', word)
+						word = re.sub(r' +',' ', word)
+
+					if word != "" and word != ' ':
+						confidence = fields[5]
+						if duration == 0:
+							node_start = previousNode
+							node_end = TFloating()
+							if len(arc) == 2:
+								g.remove_edge(arc[0], arc[1])
+								g.add_annotation(arc[0], node_end, arc_data)
+								node_inter = TFloating()
+								g.add_annotation(node_end, node_inter, data={'speech':word, 'confidence':confidence})
+								g.add_annotation(node_inter, arc[1])
+								arc.append(node_end)
+								arc.append(node_inter)
+								node_end=arc[1]
+							elif len(arc) > 2:
+								node_anc_start = arc[0]
+								node_anc_end = arc[1]
+								g.remove_edge(arc[len(arc)-1], node_anc_end)
+								g.add_annotation(arc[len(arc)-1], node_end, data={'speech':word, 'confidence':confidence})
+								g.add_annotation(node_end, node_anc_end)
+								arc.append(node_end)
+								node_end=arc[1]
+						else:
+							addEdge = True
+							node_start = TAnchored(start)
+							node_end = TAnchored(end)
+							if previousNode.is_floating:
+								if not g.has_edge(previousNode, node_start):
+									g.add_annotation(previousNode, node_start)
+							else:
+								if node_start.T < previousNode.T:
+									node_start = previousNode
+								elif node_start.T > previousNode.T:
+									g.add_annotation(previousNode, node_start)
+							if node_start.is_anchored and node_end.is_anchored:
+								if node_start.T == node_end.T:
+									addEdge = False
+									node_start = previousNode
+									node_end = TFloating()
+									if len(arc) == 2:
+										g.remove_edge(arc[0], arc[1])
+										g.add_annotation(arc[0], node_end, arc_data)
+										node_inter = TFloating()
+										g.add_annotation(node_end, node_inter, data={'speech':word, 'confidence':confidence})
+										g.add_annotation(node_inter, arc[1])
+										arc.append(node_end)
+										arc.append(node_inter)
+										node_end=arc[1]
+									elif len(arc) > 2:
+										node_anc_start = arc[0]
+										node_anc_end = arc[1]
+										g.remove_edge(arc[len(arc)-1], node_anc_end)
+										g.add_annotation(arc[len(arc)-1], node_end, data={'speech':word, 'confidence':confidence})
+										g.add_annotation(node_end, node_anc_end)
+										arc.append(node_end)
+										node_end=arc[1]
+								else:
+									arc = [node_start, node_end]
+									arc_data = {'speech':word, 'confidence':confidence}
+							if addEdge:
+								g.add_annotation(node_start, node_end, data={'speech':word, 'confidence':confidence})
+						previousNode=node_end
+
+		g.add_annotation(previousNode, TEnd())
+		return g