From e2b1905272dac83dc7ad9694906598569cc7a98c Mon Sep 17 00:00:00 2001
From: antoine laurent <laurent@hp801.limsi.fr>
Date: Thu, 24 Apr 2014 17:55:00 +0200
Subject: [PATCH 1/3] word alignment module

---
 tvd/algorithms/alignment/__init__.py  |   0
 tvd/algorithms/alignment/align.pl     | 166 ++++++++++++++++++++++++++
 tvd/algorithms/alignment/antoine.py   | 157 ++++++++++++++++++++++++
 tvd/algorithms/alignment/just_vrbs.pl |  85 +++++++++++++
 4 files changed, 408 insertions(+)
 create mode 100644 tvd/algorithms/alignment/__init__.py
 create mode 100755 tvd/algorithms/alignment/align.pl
 create mode 100644 tvd/algorithms/alignment/antoine.py
 create mode 100755 tvd/algorithms/alignment/just_vrbs.pl
diff --git a/tvd/algorithms/alignment/__init__.py b/tvd/algorithms/alignment/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tvd/algorithms/alignment/align.pl b/tvd/algorithms/alignment/align.pl
new file mode 100755
index 0000000..9396258
--- /dev/null
+++ b/tvd/algorithms/alignment/align.pl
@@ -0,0 +1,166 @@
+#!/usr/bin/perl
+
+if(scalar(@ARGV) != 3){
+	die "Usage : $0 fichierTxt(Node1-Node2#Speaker#text) fichierAudio name\n";
+}
+
+$txt = $ARGV[0];
+$wav = $ARGV[1];
+$name = $ARGV[2];
+
+%map = ();
+$map{"‘"}="'";
+$map{"…"}="...";
+$map{"–"}="-";
+$map{"’"}="'";
+
+
+open(OUT, ">/tmp/$name.txt");
+open(TXT, $txt);
+while($ligneC = <TXT>){
+	chomp($ligneC);
+	if($ligneC =~ /^[^#]+#[^#]+#(.*)/){
+		$ligne = $1;
+		print OUT nettoyerLigne($ligne)."\n";	
+	}
+}
+close(TXT);
+close(OUT);
+
+
+$cmd = "cat /tmp/$name.txt | utf2iso 2> /tmp/$name.txt.iso.error 1> /dev/null";
+system($cmd);
+
+if(-s "/tmp/$name.txt.iso.error" > 0){
+	die "Error during conversion ... see /tmp/$name.txt.iso.error"
+}
+unlink "/tmp/$name.txt.iso.error";
+
+$cmd = "mkdir align";
+system($cmd) if(!-d "align");
+
+$cmd = "vrbs_align -l:eng -v -f $wav /tmp/$name.txt > align/$name.xml";
+system($cmd) if(!-e "align/$name.xml");
+
+
+$cmd = "cat align/$name.xml | xml2ctm > /tmp/text.ctm";
+system($cmd);
+
+$ficDump = $txt;
+$ficCTM = "/tmp/text.ctm";
+
+$cmd = "cat $ficCTM";
+@ctm = `$cmd`;
+
+$indCtm = 0;
+
+open(FIC, $ficDump);
+while($ligne = <FIC>){
+	chomp($ligne);
+	
+	@infos = split(/#/, $ligne);
+	$nodes = $infos[0];
+	$spk = $infos[1];
+
+	$spk =~ s/ +/_/g;
+
+	$txt = $infos[2];
+	
+	$txt = nettoyerLigne($txt);
+	$txt = trim($txt);
+
+	#$txt =~ s/([^\.])\./$1 \./g;
+	#$txt =~ s/([^\,])\,/$1 \,/g;
+	#$txt =~ s/([^\?])\?/$1 \?/g;
+	#$txt =~ s/([^\!])\!/$1 \!/g;
+
+	@words = split(/ +/, $txt);
+
+	foreach(@words){
+		$word = $_;
+		$original = $word;
+		$end = 0;
+		while(!$end){
+			$ctmL = $ctm[$indCtm];
+			@iCtm = split(/ /, $ctmL);
+			$wCtm = $iCtm[4];
+			$wCtm = virerPonctuation($wCtm);
+			if($wCtm ne ""){
+				$end=1;
+			}else{
+				$indCtm++;
+			}
+			$end = 1 if($indCtm == scalar(@ctm));
+		}
+
+		
+		$word = virerPonctuation($word);	
+
+		@iCtm = split(/ /, $ctmL);
+		$show = $iCtm[0];
+		$start = $iCtm[2];
+		$duree = $iCtm[3];
+		$score = $iCtm[5];
+		$end = sprintf("%.3f", $start+$duree);
+
+		$wCtm = $iCtm[4];
+		$wCtm = virerPonctuation($wCtm);	
+		
+
+		if($word ne ""){
+			
+			if($word eq $wCtm){
+				print "thrones_s01e01 $nodes $spk $original ($start - $end - $score)\n";
+			}else{
+				die "==> ". $word . " ne ".$wCtm."\n";
+				
+			}
+
+			$indCtm++;
+		}else{
+			print "thrones_s01e01 $nodes $spk $original\n" if($original ne "");
+		}
+	}
+
+	
+	
+}
+close(FIC);
+	
+clean();
+
+sub clean{
+	unlink "/tmp/$name.txt";
+	unlink "/tmp/text.ctm";
+}
+
+
+sub virerPonctuation{
+	my $word = $_[0];
+	$word =~ s/,//g;
+    $word =~ s/\.//g;       
+    $word =~ s/\?//g;       
+    $word =~ s/\!//g;
+	$word =~ s/\"//g;    
+	$word =~ s/\://g;
+	$word =~ s/\;//g;
+	return trim($word);
+}
+
+sub trim{
+        $chaine = $_[0];
+        $chaine =~ s/^\s+//g;
+        $chaine =~ s/\s+$//g;
+        return $chaine;
+}
+sub nettoyerLigne{
+        my $ligne=$_[0];
+        foreach(keys %map){
+                $ligne =~ s/$_/$map{$_}/g;
+        }
+        $ligne =~ s/\.\.\.([^\.])/\.\.\. $1/g;
+        $ligne =~ s/ +/ /g;
+        $ligne =~ s/\([^\)]+\)//g;
+        $ligne =~ s/\[[^\]]+\]//g;
+        return $ligne;
+}
diff --git a/tvd/algorithms/alignment/antoine.py b/tvd/algorithms/alignment/antoine.py
new file mode 100644
index 0000000..c043275
--- /dev/null
+++ b/tvd/algorithms/alignment/antoine.py
@@ -0,0 +1,157 @@
+from tvd import AnnotationGraph, Episode, TAnchored, TFloating, TStart, TEnd
+import codecs
+import re
+import os
+import os.path
+import tempfile
+
+def makeCtm(g, wav, output_ctm, path_to_just_vrbs):
+	tab = {}
+	fic = tempfile.NamedTemporaryFile().name
+	namesave = fic.replace("/", "_")
+
+	with codecs.open(fic, 'w', encoding='utf8') as f:
+		for t1, t2, data in g.ordered_edges_iter(data=True):
+			if 'speech' in data:
+				cle = "%s-%s" % (t1.T,t2.T)
+				tab[cle]=[]
+				f.write("%s-%s#%s#%s\n" % (t1.T,t2.T,data['speaker'],data['speech']))
+	cmd = "%s/just_vrbs.pl %s %s %s > %s" % (path_to_just_vrbs, fic, wav, namesave, output_ctm)
+	os.system(cmd)
+
+def makeWordGraphFromCtm(ctm, keep_punctuation=False):
+	g = AnnotationGraph()
+	TFloating.reset()
+	previousNode = TFloating()
+
+	with codecs.open(ctm, "rt", encoding='utf8') as f:
+		for line in f:
+			fields = line.strip().split()
+			start = round(float(fields[2]), 3)
+			duration = round(float(fields[3]), 3)
+			end = float(start)+float(duration)
+
+			end = round(end, 3)
+			#end = "%.3f" % round(end,3)
+			word = fields[4]
+
+			if not keep_punctuation:
+				word = re.sub('[.!,;?"]', '', word)
+
+			if word != "":
+				confidence = fields[5]
+				if duration == 0:
+					node_start = previousNode
+					node_end = TFloating()
+					g.add_annotation(node_start, node_end, data={'speech':word, 'confidence':confidence})
+				else:
+					node_start = TAnchored(start)
+					node_end = TAnchored(end)
+					if previousNode.is_floating:
+						g.add_annotation(previousNode, node_start)
+					else:
+						if node_start.T < previousNode.T:
+							node_start = previousNode
+						elif node_start.T > previousNode.T:
+							g.add_annotation(previousNode, node_start)
+					g.add_annotation(node_start, node_end, data={'speech':word, 'confidence':confidence})
+				previousNode=node_end
+	return g
+
+
+def mergeGraphs(transcriptGraph, wordsGraph):
+
+	for t1, t2, data in transcriptGraph.ordered_edges_iter(data=True):
+		if 'speech' in data:
+			sentence = data['speech']
+			speaker = data['speaker']
+			print sentence
+			sentenceClean = re.sub(r'\([^\)]+\)','', sentence)
+			sentenceClean = re.sub(r'\[[^\]]+\]','', sentenceClean)
+			sentenceClean = re.sub(r' +',' ', sentenceClean)
+			sentenceClean = re.sub('[.!,;?"]','', sentenceClean)
+			print sentenceClean
+
+
+
+def alignTranscript(g, wav, namesave):
+	tab = {}
+
+	# with codecs.open('/tmp/dump.txt', 'w', encoding='utf8') as f:
+	#     for t1, t2, data in g.ordered_edges_iter(data=True):
+	#         if 'speech' in data:
+	#               f.write("%s\n" % (data['speech']))
+
+	fic = "/tmp/%s_eti.txt" % namesave
+
+	with codecs.open(fic, 'w', encoding='utf8') as f:
+		for t1, t2, data in g.ordered_edges_iter(data=True):
+			if 'speech' in data:
+				cle = "%s-%s" % (t1.T,t2.T)
+				tab[cle]=[]
+				f.write("%s-%s#%s#%s\n" % (t1.T,t2.T,data['speaker'],data['speech']))
+
+	out = "dump_clean_%s.etiq" % namesave
+	cmd = "/people/laurent/tvd/monfork/tvd/tvd/algorithms/alignment/align.pl %s %s %s > %s" % (fic, wav, namesave, out)
+	os.system(cmd)
+
+	with codecs.open(out, "rt", encoding='utf8') as f:
+		for line in f:
+			words = line.strip().split()
+			tab[words[1]].append(line)
+
+	lastEnd = 0
+
+	os.remove(out)
+	os.remove(fic)
+	for t1, t2, data in g.ordered_edges_iter(data=True):
+		if 'speech' in data:
+			cle = "%s-%s" % (t1.T,t2.T)
+			speaker = data['speaker']
+			lines = tab[cle]
+			prevNode = t1
+			for line in lines:
+				words = line.strip().split()
+				mot = words[3]
+				m = re.match(r"([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) \(([^ ]+) \- ([^ ]+) \- ([^\)]+)\)", line)
+				if m:
+					mot = m.group(4)
+					start = m.group(5)
+					end = m.group(6)
+					conf = m.group(7)
+					#Je regarde lastEnd (c'est le temps de fin du dernier tour de parole ...)
+					#Ne dois jamais arriver mais bon...
+					if start < lastEnd:
+						start = lastEnd+0.001
+					node_start = TAnchored(float(start))
+					#prevVal = stringVal(prevNode.T)
+					if prevNode.is_anchored and prevNode.T != node_start.T:
+						if node_start.T < prevNode.T:
+							node_start.T = prevNode.T
+						print "Node between %s and %s => no data" % (prevNode, node_start)
+						g.add_annotation(prevNode, node_start, data={})
+					else:
+						if prevNode.is_floating:
+							print "Node between %s and %s => no data" % (prevNode, node_start)
+							g.add_annotation(prevNode, node_start, data={})
+
+					node_end = TAnchored(float(end))
+					if node_end.T <= node_start.T:
+						node_end.T = node_start.T+0.001
+
+					print "Node between %s and %s => data{speech:%s speaker:%s confidence:%s}" % (node_start, node_end, mot, speaker, conf)
+					g.add_annotation(node_start, node_end, data={'speech':mot, 'speaker':speaker, 'confidence':conf})
+					prevNode = node_end
+				else:
+					#prevVal = stringVal(prevNode.T)
+					if prevNode.is_anchored:
+						node_start = TAnchored(prevNode.T)
+						node_end = TAnchored(prevNode.T+0.001)
+						print "Node between %s and %s => data{speech:%s speaker:%s confidence:0.950}" % (node_start, node_end, mot, speaker)
+						g.add_annotation(node_start, node_end, data={'speech':mot, 'speaker':speaker, 'confidence':'0.950'})
+						prevNode = node_end
+			if prevNode.is_anchored:
+				print "Node between %s and %s => no data"%(prevNode,t2)
+				g.add_annotation(prevNode, t2, data={})
+				lastEnd = prevNode.T
+	g.save("%s.json" % namesave)
diff --git a/tvd/algorithms/alignment/just_vrbs.pl b/tvd/algorithms/alignment/just_vrbs.pl
new file mode 100755
index 0000000..8809311
--- /dev/null
+++ b/tvd/algorithms/alignment/just_vrbs.pl
@@ -0,0 +1,85 @@
+#!/usr/bin/perl
+
+if(scalar(@ARGV) != 3){
+	die "Usage : $0 fichierTxt(Node1-Node2#Speaker#text) fichierAudio name\n";
+}
+
+$txt = $ARGV[0];
+$wav = $ARGV[1];
+$name = $ARGV[2];
+
+%map = ();
+$map{"‘"}="'";
+$map{"…"}="...";
+$map{"–"}="-";
+$map{"’"}="'";
+
+
+open(OUT, ">/tmp/$name.txt");
+open(TXT, $txt);
+while($ligneC = <TXT>){
+	chomp($ligneC);
+	if($ligneC =~ /^[^#]+#[^#]+#(.*)/){
+		$ligne = $1;
+		print OUT nettoyerLigne($ligne)."\n";	
+	}
+}
+close(TXT);
+close(OUT);
+
+
+$cmd = "cat /tmp/$name.txt | utf2iso 2> /tmp/$name.txt.iso.error 1> /dev/null";
+system($cmd);
+
+if(-s "/tmp/$name.txt.iso.error" > 0){
+	die "Error during conversion ... see /tmp/$name.txt.iso.error"
+}
+unlink "/tmp/$name.txt.iso.error";
+
+$cmd = "mkdir align";
+system($cmd) if(!-d "align");
+
+$cmd = "vrbs_align -l:eng -v -f $wav /tmp/$name.txt > /tmp/$name.xml";
+system($cmd);
+
+
+$cmd = "cat /tmp/$name.xml | xml2ctm";
+system($cmd);
+	
+clean();
+
+sub clean{
+	unlink "/tmp/$name.txt";
+	unlink "/tmp/$name.xml";
+}
+
+
+sub virerPonctuation{
+	my $word = $_[0];
+	$word =~ s/,//g;
+    $word =~ s/\.//g;       
+    $word =~ s/\?//g;       
+    $word =~ s/\!//g;
+	$word =~ s/\"//g;    
+	$word =~ s/\://g;
+	$word =~ s/\;//g;
+	return trim($word);
+}
+
+sub trim{
+        $chaine = $_[0];
+        $chaine =~ s/^\s+//g;
+        $chaine =~ s/\s+$//g;
+        return $chaine;
+}
+sub nettoyerLigne{
+        my $ligne=$_[0];
+        foreach(keys %map){
+                $ligne =~ s/$_/$map{$_}/g;
+        }
+        $ligne =~ s/\.\.\.([^\.])/\.\.\. $1/g;
+        $ligne =~ s/ +/ /g;
+        $ligne =~ s/\([^\)]+\)//g;
+        $ligne =~ s/\[[^\]]+\]//g;
+        return $ligne;
+}

From e03731a3df249ae134e8f783bdbe9af79ce3eb65 Mon Sep 17 00:00:00 2001
From: antoine laurent <laurent@hp801.limsi.fr>
Date: Thu, 1 May 2014 15:42:51 +0200
Subject: [PATCH 2/3] CTM Aligner and CTM Parser (need to add comments)

---
 tvd/algorithms/__init__.py            |   0
 tvd/algorithms/alignment/._ctm.py     | Bin 0 -> 4096 bytes
 tvd/algorithms/alignment/antoine.py   | 157 --------------------------
 tvd/algorithms/alignment/ctm.py       | 148 ++++++++++++++++++++++++
 tvd/algorithms/alignment/just_vrbs.pl |  85 --------------
 tvd/parser/._ctm.py                   | Bin 0 -> 4096 bytes
 tvd/parser/__init__.py                |   0
 tvd/parser/ctm.py                     | 101 +++++++++++++++++
 8 files changed, 249 insertions(+), 242 deletions(-)
 create mode 100644 tvd/algorithms/__init__.py
 create mode 100644 tvd/algorithms/alignment/._ctm.py
 delete mode 100644 tvd/algorithms/alignment/antoine.py
 create mode 100644 tvd/algorithms/alignment/ctm.py
 delete mode 100755 tvd/algorithms/alignment/just_vrbs.pl
 create mode 100644 tvd/parser/._ctm.py
 create mode 100644 tvd/parser/__init__.py
 create mode 100644 tvd/parser/ctm.py

diff --git a/tvd/algorithms/__init__.py b/tvd/algorithms/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tvd/algorithms/alignment/._ctm.py b/tvd/algorithms/alignment/._ctm.py
new file mode 100644
index 0000000000000000000000000000000000000000..192580fed2d9aab9b11086c241612798ce343104
GIT binary patch
literal 4096
zcmZQz6=P>$Vqox1Ojhs@R)|o50+1L3ClDJkFz{^v(m+1nBL)UWIUt(=a103v0;)8J
z=wPTIpnOz%Gz3ONU^E0qLtr!nMnhmU1V%$(Gz3ONU^E0qLtr!ns1gE<KvO^%4CF#G
zGK&?8Qj7CTi;`0n((;S46*BWmQu9hO^YapOaw-*aQqxKll5!IBvVnbJsIDPRq52>0
K6&VJ(|NjAs7#qF-

literal 0
HcmV?d00001

diff --git a/tvd/algorithms/alignment/antoine.py b/tvd/algorithms/alignment/antoine.py
deleted file mode 100644
index c043275..0000000
--- a/tvd/algorithms/alignment/antoine.py
+++ /dev/null
@@ -1,157 +0,0 @@
-from tvd import AnnotationGraph, Episode, TAnchored, TFloating, TStart, TEnd
-import codecs
-import re
-import os
-import os.path
-import tempfile
-
-def makeCtm(g, wav, output_ctm, path_to_just_vrbs):
-	tab = {}
-	fic = tempfile.NamedTemporaryFile().name
-	namesave = fic.replace("/", "_")
-
-	with codecs.open(fic, 'w', encoding='utf8') as f:
-		for t1, t2, data in g.ordered_edges_iter(data=True):
-			if 'speech' in data:
-				cle = "%s-%s" % (t1.T,t2.T)
-				tab[cle]=[]
-				f.write("%s-%s#%s#%s\n" % (t1.T,t2.T,data['speaker'],data['speech']))
-	cmd = "%s/just_vrbs.pl %s %s %s > %s" % (path_to_just_vrbs, fic, wav, namesave, output_ctm)
-	os.system(cmd)
-
-def makeWordGraphFromCtm(ctm, keep_punctuation=False):
-	g = AnnotationGraph()
-	TFloating.reset()
-	previousNode = TFloating()
-
-	with codecs.open(ctm, "rt", encoding='utf8') as f:
-		for line in f:
-			fields = line.strip().split()
-			start = round(float(fields[2]), 3)
-			duration = round(float(fields[3]), 3)
-			end = float(start)+float(duration)
-
-			end = round(end, 3)
-			#end = "%.3f" % round(end,3)
-			word = fields[4]
-
-			if not keep_punctuation:
-				word = re.sub('[.!,;?"]', '', word)
-
-			if word != "":
-				confidence = fields[5]
-				if duration == 0:
-					node_start = previousNode
-					node_end = TFloating()
-					g.add_annotation(node_start, node_end, data={'speech':word, 'confidence':confidence})
-				else:
-					node_start = TAnchored(start)
-					node_end = TAnchored(end)
-					if previousNode.is_floating:
-						g.add_annotation(previousNode, node_start)
-					else:
-						if node_start.T < previousNode.T:
-							node_start = previousNode
-						elif node_start.T > previousNode.T:
-							g.add_annotation(previousNode, node_start)
-					g.add_annotation(node_start, node_end, data={'speech':word, 'confidence':confidence})
-				previousNode=node_end
-	return g
-
-
-def mergeGraphs(transcriptGraph, wordsGraph):
-
-	for t1, t2, data in transcriptGraph.ordered_edges_iter(data=True):
-		if 'speech' in data:
-			sentence = data['speech']
-			speaker = data['speaker']
-			print sentence
-			sentenceClean = re.sub(r'\([^\)]+\)','', sentence)
-			sentenceClean = re.sub(r'\[[^\]]+\]','', sentenceClean)
-			sentenceClean = re.sub(r' +',' ', sentenceClean)
-			sentenceClean = re.sub('[.!,;?"]','', sentenceClean)
-			print sentenceClean
-
-
-
-def alignTranscript(g, wav, namesave):
-	tab = {}
-
-	# with codecs.open('/tmp/dump.txt', 'w', encoding='utf8') as f:
-	#     for t1, t2, data in g.ordered_edges_iter(data=True):
-	#         if 'speech' in data:
-	#               f.write("%s\n" % (data['speech']))
-
-	fic = "/tmp/%s_eti.txt" % namesave
-
-	with codecs.open(fic, 'w', encoding='utf8') as f:
-		for t1, t2, data in g.ordered_edges_iter(data=True):
-			if 'speech' in data:
-				cle = "%s-%s" % (t1.T,t2.T)
-				tab[cle]=[]
-				f.write("%s-%s#%s#%s\n" % (t1.T,t2.T,data['speaker'],data['speech']))
-
-	out = "dump_clean_%s.etiq" % namesave
-	cmd = "/people/laurent/tvd/monfork/tvd/tvd/algorithms/alignment/align.pl %s %s %s > %s" % (fic, wav, namesave, out)
-	os.system(cmd)
-
-	with codecs.open(out, "rt", encoding='utf8') as f:
-		for line in f:
-			words = line.strip().split()
-			tab[words[1]].append(line)
-
-	lastEnd = 0
-
-	os.remove(out)
-	os.remove(fic)
-	for t1, t2, data in g.ordered_edges_iter(data=True):
-		if 'speech' in data:
-			cle = "%s-%s" % (t1.T,t2.T)
-			speaker = data['speaker']
-			lines = tab[cle]
-			prevNode = t1
-			for line in lines:
-				words = line.strip().split()
-				mot = words[3]
-				m = re.match(r"([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) \(([^ ]+) \- ([^ ]+) \- ([^\)]+)\)", line)
-				if m:
-					mot = m.group(4)
-					start = m.group(5)
-					end = m.group(6)
-					conf = m.group(7)
-					#Je regarde lastEnd (c'est le temps de fin du dernier tour de parole ...)
-					#Ne dois jamais arriver mais bon...
-					if start < lastEnd:
-						start = lastEnd+0.001
-					node_start = TAnchored(float(start))
-					#prevVal = stringVal(prevNode.T)
-					if prevNode.is_anchored and prevNode.T != node_start.T:
-						if node_start.T < prevNode.T:
-							node_start.T = prevNode.T
-						print "Node between %s and %s => no data" % (prevNode, node_start)
-						g.add_annotation(prevNode, node_start, data={})
-					else:
-						if prevNode.is_floating:
-							print "Node between %s and %s => no data" % (prevNode, node_start)
-							g.add_annotation(prevNode, node_start, data={})
-
-					node_end = TAnchored(float(end))
-					if node_end.T <= node_start.T:
-						node_end.T = node_start.T+0.001
-
-					print "Node between %s and %s => data{speech:%s speaker:%s confidence:%s}" % (node_start, node_end, mot, speaker, conf)
-					g.add_annotation(node_start, node_end, data={'speech':mot, 'speaker':speaker, 'confidence':conf})
-					prevNode = node_end
-				else:
-					#prevVal = stringVal(prevNode.T)
-					if prevNode.is_anchored:
-						node_start = TAnchored(prevNode.T)
-						node_end = TAnchored(prevNode.T+0.001)
-						print "Node between %s and %s => data{speech:%s speaker:%s confidence:0.950}" % (node_start, node_end, mot, speaker)
-						g.add_annotation(node_start, node_end, data={'speech':mot, 'speaker':speaker, 'confidence':'0.950'})
-						prevNode = node_end
-			if prevNode.is_anchored:
-				print "Node between %s and %s => no data"%(prevNode,t2)
-				g.add_annotation(prevNode, t2, data={})
-				lastEnd = prevNode.T
-	g.save("%s.json" % namesave)
diff --git a/tvd/algorithms/alignment/ctm.py b/tvd/algorithms/alignment/ctm.py
new file mode 100644
index 0000000..9f05214
--- /dev/null
+++ b/tvd/algorithms/alignment/ctm.py
@@ -0,0 +1,148 @@
+from tvd import AnnotationGraph, Episode, TAnchored, TFloating, TStart, TEnd
+import re
+import networkx as nx
+
+class CTMAligner(object):
+	"""docstring for CTMParser"""
+	def __init__(self, punctuation=True):
+		super(CTMAligner, self).__init__()
+		self.punctuation = punctuation
+
+	def clean_sentence(self, sentenceWords):
+		sentenceClean = re.sub(r'\([^\)]+\)','', sentenceWords)
+		sentenceClean = re.sub(r'\[[^\]]+\]','', sentenceClean)
+		#sentenceClean = re.sub('[.!,;?":]','', sentenceClean)
+			
+		sentenceClean = re.sub(r'^[\.!,;?":]+','', sentenceClean)
+
+		sentenceClean = re.sub(r'([\.!,;?":]+)[ ]+([\.!,;?":]+)','\g<1>\g<2>', sentenceClean)
+		sentenceClean = re.sub(r'[ ]*([\.!,;?":]+)','\g<1> ', sentenceClean)
+
+		sentenceClean = re.sub(r' +',' ', sentenceClean)
+		sentenceClean = sentenceClean.strip()
+		return sentenceClean
+	
+	def merge_ctm_with_manual_transcript(self, ctmGraph, manualTranscriptGraph):
+		lastIndexNode=0
+
+		end = False
+		
+		TFloating.reset()
+		manualTranscriptGraph, map = manualTranscriptGraph.relabel_floating_nodes()
+		ctmGraph, map2 = ctmGraph.relabel_floating_nodes()
+
+		manualTranscriptGraph, map = manualTranscriptGraph.relabel_floating_nodes(mapping=map)
+
+		nodesWords = nx.topological_sort(ctmGraph)
+		if nodesWords[lastIndexNode] == TStart():
+			lastIndexNode += 1
+
+		last = -1
+		next = -1
+
+		first_node = None
+
+		first = -1
+		for t1, t2, data in manualTranscriptGraph.ordered_edges_iter(data=True):
+			if 'speech' in data:
+				sentence = data['speech']
+				speaker = data['speaker']
+				sentenceClean = self.clean_sentence(sentence)
+				if not self.punctuation:
+					sentenceClean = re.sub(r'[\.!,;?":]+','', sentenceClean)
+
+				if sentenceClean != "":
+
+					sentenceWords = ""
+				
+					if lastIndexNode < len(nodesWords):
+
+						if first_node is None and t1 != TStart():
+							first_node = t1
+							manualTranscriptGraph.add_annotation(first_node, nodesWords[lastIndexNode])
+
+						node_manual_trs_start = t1
+						node_manual_trs_end = t2
+
+						node_float = TFloating()
+						remainingData = None
+						if last > 0 and next > 0:
+							for key in ctmGraph[last][next]:
+								dataWord = ctmGraph[last][next][key]
+								if 'speech' in dataWord:
+									remainingData = dataWord
+									sentenceWords = remainingData['speech']
+									sentenceWords = self.clean_sentence(sentenceWords)
+									last = -1
+									next = -1
+						
+						bAlreadyAdded = False
+
+						if(remainingData is not None):
+							if 'speech' in remainingData:
+								remainingData['speaker']=speaker
+							manualTranscriptGraph.add_annotation(node_manual_trs_start, nodesWords[lastIndexNode], data=remainingData)
+							if sentenceWords == sentenceClean:
+								manualTranscriptGraph.add_annotation(nodesWords[lastIndexNode], node_manual_trs_end)
+								bAlreadyAdded = True
+
+						if not bAlreadyAdded:
+							if not manualTranscriptGraph.has_edge(node_manual_trs_start, nodesWords[lastIndexNode]):
+								manualTranscriptGraph.add_annotation(node_manual_trs_start, nodesWords[lastIndexNode])
+
+							node_end = ""
+							previousNode = None
+							while not end and lastIndexNode < len(nodesWords):
+								node = nodesWords[lastIndexNode]
+								for node2 in sorted(ctmGraph.successors(node)):
+									
+									node_start = node
+									node_end = node2
+									
+									if previousNode is not None:
+										if not manualTranscriptGraph.has_edge(previousNode, node_start) and previousNode != node_start :
+											manualTranscriptGraph.add_annotation(previousNode, node_start)
+
+									for key in ctmGraph[node][node2]:
+										dataWord = ctmGraph[node][node2][key]
+										if 'speech' in dataWord:
+											dataWord['speaker']=speaker
+										manualTranscriptGraph.add_annotation(node_start, node_end, data=dataWord)
+									
+										if 'speech' in dataWord:
+											if sentenceWords == "":
+												sentenceWords = dataWord['speech']
+											else:
+												sentenceWords += " " + dataWord['speech']
+											sentenceWords = self.clean_sentence(sentenceWords)
+									if sentenceWords == sentenceClean:
+										if re.search(r'[\.!,;?":]$', sentenceClean):
+											#Have to add the next anchored just before the end of the speech turn ...
+											lastIndexNode+= 2
+											if lastIndexNode < len(nodesWords):
+												node = nodesWords[lastIndexNode]
+												if node.is_anchored:
+													manualTranscriptGraph.add_annotation(node_end, node)
+													node_end = node
+													lastIndexNode -= 1
+												else:
+													lastIndexNode -= 2
+										end = True
+									previousNode = node_end
+								lastIndexNode+=1
+
+							if lastIndexNode+1 < len(nodesWords):
+								last = nodesWords[lastIndexNode]
+								next = nodesWords[lastIndexNode+1]
+
+							#print "%s -> %s" % (node_end, node_manual_trs_end)
+							lastIndexNode+=1
+
+							manualTranscriptGraph.add_annotation(node_end, node_manual_trs_end)
+							
+							end = False
+					elif sentenceClean != "":
+						print "Unable to align '%s' !" % (sentenceClean)
+						return None
+
+		return manualTranscriptGraph
diff --git a/tvd/algorithms/alignment/just_vrbs.pl b/tvd/algorithms/alignment/just_vrbs.pl
deleted file mode 100755
index 8809311..0000000
--- a/tvd/algorithms/alignment/just_vrbs.pl
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/usr/bin/perl
-
-if(scalar(@ARGV) != 3){
-	die "Usage : $0 fichierTxt(Node1-Node2#Speaker#text) fichierAudio name\n";
-}
-
-$txt = $ARGV[0];
-$wav = $ARGV[1];
-$name = $ARGV[2];
-
-%map = ();
-$map{"‘"}="'";
-$map{"…"}="...";
-$map{"–"}="-";
-$map{"’"}="'";
-
-
-open(OUT, ">/tmp/$name.txt");
-open(TXT, $txt);
-while($ligneC = <TXT>){
-	chomp($ligneC);
-	if($ligneC =~ /^[^#]+#[^#]+#(.*)/){
-		$ligne = $1;
-		print OUT nettoyerLigne($ligne)."\n";	
-	}
-}
-close(TXT);
-close(OUT);
-
-
-$cmd = "cat /tmp/$name.txt | utf2iso 2> /tmp/$name.txt.iso.error 1> /dev/null";
-system($cmd);
-
-if(-s "/tmp/$name.txt.iso.error" > 0){
-	die "Error during conversion ... see /tmp/$name.txt.iso.error"
-}
-unlink "/tmp/$name.txt.iso.error";
-
-$cmd = "mkdir align";
-system($cmd) if(!-d "align");
-
-$cmd = "vrbs_align -l:eng -v -f $wav /tmp/$name.txt > /tmp/$name.xml";
-system($cmd);
-
-
-$cmd = "cat /tmp/$name.xml | xml2ctm";
-system($cmd);
-	
-clean();
-
-sub clean{
-	unlink "/tmp/$name.txt";
-	unlink "/tmp/$name.xml";
-}
-
-
-sub virerPonctuation{
-	my $word = $_[0];
-	$word =~ s/,//g;
-    $word =~ s/\.//g;       
-    $word =~ s/\?//g;       
-    $word =~ s/\!//g;
-	$word =~ s/\"//g;    
-	$word =~ s/\://g;
-	$word =~ s/\;//g;
-	return trim($word);
-}
-
-sub trim{
-        $chaine = $_[0];
-        $chaine =~ s/^\s+//g;
-        $chaine =~ s/\s+$//g;
-        return $chaine;
-}
-sub nettoyerLigne{
-        my $ligne=$_[0];
-        foreach(keys %map){
-                $ligne =~ s/$_/$map{$_}/g;
-        }
-        $ligne =~ s/\.\.\.([^\.])/\.\.\. $1/g;
-        $ligne =~ s/ +/ /g;
-        $ligne =~ s/\([^\)]+\)//g;
-        $ligne =~ s/\[[^\]]+\]//g;
-        return $ligne;
-}
diff --git a/tvd/parser/._ctm.py b/tvd/parser/._ctm.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cb34bc39a6c490523339ec53b52f2f8b3dc22c
GIT binary patch
literal 4096
zcmZQz6=P>$Vqox1Ojhs@R)|o50+1L3ClDJkFz{^v(m+1nBL)UWIUt(=a103v0;)8C
z=wPTIpnOz%Gz3ONU^E0qLtr!nMnhmU1V%$(Gz3ONU^E0qLtr!ns1gE<KvO^%4CF#G
zGK&?8Qj7CTi;`0n((;S46*BWmQu9hO^YapOaw-*aQqxKll5!IBvVnbJsIDPRq52>0
K6&VJ(|Nj9Tl^eGJ

literal 0
HcmV?d00001

diff --git a/tvd/parser/__init__.py b/tvd/parser/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tvd/parser/ctm.py b/tvd/parser/ctm.py
new file mode 100644
index 0000000..a25d0e1
--- /dev/null
+++ b/tvd/parser/ctm.py
@@ -0,0 +1,101 @@
+
+from tvd import AnnotationGraph, Episode, TAnchored, TFloating, TStart, TEnd
+import codecs
+import re
+
+class CTMParser(object):
+	"""docstring for CTMParser"""
+	def __init__(self, punctuation=True):
+		super(CTMParser, self).__init__()
+		self.punctuation = punctuation
+
+	def get_graph(self, path2ctm):
+		g = AnnotationGraph()
+
+		TFloating.reset()
+		previousNode = TStart()
+
+
+		arc = []
+
+		with codecs.open(path2ctm, "rt", encoding='utf8') as f:
+			for line in f:
+				if not re.search(r'^;;', line):
+					fields = line.strip().split()
+					start = round(float(fields[2]), 3)
+					duration = round(float(fields[3]), 3)
+					end = float(start)+float(duration)
+
+					end = round(end, 3)
+					
+					word = fields[4]
+
+					if not self.punctuation:
+						word = re.sub(r'[\.!,;?":]+',' ', word)
+						word = re.sub(r' +',' ', word)
+
+					if word != "" and word != ' ':
+						confidence = fields[5]
+						if duration == 0:
+							node_start = previousNode
+							node_end = TFloating()
+							if len(arc) == 2:
+								g.remove_edge(arc[0], arc[1])
+								g.add_annotation(arc[0], node_end, arc_data)
+								node_inter = TFloating()
+								g.add_annotation(node_end, node_inter, data={'speech':word, 'confidence':confidence})
+								g.add_annotation(node_inter, arc[1])
+								arc.append(node_end)
+								arc.append(node_inter)
+								node_end=arc[1]
+							elif len(arc) > 2:
+								node_anc_start = arc[0]
+								node_anc_end = arc[1]
+								g.remove_edge(arc[len(arc)-1], node_anc_end)
+								g.add_annotation(arc[len(arc)-1], node_end, data={'speech':word, 'confidence':confidence})
+								g.add_annotation(node_end, node_anc_end)
+								arc.append(node_end)
+								node_end=arc[1]
+						else:
+							addEdge = True
+							node_start = TAnchored(start)
+							node_end = TAnchored(end)
+							if previousNode.is_floating:
+								if not g.has_edge(previousNode, node_start):
+									g.add_annotation(previousNode, node_start)
+							else:
+								if node_start.T < previousNode.T:
+									node_start = previousNode
+								elif node_start.T > previousNode.T:
+									g.add_annotation(previousNode, node_start)
+							if node_start.is_anchored and node_end.is_anchored:
+								if node_start.T == node_end.T:
+									addEdge = False
+									node_start = previousNode
+									node_end = TFloating()
+									if len(arc) == 2:
+										g.remove_edge(arc[0], arc[1])
+										g.add_annotation(arc[0], node_end, arc_data)
+										node_inter = TFloating()
+										g.add_annotation(node_end, node_inter, data={'speech':word, 'confidence':confidence})
+										g.add_annotation(node_inter, arc[1])
+										arc.append(node_end)
+										arc.append(node_inter)
+										node_end=arc[1]
+									elif len(arc) > 2:
+										node_anc_start = arc[0]
+										node_anc_end = arc[1]
+										g.remove_edge(arc[len(arc)-1], node_anc_end)
+										g.add_annotation(arc[len(arc)-1], node_end, data={'speech':word, 'confidence':confidence})
+										g.add_annotation(node_end, node_anc_end)
+										arc.append(node_end)
+										node_end=arc[1]
+								else:
+									arc = [node_start, node_end]
+									arc_data = {'speech':word, 'confidence':confidence}
+							if addEdge:
+								g.add_annotation(node_start, node_end, data={'speech':word, 'confidence':confidence})
+						previousNode=node_end
+
+		g.add_annotation(previousNode, TEnd())
+		return g

From 4cec6ad0ed2e5e58478d7213fe16bcc2b6beb6c1 Mon Sep 17 00:00:00 2001
From: antoine laurent <laurent@hp801.limsi.fr>
Date: Thu, 1 May 2014 15:48:24 +0200
Subject: [PATCH 3/3] Removed align.pl

---
 tvd/algorithms/alignment/align.pl | 166 ------------------------------
 1 file changed, 166 deletions(-)
 delete mode 100755 tvd/algorithms/alignment/align.pl

diff --git a/tvd/algorithms/alignment/align.pl b/tvd/algorithms/alignment/align.pl
deleted file mode 100755
index 9396258..0000000
--- a/tvd/algorithms/alignment/align.pl
+++ /dev/null
@@ -1,166 +0,0 @@
-#!/usr/bin/perl
-
-if(scalar(@ARGV) != 3){
-	die "Usage : $0 fichierTxt(Node1-Node2#Speaker#text) fichierAudio name\n";
-}
-
-$txt = $ARGV[0];
-$wav = $ARGV[1];
-$name = $ARGV[2];
-
-%map = ();
-$map{"‘"}="'";
-$map{"…"}="...";
-$map{"–"}="-";
-$map{"’"}="'";
-
-
-open(OUT, ">/tmp/$name.txt");
-open(TXT, $txt);
-while($ligneC = <TXT>){
-	chomp($ligneC);
-	if($ligneC =~ /^[^#]+#[^#]+#(.*)/){
-		$ligne = $1;
-		print OUT nettoyerLigne($ligne)."\n";	
-	}
-}
-close(TXT);
-close(OUT);
-
-
-$cmd = "cat /tmp/$name.txt | utf2iso 2> /tmp/$name.txt.iso.error 1> /dev/null";
-system($cmd);
-
-if(-s "/tmp/$name.txt.iso.error" > 0){
-	die "Error during conversion ... see /tmp/$name.txt.iso.error"
-}
-unlink "/tmp/$name.txt.iso.error";
-
-$cmd = "mkdir align";
-system($cmd) if(!-d "align");
-
-$cmd = "vrbs_align -l:eng -v -f $wav /tmp/$name.txt > align/$name.xml";
-system($cmd) if(!-e "align/$name.xml");
-
-
-$cmd = "cat align/$name.xml | xml2ctm > /tmp/text.ctm";
-system($cmd);
-
-$ficDump = $txt;
-$ficCTM = "/tmp/text.ctm";
-
-$cmd = "cat $ficCTM";
-@ctm = `$cmd`;
-
-$indCtm = 0;
-
-open(FIC, $ficDump);
-while($ligne = <FIC>){
-	chomp($ligne);
-	
-	@infos = split(/#/, $ligne);
-	$nodes = $infos[0];
-	$spk = $infos[1];
-
-	$spk =~ s/ +/_/g;
-
-	$txt = $infos[2];
-	
-	$txt = nettoyerLigne($txt);
-	$txt = trim($txt);
-
-	#$txt =~ s/([^\.])\./$1 \./g;
-	#$txt =~ s/([^\,])\,/$1 \,/g;
-	#$txt =~ s/([^\?])\?/$1 \?/g;
-	#$txt =~ s/([^\!])\!/$1 \!/g;
-
-	@words = split(/ +/, $txt);
-
-	foreach(@words){
-		$word = $_;
-		$original = $word;
-		$end = 0;
-		while(!$end){
-			$ctmL = $ctm[$indCtm];
-			@iCtm = split(/ /, $ctmL);
-			$wCtm = $iCtm[4];
-			$wCtm = virerPonctuation($wCtm);
-			if($wCtm ne ""){
-				$end=1;
-			}else{
-				$indCtm++;
-			}
-			$end = 1 if($indCtm == scalar(@ctm));
-		}
-
-		
-		$word = virerPonctuation($word);	
-
-		@iCtm = split(/ /, $ctmL);
-		$show = $iCtm[0];
-		$start = $iCtm[2];
-		$duree = $iCtm[3];
-		$score = $iCtm[5];
-		$end = sprintf("%.3f", $start+$duree);
-
-		$wCtm = $iCtm[4];
-		$wCtm = virerPonctuation($wCtm);	
-		
-
-		if($word ne ""){
-			
-			if($word eq $wCtm){
-				print "thrones_s01e01 $nodes $spk $original ($start - $end - $score)\n";
-			}else{
-				die "==> ". $word . " ne ".$wCtm."\n";
-				
-			}
-
-			$indCtm++;
-		}else{
-			print "thrones_s01e01 $nodes $spk $original\n" if($original ne "");
-		}
-	}
-
-	
-	
-}
-close(FIC);
-	
-clean();
-
-sub clean{
-	unlink "/tmp/$name.txt";
-	unlink "/tmp/text.ctm";
-}
-
-
-sub virerPonctuation{
-	my $word = $_[0];
-	$word =~ s/,//g;
-    $word =~ s/\.//g;       
-    $word =~ s/\?//g;       
-    $word =~ s/\!//g;
-	$word =~ s/\"//g;    
-	$word =~ s/\://g;
-	$word =~ s/\;//g;
-	return trim($word);
-}
-
-sub trim{
-        $chaine = $_[0];
-        $chaine =~ s/^\s+//g;
-        $chaine =~ s/\s+$//g;
-        return $chaine;
-}
-sub nettoyerLigne{
-        my $ligne=$_[0];
-        foreach(keys %map){
-                $ligne =~ s/$_/$map{$_}/g;
-        }
-        $ligne =~ s/\.\.\.([^\.])/\.\.\. $1/g;
-        $ligne =~ s/ +/ /g;
-        $ligne =~ s/\([^\)]+\)//g;
-        $ligne =~ s/\[[^\]]+\]//g;
-        return $ligne;
-}