From e2b1905272dac83dc7ad9694906598569cc7a98c Mon Sep 17 00:00:00 2001 From: antoine laurent Date: Thu, 24 Apr 2014 17:55:00 +0200 Subject: [PATCH 1/3] word alignment module --- tvd/algorithms/alignment/__init__.py | 0 tvd/algorithms/alignment/align.pl | 166 ++++++++++++++++++++++++++ tvd/algorithms/alignment/antoine.py | 157 ++++++++++++++++++++++++ tvd/algorithms/alignment/just_vrbs.pl | 85 +++++++++++++ 4 files changed, 408 insertions(+) create mode 100644 tvd/algorithms/alignment/__init__.py create mode 100755 tvd/algorithms/alignment/align.pl create mode 100644 tvd/algorithms/alignment/antoine.py create mode 100755 tvd/algorithms/alignment/just_vrbs.pl diff --git a/tvd/algorithms/alignment/__init__.py b/tvd/algorithms/alignment/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tvd/algorithms/alignment/align.pl b/tvd/algorithms/alignment/align.pl new file mode 100755 index 0000000..9396258 --- /dev/null +++ b/tvd/algorithms/alignment/align.pl @@ -0,0 +1,166 @@ +#!/usr/bin/perl + +if(scalar(@ARGV) != 3){ + die "Usage : $0 fichierTxt(Node1-Node2#Speaker#text) fichierAudio name\n"; +} + +$txt = $ARGV[0]; +$wav = $ARGV[1]; +$name = $ARGV[2]; + +%map = (); +$map{"‘"}="'"; +$map{"…"}="..."; +$map{"–"}="-"; +$map{"’"}="'"; + + +open(OUT, ">/tmp/$name.txt"); +open(TXT, $txt); +while($ligneC = ){ + chomp($ligneC); + if($ligneC =~ /^[^#]+#[^#]+#(.*)/){ + $ligne = $1; + print OUT nettoyerLigne($ligne)."\n"; + } +} +close(TXT); +close(OUT); + + +$cmd = "cat /tmp/$name.txt | utf2iso 2> /tmp/$name.txt.iso.error 1> /dev/null"; +system($cmd); + +if(-s "/tmp/$name.txt.iso.error" > 0){ + die "Error during conversion ... see /tmp/$name.txt.iso.error" +} +unlink "/tmp/$name.txt.iso.error"; + +$cmd = "mkdir align"; +system($cmd) if(!-d "align"); + +$cmd = "vrbs_align -l:eng -v -f $wav /tmp/$name.txt > align/$name.xml"; +system($cmd) if(!-e "align/$name.xml"); + + +$cmd = "cat align/$name.xml | xml2ctm > /tmp/text.ctm"; +system($cmd); + +$ficDump = $txt; +$ficCTM = "/tmp/text.ctm"; + +$cmd = "cat $ficCTM"; +@ctm = `$cmd`; + +$indCtm = 0; + +open(FIC, $ficDump); +while($ligne = ){ + chomp($ligne); + + @infos = split(/#/, $ligne); + $nodes = $infos[0]; + $spk = $infos[1]; + + $spk =~ s/ +/_/g; + + $txt = $infos[2]; + + $txt = nettoyerLigne($txt); + $txt = trim($txt); + + #$txt =~ s/([^\.])\./$1 \./g; + #$txt =~ s/([^\,])\,/$1 \,/g; + #$txt =~ s/([^\?])\?/$1 \?/g; + #$txt =~ s/([^\!])\!/$1 \!/g; + + @words = split(/ +/, $txt); + + foreach(@words){ + $word = $_; + $original = $word; + $end = 0; + while(!$end){ + $ctmL = $ctm[$indCtm]; + @iCtm = split(/ /, $ctmL); + $wCtm = $iCtm[4]; + $wCtm = virerPonctuation($wCtm); + if($wCtm ne ""){ + $end=1; + }else{ + $indCtm++; + } + $end = 1 if($indCtm == scalar(@ctm)); + } + + + $word = virerPonctuation($word); + + @iCtm = split(/ /, $ctmL); + $show = $iCtm[0]; + $start = $iCtm[2]; + $duree = $iCtm[3]; + $score = $iCtm[5]; + $end = sprintf("%.3f", $start+$duree); + + $wCtm = $iCtm[4]; + $wCtm = virerPonctuation($wCtm); + + + if($word ne ""){ + + if($word eq $wCtm){ + print "thrones_s01e01 $nodes $spk $original ($start - $end - $score)\n"; + }else{ + die "==> ". $word . " ne ".$wCtm."\n"; + + } + + $indCtm++; + }else{ + print "thrones_s01e01 $nodes $spk $original\n" if($original ne ""); + } + } + + + +} +close(FIC); + +clean(); + +sub clean{ + unlink "/tmp/$name.txt"; + unlink "/tmp/text.ctm"; +} + + +sub virerPonctuation{ + my $word = $_[0]; + $word =~ s/,//g; + $word =~ s/\.//g; + $word =~ s/\?//g; + $word =~ s/\!//g; + $word =~ s/\"//g; + $word =~ s/\://g; + $word =~ s/\;//g; + return trim($word); +} + +sub trim{ + $chaine = $_[0]; + $chaine =~ s/^\s+//g; + $chaine =~ s/\s+$//g; + return $chaine; +} +sub nettoyerLigne{ + my $ligne=$_[0]; + foreach(keys %map){ + $ligne =~ s/$_/$map{$_}/g; + } + $ligne =~ s/\.\.\.([^\.])/\.\.\. $1/g; + $ligne =~ s/ +/ /g; + $ligne =~ s/\([^\)]+\)//g; + $ligne =~ s/\[[^\]]+\]//g; + return $ligne; +} diff --git a/tvd/algorithms/alignment/antoine.py b/tvd/algorithms/alignment/antoine.py new file mode 100644 index 0000000..c043275 --- /dev/null +++ b/tvd/algorithms/alignment/antoine.py @@ -0,0 +1,157 @@ +from tvd import AnnotationGraph, Episode, TAnchored, TFloating, TStart, TEnd +import codecs +import re +import os +import os.path +import tempfile + +def makeCtm(g, wav, output_ctm, path_to_just_vrbs): + tab = {} + fic = tempfile.NamedTemporaryFile().name + namesave = fic.replace("/", "_") + + with codecs.open(fic, 'w', encoding='utf8') as f: + for t1, t2, data in g.ordered_edges_iter(data=True): + if 'speech' in data: + cle = "%s-%s" % (t1.T,t2.T) + tab[cle]=[] + f.write("%s-%s#%s#%s\n" % (t1.T,t2.T,data['speaker'],data['speech'])) + cmd = "%s/just_vrbs.pl %s %s %s > %s" % (path_to_just_vrbs, fic, wav, namesave, output_ctm) + os.system(cmd) + +def makeWordGraphFromCtm(ctm, keep_punctuation=False): + g = AnnotationGraph() + TFloating.reset() + previousNode = TFloating() + + with codecs.open(ctm, "rt", encoding='utf8') as f: + for line in f: + fields = line.strip().split() + start = round(float(fields[2]), 3) + duration = round(float(fields[3]), 3) + end = float(start)+float(duration) + + end = round(end, 3) + #end = "%.3f" % round(end,3) + word = fields[4] + + if not keep_punctuation: + word = re.sub('[.!,;?"]', '', word) + + if word != "": + confidence = fields[5] + if duration == 0: + node_start = previousNode + node_end = TFloating() + g.add_annotation(node_start, node_end, data={'speech':word, 'confidence':confidence}) + else: + node_start = TAnchored(start) + node_end = TAnchored(end) + if previousNode.is_floating: + g.add_annotation(previousNode, node_start) + else: + if node_start.T < previousNode.T: + node_start = previousNode + elif node_start.T > previousNode.T: + g.add_annotation(previousNode, node_start) + g.add_annotation(node_start, node_end, data={'speech':word, 'confidence':confidence}) + previousNode=node_end + return g + + +def mergeGraphs(transcriptGraph, wordsGraph): + + for t1, t2, data in transcriptGraph.ordered_edges_iter(data=True): + if 'speech' in data: + sentence = data['speech'] + speaker = data['speaker'] + print sentence + sentenceClean = re.sub(r'\([^\)]+\)','', sentence) + sentenceClean = re.sub(r'\[[^\]]+\]','', sentenceClean) + sentenceClean = re.sub(r' +',' ', sentenceClean) + sentenceClean = re.sub('[.!,;?"]','', sentenceClean) + print sentenceClean + + + +def alignTranscript(g, wav, namesave): + tab = {} + + # with codecs.open('/tmp/dump.txt', 'w', encoding='utf8') as f: + # for t1, t2, data in g.ordered_edges_iter(data=True): + # if 'speech' in data: + # f.write("%s\n" % (data['speech'])) + + fic = "/tmp/%s_eti.txt" % namesave + + with codecs.open(fic, 'w', encoding='utf8') as f: + for t1, t2, data in g.ordered_edges_iter(data=True): + if 'speech' in data: + cle = "%s-%s" % (t1.T,t2.T) + tab[cle]=[] + f.write("%s-%s#%s#%s\n" % (t1.T,t2.T,data['speaker'],data['speech'])) + + out = "dump_clean_%s.etiq" % namesave + cmd = "/people/laurent/tvd/monfork/tvd/tvd/algorithms/alignment/align.pl %s %s %s > %s" % (fic, wav, namesave, out) + os.system(cmd) + + with codecs.open(out, "rt", encoding='utf8') as f: + for line in f: + words = line.strip().split() + tab[words[1]].append(line) + + lastEnd = 0 + + os.remove(out) + os.remove(fic) + for t1, t2, data in g.ordered_edges_iter(data=True): + if 'speech' in data: + cle = "%s-%s" % (t1.T,t2.T) + speaker = data['speaker'] + lines = tab[cle] + prevNode = t1 + for line in lines: + words = line.strip().split() + mot = words[3] + m = re.match(r"([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) \(([^ ]+) \- ([^ ]+) \- ([^\)]+)\)", line) + if m: + mot = m.group(4) + start = m.group(5) + end = m.group(6) + conf = m.group(7) + #Je regarde lastEnd (c'est le temps de fin du dernier tour de parole ...) + #Ne dois jamais arriver mais bon... + if start < lastEnd: + start = lastEnd+0.001 + node_start = TAnchored(float(start)) + #prevVal = stringVal(prevNode.T) + if prevNode.is_anchored and prevNode.T != node_start.T: + if node_start.T < prevNode.T: + node_start.T = prevNode.T + print "Node between %s and %s => no data" % (prevNode, node_start) + g.add_annotation(prevNode, node_start, data={}) + else: + if prevNode.is_floating: + print "Node between %s and %s => no data" % (prevNode, node_start) + g.add_annotation(prevNode, node_start, data={}) + + node_end = TAnchored(float(end)) + if node_end.T <= node_start.T: + node_end.T = node_start.T+0.001 + + print "Node between %s and %s => data{speech:%s speaker:%s confidence:%s}" % (node_start, node_end, mot, speaker, conf) + g.add_annotation(node_start, node_end, data={'speech':mot, 'speaker':speaker, 'confidence':conf}) + prevNode = node_end + else: + #prevVal = stringVal(prevNode.T) + if prevNode.is_anchored: + node_start = TAnchored(prevNode.T) + node_end = TAnchored(prevNode.T+0.001) + print "Node between %s and %s => data{speech:%s speaker:%s confidence:0.950}" % (node_start, node_end, mot, speaker) + g.add_annotation(node_start, node_end, data={'speech':mot, 'speaker':speaker, 'confidence':'0.950'}) + prevNode = node_end + if prevNode.is_anchored: + print "Node between %s and %s => no data"%(prevNode,t2) + g.add_annotation(prevNode, t2, data={}) + lastEnd = prevNode.T + g.save("%s.json" % namesave) diff --git a/tvd/algorithms/alignment/just_vrbs.pl b/tvd/algorithms/alignment/just_vrbs.pl new file mode 100755 index 0000000..8809311 --- /dev/null +++ b/tvd/algorithms/alignment/just_vrbs.pl @@ -0,0 +1,85 @@ +#!/usr/bin/perl + +if(scalar(@ARGV) != 3){ + die "Usage : $0 fichierTxt(Node1-Node2#Speaker#text) fichierAudio name\n"; +} + +$txt = $ARGV[0]; +$wav = $ARGV[1]; +$name = $ARGV[2]; + +%map = (); +$map{"‘"}="'"; +$map{"…"}="..."; +$map{"–"}="-"; +$map{"’"}="'"; + + +open(OUT, ">/tmp/$name.txt"); +open(TXT, $txt); +while($ligneC = ){ + chomp($ligneC); + if($ligneC =~ /^[^#]+#[^#]+#(.*)/){ + $ligne = $1; + print OUT nettoyerLigne($ligne)."\n"; + } +} +close(TXT); +close(OUT); + + +$cmd = "cat /tmp/$name.txt | utf2iso 2> /tmp/$name.txt.iso.error 1> /dev/null"; +system($cmd); + +if(-s "/tmp/$name.txt.iso.error" > 0){ + die "Error during conversion ... see /tmp/$name.txt.iso.error" +} +unlink "/tmp/$name.txt.iso.error"; + +$cmd = "mkdir align"; +system($cmd) if(!-d "align"); + +$cmd = "vrbs_align -l:eng -v -f $wav /tmp/$name.txt > /tmp/$name.xml"; +system($cmd); + + +$cmd = "cat /tmp/$name.xml | xml2ctm"; +system($cmd); + +clean(); + +sub clean{ + unlink "/tmp/$name.txt"; + unlink "/tmp/$name.xml"; +} + + +sub virerPonctuation{ + my $word = $_[0]; + $word =~ s/,//g; + $word =~ s/\.//g; + $word =~ s/\?//g; + $word =~ s/\!//g; + $word =~ s/\"//g; + $word =~ s/\://g; + $word =~ s/\;//g; + return trim($word); +} + +sub trim{ + $chaine = $_[0]; + $chaine =~ s/^\s+//g; + $chaine =~ s/\s+$//g; + return $chaine; +} +sub nettoyerLigne{ + my $ligne=$_[0]; + foreach(keys %map){ + $ligne =~ s/$_/$map{$_}/g; + } + $ligne =~ s/\.\.\.([^\.])/\.\.\. $1/g; + $ligne =~ s/ +/ /g; + $ligne =~ s/\([^\)]+\)//g; + $ligne =~ s/\[[^\]]+\]//g; + return $ligne; +} From e03731a3df249ae134e8f783bdbe9af79ce3eb65 Mon Sep 17 00:00:00 2001 From: antoine laurent Date: Thu, 1 May 2014 15:42:51 +0200 Subject: [PATCH 2/3] CTM Aligner and CTM Parser (need to add comments) --- tvd/algorithms/__init__.py | 0 tvd/algorithms/alignment/._ctm.py | Bin 0 -> 4096 bytes tvd/algorithms/alignment/antoine.py | 157 -------------------------- tvd/algorithms/alignment/ctm.py | 148 ++++++++++++++++++++++++ tvd/algorithms/alignment/just_vrbs.pl | 85 -------------- tvd/parser/._ctm.py | Bin 0 -> 4096 bytes tvd/parser/__init__.py | 0 tvd/parser/ctm.py | 101 +++++++++++++++++ 8 files changed, 249 insertions(+), 242 deletions(-) create mode 100644 tvd/algorithms/__init__.py create mode 100644 tvd/algorithms/alignment/._ctm.py delete mode 100644 tvd/algorithms/alignment/antoine.py create mode 100644 tvd/algorithms/alignment/ctm.py delete mode 100755 tvd/algorithms/alignment/just_vrbs.pl create mode 100644 tvd/parser/._ctm.py create mode 100644 tvd/parser/__init__.py create mode 100644 tvd/parser/ctm.py diff --git a/tvd/algorithms/__init__.py b/tvd/algorithms/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tvd/algorithms/alignment/._ctm.py b/tvd/algorithms/alignment/._ctm.py new file mode 100644 index 0000000000000000000000000000000000000000..192580fed2d9aab9b11086c241612798ce343104 GIT binary patch literal 4096 zcmZQz6=P>$Vqox1Ojhs@R)|o50+1L3ClDJkFz{^v(m+1nBL)UWIUt(=a103v0;)8J z=wPTIpnOz%Gz3ONU^E0qLtr!nMnhmU1V%$(Gz3ONU^E0qLtr!ns1gE0 K6&VJ(|NjAs7#qF- literal 0 HcmV?d00001 diff --git a/tvd/algorithms/alignment/antoine.py b/tvd/algorithms/alignment/antoine.py deleted file mode 100644 index c043275..0000000 --- a/tvd/algorithms/alignment/antoine.py +++ /dev/null @@ -1,157 +0,0 @@ -from tvd import AnnotationGraph, Episode, TAnchored, TFloating, TStart, TEnd -import codecs -import re -import os -import os.path -import tempfile - -def makeCtm(g, wav, output_ctm, path_to_just_vrbs): - tab = {} - fic = tempfile.NamedTemporaryFile().name - namesave = fic.replace("/", "_") - - with codecs.open(fic, 'w', encoding='utf8') as f: - for t1, t2, data in g.ordered_edges_iter(data=True): - if 'speech' in data: - cle = "%s-%s" % (t1.T,t2.T) - tab[cle]=[] - f.write("%s-%s#%s#%s\n" % (t1.T,t2.T,data['speaker'],data['speech'])) - cmd = "%s/just_vrbs.pl %s %s %s > %s" % (path_to_just_vrbs, fic, wav, namesave, output_ctm) - os.system(cmd) - -def makeWordGraphFromCtm(ctm, keep_punctuation=False): - g = AnnotationGraph() - TFloating.reset() - previousNode = TFloating() - - with codecs.open(ctm, "rt", encoding='utf8') as f: - for line in f: - fields = line.strip().split() - start = round(float(fields[2]), 3) - duration = round(float(fields[3]), 3) - end = float(start)+float(duration) - - end = round(end, 3) - #end = "%.3f" % round(end,3) - word = fields[4] - - if not keep_punctuation: - word = re.sub('[.!,;?"]', '', word) - - if word != "": - confidence = fields[5] - if duration == 0: - node_start = previousNode - node_end = TFloating() - g.add_annotation(node_start, node_end, data={'speech':word, 'confidence':confidence}) - else: - node_start = TAnchored(start) - node_end = TAnchored(end) - if previousNode.is_floating: - g.add_annotation(previousNode, node_start) - else: - if node_start.T < previousNode.T: - node_start = previousNode - elif node_start.T > previousNode.T: - g.add_annotation(previousNode, node_start) - g.add_annotation(node_start, node_end, data={'speech':word, 'confidence':confidence}) - previousNode=node_end - return g - - -def mergeGraphs(transcriptGraph, wordsGraph): - - for t1, t2, data in transcriptGraph.ordered_edges_iter(data=True): - if 'speech' in data: - sentence = data['speech'] - speaker = data['speaker'] - print sentence - sentenceClean = re.sub(r'\([^\)]+\)','', sentence) - sentenceClean = re.sub(r'\[[^\]]+\]','', sentenceClean) - sentenceClean = re.sub(r' +',' ', sentenceClean) - sentenceClean = re.sub('[.!,;?"]','', sentenceClean) - print sentenceClean - - - -def alignTranscript(g, wav, namesave): - tab = {} - - # with codecs.open('/tmp/dump.txt', 'w', encoding='utf8') as f: - # for t1, t2, data in g.ordered_edges_iter(data=True): - # if 'speech' in data: - # f.write("%s\n" % (data['speech'])) - - fic = "/tmp/%s_eti.txt" % namesave - - with codecs.open(fic, 'w', encoding='utf8') as f: - for t1, t2, data in g.ordered_edges_iter(data=True): - if 'speech' in data: - cle = "%s-%s" % (t1.T,t2.T) - tab[cle]=[] - f.write("%s-%s#%s#%s\n" % (t1.T,t2.T,data['speaker'],data['speech'])) - - out = "dump_clean_%s.etiq" % namesave - cmd = "/people/laurent/tvd/monfork/tvd/tvd/algorithms/alignment/align.pl %s %s %s > %s" % (fic, wav, namesave, out) - os.system(cmd) - - with codecs.open(out, "rt", encoding='utf8') as f: - for line in f: - words = line.strip().split() - tab[words[1]].append(line) - - lastEnd = 0 - - os.remove(out) - os.remove(fic) - for t1, t2, data in g.ordered_edges_iter(data=True): - if 'speech' in data: - cle = "%s-%s" % (t1.T,t2.T) - speaker = data['speaker'] - lines = tab[cle] - prevNode = t1 - for line in lines: - words = line.strip().split() - mot = words[3] - m = re.match(r"([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) \(([^ ]+) \- ([^ ]+) \- ([^\)]+)\)", line) - if m: - mot = m.group(4) - start = m.group(5) - end = m.group(6) - conf = m.group(7) - #Je regarde lastEnd (c'est le temps de fin du dernier tour de parole ...) - #Ne dois jamais arriver mais bon... - if start < lastEnd: - start = lastEnd+0.001 - node_start = TAnchored(float(start)) - #prevVal = stringVal(prevNode.T) - if prevNode.is_anchored and prevNode.T != node_start.T: - if node_start.T < prevNode.T: - node_start.T = prevNode.T - print "Node between %s and %s => no data" % (prevNode, node_start) - g.add_annotation(prevNode, node_start, data={}) - else: - if prevNode.is_floating: - print "Node between %s and %s => no data" % (prevNode, node_start) - g.add_annotation(prevNode, node_start, data={}) - - node_end = TAnchored(float(end)) - if node_end.T <= node_start.T: - node_end.T = node_start.T+0.001 - - print "Node between %s and %s => data{speech:%s speaker:%s confidence:%s}" % (node_start, node_end, mot, speaker, conf) - g.add_annotation(node_start, node_end, data={'speech':mot, 'speaker':speaker, 'confidence':conf}) - prevNode = node_end - else: - #prevVal = stringVal(prevNode.T) - if prevNode.is_anchored: - node_start = TAnchored(prevNode.T) - node_end = TAnchored(prevNode.T+0.001) - print "Node between %s and %s => data{speech:%s speaker:%s confidence:0.950}" % (node_start, node_end, mot, speaker) - g.add_annotation(node_start, node_end, data={'speech':mot, 'speaker':speaker, 'confidence':'0.950'}) - prevNode = node_end - if prevNode.is_anchored: - print "Node between %s and %s => no data"%(prevNode,t2) - g.add_annotation(prevNode, t2, data={}) - lastEnd = prevNode.T - g.save("%s.json" % namesave) diff --git a/tvd/algorithms/alignment/ctm.py b/tvd/algorithms/alignment/ctm.py new file mode 100644 index 0000000..9f05214 --- /dev/null +++ b/tvd/algorithms/alignment/ctm.py @@ -0,0 +1,148 @@ +from tvd import AnnotationGraph, Episode, TAnchored, TFloating, TStart, TEnd +import re +import networkx as nx + +class CTMAligner(object): + """docstring for CTMParser""" + def __init__(self, punctuation=True): + super(CTMAligner, self).__init__() + self.punctuation = punctuation + + def clean_sentence(self, sentenceWords): + sentenceClean = re.sub(r'\([^\)]+\)','', sentenceWords) + sentenceClean = re.sub(r'\[[^\]]+\]','', sentenceClean) + #sentenceClean = re.sub('[.!,;?":]','', sentenceClean) + + sentenceClean = re.sub(r'^[\.!,;?":]+','', sentenceClean) + + sentenceClean = re.sub(r'([\.!,;?":]+)[ ]+([\.!,;?":]+)','\g<1>\g<2>', sentenceClean) + sentenceClean = re.sub(r'[ ]*([\.!,;?":]+)','\g<1> ', sentenceClean) + + sentenceClean = re.sub(r' +',' ', sentenceClean) + sentenceClean = sentenceClean.strip() + return sentenceClean + + def merge_ctm_with_manual_transcript(self, ctmGraph, manualTranscriptGraph): + lastIndexNode=0 + + end = False + + TFloating.reset() + manualTranscriptGraph, map = manualTranscriptGraph.relabel_floating_nodes() + ctmGraph, map2 = ctmGraph.relabel_floating_nodes() + + manualTranscriptGraph, map = manualTranscriptGraph.relabel_floating_nodes(mapping=map) + + nodesWords = nx.topological_sort(ctmGraph) + if nodesWords[lastIndexNode] == TStart(): + lastIndexNode += 1 + + last = -1 + next = -1 + + first_node = None + + first = -1 + for t1, t2, data in manualTranscriptGraph.ordered_edges_iter(data=True): + if 'speech' in data: + sentence = data['speech'] + speaker = data['speaker'] + sentenceClean = self.clean_sentence(sentence) + if not self.punctuation: + sentenceClean = re.sub(r'[\.!,;?":]+','', sentenceClean) + + if sentenceClean != "": + + sentenceWords = "" + + if lastIndexNode < len(nodesWords): + + if first_node is None and t1 != TStart(): + first_node = t1 + manualTranscriptGraph.add_annotation(first_node, nodesWords[lastIndexNode]) + + node_manual_trs_start = t1 + node_manual_trs_end = t2 + + node_float = TFloating() + remainingData = None + if last > 0 and next > 0: + for key in ctmGraph[last][next]: + dataWord = ctmGraph[last][next][key] + if 'speech' in dataWord: + remainingData = dataWord + sentenceWords = remainingData['speech'] + sentenceWords = self.clean_sentence(sentenceWords) + last = -1 + next = -1 + + bAlreadyAdded = False + + if(remainingData is not None): + if 'speech' in remainingData: + remainingData['speaker']=speaker + manualTranscriptGraph.add_annotation(node_manual_trs_start, nodesWords[lastIndexNode], data=remainingData) + if sentenceWords == sentenceClean: + manualTranscriptGraph.add_annotation(nodesWords[lastIndexNode], node_manual_trs_end) + bAlreadyAdded = True + + if not bAlreadyAdded: + if not manualTranscriptGraph.has_edge(node_manual_trs_start, nodesWords[lastIndexNode]): + manualTranscriptGraph.add_annotation(node_manual_trs_start, nodesWords[lastIndexNode]) + + node_end = "" + previousNode = None + while not end and lastIndexNode < len(nodesWords): + node = nodesWords[lastIndexNode] + for node2 in sorted(ctmGraph.successors(node)): + + node_start = node + node_end = node2 + + if previousNode is not None: + if not manualTranscriptGraph.has_edge(previousNode, node_start) and previousNode != node_start : + manualTranscriptGraph.add_annotation(previousNode, node_start) + + for key in ctmGraph[node][node2]: + dataWord = ctmGraph[node][node2][key] + if 'speech' in dataWord: + dataWord['speaker']=speaker + manualTranscriptGraph.add_annotation(node_start, node_end, data=dataWord) + + if 'speech' in dataWord: + if sentenceWords == "": + sentenceWords = dataWord['speech'] + else: + sentenceWords += " " + dataWord['speech'] + sentenceWords = self.clean_sentence(sentenceWords) + if sentenceWords == sentenceClean: + if re.search(r'[\.!,;?":]$', sentenceClean): + #Have to add the next anchored just before the end of the speech turn ... + lastIndexNode+= 2 + if lastIndexNode < len(nodesWords): + node = nodesWords[lastIndexNode] + if node.is_anchored: + manualTranscriptGraph.add_annotation(node_end, node) + node_end = node + lastIndexNode -= 1 + else: + lastIndexNode -= 2 + end = True + previousNode = node_end + lastIndexNode+=1 + + if lastIndexNode+1 < len(nodesWords): + last = nodesWords[lastIndexNode] + next = nodesWords[lastIndexNode+1] + + #print "%s -> %s" % (node_end, node_manual_trs_end) + lastIndexNode+=1 + + manualTranscriptGraph.add_annotation(node_end, node_manual_trs_end) + + end = False + elif sentenceClean != "": + print "Unable to align '%s' !" % (sentenceClean) + return None + + return manualTranscriptGraph diff --git a/tvd/algorithms/alignment/just_vrbs.pl b/tvd/algorithms/alignment/just_vrbs.pl deleted file mode 100755 index 8809311..0000000 --- a/tvd/algorithms/alignment/just_vrbs.pl +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/perl - -if(scalar(@ARGV) != 3){ - die "Usage : $0 fichierTxt(Node1-Node2#Speaker#text) fichierAudio name\n"; -} - -$txt = $ARGV[0]; -$wav = $ARGV[1]; -$name = $ARGV[2]; - -%map = (); -$map{"‘"}="'"; -$map{"…"}="..."; -$map{"–"}="-"; -$map{"’"}="'"; - - -open(OUT, ">/tmp/$name.txt"); -open(TXT, $txt); -while($ligneC = ){ - chomp($ligneC); - if($ligneC =~ /^[^#]+#[^#]+#(.*)/){ - $ligne = $1; - print OUT nettoyerLigne($ligne)."\n"; - } -} -close(TXT); -close(OUT); - - -$cmd = "cat /tmp/$name.txt | utf2iso 2> /tmp/$name.txt.iso.error 1> /dev/null"; -system($cmd); - -if(-s "/tmp/$name.txt.iso.error" > 0){ - die "Error during conversion ... see /tmp/$name.txt.iso.error" -} -unlink "/tmp/$name.txt.iso.error"; - -$cmd = "mkdir align"; -system($cmd) if(!-d "align"); - -$cmd = "vrbs_align -l:eng -v -f $wav /tmp/$name.txt > /tmp/$name.xml"; -system($cmd); - - -$cmd = "cat /tmp/$name.xml | xml2ctm"; -system($cmd); - -clean(); - -sub clean{ - unlink "/tmp/$name.txt"; - unlink "/tmp/$name.xml"; -} - - -sub virerPonctuation{ - my $word = $_[0]; - $word =~ s/,//g; - $word =~ s/\.//g; - $word =~ s/\?//g; - $word =~ s/\!//g; - $word =~ s/\"//g; - $word =~ s/\://g; - $word =~ s/\;//g; - return trim($word); -} - -sub trim{ - $chaine = $_[0]; - $chaine =~ s/^\s+//g; - $chaine =~ s/\s+$//g; - return $chaine; -} -sub nettoyerLigne{ - my $ligne=$_[0]; - foreach(keys %map){ - $ligne =~ s/$_/$map{$_}/g; - } - $ligne =~ s/\.\.\.([^\.])/\.\.\. $1/g; - $ligne =~ s/ +/ /g; - $ligne =~ s/\([^\)]+\)//g; - $ligne =~ s/\[[^\]]+\]//g; - return $ligne; -} diff --git a/tvd/parser/._ctm.py b/tvd/parser/._ctm.py new file mode 100644 index 0000000000000000000000000000000000000000..24cb34bc39a6c490523339ec53b52f2f8b3dc22c GIT binary patch literal 4096 zcmZQz6=P>$Vqox1Ojhs@R)|o50+1L3ClDJkFz{^v(m+1nBL)UWIUt(=a103v0;)8C z=wPTIpnOz%Gz3ONU^E0qLtr!nMnhmU1V%$(Gz3ONU^E0qLtr!ns1gE0 K6&VJ(|Nj9Tl^eGJ literal 0 HcmV?d00001 diff --git a/tvd/parser/__init__.py b/tvd/parser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tvd/parser/ctm.py b/tvd/parser/ctm.py new file mode 100644 index 0000000..a25d0e1 --- /dev/null +++ b/tvd/parser/ctm.py @@ -0,0 +1,101 @@ + +from tvd import AnnotationGraph, Episode, TAnchored, TFloating, TStart, TEnd +import codecs +import re + +class CTMParser(object): + """docstring for CTMParser""" + def __init__(self, punctuation=True): + super(CTMParser, self).__init__() + self.punctuation = punctuation + + def get_graph(self, path2ctm): + g = AnnotationGraph() + + TFloating.reset() + previousNode = TStart() + + + arc = [] + + with codecs.open(path2ctm, "rt", encoding='utf8') as f: + for line in f: + if not re.search(r'^;;', line): + fields = line.strip().split() + start = round(float(fields[2]), 3) + duration = round(float(fields[3]), 3) + end = float(start)+float(duration) + + end = round(end, 3) + + word = fields[4] + + if not self.punctuation: + word = re.sub(r'[\.!,;?":]+',' ', word) + word = re.sub(r' +',' ', word) + + if word != "" and word != ' ': + confidence = fields[5] + if duration == 0: + node_start = previousNode + node_end = TFloating() + if len(arc) == 2: + g.remove_edge(arc[0], arc[1]) + g.add_annotation(arc[0], node_end, arc_data) + node_inter = TFloating() + g.add_annotation(node_end, node_inter, data={'speech':word, 'confidence':confidence}) + g.add_annotation(node_inter, arc[1]) + arc.append(node_end) + arc.append(node_inter) + node_end=arc[1] + elif len(arc) > 2: + node_anc_start = arc[0] + node_anc_end = arc[1] + g.remove_edge(arc[len(arc)-1], node_anc_end) + g.add_annotation(arc[len(arc)-1], node_end, data={'speech':word, 'confidence':confidence}) + g.add_annotation(node_end, node_anc_end) + arc.append(node_end) + node_end=arc[1] + else: + addEdge = True + node_start = TAnchored(start) + node_end = TAnchored(end) + if previousNode.is_floating: + if not g.has_edge(previousNode, node_start): + g.add_annotation(previousNode, node_start) + else: + if node_start.T < previousNode.T: + node_start = previousNode + elif node_start.T > previousNode.T: + g.add_annotation(previousNode, node_start) + if node_start.is_anchored and node_end.is_anchored: + if node_start.T == node_end.T: + addEdge = False + node_start = previousNode + node_end = TFloating() + if len(arc) == 2: + g.remove_edge(arc[0], arc[1]) + g.add_annotation(arc[0], node_end, arc_data) + node_inter = TFloating() + g.add_annotation(node_end, node_inter, data={'speech':word, 'confidence':confidence}) + g.add_annotation(node_inter, arc[1]) + arc.append(node_end) + arc.append(node_inter) + node_end=arc[1] + elif len(arc) > 2: + node_anc_start = arc[0] + node_anc_end = arc[1] + g.remove_edge(arc[len(arc)-1], node_anc_end) + g.add_annotation(arc[len(arc)-1], node_end, data={'speech':word, 'confidence':confidence}) + g.add_annotation(node_end, node_anc_end) + arc.append(node_end) + node_end=arc[1] + else: + arc = [node_start, node_end] + arc_data = {'speech':word, 'confidence':confidence} + if addEdge: + g.add_annotation(node_start, node_end, data={'speech':word, 'confidence':confidence}) + previousNode=node_end + + g.add_annotation(previousNode, TEnd()) + return g From 4cec6ad0ed2e5e58478d7213fe16bcc2b6beb6c1 Mon Sep 17 00:00:00 2001 From: antoine laurent Date: Thu, 1 May 2014 15:48:24 +0200 Subject: [PATCH 3/3] Removed align.pl --- tvd/algorithms/alignment/align.pl | 166 ------------------------------ 1 file changed, 166 deletions(-) delete mode 100755 tvd/algorithms/alignment/align.pl diff --git a/tvd/algorithms/alignment/align.pl b/tvd/algorithms/alignment/align.pl deleted file mode 100755 index 9396258..0000000 --- a/tvd/algorithms/alignment/align.pl +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/perl - -if(scalar(@ARGV) != 3){ - die "Usage : $0 fichierTxt(Node1-Node2#Speaker#text) fichierAudio name\n"; -} - -$txt = $ARGV[0]; -$wav = $ARGV[1]; -$name = $ARGV[2]; - -%map = (); -$map{"‘"}="'"; -$map{"…"}="..."; -$map{"–"}="-"; -$map{"’"}="'"; - - -open(OUT, ">/tmp/$name.txt"); -open(TXT, $txt); -while($ligneC = ){ - chomp($ligneC); - if($ligneC =~ /^[^#]+#[^#]+#(.*)/){ - $ligne = $1; - print OUT nettoyerLigne($ligne)."\n"; - } -} -close(TXT); -close(OUT); - - -$cmd = "cat /tmp/$name.txt | utf2iso 2> /tmp/$name.txt.iso.error 1> /dev/null"; -system($cmd); - -if(-s "/tmp/$name.txt.iso.error" > 0){ - die "Error during conversion ... see /tmp/$name.txt.iso.error" -} -unlink "/tmp/$name.txt.iso.error"; - -$cmd = "mkdir align"; -system($cmd) if(!-d "align"); - -$cmd = "vrbs_align -l:eng -v -f $wav /tmp/$name.txt > align/$name.xml"; -system($cmd) if(!-e "align/$name.xml"); - - -$cmd = "cat align/$name.xml | xml2ctm > /tmp/text.ctm"; -system($cmd); - -$ficDump = $txt; -$ficCTM = "/tmp/text.ctm"; - -$cmd = "cat $ficCTM"; -@ctm = `$cmd`; - -$indCtm = 0; - -open(FIC, $ficDump); -while($ligne = ){ - chomp($ligne); - - @infos = split(/#/, $ligne); - $nodes = $infos[0]; - $spk = $infos[1]; - - $spk =~ s/ +/_/g; - - $txt = $infos[2]; - - $txt = nettoyerLigne($txt); - $txt = trim($txt); - - #$txt =~ s/([^\.])\./$1 \./g; - #$txt =~ s/([^\,])\,/$1 \,/g; - #$txt =~ s/([^\?])\?/$1 \?/g; - #$txt =~ s/([^\!])\!/$1 \!/g; - - @words = split(/ +/, $txt); - - foreach(@words){ - $word = $_; - $original = $word; - $end = 0; - while(!$end){ - $ctmL = $ctm[$indCtm]; - @iCtm = split(/ /, $ctmL); - $wCtm = $iCtm[4]; - $wCtm = virerPonctuation($wCtm); - if($wCtm ne ""){ - $end=1; - }else{ - $indCtm++; - } - $end = 1 if($indCtm == scalar(@ctm)); - } - - - $word = virerPonctuation($word); - - @iCtm = split(/ /, $ctmL); - $show = $iCtm[0]; - $start = $iCtm[2]; - $duree = $iCtm[3]; - $score = $iCtm[5]; - $end = sprintf("%.3f", $start+$duree); - - $wCtm = $iCtm[4]; - $wCtm = virerPonctuation($wCtm); - - - if($word ne ""){ - - if($word eq $wCtm){ - print "thrones_s01e01 $nodes $spk $original ($start - $end - $score)\n"; - }else{ - die "==> ". $word . " ne ".$wCtm."\n"; - - } - - $indCtm++; - }else{ - print "thrones_s01e01 $nodes $spk $original\n" if($original ne ""); - } - } - - - -} -close(FIC); - -clean(); - -sub clean{ - unlink "/tmp/$name.txt"; - unlink "/tmp/text.ctm"; -} - - -sub virerPonctuation{ - my $word = $_[0]; - $word =~ s/,//g; - $word =~ s/\.//g; - $word =~ s/\?//g; - $word =~ s/\!//g; - $word =~ s/\"//g; - $word =~ s/\://g; - $word =~ s/\;//g; - return trim($word); -} - -sub trim{ - $chaine = $_[0]; - $chaine =~ s/^\s+//g; - $chaine =~ s/\s+$//g; - return $chaine; -} -sub nettoyerLigne{ - my $ligne=$_[0]; - foreach(keys %map){ - $ligne =~ s/$_/$map{$_}/g; - } - $ligne =~ s/\.\.\.([^\.])/\.\.\. $1/g; - $ligne =~ s/ +/ /g; - $ligne =~ s/\([^\)]+\)//g; - $ligne =~ s/\[[^\]]+\]//g; - return $ligne; -}