forked from aclew/launcher
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcatspa-syllabify-corpus.pl
93 lines (80 loc) · 4.05 KB
/
catspa-syllabify-corpus.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/perl
# Alex Cristia [email protected] 2015-07-13 adapted VERY MINIMALLY this code from scripts
# distributed by Lawrence Phillips & Lisa Pearl, 12/23/13 (UCI-Brent-Syllabic Corpus) - credit is owed mainly to those authors
# Uses the maximum onset principle to fully syllabify a corpus
use File::Basename;
$language=$ARGV[0];
$filecorpus=$ARGV[1];
$output=$ARGV[2];
$dirname=dirname($filecorpus);
#print "\n the language is $language\n";
# /input/ OR /scripts/ ?
# Save valid onsets from ValidOnsets.txt
%onsets = {};
open(ONSETS, "<$dirname/$language-ValidOnsets.txt") or die("Couldn't open $dirname/$language-ValidOnsets.txt\n");
while(defined($fileline = <ONSETS>)){
chomp($fileline);
#print "$fileline\n";
$onsets{$fileline} = 1; #This is an odd way of stating things
#print "added";
}
#print "out of the while";
close(ONSETS);
# Save valid vowels from Vowels.txt
%vowels = {};
open(VOWELS, "<$dirname/$language-Vowels.txt") or die("Couldn't open $dirname/$language-Vowels.txt\n");
my $vowels = <VOWELS>;
close(VOWELS);
#print "$vowels\n";
# Go through CORPUS.txt,
# for nonsyllabified words: for each syllable, find its vowel, and its maximum onset, given acceptable onsets and beginning of word.
# print syllabified version to syllabified-CORPUS.txt.
open(SYLLABIFIED, ">$output") or die("Couldn't open $output for writing\n");
open(CORPUS, "<$filecorpus") or die("Couldn't open $filecorpus for reading");
while(defined($fileline = <CORPUS>)){
#print "entered first while\n";
chomp($fileline);
$currline = $fileline;
@wordarray = split(" ", $currline); # divide the line into a set of words
$syllline="";#we start with a clean slate for each line
while(@wordarray > 0){
$currword = pop(@wordarray); # cut out the last word in the word array & put it in currword
#print "now looking at $currword\n";
@chararray = split(//, $currword);
$syllword="";#we start with a clean slate for each word
$currsyllable=""; #and for the syllable
while(@chararray > 0){
$currchar = pop(@chararray); # cut out the last char in the char array for this word & put it in currchar
$currsyllable = $currchar.$currsyllable; # append currchar to current syllable - that will be necessary regardless of whether it's a vowel or a coda
# if hit a vowel..
#if($currchar =~ /[ae3EiOo0u]/){
if($currchar =~ /[$vowels]/){
# print "$currchar\n";
#if(@chararray[@chararray-1] !=~ /[ae3EiOo0u]/){
if(@chararray[@chararray-1] !=~ /[$vowels]/){
# print "@chararray[@chararray-1]\n";
#if this char is a vowel and the previous one is not, then we need to make the onset
$onset = ""; #we start with nothing as the onset
#then we want to take one letter at a time and check whether their concatenation makes a good onset
while(@chararray > 0 && exists($onsets{@chararray[@chararray-1] . $onset})){
#print "$onsets{@chararray[@chararray-1] . $onset}\n";
$currchar = pop(@chararray);
$onset = $currchar . $onset;
}
#we get here either because we've concatenated the onset+rest or because there was no onset and the preceding element is a vowel, so this is the end of the syllable
$currsyllable = $onset . $currsyllable;
# add syllable to word entry
}
$syllword = "\/" . $currsyllable . $syllword;
$currsyllable = "";
}#we end the if we are looking at a vowel
}#when we end this while there are no more characters in this word, so we can add it
$syllline=$syllword. " " . $syllline;
#print "$syllword\n";
} #while we work our way through the words in the line -- so when we exit this, we are ready to print out a syllabified line
if($syllline){
print SYLLABIFIED "$syllline\n";
}
}#while there are lines in this file
close(SYLLABIFIED);
close(CORPUS);