Skip to content

Commit

Permalink
Merge pull request #154 from grammatek/abn-more-norm-fix
Browse files Browse the repository at this point in the history
Abn more norm fix
  • Loading branch information
lumpidu authored Feb 6, 2024
2 parents cce946f + a954a8b commit e1acbed
Show file tree
Hide file tree
Showing 10 changed files with 48 additions and 49 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ public class FrontendManager {
private PronunciationVits mPronunciationVits = null;

public FrontendManager(Context context) {
mNormalizationManager = new NormalizationManager(context);
mPronunciation = new Pronunciation(context);
mPronunciationVits = new PronunciationVits(mPronunciation);
mNormalizationManager = new NormalizationManager(context, mPronunciation.GetIpaPronDict());
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/**
* The NormalizationManager controls the normalization process from raw input text to
Expand All @@ -33,9 +34,9 @@ public class NormalizationManager {
private final Tokenizer mTokenizer;
private final TTSNormalizer mTTSNormalizer;

public NormalizationManager(Context context) {
public NormalizationManager(Context context, Map<String, PronDictEntry> pronDict) {
mContext = context;
mUnicodeNormalizer = new TTSUnicodeNormalizer(context);
mUnicodeNormalizer = new TTSUnicodeNormalizer(context, pronDict);
mTokenizer = new Tokenizer(context);
mTTSNormalizer = new TTSNormalizer();
mPosTagger = initPOSTagger();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,12 @@ private NumberHelper() {}
// DIGIT_NUMBERS.put("\\-", " <sil>");
//DIGIT_NUMBERS.put("\\-", " #");
DIGIT_NUMBERS.put("\\+", " plús");
//TODO: if we have more sentences being normalized, this replaces end-of-sentence dot as well. We don't want that
DIGIT_NUMBERS.put("\\.", " punktur");
DIGIT_NUMBERS.put(":", " tvípunktur");
//TODO: converts normal sentence commas, ask what this is supposed to do
//DIGIT_NUMBERS.put(",", " komma");
DIGIT_NUMBERS.put(",", " komma");
DIGIT_NUMBERS.put("\\/", " skástrik");
// when a semicolon occurs, we need the voice to pause
DIGIT_NUMBERS.put(";", ",");
}

public static final Map<String, String> DIGITS_ORD = new HashMap<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ public class TTSNormalizer {
// This matches also inside strings for valid email addresses
private static final Pattern EMAIL_PTRN = Pattern.compile(".+@.+");
private static final Pattern EOS_PTRN = Pattern.compile("[.:?!;]");
private static final Pattern PUNCTUATION_TO_KEEP = Pattern.compile("[.:?!,\"]");
private static final Pattern NUM_OPT_DOT_PTRN = Pattern.compile("\\d+\\.?(\\d+)?");
private static final Pattern ANY_DIGIT_PTRN = Pattern.compile("\\d");
private static final Pattern SPORT_RES_PTRN = Pattern.compile("^\\d{1,2}/\\d{1,2}$");
Expand Down Expand Up @@ -170,7 +171,8 @@ else if (linksPattern != null) {
token = normalizeURL(token);
else if (token.length() > 1 && token.charAt(0) == token.charAt(1))
token = insertSpaces(token);
else if (NormalizationDictionaries.NOT_LETTER.matcher(token).matches())
else if ((token.length() > 1 && NormalizationDictionaries.NOT_LETTER.matcher(token).matches())
|| !PUNCTUATION_TO_KEEP.matcher(token).matches())
token = normalizeDigits(token);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
Expand All @@ -21,7 +23,7 @@
*/
public class TTSUnicodeNormalizer {

public static Set<String> mLexicon = new HashSet<>();
public static Map<String, PronDictEntry> mLexicon = new HashMap<>();

// The Icelandic alphabet, the grapheme set valid for automatic g2p
private final static Set<Character> CHAR_SET = new HashSet<>();
Expand Down Expand Up @@ -64,8 +66,8 @@ public class TTSUnicodeNormalizer {
// string, but delete the unknown character otherwise
private final String DONT_DELETE = "[.,\":?!-]";

public TTSUnicodeNormalizer(Context context) {
mLexicon = initLexicon(context);
public TTSUnicodeNormalizer(Context context, Map<String, PronDictEntry> pronDict) {
mLexicon = pronDict;
}

/**
Expand Down Expand Up @@ -170,7 +172,7 @@ public List<String> normalizeAlphabet(List<String> sentences) {
}

public static boolean inDictionary(String wrd) {
return mLexicon.contains(wrd.toLowerCase());
return mLexicon.containsKey(wrd.toLowerCase());
}

private boolean isTag(String wrd) {
Expand Down Expand Up @@ -206,22 +208,4 @@ private String getIceAlphaReplacement(Character c) {

return "";
}

private Set<String> initLexicon(Context context) {
Set<String> lexicon = new HashSet<>();
Resources res = context.getResources();
String line;
try {
InputStream is = res.openRawResource(R.raw.lexicon_v2201);
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
if (is != null) {
while ((line = reader.readLine()) != null) {
lexicon.add(line.trim());
}
}
} catch (Exception e) {
e.printStackTrace();
}
return lexicon;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ public class UnicodeMaps {
postDictLookupMap.put('Ä', "E");
postDictLookupMap.put('Ü', "U");
postDictLookupMap.put('Ø', "Ö");
postDictLookupMap.put('Ô', "Ó");
}

// delete in MVP, transliterate later if necessary
Expand Down
7 changes: 2 additions & 5 deletions app/src/main/res/raw/igc_wiki_news_dict_20240107.csv
Original file line number Diff line number Diff line change
Expand Up @@ -1061,7 +1061,6 @@ sigríður s ˈ ɪː ɣ r i ð ʏ r
group k r ˈ u h p
lán l ˈ auː n
árangri ˈ auː r au ŋ k r ɪ
em ˈ ɛː ɛ m
íbúar ˈ iː p u a r
dagskrá t ˈ aː ɣ s k r au
byrjaði p ˈ ɪ r j a ð ɪ
Expand Down Expand Up @@ -5011,7 +5010,6 @@ hleypt l̥ ˈ ei f t
andersen ˈ a n t ɛ r̥ s ɛ n
hálftíma h ˈ au l v tʰ i m a
flýta f l ˈ iː t a
hí h ˈ iː
kynjanna cʰ ˈ ɪ n j a n a
munaði m ˈ ʏː n a ð ɪ
landamæri l ˈ a n t a m ˌ ai r ɪ
Expand Down Expand Up @@ -22263,6 +22261,7 @@ orrustunni ˈ ɔ r ʏ s t ʏ n ɪ
rúmið r ˈ uː m ɪ ð
útfrá ˈ uː t f r au
space s p ˈ eiː s
spacex s p ˈ eiː s ˌ ɛ k s
classic kʰ l ˈ a s ɪ k
hernaðaraðgerðum h ˈ ɛ r t n a ð a r ˌ a ð c ɛ r ð ʏ m
hindrunum h ˈ ɪ n t r ʏ n ʏ m
Expand Down Expand Up @@ -25383,7 +25382,6 @@ vítinu v ˈ iː t ɪ n ʏ
úrskurðarnefndin ˈ uː r s k ʏ r ð a r n ɛ m t ɪ n
ómari ˈ ouː m a r ɪ
ömurlega ˈ œː m ʏ r l ɛ ɣ a
mbl ˈ ɛ m p l
vinnuveitenda v ˈ ɪ n ʏ v ˌ ei t ɛ n t a
hljóðs l̥ j ˈ ou ð s
ágengt ˈ auː c ei ŋ̊ t
Expand Down Expand Up @@ -69363,7 +69361,7 @@ svarendum s v ˈ aː r ɛ n t ʏ m
viðbjóðslegt v ˈ ɪ ð p j ou ð s t l ɛ x t
hljóðmengun l̥ j ˈ ouː ð m ei ŋ k ʏ n
óttans ˈ ou h t a n s
dísella t ˈ iː s ɛ t l a
dísella t ˈ iː s ɛ l a
mönnunarvanda m ˈ œ n ʏ n a r v ˌ a n t a
mælinganna m ˈ aiː l i ŋ k a n a
acoff ˈ aː k ɔ f
Expand Down Expand Up @@ -150437,7 +150435,6 @@ starfshlutföll s t ˈ a r f s l̥ ʏ t f ˌ œ t l̥
húsgrunninum h ˈ uː s k r ʏ n ɪ n ʏ m
aukaútgjöld ˈ œyː k a ˌ u t c œ l t
frístundarheimili f r ˈ iː s t ʏ n t a r h ˌ ei m ɪ l ɪ
adhd ˈ a t n̥ t
tryggingageiranum tʰ r ˈ ɪ c i ŋ k a c ˌ ei r a n ʏ m
þjark θ j ˈ a r̥ k
tryggra tʰ r ˈ ɪ k r a
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import androidx.test.core.app.ApplicationProvider;

import com.grammatek.simaromur.frontend.NormalizationManager;
import com.grammatek.simaromur.frontend.PronDictEntry;
import com.grammatek.simaromur.frontend.Pronunciation;

import org.junit.Test;
import org.junit.runner.RunWith;
Expand All @@ -25,21 +27,23 @@
public class NormalizationManagerTest {

private final static Context context = ApplicationProvider.getApplicationContext();
private final static Pronunciation pron = new Pronunciation(context);
private final static Map<String, PronDictEntry> pronDict = pron.GetIpaPronDict();

@Test
public void processTest() {
String input = "www.mbl.is/frettir";
NormalizationManager manager = new NormalizationManager(context);
String input = "Space-X";
NormalizationManager manager = new NormalizationManager(context, pronDict);
String processed = manager.process(input);
System.out.println(processed);

assertEquals("m b l punktur is skástrik frettir .",
assertEquals("space - x .",
processed);
}

@Test
public void processDigitsTest() {
NormalizationManager manager = new NormalizationManager(context);
NormalizationManager manager = new NormalizationManager(context, pronDict);
for (String sent : getDigits().keySet()) {
String processed = manager.process(sent);
assertEquals(getDigits().get(sent), processed);
Expand All @@ -48,7 +52,7 @@ public void processDigitsTest() {

@Test
public void processSymbolsTest() {
NormalizationManager manager = new NormalizationManager(context);
NormalizationManager manager = new NormalizationManager(context, pronDict);
for (String sent : getSymbols().keySet()) {
String processed = manager.process(sent);
assertEquals(getSymbols().get(sent), processed);
Expand All @@ -57,7 +61,7 @@ public void processSymbolsTest() {

@Test
public void processNewIssuesTest() {
NormalizationManager manager = new NormalizationManager(context);
NormalizationManager manager = new NormalizationManager(context, pronDict);
for (String sent : getNewTestSentences().keySet()) {
String processed = manager.process(sent);
assertEquals(getNewTestSentences().get(sent), processed);
Expand All @@ -66,7 +70,7 @@ public void processNewIssuesTest() {

@Test
public void processV14IssuesTest() {
NormalizationManager manager = new NormalizationManager(context);
NormalizationManager manager = new NormalizationManager(context, pronDict);
for (String sent : getV14TestSentences().keySet()) {
String processed = manager.process(sent);
assertEquals(getV14TestSentences().get(sent), processed);
Expand All @@ -75,7 +79,7 @@ public void processV14IssuesTest() {

@Test
public void processListTest() {
NormalizationManager manager = new NormalizationManager(context);
NormalizationManager manager = new NormalizationManager(context, pronDict);
for (String sent : getTestSentences().keySet()) {
String processed = manager.process(sent);
assertEquals(getTestSentences().get(sent), processed);
Expand Down Expand Up @@ -169,6 +173,12 @@ private Map<String, String> getNewTestSentences() {
private Map<String, String> getV14TestSentences() {
// test sentences added for the deployment of v1.4
Map<String, String> sent = new HashMap<>();
sent.put("láta skoðanir sínar í ljós í athugasemdum.“, segir hún.",
"láta skoðanir sínar í ljós í athugasemdum \" . , segir hún .");
sent.put("Ôlafsson, framkvæmdastjóri Stakka víkur ehf., segir",
"ólafsson , framkvæmdastjóri stakka víkur e h f . , segir .");
sent.put("Stimpilgjald af kaupsamningi er 0,8% af heildarfasteignamati hjá einstaklingum.",
"stimpilgjald af kaupsamningi er núll komma átta prósent af heildarfasteignamati hjá einstaklingum .");
sent.put("íbúðin er 145 fm", "íbúðin er hundrað fjörutíu og fimm fermetrar .");
sent.put("margir með ADHD", "margir með a d h d .");
sent.put("Má áætla að þriðji stóri íþróttaviðburðurinn sem horft hafi verið til sé EM " +
Expand All @@ -185,15 +195,12 @@ private Map<String, String> getV14TestSentences() {
"leigubíl fyrir tvö þúsund og fimm hundruð krónur tæpar " +
"þrjátíu og þrjú þúsund íslenskar krónur .");
sent.put("leigubíl fyrir £377 á dag.", "leigubíl fyrir þrjú hundruð sjötíu og sjö pund á dag .");

sent.put("3.7", "þrír punktur sjö .");
sent.put("13.7", "einn þrír punktur sjö .");

sent.put("mbl.is/frettir/innlent/2024/02/02/litlu_hlutirnir_sem_folkid_saknar_helst/",
"m b l punktur is skástrik frettir skástrik innlent skástrik tvö þúsund tuttugu " +
"og fjögur skástrik núll tvö skástrik núll tvö skástrik litlu hlutirnir sem " +
"folkid saknar helst skástrik .");

sent.put("Stjórnendur Rúv gera ráð fyrir því að augýsingatekjur stofnunarinnar hækki " +
"á þessu ári um 17,4% frá fyrra ári og að útvarpsgjald hækki um 3,5%.",
"stjórnendur rúv gera ráð fyrir því að augýsingatekjur stofnunarinnar hækki " +
Expand Down Expand Up @@ -246,7 +253,7 @@ private Map<String, String> getTestSentences() {
//testSentences.put("Stelpurnar Carmen Diljá Guðbjarnardóttir og Elenora Rós Georgsdóttir söfnuðu 7.046 kr.",
// "Stelpurnar Carmen Diljá Guðbjarnardóttir og Elenora Rós Georgsdóttir söfnuðu sjö þúsund fjörutíu og sex krónum .");
testSentences.put("Stelpurnar Carmen Diljá Guðbjarnardóttir og Elenora Rós Georgsdóttir söfnuðu 7.046 kr.",
"Stelpurnar Karmen Diljá Guðbjarnardóttir og Elenora Rós Georgsdóttir söfnuðu sjö þúsund fjörutíu og sex krónur .".toLowerCase());
"Stelpurnar Carmen Diljá Guðbjarnardóttir og Elenora Rós Georgsdóttir söfnuðu sjö þúsund fjörutíu og sex krónur .".toLowerCase());
testSentences.put("Hann skoraði 21 stig og tók 12 fráköst.", "Hann skoraði tuttugu og eitt stig og tók tólf fráköst .".toLowerCase());
testSentences.put("Opna Suðurnesjamótið í pílu fer fram þann 4. desember nk. kl. 13:00 í píluaðstöðu Pílufélags Reykjanesbæjar að Hrannargötu 6. ",
"Opna Suðurnesjamótið í pílu fer fram þann fjórða desember næstkomandi klukkan þrettán núll núll ".toLowerCase() +
Expand Down Expand Up @@ -286,7 +293,7 @@ private Map<String, String> getTestSentences() {
"Hollenska fjárfestingafyrirtækið EsBro hyggst reisa fimmtán hektarar <sil> hundrað og fimmtíu þúsund fermetrar <sil> gróðurhús til framleiðslu á tómötum .".toLowerCase());
//testSentences.put("Við Lindarhvamm í Hafnarfirði er að finna 134 fm efri sérhæð og ris í snyrtilegu tvíbýlishúsi sem reist var árið 1963.",
// "við lindarhvamm í hafnarfirði er að finna hundrað þrjátíu og fjögur fermetrar efri sérhæð og ris í snyrtilegu tvíbýlishúsi sem reist var árið nítján hundruð sextíu og þrjú .".toLowerCase());
testSentences.put("Mynd / [email protected]", "Mynd skástrik e l g hjá v f punktur is .".toLowerCase());
testSentences.put("Mynd / [email protected]", "Mynd skástrik elg hjá v f punktur is .".toLowerCase());
testSentences.put("hefur leikið sjö leiki með U-21 árs liðinu.", "hefur leikið sjö leiki með U - tuttugu og eins árs liðinu .".toLowerCase());
testSentences.put("er þetta í 23. skiptið sem mótið er haldið .", "er þetta í tuttugasta og þriðja skiptið sem mótið er haldið .".toLowerCase());
testSentences.put("Skráning er hafin á http://keflavik.is/fimleikar/ og ef eitthvað er óljóst er hægt að hafa samband í síma 421-6368 eða á [email protected]",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import android.content.Context;
import android.os.Build;

import com.grammatek.simaromur.frontend.PronDictEntry;
import com.grammatek.simaromur.frontend.Pronunciation;
import com.grammatek.simaromur.frontend.TTSUnicodeNormalizer;

import org.junit.Test;
Expand All @@ -27,10 +29,12 @@
public class TTSUnicodeNormalizerTest {

private final static Context context = ApplicationProvider.getApplicationContext();
private final static Pronunciation pron = new Pronunciation(context);
private final static Map<String, PronDictEntry> pronDict = pron.GetIpaPronDict();

@Test
public void unicodeNormalizingTest() {
TTSUnicodeNormalizer normalizer = new TTSUnicodeNormalizer(context);
TTSUnicodeNormalizer normalizer = new TTSUnicodeNormalizer(context, pronDict);
String input = "„ Við vorum samheldnir og þéttir og það er gott að innbyrða sigur á útivelli gegn öflugu liði eins og Breiðabliki , “ sagði Willum Þór Þórsson";
String normalized = normalizer.normalizeEncoding(input);
assertEquals("\" Við vorum samheldnir og þéttir og það er gott að innbyrða sigur á útivelli gegn öflugu liði eins og Breiðabliki , \" sagði Willum Þór Þórsson", normalized);
Expand All @@ -42,7 +46,7 @@ public void unicodeNormalizingTest() {

@Test
public void alphabetNormalizingTest() {
TTSUnicodeNormalizer normalizer = new TTSUnicodeNormalizer(context);
TTSUnicodeNormalizer normalizer = new TTSUnicodeNormalizer(context, pronDict);
for (String sent : getTestSentences().keySet()) {
List<String> processed = normalizer.normalizeAlphabet(Arrays.asList(sent));
assertEquals(getTestSentences().get(sent), processed.get(0));
Expand Down
3 changes: 3 additions & 0 deletions app/src/test/java/com/grammatek/simaromur/TokenizerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ public void tokenizerCommaSepTest() {
input = "14,1% íbúa";
tokenized = tok.detectSentences(input);
assertEquals("14,1 % íbúa .", tokenized.get(0));
input = "kaupsamningi er 0,8% af heildarfasteignamati";
tokenized = tok.detectSentences(input);
assertEquals("kaupsamningi er 0,8 % af heildarfasteignamati .", tokenized.get(0));
}

@Test
Expand Down

0 comments on commit e1acbed

Please sign in to comment.