diff --git a/app/src/main/java/com/grammatek/simaromur/frontend/FrontendManager.java b/app/src/main/java/com/grammatek/simaromur/frontend/FrontendManager.java index c5c01ac..d850f81 100644 --- a/app/src/main/java/com/grammatek/simaromur/frontend/FrontendManager.java +++ b/app/src/main/java/com/grammatek/simaromur/frontend/FrontendManager.java @@ -24,9 +24,9 @@ public class FrontendManager { private PronunciationVits mPronunciationVits = null; public FrontendManager(Context context) { - mNormalizationManager = new NormalizationManager(context); mPronunciation = new Pronunciation(context); mPronunciationVits = new PronunciationVits(mPronunciation); + mNormalizationManager = new NormalizationManager(context, mPronunciation.GetIpaPronDict()); } /** diff --git a/app/src/main/java/com/grammatek/simaromur/frontend/NormalizationManager.java b/app/src/main/java/com/grammatek/simaromur/frontend/NormalizationManager.java index 3bc9c12..a0c853d 100644 --- a/app/src/main/java/com/grammatek/simaromur/frontend/NormalizationManager.java +++ b/app/src/main/java/com/grammatek/simaromur/frontend/NormalizationManager.java @@ -10,6 +10,7 @@ import java.io.InputStream; import java.util.ArrayList; import java.util.List; +import java.util.Map; /** * The NormalizationManager controls the normalization process from raw input text to @@ -33,9 +34,9 @@ public class NormalizationManager { private final Tokenizer mTokenizer; private final TTSNormalizer mTTSNormalizer; - public NormalizationManager(Context context) { + public NormalizationManager(Context context, Map pronDict) { mContext = context; - mUnicodeNormalizer = new TTSUnicodeNormalizer(context); + mUnicodeNormalizer = new TTSUnicodeNormalizer(context, pronDict); mTokenizer = new Tokenizer(context); mTTSNormalizer = new TTSNormalizer(); mPosTagger = initPOSTagger(); diff --git a/app/src/main/java/com/grammatek/simaromur/frontend/NumberHelper.java b/app/src/main/java/com/grammatek/simaromur/frontend/NumberHelper.java index fdcffb9..f9c56ba 100644 --- a/app/src/main/java/com/grammatek/simaromur/frontend/NumberHelper.java +++ b/app/src/main/java/com/grammatek/simaromur/frontend/NumberHelper.java @@ -104,12 +104,12 @@ private NumberHelper() {} // DIGIT_NUMBERS.put("\\-", " "); //DIGIT_NUMBERS.put("\\-", " #"); DIGIT_NUMBERS.put("\\+", " plús"); - //TODO: if we have more sentences being normalized, this replaces end-of-sentence dot as well. We don't want that DIGIT_NUMBERS.put("\\.", " punktur"); DIGIT_NUMBERS.put(":", " tvípunktur"); - //TODO: converts normal sentence commas, ask what this is supposed to do - //DIGIT_NUMBERS.put(",", " komma"); + DIGIT_NUMBERS.put(",", " komma"); DIGIT_NUMBERS.put("\\/", " skástrik"); + // when a semicolon occurs, we need the voice to pause + DIGIT_NUMBERS.put(";", ","); } public static final Map DIGITS_ORD = new HashMap<>(); diff --git a/app/src/main/java/com/grammatek/simaromur/frontend/TTSNormalizer.java b/app/src/main/java/com/grammatek/simaromur/frontend/TTSNormalizer.java index 3bb3da6..22f7dfc 100644 --- a/app/src/main/java/com/grammatek/simaromur/frontend/TTSNormalizer.java +++ b/app/src/main/java/com/grammatek/simaromur/frontend/TTSNormalizer.java @@ -57,6 +57,7 @@ public class TTSNormalizer { // This matches also inside strings for valid email addresses private static final Pattern EMAIL_PTRN = Pattern.compile(".+@.+"); private static final Pattern EOS_PTRN = Pattern.compile("[.:?!;]"); + private static final Pattern PUNCTUATION_TO_KEEP = Pattern.compile("[.:?!,\"]"); private static final Pattern NUM_OPT_DOT_PTRN = Pattern.compile("\\d+\\.?(\\d+)?"); private static final Pattern ANY_DIGIT_PTRN = Pattern.compile("\\d"); private static final Pattern SPORT_RES_PTRN = Pattern.compile("^\\d{1,2}/\\d{1,2}$"); @@ -170,7 +171,8 @@ else if (linksPattern != null) { token = normalizeURL(token); else if (token.length() > 1 && token.charAt(0) == token.charAt(1)) token = insertSpaces(token); - else if (NormalizationDictionaries.NOT_LETTER.matcher(token).matches()) + else if ((token.length() > 1 && NormalizationDictionaries.NOT_LETTER.matcher(token).matches()) + || !PUNCTUATION_TO_KEEP.matcher(token).matches()) token = normalizeDigits(token); } diff --git a/app/src/main/java/com/grammatek/simaromur/frontend/TTSUnicodeNormalizer.java b/app/src/main/java/com/grammatek/simaromur/frontend/TTSUnicodeNormalizer.java index a968c62..27d87d0 100644 --- a/app/src/main/java/com/grammatek/simaromur/frontend/TTSUnicodeNormalizer.java +++ b/app/src/main/java/com/grammatek/simaromur/frontend/TTSUnicodeNormalizer.java @@ -9,8 +9,10 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; /** @@ -21,7 +23,7 @@ */ public class TTSUnicodeNormalizer { - public static Set mLexicon = new HashSet<>(); + public static Map mLexicon = new HashMap<>(); // The Icelandic alphabet, the grapheme set valid for automatic g2p private final static Set CHAR_SET = new HashSet<>(); @@ -64,8 +66,8 @@ public class TTSUnicodeNormalizer { // string, but delete the unknown character otherwise private final String DONT_DELETE = "[.,\":?!-]"; - public TTSUnicodeNormalizer(Context context) { - mLexicon = initLexicon(context); + public TTSUnicodeNormalizer(Context context, Map pronDict) { + mLexicon = pronDict; } /** @@ -170,7 +172,7 @@ public List normalizeAlphabet(List sentences) { } public static boolean inDictionary(String wrd) { - return mLexicon.contains(wrd.toLowerCase()); + return mLexicon.containsKey(wrd.toLowerCase()); } private boolean isTag(String wrd) { @@ -206,22 +208,4 @@ private String getIceAlphaReplacement(Character c) { return ""; } - - private Set initLexicon(Context context) { - Set lexicon = new HashSet<>(); - Resources res = context.getResources(); - String line; - try { - InputStream is = res.openRawResource(R.raw.lexicon_v2201); - BufferedReader reader = new BufferedReader(new InputStreamReader(is)); - if (is != null) { - while ((line = reader.readLine()) != null) { - lexicon.add(line.trim()); - } - } - } catch (Exception e) { - e.printStackTrace(); - } - return lexicon; - } } diff --git a/app/src/main/java/com/grammatek/simaromur/frontend/UnicodeMaps.java b/app/src/main/java/com/grammatek/simaromur/frontend/UnicodeMaps.java index bfc5983..724a13a 100644 --- a/app/src/main/java/com/grammatek/simaromur/frontend/UnicodeMaps.java +++ b/app/src/main/java/com/grammatek/simaromur/frontend/UnicodeMaps.java @@ -95,6 +95,7 @@ public class UnicodeMaps { postDictLookupMap.put('Ä', "E"); postDictLookupMap.put('Ü', "U"); postDictLookupMap.put('Ø', "Ö"); + postDictLookupMap.put('Ô', "Ó"); } // delete in MVP, transliterate later if necessary diff --git a/app/src/main/res/raw/igc_wiki_news_dict_20240107.csv b/app/src/main/res/raw/igc_wiki_news_dict_20240107.csv index 2089135..3d3d8dd 100644 --- a/app/src/main/res/raw/igc_wiki_news_dict_20240107.csv +++ b/app/src/main/res/raw/igc_wiki_news_dict_20240107.csv @@ -1061,7 +1061,6 @@ sigríður s ˈ ɪː ɣ r i ð ʏ r group k r ˈ u h p lán l ˈ auː n árangri ˈ auː r au ŋ k r ɪ -em ˈ ɛː ɛ m íbúar ˈ iː p u a r dagskrá t ˈ aː ɣ s k r au byrjaði p ˈ ɪ r j a ð ɪ @@ -5011,7 +5010,6 @@ hleypt l̥ ˈ ei f t andersen ˈ a n t ɛ r̥ s ɛ n hálftíma h ˈ au l v tʰ i m a flýta f l ˈ iː t a -hí h ˈ iː kynjanna cʰ ˈ ɪ n j a n a munaði m ˈ ʏː n a ð ɪ landamæri l ˈ a n t a m ˌ ai r ɪ @@ -22263,6 +22261,7 @@ orrustunni ˈ ɔ r ʏ s t ʏ n ɪ rúmið r ˈ uː m ɪ ð útfrá ˈ uː t f r au space s p ˈ eiː s +spacex s p ˈ eiː s ˌ ɛ k s classic kʰ l ˈ a s ɪ k hernaðaraðgerðum h ˈ ɛ r t n a ð a r ˌ a ð c ɛ r ð ʏ m hindrunum h ˈ ɪ n t r ʏ n ʏ m @@ -25383,7 +25382,6 @@ vítinu v ˈ iː t ɪ n ʏ úrskurðarnefndin ˈ uː r s k ʏ r ð a r n ɛ m t ɪ n ómari ˈ ouː m a r ɪ ömurlega ˈ œː m ʏ r l ɛ ɣ a -mbl ˈ ɛ m p l vinnuveitenda v ˈ ɪ n ʏ v ˌ ei t ɛ n t a hljóðs l̥ j ˈ ou ð s ágengt ˈ auː c ei ŋ̊ t @@ -69363,7 +69361,7 @@ svarendum s v ˈ aː r ɛ n t ʏ m viðbjóðslegt v ˈ ɪ ð p j ou ð s t l ɛ x t hljóðmengun l̥ j ˈ ouː ð m ei ŋ k ʏ n óttans ˈ ou h t a n s -dísella t ˈ iː s ɛ t l a +dísella t ˈ iː s ɛ l a mönnunarvanda m ˈ œ n ʏ n a r v ˌ a n t a mælinganna m ˈ aiː l i ŋ k a n a acoff ˈ aː k ɔ f @@ -150437,7 +150435,6 @@ starfshlutföll s t ˈ a r f s l̥ ʏ t f ˌ œ t l̥ húsgrunninum h ˈ uː s k r ʏ n ɪ n ʏ m aukaútgjöld ˈ œyː k a ˌ u t c œ l t frístundarheimili f r ˈ iː s t ʏ n t a r h ˌ ei m ɪ l ɪ -adhd ˈ a t n̥ t tryggingageiranum tʰ r ˈ ɪ c i ŋ k a c ˌ ei r a n ʏ m þjark θ j ˈ a r̥ k tryggra tʰ r ˈ ɪ k r a diff --git a/app/src/test/java/com/grammatek/simaromur/NormalizationManagerTest.java b/app/src/test/java/com/grammatek/simaromur/NormalizationManagerTest.java index 129f670..b52bf8c 100644 --- a/app/src/test/java/com/grammatek/simaromur/NormalizationManagerTest.java +++ b/app/src/test/java/com/grammatek/simaromur/NormalizationManagerTest.java @@ -6,6 +6,8 @@ import androidx.test.core.app.ApplicationProvider; import com.grammatek.simaromur.frontend.NormalizationManager; +import com.grammatek.simaromur.frontend.PronDictEntry; +import com.grammatek.simaromur.frontend.Pronunciation; import org.junit.Test; import org.junit.runner.RunWith; @@ -25,21 +27,23 @@ public class NormalizationManagerTest { private final static Context context = ApplicationProvider.getApplicationContext(); + private final static Pronunciation pron = new Pronunciation(context); + private final static Map pronDict = pron.GetIpaPronDict(); @Test public void processTest() { - String input = "www.mbl.is/frettir"; - NormalizationManager manager = new NormalizationManager(context); + String input = "Space-X"; + NormalizationManager manager = new NormalizationManager(context, pronDict); String processed = manager.process(input); System.out.println(processed); - assertEquals("m b l punktur is skástrik frettir .", + assertEquals("space - x .", processed); } @Test public void processDigitsTest() { - NormalizationManager manager = new NormalizationManager(context); + NormalizationManager manager = new NormalizationManager(context, pronDict); for (String sent : getDigits().keySet()) { String processed = manager.process(sent); assertEquals(getDigits().get(sent), processed); @@ -48,7 +52,7 @@ public void processDigitsTest() { @Test public void processSymbolsTest() { - NormalizationManager manager = new NormalizationManager(context); + NormalizationManager manager = new NormalizationManager(context, pronDict); for (String sent : getSymbols().keySet()) { String processed = manager.process(sent); assertEquals(getSymbols().get(sent), processed); @@ -57,7 +61,7 @@ public void processSymbolsTest() { @Test public void processNewIssuesTest() { - NormalizationManager manager = new NormalizationManager(context); + NormalizationManager manager = new NormalizationManager(context, pronDict); for (String sent : getNewTestSentences().keySet()) { String processed = manager.process(sent); assertEquals(getNewTestSentences().get(sent), processed); @@ -66,7 +70,7 @@ public void processNewIssuesTest() { @Test public void processV14IssuesTest() { - NormalizationManager manager = new NormalizationManager(context); + NormalizationManager manager = new NormalizationManager(context, pronDict); for (String sent : getV14TestSentences().keySet()) { String processed = manager.process(sent); assertEquals(getV14TestSentences().get(sent), processed); @@ -75,7 +79,7 @@ public void processV14IssuesTest() { @Test public void processListTest() { - NormalizationManager manager = new NormalizationManager(context); + NormalizationManager manager = new NormalizationManager(context, pronDict); for (String sent : getTestSentences().keySet()) { String processed = manager.process(sent); assertEquals(getTestSentences().get(sent), processed); @@ -169,6 +173,12 @@ private Map getNewTestSentences() { private Map getV14TestSentences() { // test sentences added for the deployment of v1.4 Map sent = new HashMap<>(); + sent.put("láta skoðanir sínar í ljós í athugasemdum.“, segir hún.", + "láta skoðanir sínar í ljós í athugasemdum \" . , segir hún ."); + sent.put("Ôlafsson, framkvæmdastjóri Stakka víkur ehf., segir", + "ólafsson , framkvæmdastjóri stakka víkur e h f . , segir ."); + sent.put("Stimpilgjald af kaupsamningi er 0,8% af heildarfasteignamati hjá einstaklingum.", + "stimpilgjald af kaupsamningi er núll komma átta prósent af heildarfasteignamati hjá einstaklingum ."); sent.put("íbúðin er 145 fm", "íbúðin er hundrað fjörutíu og fimm fermetrar ."); sent.put("margir með ADHD", "margir með a d h d ."); sent.put("Má áætla að þriðji stóri íþróttaviðburðurinn sem horft hafi verið til sé EM " + @@ -185,15 +195,12 @@ private Map getV14TestSentences() { "leigubíl fyrir tvö þúsund og fimm hundruð krónur tæpar " + "þrjátíu og þrjú þúsund íslenskar krónur ."); sent.put("leigubíl fyrir £377 á dag.", "leigubíl fyrir þrjú hundruð sjötíu og sjö pund á dag ."); - sent.put("3.7", "þrír punktur sjö ."); sent.put("13.7", "einn þrír punktur sjö ."); - sent.put("mbl.is/frettir/innlent/2024/02/02/litlu_hlutirnir_sem_folkid_saknar_helst/", "m b l punktur is skástrik frettir skástrik innlent skástrik tvö þúsund tuttugu " + "og fjögur skástrik núll tvö skástrik núll tvö skástrik litlu hlutirnir sem " + "folkid saknar helst skástrik ."); - sent.put("Stjórnendur Rúv gera ráð fyrir því að augýsingatekjur stofnunarinnar hækki " + "á þessu ári um 17,4% frá fyrra ári og að útvarpsgjald hækki um 3,5%.", "stjórnendur rúv gera ráð fyrir því að augýsingatekjur stofnunarinnar hækki " + @@ -246,7 +253,7 @@ private Map getTestSentences() { //testSentences.put("Stelpurnar Carmen Diljá Guðbjarnardóttir og Elenora Rós Georgsdóttir söfnuðu 7.046 kr.", // "Stelpurnar Carmen Diljá Guðbjarnardóttir og Elenora Rós Georgsdóttir söfnuðu sjö þúsund fjörutíu og sex krónum ."); testSentences.put("Stelpurnar Carmen Diljá Guðbjarnardóttir og Elenora Rós Georgsdóttir söfnuðu 7.046 kr.", - "Stelpurnar Karmen Diljá Guðbjarnardóttir og Elenora Rós Georgsdóttir söfnuðu sjö þúsund fjörutíu og sex krónur .".toLowerCase()); + "Stelpurnar Carmen Diljá Guðbjarnardóttir og Elenora Rós Georgsdóttir söfnuðu sjö þúsund fjörutíu og sex krónur .".toLowerCase()); testSentences.put("Hann skoraði 21 stig og tók 12 fráköst.", "Hann skoraði tuttugu og eitt stig og tók tólf fráköst .".toLowerCase()); testSentences.put("Opna Suðurnesjamótið í pílu fer fram þann 4. desember nk. kl. 13:00 í píluaðstöðu Pílufélags Reykjanesbæjar að Hrannargötu 6. ", "Opna Suðurnesjamótið í pílu fer fram þann fjórða desember næstkomandi klukkan þrettán núll núll ".toLowerCase() + @@ -286,7 +293,7 @@ private Map getTestSentences() { "Hollenska fjárfestingafyrirtækið EsBro hyggst reisa fimmtán hektarar hundrað og fimmtíu þúsund fermetrar gróðurhús til framleiðslu á tómötum .".toLowerCase()); //testSentences.put("Við Lindarhvamm í Hafnarfirði er að finna 134 fm efri sérhæð og ris í snyrtilegu tvíbýlishúsi sem reist var árið 1963.", // "við lindarhvamm í hafnarfirði er að finna hundrað þrjátíu og fjögur fermetrar efri sérhæð og ris í snyrtilegu tvíbýlishúsi sem reist var árið nítján hundruð sextíu og þrjú .".toLowerCase()); - testSentences.put("Mynd / elg@vf.is", "Mynd skástrik e l g hjá v f punktur is .".toLowerCase()); + testSentences.put("Mynd / elg@vf.is", "Mynd skástrik elg hjá v f punktur is .".toLowerCase()); testSentences.put("hefur leikið sjö leiki með U-21 árs liðinu.", "hefur leikið sjö leiki með U - tuttugu og eins árs liðinu .".toLowerCase()); testSentences.put("er þetta í 23. skiptið sem mótið er haldið .", "er þetta í tuttugasta og þriðja skiptið sem mótið er haldið .".toLowerCase()); testSentences.put("Skráning er hafin á http://keflavik.is/fimleikar/ og ef eitthvað er óljóst er hægt að hafa samband í síma 421-6368 eða á fimleikar@keflavik.is", diff --git a/app/src/test/java/com/grammatek/simaromur/TTSUnicodeNormalizerTest.java b/app/src/test/java/com/grammatek/simaromur/TTSUnicodeNormalizerTest.java index 338d547..32cfd65 100644 --- a/app/src/test/java/com/grammatek/simaromur/TTSUnicodeNormalizerTest.java +++ b/app/src/test/java/com/grammatek/simaromur/TTSUnicodeNormalizerTest.java @@ -3,6 +3,8 @@ import android.content.Context; import android.os.Build; +import com.grammatek.simaromur.frontend.PronDictEntry; +import com.grammatek.simaromur.frontend.Pronunciation; import com.grammatek.simaromur.frontend.TTSUnicodeNormalizer; import org.junit.Test; @@ -27,10 +29,12 @@ public class TTSUnicodeNormalizerTest { private final static Context context = ApplicationProvider.getApplicationContext(); + private final static Pronunciation pron = new Pronunciation(context); + private final static Map pronDict = pron.GetIpaPronDict(); @Test public void unicodeNormalizingTest() { - TTSUnicodeNormalizer normalizer = new TTSUnicodeNormalizer(context); + TTSUnicodeNormalizer normalizer = new TTSUnicodeNormalizer(context, pronDict); String input = "„ Við vorum samheldnir og þéttir og það er gott að innbyrða sigur á útivelli gegn öflugu liði eins og Breiðabliki , “ sagði Willum Þór Þórsson"; String normalized = normalizer.normalizeEncoding(input); assertEquals("\" Við vorum samheldnir og þéttir og það er gott að innbyrða sigur á útivelli gegn öflugu liði eins og Breiðabliki , \" sagði Willum Þór Þórsson", normalized); @@ -42,7 +46,7 @@ public void unicodeNormalizingTest() { @Test public void alphabetNormalizingTest() { - TTSUnicodeNormalizer normalizer = new TTSUnicodeNormalizer(context); + TTSUnicodeNormalizer normalizer = new TTSUnicodeNormalizer(context, pronDict); for (String sent : getTestSentences().keySet()) { List processed = normalizer.normalizeAlphabet(Arrays.asList(sent)); assertEquals(getTestSentences().get(sent), processed.get(0)); diff --git a/app/src/test/java/com/grammatek/simaromur/TokenizerTest.java b/app/src/test/java/com/grammatek/simaromur/TokenizerTest.java index fea1b18..0fbeac6 100644 --- a/app/src/test/java/com/grammatek/simaromur/TokenizerTest.java +++ b/app/src/test/java/com/grammatek/simaromur/TokenizerTest.java @@ -53,6 +53,9 @@ public void tokenizerCommaSepTest() { input = "14,1% íbúa"; tokenized = tok.detectSentences(input); assertEquals("14,1 % íbúa .", tokenized.get(0)); + input = "kaupsamningi er 0,8% af heildarfasteignamati"; + tokenized = tok.detectSentences(input); + assertEquals("kaupsamningi er 0,8 % af heildarfasteignamati .", tokenized.get(0)); } @Test