Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

normalizer optimization #152

Merged
merged 1 commit into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package com.grammatek.simaromur.frontend;

import java.util.regex.Pattern;

/**
* A CategoryTuple typically holds information on the expansion of a certain regex pattern
* belonging to a certain category, according to a rule (pos-tag pattern)
Expand All @@ -17,22 +19,22 @@

public class CategoryTuple {

private final String numberPattern;
private final String rule;
private final Pattern numberPattern;
private final Pattern rule;
private final String category;
private final String expansion;

public CategoryTuple(String pattern, String rule, String category, String expansion) {
this.numberPattern = pattern;
this.rule = rule;
this.numberPattern = Pattern.compile(".*" + pattern + ".*");
this.rule = Pattern.compile(".*" + rule);
this.category = category;
this.expansion = expansion;
}

public String getNumberPattern() {
public Pattern getNumberPattern() {
return this.numberPattern;
}
public String getRule() {
public Pattern getRule() {
return this.rule;
}
public String getCategory() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ private String normalizeDigits(String token) {
* Initializes a map to hold the digit positions for a token, e.g.:
* {token: {thousands: "", hundreds: "", dozens: "", ones: ""}}
*/
private Map<String, Map<String, String>> makeDict(String token, String[] columns) {
private Map<String, Map<String, String>> makeDict(String token, final String[] columns) {
Map<String, Map<String, String>> valueDict = new HashMap<>();
Map<String, String> innerMap = new HashMap<>();
for (String s : columns)
Expand All @@ -528,33 +528,42 @@ private Map<String, Map<String, String>> makeDict(String token, String[] columns
return valueDict;
}

/*
/**
* Fills a map that holds the digit positions for a token, e.g.:
* {"1983": {thousands: "", hundreds: "nítján hundruð", dozens: " áttatíu og", ones: "þrjú"}}
* {"1983": {thousands: "", hundreds: "nítján hundruð", dozens: " áttatíu og", ones: "þrjú"}}
* Returns a string combined of the values, e.g.: "nítján hundruð áttatíu og þrjú"
*
* @param token the token to fill the dictionary for
* @param tag the tag to fill the dictionary for
* @param tuples the list of tuples to use for filling the dictionary
* @param typeDict the dictionary to fill
* @param columns the columns to fill in the dictionary
* @return the string combined of the values
*/
private String fillDict(String token, String tag, List<CategoryTuple> tuples, Map<String, Map<String, String>> typeDict, String[] columns) {
private String fillDict(String token, String tag, final List<CategoryTuple> tuples, Map<String, Map<String, String>> typeDict, final String[] columns) {
StringBuilder result = new StringBuilder();

for (int i = 0; i < tuples.size(); i++) {
String numberPattern = tuples.get(i).getNumberPattern();
String rule = tuples.get(i).getRule();
if (token.matches(".*" + numberPattern + ".*") && tag.matches(".*" + rule)) {
if (typeDict.containsKey(token)) {
final String category = tuples.get(i).getCategory();
if (typeDict.get(token).containsKey(category)) {
Map<String, String> tmp = typeDict.get(token);
tmp.put(category, tuples.get(i).getExpansion());
// not really necessary, since the previous assignment updates the map in
// typeDict, but this is more clear
typeDict.put(token, tmp);
}
}
Map<String, String> tokenMap = typeDict.get(token);
if (tokenMap == null) {
// this must have been done inside makeDict(), which needs to be called right before
// this method
throw new IllegalArgumentException("Token " + token + " not found in typeDict");
}

for (final CategoryTuple tuple : tuples) {
final Pattern numberPattern = tuple.getNumberPattern();
final Pattern rule = tuple.getRule();
if (numberPattern.matcher(token).matches() && rule.matcher(tag).matches()) {
tokenMap.put(tuple.getCategory(), tuple.getExpansion());
}
}
for (String s : columns)
result.append(typeDict.get(token).get(s));

for (final String column : columns) {
final String value = tokenMap.get(column);
if (value != null) {
result.append(value);
}
}
return result.toString();
}

Expand Down
Loading