Skip to content

Commit

Permalink
major cleanup of CSV files -- remove BOM from plain text files.
Browse files Browse the repository at this point in the history
  • Loading branch information
mubaldino committed May 8, 2024
1 parent f493e55 commit 5daef24
Show file tree
Hide file tree
Showing 10 changed files with 157 additions and 137 deletions.
2 changes: 1 addition & 1 deletion Core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>org.opensextant</groupId>
<artifactId>opensextant-xponents-core</artifactId>
<version>3.7.0</version>
<version>3.7.1</version>
<packaging>jar</packaging>
<name>OpenSextant Xponents Core API</name>
<description>An information extraction toolkit focused on geography and temporal entities</description>
Expand Down
168 changes: 83 additions & 85 deletions Core/src/main/java/org/opensextant/util/GeonamesUtility.java
Original file line number Diff line number Diff line change
Expand Up @@ -144,40 +144,39 @@ private void loadFeatureMetaMap() throws IOException {
* @throws IOException if timeZones.txt is not found or has an issue.
*/
private void loadCountryTimezones() throws IOException {
java.io.InputStream io = getClass().getResourceAsStream("/geonames.org/timeZones.txt");
java.io.Reader tzReader = new InputStreamReader(io);
CsvMapReader tzMap = new CsvMapReader(tzReader, CsvPreference.TAB_PREFERENCE);
String[] columns = tzMap.getHeader(true);
Map<String, String> tzdata = null;
String gmtCol = null;
String dstCol = null;
for (String col : columns) {
if (col.startsWith("GMT ")) {
gmtCol = col;
} else if (col.startsWith("DST ")) {
dstCol = col;
try(InputStream io = getClass().getResourceAsStream("/geonames.org/timeZones.txt")) {
java.io.Reader tzReader = new InputStreamReader(io);
CsvMapReader tzMap = new CsvMapReader(tzReader, CsvPreference.TAB_PREFERENCE);
String[] columns = tzMap.getHeader(true);
Map<String, String> tzdata = null;
String gmtCol = null;
String dstCol = null;
for (String col : columns) {
if (col.startsWith("GMT ")) {
gmtCol = col;
} else if (col.startsWith("DST ")) {
dstCol = col;
}
}
}
if (dstCol == null || gmtCol == null) {
tzMap.close();
throw new IOException("Bad Timezone file format from geonames.org -- changes yearly");
}
while ((tzdata = tzMap.read(columns)) != null) {
String cc = tzdata.get("CountryCode");
if (cc.trim().startsWith("#")) {
continue;
if (dstCol == null || gmtCol == null) {
throw new IOException("Bad Timezone file format from geonames.org -- changes yearly");
}
while ((tzdata = tzMap.read(columns)) != null) {
String cc = tzdata.get("CountryCode");
if (cc.trim().startsWith("#")) {
continue;
}

Country C = getCountry(cc);
if (C == null) {
continue;
}
Country C = getCountry(cc);
if (C == null) {
continue;
}

Country.TZ tz = new Country.TZ(tzdata.get("TimeZoneId"), tzdata.get(gmtCol), tzdata.get(dstCol),
tzdata.get("rawOffset (independant of DST)"));
C.addTimezone(tz);
Country.TZ tz = new Country.TZ(tzdata.get("TimeZoneId"), tzdata.get(gmtCol), tzdata.get(dstCol),
tzdata.get("rawOffset (independant of DST)"));
C.addTimezone(tz);
}
}
tzMap.close();

// Add all TZ to countries;
for (String cc : isoCountries.keySet()) {
Expand Down Expand Up @@ -476,75 +475,74 @@ public static List<Place> loadMajorCities(InputStream strm) throws IOException {
}

private void loadCountryNameMap() throws IOException {
java.io.InputStream io = getClass().getResourceAsStream("/country-names-2021.csv");
java.io.Reader countryIO = new InputStreamReader(io);
CsvMapReader countryMap = new CsvMapReader(countryIO, CsvPreference.EXCEL_PREFERENCE);
String[] columns = countryMap.getHeader(true);
Map<String, String> country_names = null;
while ((country_names = countryMap.read(columns)) != null) {
String n = country_names.get("country_name");
String cc = country_names.get("ISO2_cc");
String iso3 = country_names.get("ISO3_cc");
String fips = country_names.get("FIPS_cc");

if (n == null || cc == null) {
continue;
}
try (InputStream io = getClass().getResourceAsStream("/country-names-2021.csv")) {
java.io.Reader countryIO = new InputStreamReader(io);
CsvMapReader countryMap = new CsvMapReader(countryIO, CsvPreference.EXCEL_PREFERENCE);
String[] columns = countryMap.getHeader(true);
Map<String, String> country_names = null;
while ((country_names = countryMap.read(columns)) != null) {
String n = country_names.get("country_name");
String cc = country_names.get("ISO2_cc");
String iso3 = country_names.get("ISO3_cc");
String fips = country_names.get("FIPS_cc");

if (n == null || cc == null) {
continue;
}

double lat = Double.parseDouble(country_names.get("latitude"));
double lon = Double.parseDouble(country_names.get("longitude"));
double lat = Double.parseDouble(country_names.get("latitude"));
double lon = Double.parseDouble(country_names.get("longitude"));

cc = cc.toUpperCase();
fips = fips.toUpperCase();
cc = cc.toUpperCase();
fips = fips.toUpperCase();

// Unique Name? E.g., "Georgia" country name is not unique.
// This flag helps inform Disambiguation choose countries and places.
boolean isUniq = Boolean.parseBoolean(country_names.get("is_unique_name"));
boolean isTerr = Boolean.parseBoolean(country_names.get("territory"));
// Unique Name? E.g., "Georgia" country name is not unique.
// This flag helps inform Disambiguation choose countries and places.
boolean isUniq = Boolean.parseBoolean(country_names.get("is_unique_name"));
boolean isTerr = Boolean.parseBoolean(country_names.get("territory"));

// FIPS could be *, but as long as we use ISO2, we're fine. if
// ("*".equals(cc)){ cc = fips.toUpperCase(); }
// FIPS could be *, but as long as we use ISO2, we're fine. if
// ("*".equals(cc)){ cc = fips.toUpperCase(); }

// Normalize: "US" => "united states of america"
defaultCountryNames.put(cc, n.toLowerCase(Locale.ENGLISH));
// Normalize: "US" => "united states of america"
defaultCountryNames.put(cc, n.toLowerCase(Locale.ENGLISH));

Country C = new Country(cc, n);
C.CC_FIPS = fips;
C.CC_ISO2 = cc;
C.CC_ISO3 = iso3;
C.setUniqueName(isUniq);
C.isTerritory = isTerr;
C.setLatitude(lat);
C.setLongitude(lon);
Country C = new Country(cc, n);
C.CC_FIPS = fips;
C.CC_ISO2 = cc;
C.CC_ISO3 = iso3;
C.setUniqueName(isUniq);
C.isTerritory = isTerr;
C.setLatitude(lat);
C.setLongitude(lon);


// TOOD: Resolve the code mapping situation for simple lookups.
// FIPS -> ISO mapping is 1:1
fips2iso.put(fips, cc);
if (!C.isTerritory || (!iso2fips.containsKey(cc) && !iso2fips.containsKey(iso3) && C.isTerritory)) {
// ISO -> FIPS is 1 : many, so only map it here if it is unique.
iso2fips.put(cc, fips); // ISO2
iso2fips.put(iso3, fips);
} else {
logger.debug("Territory not mapped in iso/fips {}, {}", fips, cc);
}
// TOOD: Resolve the code mapping situation for simple lookups.
// FIPS -> ISO mapping is 1:1
fips2iso.put(fips, cc);
if (!C.isTerritory || (!iso2fips.containsKey(cc) && !iso2fips.containsKey(iso3) && C.isTerritory)) {
// ISO -> FIPS is 1 : many, so only map it here if it is unique.
iso2fips.put(cc, fips); // ISO2
iso2fips.put(iso3, fips);
} else {
logger.debug("Territory not mapped in iso/fips {}, {}", fips, cc);
}

// ISO
if (!C.isTerritory) {
isoCountries.put(cc, C);
isoCountries.put(iso3, C);
}
// ISO
if (!C.isTerritory) {
isoCountries.put(cc, C);
isoCountries.put(iso3, C);
}

// FIPS -- mostly unique.
if (!fips.equals("*")) {
fipsCountries.put(fips, C);
}
// FIPS -- mostly unique.
if (!fips.equals("*")) {
fipsCountries.put(fips, C);
}

countries.add(C);
countries.add(C);
}
}

countryMap.close();

if (defaultCountryNames.isEmpty()) {
throw new IOException("No data found in country name map");
}
Expand Down
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>org.opensextant</groupId>
<artifactId>opensextant-xponents</artifactId>
<version>3.7.0</version>
<version>3.7.1</version>
<packaging>jar</packaging>
<name>OpenSextant Xponents</name>
<description>An information extraction toolkit focused on geography and temporal entities</description>
Expand Down Expand Up @@ -53,7 +53,7 @@
<slf4j.version>2.0.12</slf4j.version>
<log4j.version>2.23.0</log4j.version>
<restlet.version>2.4.3</restlet.version>
<xponents.version>3.7.0</xponents.version>
<xponents.version>3.7.1</xponents.version>
</properties>
<!-- OSS Sonatype instructions: list repositories -->
<distributionManagement>
Expand Down
6 changes: 6 additions & 0 deletions solr/etc/gazetteer/filters/non-placenames,acronym.csv
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,11 @@ API,
ARP,
ARS,
ASA,
ASC,
ASAT,
ASM,
ASN,
ASOT,
ASP,
ATC,
ATK,
Expand Down Expand Up @@ -63,9 +66,11 @@ COB,
COE,
COM,
COO,
COR,
COS,
CSC,
CTO,
CUI,
DAP,
DB,
DBA,
Expand Down Expand Up @@ -147,6 +152,7 @@ KWH,
KPI,
LAN,
LEU,
LIDAR,
LLC,
LOE,
MBA,
Expand Down
2 changes: 1 addition & 1 deletion solr/etc/gazetteer/filters/non-placenames,deu.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
exclusion,category
exclusion,category
ach,phrase.deu
acht,noun.deu
ahorn,noun.deu
Expand Down
88 changes: 44 additions & 44 deletions solr/etc/gazetteer/filters/non-placenames,rus,ukr.csv
Original file line number Diff line number Diff line change
@@ -1,75 +1,75 @@
exclusion,category
русский,language
але,stopword
без,stopword
де,stopword
так,stopword
усі,stopword
бо,stopword
за,stopword
на,stopword
нан,stopword
по,stopword
от,stopword
привет,greeting
полезная,adjective
юг,directional
privet,greeting
priviet,greeting
полезная,adjective
страна,place.general
оон,stopword
как,stopword
эта,stopword
привет,greeting
русский,language
ага,noun
белые,noun
большой,noun
вам,noun
второй,noun
главный,noun
главный,noun
движки,noun
деньги,noun
жабер,noun
жалобы,noun
защита,noun
или,noun
мда,noun
жабер,noun
вам,noun
ниша,noun
нем,noun
ниша,noun
нужна,noun
обзор,noun
основной,noun
ошибка,noun
центр,noun
хорошая,noun
ага,noun
первый,noun
правда,noun
раздел,noun
нужна,noun
деньги,noun
защита,noun
жалобы,noun
основной,noun
белые,noun
движки,noun
второй,noun
главный,noun
сети,noun
обзор,noun
первый,noun
главный,noun
система,noun
большой,noun
старт,noun
там,stopword
до,stopword
наш,stopword
но,stopword
об,stopword
оп,pronoun
хорошая,noun
центр,noun
страна,place.general
он,pronoun
они,pronoun
юг,directional
оп,pronoun
але,stopword
без,stopword
би,stopword
бо,stopword
ва,stopword
ви,stopword
гу,stopword
де,stopword
до,stopword
до,stopword
ек,stopword
за,stopword
ин,stopword
ит,stopword
как,stopword
ко,stopword
мо,stopword
на,stopword
нан,stopword
наш,stopword
не,stopword
но,stopword
но,stopword
об,stopword
об,stopword
од,stopword
он,stopword
оон,stopword
от,stopword
по,stopword
так,stopword
там,stopword
те,stopword
то,stopword
усі,stopword
эта,stopword
Loading

0 comments on commit 5daef24

Please sign in to comment.