Skip to content

Commit

Permalink
Issue #59: country and location bias
Browse files Browse the repository at this point in the history
  • Loading branch information
mubaldino committed Oct 14, 2020
1 parent ac828db commit 950d9a4
Show file tree
Hide file tree
Showing 5 changed files with 151 additions and 47 deletions.
11 changes: 11 additions & 0 deletions Core/src/main/java/org/opensextant/processing/Parameters.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
package org.opensextant.processing;

import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.joda.time.format.DateTimeFormat;
Expand Down Expand Up @@ -81,6 +83,15 @@ public class Parameters extends java.util.Properties {
public String outputFile = null;

private Set<String> formats = new HashSet<String>();

/**
* A way of relaying arbitrary geographic filters to an extraction routine indicating that useful answers for
* disambiguation for tie-breakers come from these cues.
*
* "countries" = [c1, c2, c3, ...]
* "geohash" = [g1, g2, g3, ...]
*/
public HashMap<String, List<String>> preferredGeography = new HashMap<>();

/** You the caller must explicitly set isdefault = false;
* forcing you to actually look at these parameters.
Expand Down
19 changes: 16 additions & 3 deletions src/main/java/org/opensextant/extractors/geo/PlaceGeocoder.java
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,16 @@ private void reset() {
private boolean geocode = true;
private boolean tagOnly = !geocode;


/**
* See {@link #extract(TextInput, Parameters)} below.
* This is the default extraction routine. If you need to tune extraction call <code>extract( input, parameters ) </code>
*/
@Override
public List<TextMatch> extract(TextInput input) throws ExtractionException {
return extract(input, null);
}

/**
* Extractor.extract() calls first XCoord to get coordinates, then PlacenameMatcher In the end you
* have all geo entities ranked and scored.
Expand All @@ -462,10 +472,13 @@ private void reset() {
* @return TextMatch instances which are all PlaceCandidates.
* @throws ExtractionException on err
*/
@Override
public List<TextMatch> extract(TextInput input) throws ExtractionException {
public List<TextMatch> extract(TextInput input, Parameters jobParams) throws ExtractionException {
long t1 = System.currentTimeMillis();
reset();

if (jobParams != null) {
this.setAllowLowerCase(jobParams.tag_lowercase);
}

List<TextMatch> matches = new ArrayList<TextMatch>();
List<TextMatch> coordinates = null;
Expand Down Expand Up @@ -522,7 +535,7 @@ public List<TextMatch> extract(TextInput input) throws ExtractionException {
// Last rule: score, choose, add confidence.
//
chooser.setTextCase(input.isLower ? GeocodeRule.LOWERCASE : 0);
chooser.evaluate(candidates);
chooser.evaluate(candidates, jobParams);
if (provinceNameSetter != null) {
provinceNameSetter.evaluate(candidates);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.opensextant.extractors.geo.rules;

import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;

Expand All @@ -11,6 +12,7 @@
import org.opensextant.extractors.geo.PlaceCount;
import org.opensextant.extractors.geo.PlaceEvidence;
import org.opensextant.extractors.geo.PlaceGeocoder;
import org.opensextant.processing.Parameters;
import org.opensextant.util.GeodeticUtility;

/**
Expand All @@ -34,6 +36,8 @@ public class LocationChooserRule extends GeocodeRule {
private Map<String, PlaceCount> boundaryContext = null;
private Map<String, PlaceCount> namespace = new HashMap<>();
private HashMap<String, CountryCount> inferredCountries = new HashMap<>();
private HashSet<String> preferredCountries = new HashSet<>();
private HashSet<String> preferredLocations = new HashSet<>();

private int textCase = 0;

Expand All @@ -53,12 +57,19 @@ public void reset() {
documentCandidates.clear();
namespace.clear();
inferredCountries.clear();
preferredCountries.clear();
preferredLocations.clear();
}

@Override
public void evaluate(List<PlaceCandidate> names) {
evaluate(names, (Parameters) null);
}

/**
* Walk the entire list.
*/
public void evaluate(List<PlaceCandidate> names) {
public void evaluate(List<PlaceCandidate> names, Parameters preferences) {

// INPUTS:
// histogram of country mentions
Expand All @@ -71,6 +82,16 @@ public void evaluate(List<PlaceCandidate> names) {
//
countryContext = countryObserver.countryMentionCount();
boundaryContext = boundaryObserver.placeMentionCount();
//
// PREFS:
if (preferences != null) {
if (preferences.preferredGeography.containsKey("countries")) {
preferredCountries.addAll(preferences.preferredGeography.get("countries"));
}
if (preferences.preferredGeography.containsKey("geohashes")) {
preferredLocations.addAll(preferences.preferredGeography.get("geohashes"));
}
}

/* TODO: DEBUG through location chooser using histograms
* of found and resolved place metadata.
Expand Down Expand Up @@ -190,6 +211,13 @@ private void debuggingHistograms(List<PlaceCandidate> names) {
*/
private static final int GLOBAL_POINTS = 5;

/**
* Preferred Country or Location -- when user supplies the context that may be missing.... We accept
* that and weight such preference higher.
*/
public static String PREF_COUNTRY = "PreferredCountry";
public static String PREF_LOCATION = "PreferredLocation";

/**
* Yet unchosen location. Consider given evidence first, creating some weight there, then
* introducing innate properties of possible locations, thereby amplifying the differences in the
Expand All @@ -199,7 +227,30 @@ private void debuggingHistograms(List<PlaceCandidate> names) {
@Override
public void evaluate(PlaceCandidate name, Place geo) {

// With "preferred geography" we can influence in a subtle fashion ambiguous mentions, e.g.,
// If known geography is Ohio and we see mentions of Springfield without other context, we can
// nudge choice of Springfield, OH as such. Such as with a preferred location (geohash).

if (preferredCountries != null && !preferredCountries.isEmpty()) {
if (preferredCountries.contains(geo.getCountryCode())) {
// Get a half-point for being within the country
name.incrementPlaceScore(geo, 0.5);
name.addRule(PREF_COUNTRY);
}
}
if (preferredLocations != null && !preferredLocations.isEmpty()) {
for (String gh : preferredLocations) {
if (geo.getGeohash().startsWith(gh)) {
// Increment a full point for being within the geohash. Note geohash length of 4 or more chars is reasonably good resolution.
name.incrementPlaceScore(geo, 1.0);
name.addRule(PREF_LOCATION);
}
}
}

if (boundaryContext.isEmpty() && countryContext.isEmpty()) {
// So without context, there is nothing more we can do to influence the connection between
// the one named place and the candidate location
return;
}

Expand Down Expand Up @@ -275,8 +326,8 @@ public void evaluate(PlaceCandidate name, Place geo) {
public static final int MATCHCONF_NAME_REGION = 75;

/**
* Absolute Confidence: Unique name in gazetteer.
* Confidence is high, however this needs to be tempered by the number of gazetteers, coverage, and diversity
* Absolute Confidence: Unique name in gazetteer. Confidence is high, however this needs to be
* tempered by the number of gazetteers, coverage, and diversity
*/
public static final int MATCHCONF_ONE_LOC = 70;

Expand Down Expand Up @@ -309,6 +360,12 @@ public void evaluate(PlaceCandidate name, Place geo) {
*/
public static final int MATCHCONF_QUALIFIER_LOWERCASE = -15;

/**
* A subtle boost for locations that were preferred -- especially helps when there is no inherent
* context and we must rely on the caller's intuition.
*/
public static final int MATCHCONF_PREFERRED = 5;

private static boolean isShort(int matchLen) {
return matchLen <= NonsenseFilter.GENERIC_ONE_WORD;
}
Expand Down Expand Up @@ -380,7 +437,7 @@ public void assessConfidence(PlaceCandidate pc) {
if (fc != null) {
featWeight = fc.factor;
}
points = (int)((0.75 * points) + (0.25 * points * featWeight));
points = (int) ((0.75 * points) + (0.25 * points * featWeight));

// Any of these may occur.
//======================
Expand Down Expand Up @@ -457,6 +514,13 @@ public void assessConfidence(PlaceCandidate pc) {
points += pc.getLength() - 4;
}

if (pc.hasRule(PREF_COUNTRY)) {
points += MATCHCONF_PREFERRED;
}
if (pc.hasRule(PREF_LOCATION)) {
points += MATCHCONF_PREFERRED;
}

pc.setConfidence(points);
}

Expand Down
27 changes: 26 additions & 1 deletion src/main/java/org/opensextant/xlayer/server/TaggerResource.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,15 @@

import static org.apache.commons.lang3.StringUtils.isNotBlank;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.opensextant.data.TextInput;
Expand Down Expand Up @@ -138,6 +142,20 @@ protected void resetParameters(Parameters job) {
job.tag_patterns = false;
job.addOutputFormat("json");
}

/**
*
* @param a JSONArray
* @return
*/
protected List<String> fromArray(JSONArray a){
ArrayList<String> strings = new ArrayList<>();
Iterator<Object> iter = a.iterator();
while (iter.hasNext()) {
strings.add((String)iter.next());
}
return strings;
}

/**
*
Expand Down Expand Up @@ -181,7 +199,14 @@ protected Parameters fromRequest(JSONObject inputs) throws JSONException {
job.tag_lowercase = opts.contains("lowercase");
job.resolve_localities = opts.contains("revgeo") || opts.contains("resolve_localities");
}

//
// Geographic filters
if (inputs.has("preferred_countries")) {
job.preferredGeography.put("countries", fromArray(inputs.getJSONArray("preferred_countries")));
}
if (inputs.has("preferred_locations")) {
job.preferredGeography.put("geohashes", fromArray(inputs.getJSONArray("preferred_locations")));
}
if (job.clean_input || job.tag_lowercase) {
job.isdefault = false;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public XponentsGeotagger() {
}

/**
* get Xponents Exxtractor object from global attributes.
* get Xponents Exxtractor object from global attributes.
*/
public Extractor getExtractor(String xid) {
Object X = this.getApplication().getContext().getAttributes().get(xid);
Expand All @@ -60,27 +60,22 @@ public Extractor getExtractor(String xid) {
}

/**
* Contract:
* docid optional; 'text' | 'doc-list' required.
* command: cmd=ping sends back a simple response
*
* text = UTF-8 encoded text
* docid = user's provided document ID
* doc-list = An array of text
*
* cmd=ping = report status.
*
* Where json-array contains { docs=[ {docid='A', text='...'}, {docid='B', text='...',...] }
* The entire array must be parsable in memory as a single, traversible JSON object.
* We make no assumption about one-JSON object per line or anything about line-endings as separators.
*
*
* @param params
* the params
* @return the representation
* @throws JSONException
* the JSON exception
*/
* Contract: docid optional; 'text' | 'doc-list' required. command: cmd=ping sends back a simple
* response
*
* text = UTF-8 encoded text docid = user's provided document ID doc-list = An array of text
*
* cmd=ping = report status.
*
* Where json-array contains { docs=[ {docid='A', text='...'}, {docid='B', text='...',...] } The
* entire array must be parsable in memory as a single, traversible JSON object. We make no
* assumption about one-JSON object per line or anything about line-endings as separators.
*
*
* @param params JSON parameters per REST API: docid, text, lang, features, options, and preferred_*
* @return the representation
* @throws JSONException the JSON exception
*/
@Post("application/json;charset=utf-8")
public Representation processForm(JsonRepresentation params) throws JSONException {
org.json.JSONObject json = params.getJsonObject();
Expand All @@ -100,16 +95,14 @@ public Representation processForm(JsonRepresentation params) throws JSONExceptio
}

/**
* HTTP GET -- vanilla. Do not use in production, unless you have really small data packages.
* This is useful for testing. Partial contract:
*
* miscellany: 'cmd' = 'ping' |... other commands.
* processing: 'docid' = ?, 'text' = ?
*
* @param params
* the params
* @return the representation
*/
* HTTP GET -- vanilla. Do not use in production, unless you have really small data packages. This
* is useful for testing. Partial contract:
*
* miscellany: 'cmd' = 'ping' |... other commands. processing: 'docid' = ?, 'text' = ?
*
* @param params JSON parameters. see process()
* @return the representation
*/
@Get
public Representation processGet(Representation params) {
Form inputs = getRequest().getResourceRef().getQueryAsForm();
Expand Down Expand Up @@ -140,10 +133,8 @@ public Representation process(TextInput input, Parameters jobParams) {
try {
if (prodMode) {
PlaceGeocoder xgeo = (PlaceGeocoder) getExtractor("xgeo");
xgeo.setAllowLowerCase(jobParams.tag_lowercase);
List<TextMatch> matches = xgeo.extract(input, jobParams);

List<TextMatch> matches = xgeo.extract(input);

if (jobParams.tag_patterns) {
XTemporal xt = (XTemporal) getExtractor("xtemp");
matches.addAll(xt.extract(input));
Expand All @@ -169,7 +160,7 @@ public Representation process(TextInput input, Parameters jobParams) {
/**
* Format matches as JSON
*
* @param matches items to format
* @param matches items to format
* @param jobParams parameters
* @return formatted json
* @throws JSONException on format error
Expand All @@ -184,9 +175,9 @@ private Representation format(List<TextMatch> matches, Parameters jobParams) thr
}

/**
* @param params parameters
* @param variousMatches matches to filter
*/
* @param params parameters
* @param variousMatches matches to filter
*/
public void filter(List<TextMatch> variousMatches, Parameters params) {
// Determine what looks useful. Filter out things not worth
// saving at all in data store.
Expand Down

0 comments on commit 950d9a4

Please sign in to comment.