Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Several Changes leprinco #152

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
591 changes: 308 additions & 283 deletions src/main/java/no/priv/garshol/duke/ConfigLoader.java

Large diffs are not rendered by default.

1,434 changes: 688 additions & 746 deletions src/main/java/no/priv/garshol/duke/Processor.java

Large diffs are not rendered by default.

27 changes: 27 additions & 0 deletions src/main/java/no/priv/garshol/duke/RecordLinkageStrategy.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package no.priv.garshol.duke;

import java.util.Collection;

import no.priv.garshol.duke.matchers.MatchListener;

/**
* This interface allows the customization of the strategy for record linkage
* Advanced possibilities exist for record linkage, but
* they are not implemented yet. see the links below for more
* information.
*
* http://code.google.com/p/duke/issues/detail?id=55
* http://research.microsoft.com/pubs/153478/msr-report-1to1.pdf
*/
public interface RecordLinkageStrategy {

/**
* Compare record
* @param processor The processor to notify {@link MatchListener}
* @param config The {@link Configuration}
* @param record The record to match
* @param candidates The possible candidates
*/
void compare(Processor processor, Configuration config, Record record, Collection<Record> candidates);

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package no.priv.garshol.duke.recordlinkage;

import java.util.Collection;

import no.priv.garshol.duke.Configuration;
import no.priv.garshol.duke.Processor;
import no.priv.garshol.duke.Record;
import no.priv.garshol.duke.RecordLinkageStrategy;

/**
* Strategy used for record linkage, to implement a simple greedy matching
* algorithm where we choose the best alternative above the threshold for each
* record.
*/
public class RecordLinkageBestStrategy implements RecordLinkageStrategy {

@Override
public void compare(Processor processor, Configuration config, Record record,
Collection<Record> candidates) {

double max = 0.0;
Record best = null;

// go through all candidates, and find the best
for (Record candidate : candidates) {
if (processor.isSameAs(record, candidate))
continue;

double prob = processor.compare(record, candidate);
if (prob > max) {
max = prob;
best = candidate;
}
}

// pass on the best match, if any
if (max > config.getThreshold())
processor.registerMatch(record, best, max);
else if (config.getMaybeThreshold() != 0.0 && max > config.getMaybeThreshold())
processor.registerMatchPerhaps(record, best, max);
else
processor.registerNoMatchFor(record);

}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package no.priv.garshol.duke.recordlinkage;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import no.priv.garshol.duke.Configuration;
import no.priv.garshol.duke.Processor;
import no.priv.garshol.duke.Record;
import no.priv.garshol.duke.RecordLinkageStrategy;

/**
* Strategy used for record linkage, to implement a simple greedy matching
* algorithm where we choose the best alternative above the threshold for each
* record.
*/
public class RecordLinkageMixStrategy implements RecordLinkageStrategy {

@Override
public void compare(Processor processor, Configuration config, Record record, Collection<Record> candidates) {

double max = 0.0;
Record best = null;

List<Record> maybe = new ArrayList<Record>();
List<Double> maybeScores = new ArrayList<Double>();

// go through all candidates, and find the best
for (Record candidate : candidates) {
if (processor.isSameAs(record, candidate))
continue;

double prob = processor.compare(record, candidate);

if (prob > config.getThreshold()) {
if (prob > max) {
max = prob;
best = candidate;
}
} else if (config.getMaybeThreshold() != 0.0 && prob > config.getMaybeThreshold()) {
maybe.add(candidate);
maybeScores.add(prob);
}
}

// notify MatchListeners

if (best!=null) {
processor.registerMatch(record, best, max);
}
else if (maybe.size()>0) {
for (int i = 0; i < maybe.size(); i++) {
processor.registerMatchPerhaps(record, maybe.get(i), maybeScores.get(i));
}
}
else {
processor.registerNoMatchFor(record);
}

}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package no.priv.garshol.duke.recordlinkage;

import java.util.Collection;

import no.priv.garshol.duke.Configuration;
import no.priv.garshol.duke.RecordLinkageStrategy;
import no.priv.garshol.duke.Processor;
import no.priv.garshol.duke.Record;

/**
* Simple startegy used for deduplication, where we simply
* want all matches above the thresholds.
*/
public class RecordLinkageSimpleStrategy implements RecordLinkageStrategy {


@Override
public void compare(Processor processor, Configuration config, Record record, Collection<Record> candidates) {

boolean found = false;
for (Record candidate : candidates) {
if (processor.isSameAs(record, candidate))
continue;

double prob = processor.compare(record, candidate);
if (prob > config.getThreshold()) {
found = true;
processor.registerMatch(record, candidate, prob);
} else if (config.getMaybeThreshold() != 0.0 && prob > config.getMaybeThreshold()) {
found = true; // I guess?
processor.registerMatchPerhaps(record, candidate, prob);
}
}
if (!found) {
processor.registerNoMatchFor(record);
}

}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/*
* [email protected]
*/
package no.priv.garshol.duke.transforms;

import java.util.ArrayList;
import java.util.List;

import no.priv.garshol.duke.DataSource;
import no.priv.garshol.duke.Logger;
import no.priv.garshol.duke.Record;
import no.priv.garshol.duke.RecordIterator;

/**
* A wrapper arround a DataSource that transforms it via list of operations
*/
public class TransformDataSource implements DataSource {

/** The DataSource to transform */
protected DataSource transformedDataSource;

/** operations to apply on Records */
protected List<TransformOperation> operations = new ArrayList<TransformOperation>();

/**
* Default constructor
* @param source The DataSource to be transformed
*/
public TransformDataSource(DataSource source) {
this.transformedDataSource = source;
}

@Override
public RecordIterator getRecords() {
final RecordIterator srciter = transformedDataSource.getRecords();
return new RecordIterator() {
@Override
public Record next() {
Record r = srciter.next();
if (r!=null) {
for (TransformOperation op: operations) {
r = op.transform(r);
}
}
return r;
}

@Override
public boolean hasNext() {
return srciter.hasNext();
}
};
}

/**
* Just cascade the logger
*/
@Override
public void setLogger(Logger logger) {
transformedDataSource.setLogger(logger);
}

/**
* Add an operation
* @param oper The TransformOperation
*/
public void addOperation(TransformOperation oper) {
operations.add(oper);
}

/**
* @return the transformedDataSource
*/
public DataSource getTransformedDataSource() {
return transformedDataSource;
}

/**
* @return the operations
*/
public List<TransformOperation> getOperations() {
return operations;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/*
* [email protected]
*/
package no.priv.garshol.duke.transforms;

import no.priv.garshol.duke.Record;

/**
* Operations to apply on Record to transform it
*/
public interface TransformOperation {

/**
* Transform the record
* @param r The record to modify
* @return The transformed record
*/
Record transform(Record r);

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* [email protected]
*/
package no.priv.garshol.duke.transforms;

import no.priv.garshol.duke.Record;
import no.priv.garshol.duke.utils.StringUtils;

/**
* A specific TransformOperation that add to the record an additional property that is the result of a join operation between properties.
*/
public class TransformOperationJoin implements TransformOperation {

protected String resultingProperty;

protected String[] properties;

protected String joiner = " ";

/**
* @see no.priv.garshol.duke.transforms.TransformOperation#transform(no.priv.garshol.duke.Record)
*/
@Override
public Record transform(Record record) {

StringBuilder tmp = new StringBuilder();
boolean first = true;
for (int i = 0; i < properties.length; i++) {
String v = record.getValue(properties[i]);
if (v!=null && !v.equals("")) {
if (!first) {
tmp.append(joiner);
}
first = false;
tmp.append(v);
}
}
return new TransformedRecord(record, resultingProperty, tmp.toString());
}

//--------------------------------- configuration --

/**
* @param resultingProperty the resultingProperty to set
*/
public void setResultingProperty(String resultingProperty) {
this.resultingProperty = resultingProperty;
}

/**
* @param properties the properties to set
*/
public void setProperties(String props) {
this.properties = StringUtils.split(props);
}

/**
* @param joiner the joiner to set
*/
public void setJoiner(String joiner) {
this.joiner = joiner;
}

}
Loading