From 0d59e02392599398b828ded94a0eeff1ffad2644 Mon Sep 17 00:00:00 2001 From: jerryye Date: Fri, 29 Aug 2014 15:04:25 -0700 Subject: [PATCH 1/5] Serialized library --- pom.xml | 6 +++ .../org/ahocorasick/interval/Interval.java | 18 ++++++- src/main/java/org/ahocorasick/trie/Emit.java | 22 +++++++- src/main/java/org/ahocorasick/trie/State.java | 50 ++++++++++++++++++- src/main/java/org/ahocorasick/trie/Trie.java | 35 ++++++++++++- .../java/org/ahocorasick/trie/TrieConfig.java | 36 ++++++++++++- .../java/org/ahocorasick/trie/TrieTest.java | 34 +++++++++++++ 7 files changed, 196 insertions(+), 5 deletions(-) diff --git a/pom.xml b/pom.xml index 2f3a59b..aa43192 100644 --- a/pom.xml +++ b/pom.xml @@ -60,6 +60,12 @@ test + + org.apache.commons + commons-lang3 + 3.3.2 + + diff --git a/src/main/java/org/ahocorasick/interval/Interval.java b/src/main/java/org/ahocorasick/interval/Interval.java index c43dd7c..0c859fc 100644 --- a/src/main/java/org/ahocorasick/interval/Interval.java +++ b/src/main/java/org/ahocorasick/interval/Interval.java @@ -1,6 +1,10 @@ package org.ahocorasick.interval; -public class Interval implements Intervalable { +import java.io.IOException; +import java.io.Serializable; +import java.lang.reflect.Field; + +public class Interval implements Intervalable, Serializable { private int start; private int end; @@ -61,4 +65,16 @@ public String toString() { return this.start + ":" + this.end; } + protected void writeObject(java.io.ObjectOutputStream stream) + throws IOException { + stream.writeInt(start); + stream.writeInt(end); + } + + protected void readObject(java.io.ObjectInputStream stream) + throws IOException, ClassNotFoundException, IllegalAccessException, NoSuchFieldException { + this.start = stream.readInt(); + this.end = stream.readInt(); + } + } diff --git a/src/main/java/org/ahocorasick/trie/Emit.java b/src/main/java/org/ahocorasick/trie/Emit.java index 60c1f9e..096a30c 100644 --- a/src/main/java/org/ahocorasick/trie/Emit.java +++ b/src/main/java/org/ahocorasick/trie/Emit.java @@ -3,7 +3,11 @@ import org.ahocorasick.interval.Interval; import org.ahocorasick.interval.Intervalable; -public class Emit extends Interval implements Intervalable { +import java.io.IOException; +import java.io.Serializable; +import java.lang.reflect.Field; + +public class Emit extends Interval implements Intervalable, Serializable { private final String keyword; @@ -21,4 +25,20 @@ public String toString() { return super.toString() + "=" + this.keyword; } + @Override + protected void writeObject(java.io.ObjectOutputStream stream) + throws IOException { + super.writeObject(stream); + stream.writeUTF(keyword); + } + + @Override + protected void readObject(java.io.ObjectInputStream stream) + throws IOException, ClassNotFoundException, IllegalAccessException, NoSuchFieldException { + Field f = this.getClass().getDeclaredField("keyword"); + super.readObject(stream); + f.setAccessible(true); + f.set(this, stream.readUTF()); + } + } diff --git a/src/main/java/org/ahocorasick/trie/State.java b/src/main/java/org/ahocorasick/trie/State.java index 9108838..51ae64f 100644 --- a/src/main/java/org/ahocorasick/trie/State.java +++ b/src/main/java/org/ahocorasick/trie/State.java @@ -1,6 +1,9 @@ package org.ahocorasick.trie; +import java.io.IOException; +import java.io.Serializable; import java.util.*; +import java.lang.reflect.Field; /** *

@@ -23,7 +26,7 @@ * * @author Robert Bor */ -public class State { +public class State implements Serializable, Comparable { /** effective the size of the keyword */ private final int depth; @@ -114,4 +117,49 @@ public Collection getTransitions() { return this.success.keySet(); } + private void writeObject(java.io.ObjectOutputStream stream) + throws IOException { + stream.writeInt(depth); + stream.writeObject(success); + stream.writeObject(failure); + stream.writeObject(emits); + } + + private void readObject(java.io.ObjectInputStream stream) + throws IOException, ClassNotFoundException, NoSuchFieldException, IllegalAccessException { + + // Use reflection to modify final field + Field f = this.getClass().getDeclaredField("depth"); + f.setAccessible(true); + f.set(this, stream.readInt()); + + f = this.getClass().getDeclaredField("rootState"); + f.setAccessible(true); + f.set(this, (depth == 0)?this:null); + success = (TreeMap) stream.readObject(); + failure = (State) stream.readObject(); + emits = (List) stream.readObject(); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof State)) + return false; + return compareTo((State) obj) == 0; + } + + @Override + public int compareTo(State o) { + if (this.depth != o.depth) + return 1; + if ((this.depth == 0 && o.depth == 0) && (this.rootState != this || o.rootState != o)) + return 1; + if (!this.success.equals(o.success)) + return 1; + if (this.failure != null && o.failure != null && !this.failure.equals(o.failure)) + return 1; + if (this.emits != null && o.emits != null && !this.emits.equals(o.emits)) + return 1; + return 0; + } } diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java index bf2948a..bfe4072 100644 --- a/src/main/java/org/ahocorasick/trie/Trie.java +++ b/src/main/java/org/ahocorasick/trie/Trie.java @@ -3,6 +3,8 @@ import org.ahocorasick.interval.IntervalTree; import org.ahocorasick.interval.Intervalable; +import java.io.IOException; +import java.io.Serializable; import java.util.ArrayList; import java.util.Collection; import java.util.List; @@ -14,7 +16,7 @@ * Based on the Aho-Corasick white paper, Bell technologies: ftp://163.13.200.222/assistant/bearhero/prog/%A8%E4%A5%A6/ac_bm.pdf * @author Robert Bor */ -public class Trie { +public class Trie implements Serializable, Comparable { private TrieConfig trieConfig; @@ -185,4 +187,35 @@ private void storeEmits(int position, State currentState, List collectedEm } } + private void writeObject(java.io.ObjectOutputStream stream) + throws IOException { + stream.writeObject(trieConfig); + stream.writeObject(rootState); + stream.writeBoolean(failureStatesConstructed); + } + + private void readObject(java.io.ObjectInputStream stream) + throws IOException, ClassNotFoundException { + trieConfig = (TrieConfig) stream.readObject(); + rootState = (State) stream.readObject(); + failureStatesConstructed = stream.readBoolean(); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof Trie)) + return false; + return compareTo((Trie) obj) == 0; + } + + @Override + public int compareTo(Trie o) { + if (!this.trieConfig.equals(o.trieConfig)) + return 1; + if (!this.rootState.equals(o.rootState)) + return 1; + if (!this.failureStatesConstructed == o.failureStatesConstructed) + return 1; + return 0; + } } diff --git a/src/main/java/org/ahocorasick/trie/TrieConfig.java b/src/main/java/org/ahocorasick/trie/TrieConfig.java index 6fa05c7..2623588 100644 --- a/src/main/java/org/ahocorasick/trie/TrieConfig.java +++ b/src/main/java/org/ahocorasick/trie/TrieConfig.java @@ -1,6 +1,9 @@ package org.ahocorasick.trie; -public class TrieConfig { +import java.io.IOException; +import java.io.Serializable; + +public class TrieConfig implements Serializable, Comparable { private boolean allowOverlaps = true; @@ -31,4 +34,35 @@ public boolean isCaseInsensitive() { public void setCaseInsensitive(boolean caseInsensitive) { this.caseInsensitive = caseInsensitive; } + + private void writeObject(java.io.ObjectOutputStream stream) + throws IOException { + stream.writeBoolean(allowOverlaps); + stream.writeBoolean(onlyWholeWords); + stream.writeBoolean(caseInsensitive); + } + + private void readObject(java.io.ObjectInputStream stream) + throws IOException, ClassNotFoundException { + this.allowOverlaps = stream.readBoolean(); + this.onlyWholeWords = stream.readBoolean(); + this.caseInsensitive = stream.readBoolean(); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof TrieConfig)) + return false; + return compareTo((TrieConfig) obj) == 0; + } + + @Override + public int compareTo(TrieConfig o) { + if (this.allowOverlaps != o.allowOverlaps || this.caseInsensitive != o.caseInsensitive || + this.onlyWholeWords != o.onlyWholeWords) { + return 1; + } else { + return 0; + } + } } diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java index faa4689..294ef45 100644 --- a/src/test/java/org/ahocorasick/trie/TrieTest.java +++ b/src/test/java/org/ahocorasick/trie/TrieTest.java @@ -1,11 +1,13 @@ package org.ahocorasick.trie; +import org.apache.commons.lang3.SerializationUtils; import org.junit.Test; import java.util.Collection; import java.util.Iterator; import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.assertSame; public class TrieTest { @@ -200,6 +202,38 @@ public void zeroLengthTestBug7InGithubReportedByXCurry() { trie.tokenize("Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel."); } + @Test + public void trieSerialization() { + Trie trie = new Trie().removeOverlaps().onlyWholeWords().caseInsensitive(); + trie.addKeyword("san francisco"); + trie.addKeyword("ca"); + trie.addKeyword("oakland"); + Trie deserializedTrie = (Trie) SerializationUtils.clone(trie); + assertEquals(trie, deserializedTrie); + deserializedTrie.parseText("san francisco ca"); + deserializedTrie.parseText("San Francisco ca"); + } + + @Test + public void trieConfigSerialization() { + TrieConfig conf = new TrieConfig(); + conf.setAllowOverlaps(true); + conf.setOnlyWholeWords(true); + + TrieConfig deserializedConf = SerializationUtils.clone(conf); + assertEquals(conf, deserializedConf); + } + + @Test + public void stateSerialization() { + State state = new State(); + state.addEmit("san francisco"); + state.addEmit("ca"); + state.addEmit("oakland"); + State deserializedState = (State) SerializationUtils.clone(state); + assertEquals(state, deserializedState); + } + private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) { assertEquals(expectedStart, next.getStart()); assertEquals(expectedEnd, next.getEnd()); From 6890a98789e02ceb7205181df25b32b14df73288 Mon Sep 17 00:00:00 2001 From: jerryye Date: Wed, 10 Sep 2014 14:33:02 -0700 Subject: [PATCH 2/5] Fixed serialization issue with TreeMaps containing recursive references. --- src/main/java/org/ahocorasick/trie/State.java | 26 ++++++++++++++++--- src/main/java/org/ahocorasick/trie/Trie.java | 3 +-- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/ahocorasick/trie/State.java b/src/main/java/org/ahocorasick/trie/State.java index e2a1127..5fada7c 100644 --- a/src/main/java/org/ahocorasick/trie/State.java +++ b/src/main/java/org/ahocorasick/trie/State.java @@ -2,8 +2,11 @@ import java.io.IOException; import java.io.Serializable; +import java.lang.Character; +import java.lang.Integer; import java.util.*; import java.lang.reflect.Field; +import java.util.TreeMap; /** *

@@ -120,7 +123,16 @@ public Collection getTransitions() { private void writeObject(java.io.ObjectOutputStream stream) throws IOException { stream.writeInt(depth); - stream.writeObject(success); + stream.writeInt(success.size()); + for (Map.Entry e : success.entrySet()) { + stream.writeObject(e.getKey()); + if (e.getValue() == this) { + stream.writeObject(null); + } else { + stream.writeObject(e.getValue()); + } + } + stream.writeObject(failure); stream.writeObject(emits); } @@ -136,9 +148,17 @@ private void readObject(java.io.ObjectInputStream stream) f = this.getClass().getDeclaredField("rootState"); f.setAccessible(true); f.set(this, (depth == 0)?this:null); - success = (TreeMap) stream.readObject(); + int successSize = (Integer) stream.readInt(); + success = new TreeMap(); + for (int i = 0; i < successSize; i++) { + Character character = (Character) stream.readObject(); + State treeState = (State) stream.readObject(); + success.put(character, (treeState == null)?this:treeState); + } + + failure = (State) stream.readObject(); - emits = (List) stream.readObject(); + emits = (TreeSet) stream.readObject(); } @Override diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java index eea8e10..f6663b5 100644 --- a/src/main/java/org/ahocorasick/trie/Trie.java +++ b/src/main/java/org/ahocorasick/trie/Trie.java @@ -190,14 +190,13 @@ private void writeObject(java.io.ObjectOutputStream stream) throws IOException { stream.writeObject(trieConfig); stream.writeObject(rootState); - stream.writeBoolean(failureStatesConstructed); } private void readObject(java.io.ObjectInputStream stream) throws IOException, ClassNotFoundException { trieConfig = (TrieConfig) stream.readObject(); rootState = (State) stream.readObject(); - failureStatesConstructed = stream.readBoolean(); + constructFailureStates(); } @Override From 407142761af7ba545bb5bae123ef2e6ad096c83e Mon Sep 17 00:00:00 2001 From: jerryye Date: Mon, 27 Apr 2015 17:46:54 -0700 Subject: [PATCH 3/5] fixed serialization issues with cyclic references --- src/main/java/org/ahocorasick/trie/State.java | 58 ++++++++++++++++--- src/main/java/org/ahocorasick/trie/Trie.java | 7 ++- .../java/org/ahocorasick/trie/StateTest.java | 19 +++++- .../java/org/ahocorasick/trie/TrieTest.java | 18 +++++- 4 files changed, 90 insertions(+), 12 deletions(-) diff --git a/src/main/java/org/ahocorasick/trie/State.java b/src/main/java/org/ahocorasick/trie/State.java index 5fada7c..ab2427b 100644 --- a/src/main/java/org/ahocorasick/trie/State.java +++ b/src/main/java/org/ahocorasick/trie/State.java @@ -49,6 +49,17 @@ public class State implements Serializable, Comparable { /** whenever this state is reached, it will emit the matches keywords for future reference */ private Set emits = null; + /** used for serialization */ + public static IdentityHashMap objectToReference = new IdentityHashMap(); + public static IdentityHashMap referenceToObject = new IdentityHashMap(); + public static int referenceCount = 1; + + public static void reset () { + objectToReference.clear(); + referenceToObject.clear(); + referenceCount = 1; + } + public State() { this(0); } @@ -126,10 +137,15 @@ private void writeObject(java.io.ObjectOutputStream stream) stream.writeInt(success.size()); for (Map.Entry e : success.entrySet()) { stream.writeObject(e.getKey()); - if (e.getValue() == this) { - stream.writeObject(null); - } else { + + Integer reference = objectToReference.get(e.getValue()); + if (reference == null) { + objectToReference.put(e.getValue(), ++referenceCount); + stream.writeInt(0); + stream.writeInt(referenceCount); stream.writeObject(e.getValue()); + } else { + stream.writeInt(reference); } } @@ -147,13 +163,21 @@ private void readObject(java.io.ObjectInputStream stream) f = this.getClass().getDeclaredField("rootState"); f.setAccessible(true); - f.set(this, (depth == 0)?this:null); + f.set(this, (depth == 0) ? this : null); int successSize = (Integer) stream.readInt(); success = new TreeMap(); for (int i = 0; i < successSize; i++) { Character character = (Character) stream.readObject(); - State treeState = (State) stream.readObject(); - success.put(character, (treeState == null)?this:treeState); + Integer reference = stream.readInt(); + State treeState = null; + if (reference == 0) { + Integer referenceID = stream.readInt(); + treeState = (State) stream.readObject(); + referenceToObject.put(referenceID, treeState); + } else { + treeState = (org.ahocorasick.trie.State) referenceToObject.get(reference); + } + success.put(character, treeState); } @@ -161,6 +185,24 @@ private void readObject(java.io.ObjectInputStream stream) emits = (TreeSet) stream.readObject(); } + private static IdentityHashMap equalityReferenceMap = new IdentityHashMap(); + + private boolean gotoEquality(Map mine, Map other) { + if (mine.size() != other.size()) return false; + Iterator otherEntrySet = other.entrySet().iterator(); + for (Map.Entry e : mine.entrySet()) { + Map.Entry otherE = (Map.Entry ) otherEntrySet.next(); + + if (!e.getKey().equals(otherE.getKey())) return false; + Integer reference = equalityReferenceMap.get(e.getValue()); + if (reference == null) { + equalityReferenceMap.put(e.getValue(), equalityReferenceMap.size() + 1); + if (!e.getValue().equals(otherE.getValue())) return false; + } + } + return true; + } + @Override public boolean equals(Object obj) { if (!(obj instanceof State)) @@ -174,7 +216,7 @@ public int compareTo(State o) { return 1; if ((this.depth == 0 && o.depth == 0) && (this.rootState != this || o.rootState != o)) return 1; - if (!this.success.equals(o.success)) + if (!gotoEquality(this.success, o.success)) return 1; if (this.failure != null && o.failure != null && !this.failure.equals(o.failure)) return 1; @@ -182,4 +224,4 @@ public int compareTo(State o) { return 1; return 0; } -} +} \ No newline at end of file diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java index f6663b5..2e5b7f9 100644 --- a/src/main/java/org/ahocorasick/trie/Trie.java +++ b/src/main/java/org/ahocorasick/trie/Trie.java @@ -151,10 +151,13 @@ private void checkForConstructedFailureStates() { private void constructFailureStates() { Queue queue = new LinkedBlockingDeque(); + int i = 0; // First, set the fail state of all depth 1 states to the root state for (State depthOneState : this.rootState.getStates()) { - depthOneState.setFailure(this.rootState); - queue.add(depthOneState); + if (depthOneState != null) { + depthOneState.setFailure(this.rootState); + queue.add(depthOneState); + } } this.failureStatesConstructed = true; diff --git a/src/test/java/org/ahocorasick/trie/StateTest.java b/src/test/java/org/ahocorasick/trie/StateTest.java index 2a64370..20d622a 100644 --- a/src/test/java/org/ahocorasick/trie/StateTest.java +++ b/src/test/java/org/ahocorasick/trie/StateTest.java @@ -1,8 +1,10 @@ package org.ahocorasick.trie; -import org.ahocorasick.trie.State; +import org.apache.commons.lang3.SerializationUtils; import org.junit.Test; +import java.io.Serializable; + import static junit.framework.Assert.assertEquals; public class StateTest { @@ -22,4 +24,19 @@ public void constructSequenceOfCharacters() { assertEquals(3, currentState.getDepth()); } + @Test + public void testSerialization() { + State rootState = new State(); + rootState + .addState('a') + .addState('b') + .addState('c'); + State currentState = rootState.nextState('a'); + currentState = currentState.nextState('b'); + currentState = currentState.nextState('c'); + + Serializable copy = SerializationUtils.clone(rootState); + assertEquals(copy, rootState); + } + } diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java index a3d5a4b..6abf8d2 100644 --- a/src/test/java/org/ahocorasick/trie/TrieTest.java +++ b/src/test/java/org/ahocorasick/trie/TrieTest.java @@ -3,11 +3,11 @@ import org.apache.commons.lang3.SerializationUtils; import org.junit.Test; +import java.io.Serializable; import java.util.Collection; import java.util.Iterator; import static junit.framework.Assert.assertEquals; -import static junit.framework.Assert.assertSame; public class TrieTest { @@ -153,6 +153,22 @@ public void tokenizeFullSentence() { assertEquals(" in reserve", tokensIt.next().getFragment()); } + @Test + public void serializeTrie() { + Trie trie = new Trie(); + trie.addKeyword("he"); + trie.addKeyword("hehehehe"); + trie.addKeyword("Alpha"); + trie.addKeyword("Beta"); + trie.addKeyword("Gamma"); + + Collection emits = trie.parseText("hehehehehe"); + Iterator iterator = emits.iterator(); + + Serializable copy = SerializationUtils.clone(trie); + assert(trie.equals(copy)); + } + @Test public void bug5InGithubReportedByXCurry() { Trie trie = new Trie().caseInsensitive().onlyWholeWords(); From 557474b8b0bcc39b9373f31a6220f054d532ba00 Mon Sep 17 00:00:00 2001 From: jerryye Date: Tue, 28 Apr 2015 19:12:30 -0700 Subject: [PATCH 4/5] enable toggling of onlyWholeWords from call to parseText --- src/main/java/org/ahocorasick/trie/Trie.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java index 2e5b7f9..0d2eb90 100644 --- a/src/main/java/org/ahocorasick/trie/Trie.java +++ b/src/main/java/org/ahocorasick/trie/Trie.java @@ -89,6 +89,10 @@ private Token createMatch(Emit emit, String text) { @SuppressWarnings("unchecked") public Collection parseText(String text) { + return parseText(text, false); + } + + public Collection parseText(String text, Boolean onlyWholeWords) { checkForConstructedFailureStates(); int position = 0; @@ -103,7 +107,7 @@ public Collection parseText(String text) { position++; } - if (trieConfig.isOnlyWholeWords()) { + if (trieConfig.isOnlyWholeWords() || onlyWholeWords) { removePartialMatches(text, collectedEmits); } From 986b905449b58737bd76586085f14c89c3c56e85 Mon Sep 17 00:00:00 2001 From: jerryye Date: Sat, 20 Jun 2015 02:17:02 -0700 Subject: [PATCH 5/5] fix npe issues --- src/main/java/org/ahocorasick/trie/Trie.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java index 0d2eb90..eee1515 100644 --- a/src/main/java/org/ahocorasick/trie/Trie.java +++ b/src/main/java/org/ahocorasick/trie/Trie.java @@ -5,10 +5,7 @@ import java.io.IOException; import java.io.Serializable; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; -import java.util.Queue; +import java.util.*; import java.util.concurrent.LinkedBlockingDeque; /** @@ -138,10 +135,13 @@ private void removePartialMatches(String searchText, List collectedEmits) } private State getState(State currentState, Character character) { - State newCurrentState = currentState.nextState(character); - while (newCurrentState == null) { - currentState = currentState.failure(); + State newCurrentState = null; + if (currentState != null) { newCurrentState = currentState.nextState(character); + while (newCurrentState == null && currentState.failure() != null) { + currentState = currentState.failure(); + newCurrentState = currentState.nextState(character); + } } return newCurrentState; } @@ -185,7 +185,7 @@ private void constructFailureStates() { } private void storeEmits(int position, State currentState, List collectedEmits) { - Collection emits = currentState.emit(); + Collection emits = (currentState == null) ? Collections. emptyList() : currentState.emit(); if (emits != null && !emits.isEmpty()) { for (String emit : emits) { collectedEmits.add(new Emit(position-emit.length()+1, position, emit));