From f1a78b2eb0038cb2d1d7b2c6734ddf6022cce5bb Mon Sep 17 00:00:00 2001 From: Shivam Bansal Date: Sat, 23 Jul 2016 01:57:26 +0530 Subject: [PATCH 1/2] added stopwords cleaner with tests --- .gitignore | 5 +- duke-core/.idea/compiler.xml | 22 + .../.idea/copyright/profiles_settings.xml | 3 + duke-core/.idea/misc.xml | 42 ++ duke-core/.idea/modules.xml | 9 + duke-core/.idea/workspace.xml | 562 ++++++++++++++++++ .../duke/cleaners/StopwordsCleaner.java | 72 +++ .../priv/garshol/duke/english-stopwords.txt | 495 +++++++++++++++ .../duke/cleaners/StopwordsCleanerTest.java | 21 + 9 files changed, 1230 insertions(+), 1 deletion(-) create mode 100644 duke-core/.idea/compiler.xml create mode 100644 duke-core/.idea/copyright/profiles_settings.xml create mode 100644 duke-core/.idea/misc.xml create mode 100644 duke-core/.idea/modules.xml create mode 100644 duke-core/.idea/workspace.xml create mode 100644 duke-core/src/main/java/no/priv/garshol/duke/cleaners/StopwordsCleaner.java create mode 100644 duke-core/src/main/resources/no/priv/garshol/duke/english-stopwords.txt create mode 100644 duke-core/src/test/java/no/priv/garshol/duke/cleaners/StopwordsCleanerTest.java diff --git a/.gitignore b/.gitignore index 3b66d17e..6fbb068e 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,7 @@ target/ *.iml *.ipr *.iws -/.idea/ \ No newline at end of file +/.idea/ + +.DS_Store + diff --git a/duke-core/.idea/compiler.xml b/duke-core/.idea/compiler.xml new file mode 100644 index 00000000..96cc43ef --- /dev/null +++ b/duke-core/.idea/compiler.xml @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/duke-core/.idea/copyright/profiles_settings.xml b/duke-core/.idea/copyright/profiles_settings.xml new file mode 100644 index 00000000..e7bedf33 --- /dev/null +++ b/duke-core/.idea/copyright/profiles_settings.xml @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/duke-core/.idea/misc.xml b/duke-core/.idea/misc.xml new file mode 100644 index 00000000..8d8dc8d4 --- /dev/null +++ b/duke-core/.idea/misc.xml @@ -0,0 +1,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + 1.8 + + + + + + + + \ No newline at end of file diff --git a/duke-core/.idea/modules.xml b/duke-core/.idea/modules.xml new file mode 100644 index 00000000..211fc4dc --- /dev/null +++ b/duke-core/.idea/modules.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/duke-core/.idea/workspace.xml b/duke-core/.idea/workspace.xml new file mode 100644 index 00000000..c65a71bb --- /dev/null +++ b/duke-core/.idea/workspace.xml @@ -0,0 +1,562 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1469218236929 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/duke-core/src/main/java/no/priv/garshol/duke/cleaners/StopwordsCleaner.java b/duke-core/src/main/java/no/priv/garshol/duke/cleaners/StopwordsCleaner.java new file mode 100644 index 00000000..ac3aaf42 --- /dev/null +++ b/duke-core/src/main/java/no/priv/garshol/duke/cleaners/StopwordsCleaner.java @@ -0,0 +1,72 @@ + +package no.priv.garshol.duke.cleaners; + +import no.priv.garshol.duke.Cleaner; + +import java.io.*; +import java.util.List; +import java.util.ArrayList; + +/** + * A cleaner which removes english stopwords from a string. + */ + +public class StopwordsCleaner implements Cleaner { + private LowerCaseNormalizeCleaner sub; + private String[] stopwords; + private ArrayList wordsList = new ArrayList(); + + + public StopwordsCleaner() { + this.sub = new LowerCaseNormalizeCleaner(); + + try { + this.stopwords = loadStopwords(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + + public String clean(String value) { + + value = sub.clean(value); + if (value == null || value.equals("")) + return value; + + + String[] words = value.split(" "); + for (String word : words) { + wordsList.add(word); + } + + for (int j = 0; j < stopwords.length; j++) { + if (wordsList.contains(stopwords[j])) { + wordsList.remove(stopwords[j]); + } + } + + return String.join(" ",wordsList); + + } + + private String[] loadStopwords() throws IOException { + String mapfile = "no/priv/garshol/duke/english-stopwords.txt"; + + + BufferedReader in = new BufferedReader(new FileReader(mapfile)); + String str; + + List list = new ArrayList(); + while((str = in.readLine()) != null){ + list.add(str); + } + + String[] stopwords = list.toArray(new String[0]); + + in.close(); + return stopwords; + } + +} + diff --git a/duke-core/src/main/resources/no/priv/garshol/duke/english-stopwords.txt b/duke-core/src/main/resources/no/priv/garshol/duke/english-stopwords.txt new file mode 100644 index 00000000..24f5f4d6 --- /dev/null +++ b/duke-core/src/main/resources/no/priv/garshol/duke/english-stopwords.txt @@ -0,0 +1,495 @@ +a +able +about +above +abst +accordance +according +accordingly +across +act +actually +added +adj +affected +affecting +affects +after +afterwards +again +against +ah +all +almost +alone +along +already +also +although +always +am +among +amongst +an +and +announce +another +any +anybody +anyhow +anymore +anyone +anything +anyway +anyways +anywhere +apparently +approximately +are +aren +arent +arise +around +as +aside +ask +asking +at +auth +available +away +awfully +b +back +be +became +because +become +becomes +becoming +been +before +beforehand +begin +beginning +beginnings +begins +behind +being +believe +below +beside +besides +between +beyond +biol +both +brief +briefly +but +by +c +ca +came +can +cannot +can't +cause +causes +certain +certainly +co +com +come +comes +contain +containing +contains +could +couldnt +d +date +did +didn't +different +do +does +doesn't +doing +done +don't +down +downwards +due +during +e +each +ed +edu +effect +eg +eight +eighty +either +else +elsewhere +end +ending +enough +especially +et +et-al +etc +even +ever +every +everybody +everyone +everything +everywhere +ex +except +f +far +few +ff +fifth +first +five +fix +followed +following +follows +for +former +formerly +forth +found +four +from +further +furthermore +g +gave +get +gets +getting +give +given +gives +giving +go +goes +gone +got +gotten +h +had +happens +hardly +has +hasn't +have +haven't +having +he +hed +hence +her +here +hereafter +hereby +herein +heres +hereupon +hers +herself +hes +hi +hid +him +himself +his +hither +home +how +howbeit +however +hundred +i +id +ie +if +i'll +im +immediate +immediately +importance +important +in +inc +indeed +index +information +instead +into +invention +inward +is +isn't +it +itd +it'll +its +itself +i've +j +just +k +keep keeps +kept +kg +km +know +known +knows +l +largely +last +lately +later +latter +latterly +least +less +lest +let +lets +like +liked +likely +line +little +'ll +look +looking +looks +ltd +m +made +mainly +make +makes +many +may +maybe +me +mean +means +meantime +meanwhile +merely +mg +might +million +miss +ml +more +moreover +most +mostly +mr +mrs +much +mug +must +my +myself +n +na +name +namely +nay +nd +near +nearly +necessarily +necessary +need +needs +neither +never +nevertheless +new +next +nine +ninety +no +nobody +non +none +nonetheless +noone +nor +normally +nos +not +noted +nothing +now +nowhere +o +obtain +obtained +obviously +of +off +often +oh +ok +okay +old +omitted +on +once +one +ones +only +onto +or +ord +other +others +otherwise +ought +our +ours +ourselves +out +outside +over +overall +owing +own +p +page +pages +part +particular +particularly +past +per +perhaps +placed +please +plus +poorly +possible +possibly +potentially +pp +predominantly +present +previously +primarily +probably +promptly +proud +provides +put +q +que +quickly +quite +qv +r +ran +rather +rd +re +readily +really +recent +recently +ref +refs +regarding +regardless +regards +related +relatively +research +respectively +resulted +resulting +results +right +run +s +said +same +saw +say +saying +says +sec +section +see +seeing +seem +seemed +seeming +seems +seen +self +selves +sent +seven +several +shall +she +shed +she'll +shes +should +shouldn't +show +showed +shown +showns +shows +significant +significantly +similar +similarly +since +six +slightly +so +some +somebody +somehow +someone +somethan +something +sometime +sometimes +somewhat +somewhere +soon +sorry +specifically +specified +specify +specifying +still +stop +strongly +sub +substantially +successfully +such +sufficiently +suggest +sup +sure \ No newline at end of file diff --git a/duke-core/src/test/java/no/priv/garshol/duke/cleaners/StopwordsCleanerTest.java b/duke-core/src/test/java/no/priv/garshol/duke/cleaners/StopwordsCleanerTest.java new file mode 100644 index 00000000..2b1d3206 --- /dev/null +++ b/duke-core/src/test/java/no/priv/garshol/duke/cleaners/StopwordsCleanerTest.java @@ -0,0 +1,21 @@ + +package no.priv.garshol.duke.cleaners; + +import org.junit.Before; +import org.junit.Test; + +import static junit.framework.Assert.assertEquals; + +public class StopwordsCleanerTest extends LowerCaseNormalizeCleanerTest { + + public void setUp() { + cleaner = new StopwordsCleaner(); + } + + public void testMapping() { + assertEquals("Hello my name is duke", cleaner.clean("hello name duke")); + } + + + +} \ No newline at end of file From 50ca14ecb05d3b323c71e89cff7968a64f6b99de Mon Sep 17 00:00:00 2001 From: Shivam Bansal Date: Mon, 25 Jul 2016 18:23:20 +0530 Subject: [PATCH 2/2] added optimizations --- .../duke/cleaners/StopwordsCleaner.java | 21 ++++++------------- .../priv/garshol/duke/english-stopwords.txt | 3 ++- .../duke/cleaners/StopwordsCleanerTest.java | 11 ++++++++++ 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/duke-core/src/main/java/no/priv/garshol/duke/cleaners/StopwordsCleaner.java b/duke-core/src/main/java/no/priv/garshol/duke/cleaners/StopwordsCleaner.java index ac3aaf42..68b3bf9b 100644 --- a/duke-core/src/main/java/no/priv/garshol/duke/cleaners/StopwordsCleaner.java +++ b/duke-core/src/main/java/no/priv/garshol/duke/cleaners/StopwordsCleaner.java @@ -13,7 +13,7 @@ public class StopwordsCleaner implements Cleaner { private LowerCaseNormalizeCleaner sub; - private String[] stopwords; + HashSet stopwords = new HashSet(); private ArrayList wordsList = new ArrayList(); @@ -22,7 +22,7 @@ public StopwordsCleaner() { try { this.stopwords = loadStopwords(); - } catch (IOException e) { + } catch (DukeException e) { throw new RuntimeException(e); } } @@ -35,35 +35,26 @@ public String clean(String value) { return value; - String[] words = value.split(" "); for (String word : words) { + if (!stopwords.contains(word)) wordsList.add(word); } - for (int j = 0; j < stopwords.length; j++) { - if (wordsList.contains(stopwords[j])) { - wordsList.remove(stopwords[j]); - } - } - return String.join(" ",wordsList); } - private String[] loadStopwords() throws IOException { + private HashSet loadStopwords() throws IOException { String mapfile = "no/priv/garshol/duke/english-stopwords.txt"; - BufferedReader in = new BufferedReader(new FileReader(mapfile)); String str; - List list = new ArrayList(); + HashSet stopwords = new HashSet(); while((str = in.readLine()) != null){ - list.add(str); + stopwords.add(str); } - String[] stopwords = list.toArray(new String[0]); - in.close(); return stopwords; } diff --git a/duke-core/src/main/resources/no/priv/garshol/duke/english-stopwords.txt b/duke-core/src/main/resources/no/priv/garshol/duke/english-stopwords.txt index 24f5f4d6..31e2514c 100644 --- a/duke-core/src/main/resources/no/priv/garshol/duke/english-stopwords.txt +++ b/duke-core/src/main/resources/no/priv/garshol/duke/english-stopwords.txt @@ -246,7 +246,8 @@ i've j just k -keep keeps +keep +keeps kept kg km diff --git a/duke-core/src/test/java/no/priv/garshol/duke/cleaners/StopwordsCleanerTest.java b/duke-core/src/test/java/no/priv/garshol/duke/cleaners/StopwordsCleanerTest.java index 2b1d3206..f1ce7f8b 100644 --- a/duke-core/src/test/java/no/priv/garshol/duke/cleaners/StopwordsCleanerTest.java +++ b/duke-core/src/test/java/no/priv/garshol/duke/cleaners/StopwordsCleanerTest.java @@ -5,6 +5,7 @@ import org.junit.Test; import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.assertTrue; public class StopwordsCleanerTest extends LowerCaseNormalizeCleanerTest { @@ -12,10 +13,20 @@ public void setUp() { cleaner = new StopwordsCleaner(); } + @Test public void testMapping() { assertEquals("Hello my name is duke", cleaner.clean("hello name duke")); } + @Test + public void testEmpty() { + assertTrue(cleaner.clean("") == ""); + } + + @Test + public void testNull() { + assertTrue(cleaner.clean(null) == null); + } } \ No newline at end of file