synset pos parameter (#15)

* modified: spacy_wordnet/wordnet_domains.py - importing Synset class and added return type hint to __find_synsets(...) method - pep8 character limit adjustment * modified: spacy_wordnet/wordnet_domains.py - added optional pos param to __find_synsets(...) method * modified: spacy_wordnet/wordnet_domains.py - argument handling for pos param * modified: spacy_wordnet/wordnet_domains.py - swapping all(map(...)) for set(...).difference(...) which gives a slight boost in speed and readability * modified: spacy_wordnet/wordnet_domains.py - added try/except to attempt to convert pos arg to list * modified: spacy_wordnet/wordnet_domains.py - filtering acceptable_pos using pos values and assigning to token_pos which will be used to determine which tokens to get synsets for * modified: spacy_wordnet/wordnet_domains.py - moved call from self.__synsets declaration into .synsets(...) method allowing user to supply pos args * modified: spacy_wordnet/wordnet_domains.py - return type hint and docstring for synsets(...) method * modified: tests/test_wordnet_annotator.py - added three assertions for pos param in test_english_annotations() method * modified: spacy_wordnet/wordnet_domains.py - fixed error type hint in synsets(...) method * modified: spacy_wordnet/wordnet_domains.py - fixed type error in __find_lemmas() method by swapping self.__synsets attribute with self.synsets(...) method - pep8 character limit fix in __find_lemmas() method * modified: spacy_wordnet/wordnet_domains.py - defined token_synsets as a separate list and filtered returned synsets in wn.synsets and extending token_synsets in __find_synsets(...) method * modified: tests/test_wordnet_annotator.py changed expected_adj_synsets to set() instead of {} (a dict) in test_english_annotations() method * Update spacy_wordnet/wordnet_domains.py param type hint spacing/formatting in synsets(...) method Co-authored-by: Francisco Aranda <[email protected]> * Update spacy_wordnet/wordnet_domains.py param type hint spacing/formatting in __find_synsets(...) method Co-authored-by: Francisco Aranda <[email protected]> * use token.pos if pos argument is none to mimic previous behavior. Co-authored-by: Francisco Aranda <[email protected]> * Update wordnet_domains.py modified docstring to reflect what happens if pos argument is none * modified: tests/test_wordnet_annotator.py - added assert to test that list of pos args will return expected results * modified: tests/test_wordnet_annotator.py - added test for when pos argument is none * Update spacy_wordnet/wordnet_domains.py Checking if `token.pos` is an acceptable value before appending its lemma to the `word_variants` list. This avoids unexpected results such as when `token.pos` is an `ADVERB`. Co-authored-by: Francisco Aranda <[email protected]> * Update wordnet_domains.py Updated docstring so user knows results are limited to NOUN, VERB, and ADJ even if `pos` is None. Co-authored-by: Ian Thompson <[email protected]> Co-authored-by: Francisco Aranda <[email protected]>
argilla-io · Sep 19, 2022 · b9efd80 · b9efd80
1 parent 4bc9fe0
commit b9efd80
Show file tree

Hide file tree

Showing 2 changed files with 94 additions and 14 deletions.
diff --git a/spacy_wordnet/wordnet_domains.py b/spacy_wordnet/wordnet_domains.py
@@ -1,4 +1,6 @@
+from typing import Union
 from nltk.corpus import wordnet as wn
+from nltk.corpus.reader.wordnet import Synset
 from spacy.tokens.token import Token
 
 from spacy_wordnet.__utils__ import *
@@ -43,16 +45,28 @@ class Wordnet(object):
     def __init__(self, token: Token, lang: str = "es"):
         self.__token = token
         self.__lang = fetch_wordnet_lang(lang)
-        self.__synsets = self.__find_synsets(token, self.__lang)
+        self.__synsets = self.__find_synsets
         self.__lemmas = self.__find_lemmas()
         self.__wordnet_domains = self.__find_wordnet_domains()
 
+    def synsets(self, pos: Optional[Union[str, List[str]]] = None) -> List[Synset]:
+        """
+        Load all synsets with a given part of speech tag.
+        If no pos is specified and `token.pos` is a VERB, NOUN, 
+        or ADJ, synsets with the same parts of speech as 
+        `token.pos` will be loaded. If `token.pos` is not a 
+        VERB, NOUN, or ADJ and no pos is specified, an empty 
+        list will be returned.
+
+        :param pos: filter returned synsets by part(s) of speech.
+            Acceptable values are "verb", "noun", and "adj".
+        :return: list of synsets
+        """
+        return self.__synsets(self.__token, self.__lang, pos=pos)
+
     def lang(self):
         return self.__lang
 
-    def synsets(self):
-        return self.__synsets
-
     def lemmas(self):
         return self.__lemmas
 
@@ -68,16 +82,40 @@ def wordnet_synsets_for_domain(self, domains: List[str]):
         ]
 
     @staticmethod
-    def __find_synsets(token: Token, lang: str):
+    def __find_synsets(token: Token,
+                       lang: str,
+                       pos: Optional[Union[str, List[str]]] = None) -> List[Synset]:
+        if pos is None:
+            pos = []
+        elif isinstance(pos, str):
+            pos = [pos]
+        elif not isinstance(pos, list):
+            try:
+                pos = list(pos)
+            except TypeError:
+                raise TypeError("pos argument must be None, type str, or type list.")
+
+        acceptable_pos = {"verb": VERB, "noun": NOUN, "adj": ADJ} # We can define this as a private class constant
+        # check if any element in `pos` is not in `acceptable_pos`
+        if set(pos).difference(acceptable_pos):
+            raise ValueError("pos argument must be a combination of 'verb', "
+                             "'noun', or 'adj'.")
+
+        token_pos: List[int] = [acceptable_pos[k] for k in pos]
+        if not token_pos:
+           token_pos = [token.pos]
         word_variants = [token.text]
-        if token.pos in [VERB, NOUN, ADJ]:
+        if token.pos in (token_pos if pos else acceptable_pos.values()):
             # extend synset coverage using lemmas
             word_variants.append(token.lemma_)
 
         for word in word_variants:
-            token_synsets = wn.synsets(
-                word, pos=spacy2wordnet_pos(token.pos), lang=lang
-            )
+            token_synsets: List[Synset] = []
+            for p in token_pos:
+                token_synsets.extend(wn.synsets(
+                    word, pos=spacy2wordnet_pos(p), lang=lang
+                ))
+
             if token_synsets:
                 return token_synsets
 
@@ -95,8 +133,4 @@ def __find_wordnet_domains(self):
         ]
 
     def __find_lemmas(self):
-        return [
-            lemma
-            for synset in self.synsets()
-            for lemma in synset.lemmas(lang=self.__lang)
-        ]
+        return [lemma for synset in self.synsets() for lemma in synset.lemmas(lang=self.__lang)]
diff --git a/tests/test_wordnet_annotator.py b/tests/test_wordnet_annotator.py
@@ -1,5 +1,6 @@
 import unittest
 
+from nltk.corpus import wordnet as wn
 import spacy
 
 from spacy_wordnet.wordnet_annotator import WordnetAnnotator
@@ -28,6 +29,51 @@ def test_english_annotations(self):
         assert token._.wordnet.lemmas()
         assert token._.wordnet.wordnet_domains()
 
+        actual_none_synsets = set(token._.wordnet.synsets(pos=None))
+        expected_none_synsets = {wn.synset("contract.n.01"),
+                                 wn.synset("contract.n.02"),
+                                 wn.synset("contract.n.03")}
+        assert actual_none_synsets == expected_none_synsets
+
+        actual_verb_synsets = set(token._.wordnet.synsets(pos="verb"))
+        expected_verb_synsets = {wn.synset('abridge.v.01'),
+                                 wn.synset('compress.v.02'),
+                                 wn.synset('condense.v.07'),
+                                 wn.synset('contract.v.01'),
+                                 wn.synset('contract.v.04'),
+                                 wn.synset('contract.v.06'),
+                                 wn.synset('narrow.v.01'),
+                                 wn.synset('shrink.v.04'),
+                                 wn.synset('sign.v.04')}
+        assert actual_verb_synsets == expected_verb_synsets
+
+        actual_noun_synsets = set(token._.wordnet.synsets(pos="noun"))
+        expected_noun_synsets = {wn.synset('contract.n.01'),
+                                 wn.synset('contract.n.02'),
+                                 wn.synset('contract.n.03')}
+        assert actual_noun_synsets == expected_noun_synsets
+
+        actual_adj_synsets = set(token._.wordnet.synsets(pos="adj"))
+        expected_adj_synsets = set()
+        assert actual_adj_synsets == expected_adj_synsets
+
+        actual_verb_noun_synsets = set(token._.wordnet.synsets(
+            pos=["verb", "noun"])
+        )
+        expected_verb_noun_synsets = {wn.synset('abridge.v.01'),
+                                      wn.synset('compress.v.02'),
+                                      wn.synset('condense.v.07'),
+                                      wn.synset('contract.v.01'),
+                                      wn.synset('contract.v.04'),
+                                      wn.synset('contract.v.06'),
+                                      wn.synset('narrow.v.01'),
+                                      wn.synset('shrink.v.04'),
+                                      wn.synset('sign.v.04'),
+                                      wn.synset('contract.n.01'),
+                                      wn.synset('contract.n.02'),
+                                      wn.synset('contract.n.03')}
+        assert actual_verb_noun_synsets == expected_verb_noun_synsets
+
     def test_generate_variants_from_domain_list(self):
 
         economy_domains = ["finance", "banking"]