Skip to content

Commit

Permalink
synset pos parameter (#15)
Browse files Browse the repository at this point in the history
* modified:   spacy_wordnet/wordnet_domains.py
	- importing Synset class and added return type hint to __find_synsets(...) method
	- pep8 character limit adjustment

* modified:   spacy_wordnet/wordnet_domains.py
	- added optional pos param to __find_synsets(...) method

* modified:   spacy_wordnet/wordnet_domains.py
	- argument handling for pos param

* modified:   spacy_wordnet/wordnet_domains.py
	- swapping all(map(...)) for set(...).difference(...) which gives a slight boost in speed and readability

* modified:   spacy_wordnet/wordnet_domains.py
	- added try/except to attempt to convert pos arg to list

* modified:   spacy_wordnet/wordnet_domains.py
	- filtering acceptable_pos using pos values and assigning to token_pos which will be used to determine which tokens to get synsets for

* modified:   spacy_wordnet/wordnet_domains.py
	- moved call from self.__synsets declaration into .synsets(...) method allowing user to supply pos args

* modified:   spacy_wordnet/wordnet_domains.py
	- return type hint and docstring for synsets(...) method

* modified:   tests/test_wordnet_annotator.py
	- added three assertions for pos param in test_english_annotations() method

* modified:   spacy_wordnet/wordnet_domains.py
	- fixed error type hint in synsets(...) method

* modified:   spacy_wordnet/wordnet_domains.py
	- fixed type error in __find_lemmas() method by swapping self.__synsets attribute with self.synsets(...) method
	- pep8 character limit fix in __find_lemmas() method

* modified:   spacy_wordnet/wordnet_domains.py
	- defined token_synsets as a separate list and filtered returned synsets in wn.synsets and extending token_synsets in __find_synsets(...) method

* modified:   tests/test_wordnet_annotator.py
	changed expected_adj_synsets to set() instead of {} (a dict) in test_english_annotations() method

* Update spacy_wordnet/wordnet_domains.py

param type hint spacing/formatting in synsets(...) method

Co-authored-by: Francisco Aranda <[email protected]>

* Update spacy_wordnet/wordnet_domains.py

param type hint spacing/formatting in __find_synsets(...) method

Co-authored-by: Francisco Aranda <[email protected]>

* use token.pos if pos argument is none to mimic previous behavior.

Co-authored-by: Francisco Aranda <[email protected]>

* Update wordnet_domains.py

modified docstring to reflect what happens if pos argument is none

* modified:   tests/test_wordnet_annotator.py
	- added assert to test that list of pos args will return expected results

* modified:   tests/test_wordnet_annotator.py
	- added test for when pos argument is none

* Update spacy_wordnet/wordnet_domains.py

Checking if `token.pos` is an acceptable value before appending its lemma to the `word_variants` list. This avoids unexpected results such as when `token.pos` is an `ADVERB`.

Co-authored-by: Francisco Aranda <[email protected]>

* Update wordnet_domains.py

Updated docstring so user knows results are limited to NOUN, VERB, and ADJ even if `pos` is None.

Co-authored-by: Ian Thompson <[email protected]>
Co-authored-by: Francisco Aranda <[email protected]>
  • Loading branch information
3 people authored Sep 19, 2022
1 parent 4bc9fe0 commit b9efd80
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 14 deletions.
62 changes: 48 additions & 14 deletions spacy_wordnet/wordnet_domains.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from typing import Union
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import Synset
from spacy.tokens.token import Token

from spacy_wordnet.__utils__ import *
Expand Down Expand Up @@ -43,16 +45,28 @@ class Wordnet(object):
def __init__(self, token: Token, lang: str = "es"):
self.__token = token
self.__lang = fetch_wordnet_lang(lang)
self.__synsets = self.__find_synsets(token, self.__lang)
self.__synsets = self.__find_synsets
self.__lemmas = self.__find_lemmas()
self.__wordnet_domains = self.__find_wordnet_domains()

def synsets(self, pos: Optional[Union[str, List[str]]] = None) -> List[Synset]:
"""
Load all synsets with a given part of speech tag.
If no pos is specified and `token.pos` is a VERB, NOUN,
or ADJ, synsets with the same parts of speech as
`token.pos` will be loaded. If `token.pos` is not a
VERB, NOUN, or ADJ and no pos is specified, an empty
list will be returned.
:param pos: filter returned synsets by part(s) of speech.
Acceptable values are "verb", "noun", and "adj".
:return: list of synsets
"""
return self.__synsets(self.__token, self.__lang, pos=pos)

def lang(self):
return self.__lang

def synsets(self):
return self.__synsets

def lemmas(self):
return self.__lemmas

Expand All @@ -68,16 +82,40 @@ def wordnet_synsets_for_domain(self, domains: List[str]):
]

@staticmethod
def __find_synsets(token: Token, lang: str):
def __find_synsets(token: Token,
lang: str,
pos: Optional[Union[str, List[str]]] = None) -> List[Synset]:
if pos is None:
pos = []
elif isinstance(pos, str):
pos = [pos]
elif not isinstance(pos, list):
try:
pos = list(pos)
except TypeError:
raise TypeError("pos argument must be None, type str, or type list.")

acceptable_pos = {"verb": VERB, "noun": NOUN, "adj": ADJ} # We can define this as a private class constant
# check if any element in `pos` is not in `acceptable_pos`
if set(pos).difference(acceptable_pos):
raise ValueError("pos argument must be a combination of 'verb', "
"'noun', or 'adj'.")

token_pos: List[int] = [acceptable_pos[k] for k in pos]
if not token_pos:
token_pos = [token.pos]
word_variants = [token.text]
if token.pos in [VERB, NOUN, ADJ]:
if token.pos in (token_pos if pos else acceptable_pos.values()):
# extend synset coverage using lemmas
word_variants.append(token.lemma_)

for word in word_variants:
token_synsets = wn.synsets(
word, pos=spacy2wordnet_pos(token.pos), lang=lang
)
token_synsets: List[Synset] = []
for p in token_pos:
token_synsets.extend(wn.synsets(
word, pos=spacy2wordnet_pos(p), lang=lang
))

if token_synsets:
return token_synsets

Expand All @@ -95,8 +133,4 @@ def __find_wordnet_domains(self):
]

def __find_lemmas(self):
return [
lemma
for synset in self.synsets()
for lemma in synset.lemmas(lang=self.__lang)
]
return [lemma for synset in self.synsets() for lemma in synset.lemmas(lang=self.__lang)]
46 changes: 46 additions & 0 deletions tests/test_wordnet_annotator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import unittest

from nltk.corpus import wordnet as wn
import spacy

from spacy_wordnet.wordnet_annotator import WordnetAnnotator
Expand Down Expand Up @@ -28,6 +29,51 @@ def test_english_annotations(self):
assert token._.wordnet.lemmas()
assert token._.wordnet.wordnet_domains()

actual_none_synsets = set(token._.wordnet.synsets(pos=None))
expected_none_synsets = {wn.synset("contract.n.01"),
wn.synset("contract.n.02"),
wn.synset("contract.n.03")}
assert actual_none_synsets == expected_none_synsets

actual_verb_synsets = set(token._.wordnet.synsets(pos="verb"))
expected_verb_synsets = {wn.synset('abridge.v.01'),
wn.synset('compress.v.02'),
wn.synset('condense.v.07'),
wn.synset('contract.v.01'),
wn.synset('contract.v.04'),
wn.synset('contract.v.06'),
wn.synset('narrow.v.01'),
wn.synset('shrink.v.04'),
wn.synset('sign.v.04')}
assert actual_verb_synsets == expected_verb_synsets

actual_noun_synsets = set(token._.wordnet.synsets(pos="noun"))
expected_noun_synsets = {wn.synset('contract.n.01'),
wn.synset('contract.n.02'),
wn.synset('contract.n.03')}
assert actual_noun_synsets == expected_noun_synsets

actual_adj_synsets = set(token._.wordnet.synsets(pos="adj"))
expected_adj_synsets = set()
assert actual_adj_synsets == expected_adj_synsets

actual_verb_noun_synsets = set(token._.wordnet.synsets(
pos=["verb", "noun"])
)
expected_verb_noun_synsets = {wn.synset('abridge.v.01'),
wn.synset('compress.v.02'),
wn.synset('condense.v.07'),
wn.synset('contract.v.01'),
wn.synset('contract.v.04'),
wn.synset('contract.v.06'),
wn.synset('narrow.v.01'),
wn.synset('shrink.v.04'),
wn.synset('sign.v.04'),
wn.synset('contract.n.01'),
wn.synset('contract.n.02'),
wn.synset('contract.n.03')}
assert actual_verb_noun_synsets == expected_verb_noun_synsets

def test_generate_variants_from_domain_list(self):

economy_domains = ["finance", "banking"]
Expand Down

0 comments on commit b9efd80

Please sign in to comment.