-
Notifications
You must be signed in to change notification settings - Fork 60
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add abbreviation replacement data augmentation op and test #732
base: master
Are you sure you want to change the base?
Changes from 3 commits
5902d7b
aa98d53
a3df17d
f46e40d
f779549
247ef59
e7f8d4c
cc471ab
d295df0
0e00ac5
0fa98b2
e657d49
8ae0112
512b1a0
a62c864
82da6ef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
# Copyright 2020 The Forte Authors. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
import random | ||
import json | ||
from typing import Tuple, Union, Dict, Any | ||
|
||
import requests | ||
from forte.data.ontology import Annotation | ||
from forte.processors.data_augment.algorithms.single_annotation_op import ( | ||
SingleAnnotationAugmentOp, | ||
) | ||
from forte.common.configuration import Config | ||
|
||
__all__ = [ | ||
"AbbreviationReplacementOp", | ||
] | ||
|
||
|
||
class AbbreviationReplacementOp(SingleAnnotationAugmentOp): | ||
r""" | ||
This class is a replacement op utilizing a pre-defined | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The docstring should be more comprehensive. This is what the user is going to see if they want to use this DA op. |
||
abbreviation to replace words. | ||
|
||
Args: | ||
configs: | ||
- prob (float): The probability of replacement, | ||
should fall in [0, 1]. | ||
- dict_path (str): the `url` or the path to the pre-defined | ||
abbreviation json file. The key is a word / phrase we want to replace. | ||
The value is an abbreviated word of the corresponding key. | ||
""" | ||
|
||
def __init__(self, configs: Union[Config, Dict[str, Any]]): | ||
super().__init__(configs) | ||
if "dict_path" in configs.keys(): | ||
self.dict_path = configs["dict_path"] | ||
else: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. An if-else loop is not needed here as you are already setting a default value in the |
||
self.dict_path = ( | ||
"https://raw.githubusercontent.com/GEM-benchmark/NL-Augmenter/" | ||
+ "main/transformations/abbreviation_transformation/" | ||
+ "phrase_abbrev_dict.json" | ||
) | ||
|
||
try: | ||
r = requests.get(self.dict_path) | ||
self.data = json.loads(r.text) | ||
except requests.exceptions.RequestException: | ||
with open(self.dict_path, encoding="utf8") as json_file: | ||
self.data = json.load(json_file) | ||
|
||
def single_annotation_augment( | ||
self, input_anno: Annotation | ||
) -> Tuple[bool, str]: | ||
r""" | ||
This function replaces a word from an abbreviation dictionary. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, we should add a better description of what this function will do. |
||
|
||
Args: | ||
input_anno (Annotation): The input annotation. | ||
abbeyyyy marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Returns: | ||
A tuple, where the first element is a boolean value indicating | ||
whether the replacement happens, and the second element is the | ||
replaced string. | ||
""" | ||
# If the replacement does not happen, return False. | ||
if random.random() > self.configs.prob: | ||
return False, input_anno.text | ||
if input_anno.text in self.data.keys(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since you are returning from the function if the program enters the earlier There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, I am not sure is this check ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was thinking if the input phrase does not have a corresponding abbreviation, an error will occur. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
result: str = self.data[input_anno.text] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Something about this replacement:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe you need to consider using an Aho-Corasick data sturcture here: https://pyahocorasick.readthedocs.io/en/latest/ |
||
return True, result | ||
else: | ||
return False, input_anno.text | ||
|
||
@classmethod | ||
def default_configs(cls) -> Dict[str, Any]: | ||
r""" | ||
Returns: | ||
A dictionary with the default config for this processor. | ||
Following are the keys for this dictionary: | ||
- prob (float): The probability of replacement, | ||
should fall in [0, 1]. Default value is 0.1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The default value below is 0.5. Make sure you check the documentation thoroughly. |
||
- dict_path (str): the `url` or the path to the pre-defined | ||
abbreviation json file. The key is a word / phrase we want | ||
to replace. The value is an abbreviated word of the | ||
corresponding key. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd recommend adding the default value of |
||
""" | ||
return { | ||
"dict_path": "https://raw.githubusercontent.com/GEM-benchmark/" | ||
+ "NL-Augmenter/main/transformations/" | ||
+ "abbreviation_transformation/phrase_abbrev_dict.json", | ||
"prob": 0.5, | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# Copyright 2020 The Forte Authors. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
""" | ||
Unit tests for dictionary word replacement op. | ||
""" | ||
|
||
import unittest | ||
from forte.data.data_pack import DataPack | ||
from ft.onto.base_ontology import Token | ||
from forte.processors.data_augment.algorithms.abbreviation_replacement_op import ( | ||
AbbreviationReplacementOp, | ||
) | ||
|
||
|
||
class TestAbbreviationReplacementOp(unittest.TestCase): | ||
def setUp(self): | ||
self.abre = AbbreviationReplacementOp( | ||
configs={ | ||
"prob": 1.0, | ||
} | ||
) | ||
|
||
def test_replace(self): | ||
data_pack = DataPack() | ||
text = "see you later" | ||
data_pack.set_text(text) | ||
token = Token(data_pack, 0, len(text)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We also have Document https://github.com/asyml/forte/blob/master/ft/onto/base_ontology.py#L136 for the whole article. I know it is just a test case so it doesn't matter too much, but still worth noting. |
||
data_pack.add_entry(token) | ||
|
||
augmented_data_pack = self.abre.perform_augmentation(data_pack) | ||
|
||
augmented_token = list( | ||
augmented_data_pack.get("ft.onto.base_ontology.Token") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you should take the comment above into consideration and rework your test cases accordingly. |
||
)[0] | ||
|
||
self.assertIn( | ||
augmented_token.text, | ||
["syl8r", "cul83r", "cul8r"], | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
unittest.main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
2022*