Source code for forte.processors.data_augment.algorithms.dictionary

# Copyright 2020 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List
from forte.utils import create_import_error_msg

__all__ = ["Dictionary", "WordnetDictionary"]


[docs]class Dictionary: r""" This class defines a dictionary for word replacement. Given an input word and its pos_tag(optional), the dictionary will outputs its synonyms, antonyms, hypernyms and hypernyms. """ # pylint: disable=unused-argument
[docs] def get_synonyms( self, word: str, pos_tag: str = "", lang: str = "eng" ) -> List[str]: r""" Args: word: The input string. pos_tag: The Part-of-Speech tag for substitution. lang: The language of the input string. Returns: synonyms of the word. """ return []
[docs] def get_antonyms( self, word: str, pos_tag: str = "", lang: str = "eng" ) -> List[str]: r""" Args: word: The input string. pos_tag: The Part-of-Speech tag for substitution. lang: The language of the input string. Returns: Antonyms of the word. """ return []
[docs] def get_hypernyms( self, word: str, pos_tag: str = "", lang: str = "eng" ) -> List[str]: r""" Args: word: The input string. pos_tag: The Part-of-Speech tag for substitution. lang: The language of the input string. Returns: Hypernyms of the word. """ return []
[docs] def get_hyponyms( self, word: str, pos_tag: str = "", lang: str = "eng" ) -> List[str]: r""" Args: word: The input string. pos_tag: The Part-of-Speech tag for substitution. lang: The language of the input string. Returns: Hyponyms of the word. """ return []
[docs]class WordnetDictionary(Dictionary): r""" This class wraps the nltk WORDNET to replace the input word with an synonym/antonym/hypernym/hyponym. Part-of-Speech(optional) can be provided to the wordnet for retrieving words with the same POS. """ def __init__(self): try: import nltk # pylint: disable=import-outside-toplevel from nltk.corpus import ( # pylint:disable=import-outside-toplevel wordnet, ) except ImportError as err: raise ImportError( create_import_error_msg( "nltk", "data_aug", "dictionary based data augmentation" ) ) from err try: # Check if the wordnet package and # pos_tag package are downloaded. wordnet.synsets("computer") except LookupError: nltk.download("wordnet") nltk.download("omw-1.4") self.model = wordnet def _get_wordnet_pos(self, treebank_tag: str) -> str: """ return WORDNET POS compliance to WORDNET lemmatization (a,n,r,v) """ if treebank_tag.startswith("J"): return self.model.ADJ elif treebank_tag.startswith("V"): return self.model.VERB elif treebank_tag.startswith("N"): return self.model.NOUN elif treebank_tag.startswith("R"): return self.model.ADV else: # As default pos in lemmatization is Noun return self.model.NOUN
[docs] def get_lemmas( self, word: str, pos_tag: str = "", lang: str = "eng", lemma_type: str = "SYNONYM", ): r""" This function gets synonyms/antonyms/hypernyms/hyponyms from a WORDNET dictionary. Args: word: The input token. pos_tag: The NLTK POS tag. lang: The input language. lemma_type: The type of words to replace, must be one of the following: - ``'SYNONYM'`` - ``'ANTONYM'`` - ``'HYPERNYM'`` - ``'HYPONYM'`` """ res: List[str] = [] pos_wordnet = None # The POS property is used for retrieving lemmas with the same POS. if pos_tag and len(pos_tag) > 0: pos_wordnet = self._get_wordnet_pos(pos_tag) for synonym in self.model.synsets(word, pos=pos_wordnet, lang=lang): for lemma in synonym.lemmas(lang=lang): if lemma_type == "SYNONYM": res.append(lemma.name()) elif lemma_type == "ANTONYM": for antonym in lemma.antonyms(): res.append(antonym.name()) elif lemma_type == "HYPERNYM": for hypernym in lemma.hypernyms(): res.append(hypernym.name()) elif lemma_type == "HYPONYM": for hyponym in lemma.hyponyms(): res.append(hyponym.name()) else: raise KeyError( f"The type {type} does not belong to " '["SYNONYM", "ANTONYM", ' '"HYPERNYM", "HYPONYM"]]' ) # The phrases are concatenated with "_" in wordnet. return [word.replace("_", " ") for word in res]
[docs] def get_synonyms( self, word: str, pos_tag: str = "", lang: str = "eng" ) -> List[str]: r""" This function replaces a word with synonyms from a WORDNET dictionary. """ return self.get_lemmas(word, pos_tag, lang, lemma_type="SYNONYM")
[docs] def get_antonyms( self, word: str, pos_tag: str = "", lang: str = "eng" ) -> List[str]: r""" This function replaces a word with antonyms from a WORDNET dictionary. """ return self.get_lemmas(word, pos_tag, lang, lemma_type="ANTONYM")
[docs] def get_hypernyms( self, word: str, pos_tag: str = "", lang: str = "eng" ) -> List[str]: r""" This function replaces a word with hypernyms from a WORDNET dictionary. """ return self.get_lemmas(word, pos_tag, lang, lemma_type="HYPERNYM")
[docs] def get_hyponyms( self, word: str, pos_tag: str = "", lang: str = "eng" ) -> List[str]: r""" This function replaces a word with hyponyms from a WORDNET dictionary. """ return self.get_lemmas(word, pos_tag, lang, lemma_type="HYPONYM")