Source code for forte.processors.data_augment.algorithms.dictionary

# Copyright 2020 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List
from forte.utils import create_import_error_msg

__all__ = ["Dictionary", "WordnetDictionary"]


[docs]class Dictionary:
    r"""
    This class defines a dictionary for word replacement.
    Given an input word and its pos_tag(optional), the dictionary
    will outputs its synonyms, antonyms, hypernyms and hypernyms.
    """

    # pylint: disable=unused-argument
[docs]    def get_synonyms(
        self, word: str, pos_tag: str = "", lang: str = "eng"
    ) -> List[str]:
        r"""
        Args:
            word: The input string.
            pos_tag: The Part-of-Speech tag for substitution.
            lang: The language of the input string.
        Returns:
            synonyms of the word.
        """
        return []

[docs]    def get_antonyms(
        self, word: str, pos_tag: str = "", lang: str = "eng"
    ) -> List[str]:
        r"""
        Args:
            word: The input string.
            pos_tag: The Part-of-Speech tag for substitution.
            lang: The language of the input string.
        Returns:
            Antonyms of the word.
        """
        return []

[docs]    def get_hypernyms(
        self, word: str, pos_tag: str = "", lang: str = "eng"
    ) -> List[str]:
        r"""
        Args:
            word: The input string.
            pos_tag: The Part-of-Speech tag for substitution.
            lang: The language of the input string.
        Returns:
            Hypernyms of the word.
        """
        return []

[docs]    def get_hyponyms(
        self, word: str, pos_tag: str = "", lang: str = "eng"
    ) -> List[str]:
        r"""
        Args:
            word: The input string.
            pos_tag: The Part-of-Speech tag for substitution.
            lang: The language of the input string.
        Returns:
            Hyponyms of the word.
        """
        return []


[docs]class WordnetDictionary(Dictionary):
    r"""
    This class wraps the nltk WORDNET to replace
    the input word with an synonym/antonym/hypernym/hyponym.
    Part-of-Speech(optional) can be provided to the wordnet
    for retrieving words with the same POS.
    """

    def __init__(self):
        try:
            import nltk  # pylint: disable=import-outside-toplevel
            from nltk.corpus import (  # pylint:disable=import-outside-toplevel
                wordnet,
            )
        except ImportError as err:
            raise ImportError(
                create_import_error_msg(
                    "nltk", "data_aug", "dictionary based data augmentation"
                )
            ) from err

        try:
            # Check if the wordnet package and
            # pos_tag package are downloaded.
            wordnet.synsets("computer")
        except LookupError:
            nltk.download("wordnet")
            nltk.download("omw-1.4")
        self.model = wordnet

    def _get_wordnet_pos(self, treebank_tag: str) -> str:
        """
        return WORDNET POS compliance to WORDNET lemmatization (a,n,r,v)
        """
        if treebank_tag.startswith("J"):
            return self.model.ADJ
        elif treebank_tag.startswith("V"):
            return self.model.VERB
        elif treebank_tag.startswith("N"):
            return self.model.NOUN
        elif treebank_tag.startswith("R"):
            return self.model.ADV
        else:
            # As default pos in lemmatization is Noun
            return self.model.NOUN

[docs]    def get_lemmas(
        self,
        word: str,
        pos_tag: str = "",
        lang: str = "eng",
        lemma_type: str = "SYNONYM",
    ):
        r"""
        This function gets synonyms/antonyms/hypernyms/hyponyms
        from a WORDNET dictionary.

        Args:
            word: The input token.
            pos_tag: The NLTK POS tag.
            lang: The input language.
            lemma_type: The type of words to replace, must be
                one of the following:

                - ``'SYNONYM'``
                - ``'ANTONYM'``
                - ``'HYPERNYM'``
                - ``'HYPONYM'``
        """
        res: List[str] = []
        pos_wordnet = None
        # The POS property is used for retrieving lemmas with the same POS.
        if pos_tag and len(pos_tag) > 0:
            pos_wordnet = self._get_wordnet_pos(pos_tag)

        for synonym in self.model.synsets(word, pos=pos_wordnet, lang=lang):
            for lemma in synonym.lemmas(lang=lang):
                if lemma_type == "SYNONYM":
                    res.append(lemma.name())
                elif lemma_type == "ANTONYM":
                    for antonym in lemma.antonyms():
                        res.append(antonym.name())
                elif lemma_type == "HYPERNYM":
                    for hypernym in lemma.hypernyms():
                        res.append(hypernym.name())
                elif lemma_type == "HYPONYM":
                    for hyponym in lemma.hyponyms():
                        res.append(hyponym.name())
                else:
                    raise KeyError(
                        f"The type {type} does not belong to "
                        '["SYNONYM", "ANTONYM", '
                        '"HYPERNYM", "HYPONYM"]]'
                    )
        # The phrases are concatenated with "_" in wordnet.
        return [word.replace("_", " ") for word in res]

[docs]    def get_synonyms(
        self, word: str, pos_tag: str = "", lang: str = "eng"
    ) -> List[str]:
        r"""
        This function replaces a word with synonyms from a WORDNET dictionary.
        """
        return self.get_lemmas(word, pos_tag, lang, lemma_type="SYNONYM")

[docs]    def get_antonyms(
        self, word: str, pos_tag: str = "", lang: str = "eng"
    ) -> List[str]:
        r"""
        This function replaces a word with antonyms from a WORDNET dictionary.
        """
        return self.get_lemmas(word, pos_tag, lang, lemma_type="ANTONYM")

[docs]    def get_hypernyms(
        self, word: str, pos_tag: str = "", lang: str = "eng"
    ) -> List[str]:
        r"""
        This function replaces a word with hypernyms from a WORDNET dictionary.
        """
        return self.get_lemmas(word, pos_tag, lang, lemma_type="HYPERNYM")

[docs]    def get_hyponyms(
        self, word: str, pos_tag: str = "", lang: str = "eng"
    ) -> List[str]:
        r"""
        This function replaces a word with hyponyms from a WORDNET dictionary.
        """
        return self.get_lemmas(word, pos_tag, lang, lemma_type="HYPONYM")