Source code for forte.processors.data_augment.algorithms.typo_replacement_op

# Copyright 2020 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import random
import json
from typing import Tuple, Union, Dict, Any


from forte.data.ontology import Annotation
from forte.processors.data_augment.algorithms.single_annotation_op import (
    SingleAnnotationAugmentOp,
)
from forte.common.configuration import Config
from forte.utils import create_import_error_msg

__all__ = [
    "UniformTypoGenerator",
    "TypoReplacementOp",
]


[docs]class UniformTypoGenerator:
    r"""
    A uniform generator that generates a typo from a typo dictionary.

    Args:
        word: input word that needs to be replaced,
        dict_path: the url or the path to the pre-defined typo json file.
            The key is a word we want to replace. The value is a list
            containing various typos of the corresponding key.

            .. code-block:: python

                {
                    "apparent": ["aparent", "apparant"],
                    "bankruptcy": ["bankrupcy", "banruptcy"],
                    "barbecue": ["barbeque"]
                }
    """

    def __init__(self, dict_path: str):
        try:
            import requests  # pylint: disable=import-outside-toplevel
        except ImportError as e:
            raise ImportError(
                create_import_error_msg(
                    "requests", "data_aug", "data augment support"
                )
            ) from e

        try:
            r = requests.get(dict_path, timeout=30)
            self.data = r.json()
        except requests.exceptions.RequestException:
            with open(dict_path, encoding="utf8") as json_file:
                self.data = json.load(json_file)

    def generate(self, word: str) -> str:
        if word in self.data.keys():
            result: str = random.choice(self.data[word])
            return result
        else:
            return word


[docs]class TypoReplacementOp(SingleAnnotationAugmentOp):
    r"""
    This class is a replacement op using a pre-defined
    spelling mistake dictionary to simulate spelling mistake.
    """

    def __init__(self, configs: Union[Config, Dict[str, Any]]):
        super().__init__(configs)
        if "dict_path" in configs.keys():
            self.dict_path = configs["dict_path"]
        else:
            # default typo dictionary
            self.dict_path = (
                "https://raw.githubusercontent.com/wanglec/"
                + "temporaryJson/main/misspelling.json"
            )
        if configs["typo_generator"] == "uniform":
            self.typo_generator = UniformTypoGenerator(self.dict_path)
        else:
            raise ValueError(
                "The valid options for typo_generator are [uniform]"
            )

[docs]    def single_annotation_augment(
        self, input_anno: Annotation
    ) -> Tuple[bool, str]:
        r"""
        This function replaces a word from a typo dictionary.

        Args:
            input_anno: The input annotation.
        Returns:
            A tuple, where the first element is a boolean value indicating
            whether the replacement happens, and the second element is the
            replaced string.
        """
        # If the replacement does not happen, return False.
        if random.random() > self.configs.prob:
            return False, input_anno.text
        word: str = self.typo_generator.generate(input_anno.text)
        return True, word

[docs]    @classmethod
    def default_configs(cls):
        r"""
        Returns:
            A dictionary with the default config for this processor.
        Following are the keys for this dictionary:
            - prob (float): The probability of replacement,
              should fall in [0, 1]. Default value is 0.1
            - dict_path (str): the `url` or the path to the pre-defined
              typo json file. The key is a word we want to replace.
              The value is a list containing various typos
              of the corresponding key.
            - typo_generator (str): A generator that takes in a word and
              outputs the replacement typo.
        """
        return {
            "prob": 0.1,
            "dict_path": "https://raw.githubusercontent.com/wanglec/"
            + "temporaryJson/main/misspelling.json",
            "typo_generator": "uniform",
        }