Source code for forte.processors.data_augment.algorithms.back_translation_op

# Copyright 2020 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Class for back translation op. The input is translated
to another language, then translated back to the original language.
"""
import random
from typing import Any, Dict, Tuple

from forte.data.ontology import Annotation
from forte.processors.data_augment.algorithms.single_annotation_op import (
    SingleAnnotationAugmentOp,
)
from forte.common.configuration import Config
from forte.utils.utils import create_class_with_kwargs

__all__ = [
    "BackTranslationOp",
]


[docs]class BackTranslationOp(SingleAnnotationAugmentOp): r""" This class is a replacement op using back translation to generate data with the same semantic meanings. The input is translated to another language, then translated back to the original language, with pretrained machine-translation models. It will sample from a Bernoulli distribution to decide whether to replace the input, with `prob` as the probability of replacement. """ def __init__(self, configs: Config): super().__init__(configs) self._validate_configs(configs) self.model_to = create_class_with_kwargs( configs["model_to"], class_args={ "src_lang": configs["src_language"], "tgt_lang": configs["tgt_language"], "device": configs["device"], }, ) self.model_back = create_class_with_kwargs( configs["model_back"], class_args={ "src_lang": configs["tgt_language"], "tgt_lang": configs["src_language"], "device": configs["device"], }, ) def _validate_configs(self, configs): prob = configs["prob"] if not prob or prob < 0 or prob > 1: raise ValueError("The prob should be a float between 0 and 1!") src_lang = configs["src_language"] if not src_lang or len(src_lang) == 0: raise ValueError("Please provide a valid source language!") tgt_lang = configs["tgt_language"] if not tgt_lang or len(tgt_lang) == 0: raise ValueError("Please provide a valid target language!") model_to = configs["model_to"] if not model_to or len(model_to) == 0: raise ValueError("Please provide a valid to-model!") model_back = configs["model_back"] if not model_back or len(model_back) == 0: raise ValueError("Please provide a valid back-model!") device = configs["device"] if device not in ("cpu", "cuda"): raise ValueError("The device must be 'cpu' or 'cuda'!")
[docs] def single_annotation_augment( self, input_anno: Annotation ) -> Tuple[bool, str]: r""" This function replaces a piece of text with back translation. Args: input_anno: An annotation, could be a word, sentence or document. Returns: A tuple, where the first element is a boolean value indicating whether the replacement happens, and the second element is the replaced string. """ # If the replacement does not happen, return False. if random.random() > self.configs["prob"]: return False, input_anno.text intermediate_text: str = self.model_to.translate(input_anno.text) return True, self.model_back.translate(intermediate_text)
[docs] @classmethod def default_configs(cls) -> Dict[str, Any]: """ Returns: A dictionary with the default config for this processor. Following are the keys for this dictionary: - `augment_entry` (str): This indicates the entity that needs to be augmented. By default, this value is set to `ft.onto.base_ontology.Sentence`. - `prob` (float): The probability of replacement, should fall in [0, 1]. The Default value is 0.5 - `src_language` (str): The source language of back translation. - `tgt_language` (str): The target language of back translation. - `model_to` (str): The full qualified name of the model from source language to target language. - `model_back` (str): The full qualified name of the model from target language to source language. - `device` (str): "cpu" for the CPU or "cuda" for GPU. The Default value is cpu. """ model_class_name = ( "forte.processors.data_augment.algorithms." "machine_translator.MarianMachineTranslator" ) return { "augment_entry": "ft.onto.base_ontology.Sentence", "prob": 0.5, "model_to": model_class_name, "model_back": model_class_name, "src_language": "en", "tgt_language": "fr", "device": "cpu", }