Source code for forte.processors.data_augment.algorithms.eda_ops

# Copyright 2020 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Data augmentation processors from the paper "EDA: Easy Data Augmentation
Techniques for Boosting Performance on Text Classification Tasks", including
Random Swap, Random Insertion and Random Deletion. All three processors are
implemented based on the ReplacementDataAugmentProcessor.
"""

from math import ceil
import random
from typing import List, Dict, Iterable, Union, Any
from forte.common.configuration import Config
from forte.data.data_pack import DataPack
from forte.data.ontology import Annotation
from forte.processors.data_augment.algorithms.base_data_augmentation_op import (
    BaseDataAugmentationOp,
)
from forte.utils.utils import get_class, create_class_with_kwargs

__all__ = [
    "RandomSwapDataAugmentOp",
    "RandomInsertionDataAugmentOp",
    "RandomDeletionDataAugmentOp",
]

english_stopwords = [
    "i",
    "me",
    "my",
    "myself",
    "we",
    "our",
    "ours",
    "ourselves",
    "you",
    "you're",
    "you've",
    "you'll",
    "you'd",
    "your",
    "yours",
    "yourself",
    "yourselves",
    "he",
    "him",
    "his",
    "himself",
    "she",
    "she's",
    "her",
    "hers",
    "herself",
    "it",
    "it's",
    "its",
    "itself",
    "they",
    "them",
    "their",
    "theirs",
    "themselves",
    "what",
    "which",
    "who",
    "whom",
    "this",
    "that",
    "that'll",
    "these",
    "those",
    "am",
    "is",
    "are",
    "was",
    "were",
    "be",
    "been",
    "being",
    "have",
    "has",
    "had",
    "having",
    "do",
    "does",
    "did",
    "doing",
    "a",
    "an",
    "the",
    "and",
    "but",
    "if",
    "or",
    "because",
    "as",
    "until",
    "while",
    "of",
    "at",
    "by",
    "for",
    "with",
    "about",
    "against",
    "between",
    "into",
    "through",
    "during",
    "before",
    "after",
    "above",
    "below",
    "to",
    "from",
    "up",
    "down",
    "in",
    "out",
    "on",
    "off",
    "over",
    "under",
    "again",
    "further",
    "then",
    "once",
    "here",
    "there",
    "when",
    "where",
    "why",
    "how",
    "all",
    "any",
    "both",
    "each",
    "few",
    "more",
    "most",
    "other",
    "some",
    "such",
    "no",
    "nor",
    "not",
    "only",
    "own",
    "same",
    "so",
    "than",
    "too",
    "very",
    "s",
    "t",
    "can",
    "will",
    "just",
    "don",
    "don't",
    "should",
    "should've",
    "now",
    "d",
    "ll",
    "m",
    "o",
    "re",
    "ve",
    "y",
    "ain",
    "aren",
    "aren't",
    "couldn",
    "couldn't",
    "didn",
    "didn't",
    "doesn",
    "doesn't",
    "hadn",
    "hadn't",
    "hasn",
    "hasn't",
    "haven",
    "haven't",
    "isn",
    "isn't",
    "ma",
    "mightn",
    "mightn't",
    "mustn",
    "mustn't",
    "needn",
    "needn't",
    "shan",
    "shan't",
    "shouldn",
    "shouldn't",
    "wasn",
    "wasn't",
    "weren",
    "weren't",
    "won",
    "won't",
    "wouldn",
    "wouldn't",
]


[docs]class RandomSwapDataAugmentOp(BaseDataAugmentationOp): r""" Data augmentation operation for the Random Swap operation. Randomly choose two words in the sentence and swap their positions. Do this n times, where n = alpha * input length. """
[docs] def augment(self, data_pack: DataPack) -> bool: augment_entry = get_class(self.configs["augment_entry"]) if not issubclass(augment_entry, Annotation): raise ValueError( f"This augmenter only accept data of " f"'forte.data.ontology.top.Annotation' type, " f"but {self.configs['augment_entry']} is not." ) annotations: List[Annotation] = list( data_pack.get(self.configs["augment_entry"]) ) if len(annotations) > 0: replace_map: Dict = {} for _ in range(ceil(self.configs["alpha"] * len(annotations))): swap_idx = random.sample(range(len(annotations)), 2) new_idx_0 = ( swap_idx[1] if swap_idx[1] not in replace_map else replace_map[swap_idx[1]] ) new_idx_1 = ( swap_idx[0] if swap_idx[0] not in replace_map else replace_map[swap_idx[0]] ) replace_map[swap_idx[0]] = new_idx_0 replace_map[swap_idx[1]] = new_idx_1 for idx, replace_target in replace_map.items(): try: self.replace_annotations( annotations[idx], annotations[replace_target].text ) except ValueError: return False return True
[docs] @classmethod def default_configs(cls): """ Additional keys for Random Swap: - augment_entry (str): Defines the entry the processor will augment. It should be a full qualified name of the entry class. For example, "ft.onto.base_ontology.Sentence". - alpha: 0 <= alpha <= 1. indicates the percent of the words in a sentence that are changed. The processor will perform the Random Swap operation (input length * alpha) times. Default Value is 0.1. Returns: A dictionary with the default config for this processor. """ return { "augment_entry": "ft.onto.base_ontology.Token", "other_entry_policy": { "ft.onto.base_ontology.Document": "auto_align", "ft.onto.base_ontology.Sentence": "auto_align", }, "alpha": 0.1, }
[docs]class RandomInsertionDataAugmentOp(BaseDataAugmentationOp): r""" Data augmentation operation for the Random Insertion operation. Find a random synonym of a random word in the sentence that is not a stop word. Insert that synonym into a random position in the sentence. Do this n times, where n = alpha * input length. """ def __init__(self, configs: Union[Config, Dict[str, Any]]) -> None: super().__init__(configs) self.stopwords = set(self.configs["stopwords"])
[docs] def augment(self, data_pack: DataPack) -> bool: replacement_op = create_class_with_kwargs( self.configs["insertion_op_configs"]["type"], class_args={ "configs": self.configs["insertion_op_configs"]["kwargs"] }, ) annotations: List[Annotation] = [] pos = [0] annos: Iterable[Annotation] = data_pack.get( self.configs["augment_entry"] ) for anno in annos: if anno.text not in self.stopwords: annotations.append(anno) pos.append(anno.end) if len(annotations) > 0: for _ in range(ceil(self.configs["alpha"] * len(annotations))): src_anno = random.choice(annotations) try: _, replaced_text = replacement_op.single_annotation_augment( src_anno ) except ValueError: return False insert_pos = random.choice(pos) if insert_pos > 0: replaced_text = " " + replaced_text else: replaced_text = replaced_text + " " try: self.insert_annotated_span( replaced_text, data_pack, insert_pos, self.configs["augment_entry"], ) except ValueError: return False return True
[docs] @classmethod def default_configs(cls): """ Additional keys for Random Swap: - augment_entry (str): Defines the entry the processor will augment. It should be a full qualified name of the entry class. For example, "ft.onto.base_ontology.Sentence". - alpha: 0 <= alpha <= 1. indicates the percent of the words in a sentence that are changed. The processor will perform the Random Insertion operation (input length * alpha) times. Default Value is 0.1 - stopwords: a list of stopword for the language. - `insertion_op_config`: A dictionary representing the configurations required operation to take random annotations from the source data pack, augment them based on specified rules and insert them in random positions. - type: The type of data augmentation operation to be used (pass the path of the class which defines the required operation) - kwargs: This dictionary contains the data that is to be fed to the required operation (Make sure to be well versed with the required configurations of the operation that is defined in the type config). .. code-block:: python { "type": "forte.processors.data_augment.algorithms." "dictionary_replacement_op.DictionaryReplacementOp", "kwargs":{ "dictionary_class": ( "forte.processors.data_augment." "algorithms.dictionary.WordnetDictionary" ), "prob": 1.0, "lang": "eng", } } Returns: A dictionary with the default config for this processor. By default, we use Dictionary Replacement with Wordnet to get synonyms to insert. """ return { "augment_entry": "ft.onto.base_ontology.Token", "other_entry_policy": { "ft.onto.base_ontology.Document": "auto_align", "ft.onto.base_ontology.Sentence": "auto_align", }, "insertion_op_configs": { "type": "forte.processors.data_augment.algorithms." "dictionary_replacement_op.DictionaryReplacementOp", "kwargs": { "dictionary_class": ( "forte.processors.data_augment." "algorithms.dictionary.WordnetDictionary" ), "prob": 1.0, "lang": "eng", }, }, "alpha": 0.1, "stopwords": english_stopwords, }
[docs]class RandomDeletionDataAugmentOp(BaseDataAugmentationOp): r""" Data augmentation operation for the Random Insertion operation. Randomly remove each word in the sentence with probability alpha. """
[docs] def augment(self, data_pack: DataPack) -> bool: anno: Annotation for anno in data_pack.get(self.configs["augment_entry"]): if random.random() < self.configs["alpha"]: try: self.delete_annotation(anno) except ValueError: return False return True
[docs] @classmethod def default_configs(cls): """ Returns: A dictionary with the default config for this processor. Additional keys for Random Deletion: - augment_entry (str): Defines the entry the processor will augment. It should be a full qualified name of the entry class. For example, "ft.onto.base_ontology.Sentence". Default Value is 0.1 - alpha: 0 <= alpha <= 1. The probability to delete each word. """ return { "augment_entry": "ft.onto.base_ontology.Token", "other_entry_policy": { "ft.onto.base_ontology.Document": "auto_align", "ft.onto.base_ontology.Sentence": "auto_align", }, "alpha": 0.1, }