Source code for forte.data.base_extractor

# Copyright 2020 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This file implements BaseExtractor, which is the abstract class other
extractors will inherit from.
"""
import logging
from abc import ABC, abstractmethod
from typing import Tuple, List, Dict, Any
from typing import Union, Hashable, Iterable, Optional

from forte.common import InvalidForteStateException
from forte.common.configuration import Config
from forte.data.converter.feature import Feature
from forte.data.data_pack import DataPack
from forte.data.ontology import Annotation
from forte.data.vocabulary import Vocabulary

logger = logging.getLogger(__name__)

__all__ = ["BaseExtractor"]


[docs]class BaseExtractor(ABC):
    r"""The functionality of Extractor is as followed. Most of
    the time, a user will not need to call this class explicitly,
    they will be called by the framework.

        1. Build vocabulary.
        2. Extract feature from datapack.
        3. Perform pre-evaluation action on datapack.
        4. Add prediction to datapack.

    Explanation:

        - Vocabulary:
          Vocabulary is maintained as an attribute
          in extractor. It will store the mapping from element
          to index, which is an integer, and representation,
          which could be an index integer or one-hot vector
          depending on the configuration of the vocabulary.
          Check :class:`~forte.data.vocabulary.Vocabulary` for
          more details.
        - Feature:
          A feature basically wraps the data we want from
          one instance in a datapack. For example, the instance
          can be one sentence in a datapack. Then the data wrapped
          by the feature could be the token text of this sentence.
          The data is already converted as list of indexes using
          vocabulary. Besides the data, other information like the
          raw data before indexing and some meta_data will also be
          stored in the feature. Check
          :class:`~forte.data.converter.Feature` for more
          details.
        - Remove feature / Add prediction:
          Removing feature means remove
          the existing data in the datapack. If we remove the feature
          in the pack, then extracting feature will return empty list.
          Adding prediction means we add the prediction from model
          back to the datapack. If a datapack has some old data (for
          example, the golden data in the test set), we can first
          remove those data and then add our model prediction to
          the pack.

    Attributes:
        config: An instance of `Dict` or :class:`~forte.common.configuration.Config` that
            provides configurable options. See
            :meth:`~forte.data.base_extractor.BaseExtractor.default_configs`
            for available options and default values.

    """
    _VOCAB_ERROR_MSG = (
        "When vocab_method is raw, vocabulary "
        "will not be built. Functions operating "
        "on vocabulary should not be called."
    )

    def __init__(self):
        self._vocab: Optional[Vocabulary] = None
        self.config: Optional[Config] = None
        self._vocab_method: Optional[str] = None

    def initialize(self, config: Union[Dict, Config]):
        self.config = Config(config, self.default_configs())

        if self.config.vocab_method != "custom":
            self._vocab = Vocabulary(
                method=self.config.vocab_method,
                use_pad=self.config.need_pad,
                use_unk=self.config.vocab_use_unk,
                pad_value=self.config.pad_value,
                unk_value=self.config.unk_value,
            )
        else:
            self._vocab = None
        self._vocab_method = self.config.vocab_method

[docs]    @classmethod
    def default_configs(cls):
        r"""Returns a dictionary of default hyper-parameters.

        Here:

        - vocab_method (str)
          What type of vocabulary is used for this extractor. `custom`,
          `indexing`, `one-hot` are supported, default is `indexing`.
          Check the behavior of vocabulary under different setting
          in :class:`~forte.data.vocabulary.Vocabulary`

        - context_type (str): The fully qualified name of the context used to
          group the extracted features, for example, it could be a
          `ft.onto.base_ontology.Sentence`. If this is `None`, features from
          in the whole data pack will be grouped together. Default is None.
          This value could be mandatory for some processors, which will be
          documented and specified by the specific processor implementation.

        - vocab_use_unk (bool)
          Whether the `<UNK>` element should be added to vocabulary.
          Default is true.

        - need_pad (bool)
          Whether the `<PAD>` element should be added to vocabulary. And
          whether the feature need to be batched and padded. Default is True.

        - pad_value (int)
          A customized value/representation to be used for
          padding. This value is only needed when `use_pad` is True.
          Default is None, where the value of padding is determined by
          the system.

        - unk_value (int)
          A customized value/representation to be used for
          unknown value (`unk`). This value is only needed when
          `vocab_use_unk` is True. Default is None, where the value
          of `UNK` is determined by the system.

        """
        return {
            "vocab_method": "indexing",
            "context_type": None,
            "vocab_use_unk": True,
            "need_pad": True,
            "pad_value": None,
            "unk_value": None,
        }

    @property
    def vocab_method(self) -> str:
        if self._vocab_method is None:
            raise InvalidForteStateException(
                "The vocab_method for the extractor is not set."
            )
        return self._vocab_method

    @property
    def vocab(self) -> Optional[Vocabulary]:
        """
        Getter of the vocabulary class.

        Returns: The vocabulary. None if the vocabulary is not set.

        """
        return self._vocab

    @vocab.setter
    def vocab(self, vocab: Vocabulary):
        """
        Setter of the vocabulary, used when user build the vocabulary
        externally.

        Args:
            vocab: The vocabulary to be assigned.

        Returns:

        """
        self._vocab = vocab

    def get_pad_value(self) -> Union[None, int, List[int]]:
        if self.vocab is not None:
            return self.vocab.get_pad_value()
        else:
            return None

    def vocab_items(self) -> Iterable[Tuple[Hashable, int]]:
        if self.vocab is None:
            raise AttributeError(self._VOCAB_ERROR_MSG)
        return self.vocab.vocab_items()

    def add(self, element: Hashable):
        if self.vocab is None:
            raise AttributeError(self._VOCAB_ERROR_MSG)
        return self.vocab.add_element(element)

    def has_element(self, element: Hashable) -> bool:
        if self.vocab is None:
            raise AttributeError(self._VOCAB_ERROR_MSG)
        return self.vocab.has_element(element)

    def element2repr(self, element: Hashable) -> Union[int, List[int]]:
        if self.vocab is None:
            raise AttributeError(self._VOCAB_ERROR_MSG)
        return self.vocab.element2repr(element)

    def id2element(self, idx: int) -> Any:
        if self.vocab is None:
            raise AttributeError(self._VOCAB_ERROR_MSG)
        return self.vocab.id2element(idx)

[docs]    def predefined_vocab(self, predefined: Iterable):
        r"""Populate the vocabulary with predefined values. You can also extend
        this method to customize the ways to handle the vocabulary.

        Overwrite instruction:

            1. Take out elements from predefined.
            2. Make modification on the elements based on the need of the
               extractor.
            3. Use :meth:`add` function to add the element into vocabulary.

        Args:
            predefined (Iterable): A collections that contains the elements to
              be added into the vocabulary.
        """
        for element in predefined:
            self.add(element)

[docs]    def update_vocab(
        self, pack: DataPack, context: Optional[Annotation] = None
    ):
        r"""Populate the vocabulary needed by the extractor. This can be
        implemented by a specific extractor. The populated vocabulary can be
        used to map features/items to numeric representations. If you use a
        pre-specified vocabulary, you may not need to use this function.

        Overwrite instructions:

            1. Get all entries of the type of interest, such as all the
            `Token`s in the data pack.
            2. Use :meth:`~forte.data.vocabulary.Vocabulary.add` to add those
            element into `self._vocab`.

        Args:
            pack: The input data pack.
            context: The context is an Annotation entry where
                features will be extracted within its range. If None, then the
                whole data pack will be used as the context. Default is None.
        """
        pass

[docs]    @abstractmethod
    def extract(
        self, pack: DataPack, context: Optional[Annotation] = None
    ) -> Feature:
        """This method should be implemented to extract features from a
        datapack.

        Args:
            pack: The input data pack that contains the features.
            context: The context is an Annotation entry where
                features will be extracted within its range. If None, then the
                whole data pack will be used as the context. Default is None.

        Returns: Features inside this instance stored as a
        `~forte.data.converter.feature.Feature` instance.

        """
        raise NotImplementedError

[docs]    def pre_evaluation_action(
        self, pack: DataPack, context: Optional[Annotation]
    ):
        r"""This function is performed on the pack before the evaluation
        stage, allowing one to perform some actions before the evaluation.
        For example, you can remove entries or remove some attributes of
        the entry. By default, this function will not do anything.

        Args:
            pack: The datapack that contains the current instance.
            context: The context is an Annotation entry
                where data are extracted within its range. If None, then the
                whole data pack will be used as the context. Default is None.
        """
        pass

[docs]    def add_to_pack(
        self,
        pack: DataPack,
        predictions: Any,
        context: Optional[Annotation] = None,
    ):
        r"""Add prediction of a model (normally in the form of a tensor)
        back to the pack. This function should have knowledge of the structure
        of the `prediction` to correctly populate the data pack values.

        This function can be roughly considered as the reverse operation of
        :meth:`~forte.data.base_extractor.BaseExtractor.extract`.

        Overwrite instruction:

            1. Get all entries from one instance in the pack.

            2. Convert predictions into elements that needs to be assigned
               to entries. You can use :meth:`~forte.data.vocabulary.id2element`
               to convert integers in the prediction into element via the
               vocabulary maintained by the extractor.

            3. Add the element to corresponding entry based on the need.

        Args:
            pack: The datapack to add predictions back.
            predictions: This is the output of the model, the format of
              which will be determined by the predict function defined in the
              Predictor.
            context: The context is an Annotation
                entry where predictions will be added to. This has the same
                meaning with `context` as in
                :meth:`~forte.data.base_extractor.BaseExtractor.extract`.
                If None, then the whole data pack will be used as the
                context. Default is None.
        """
        pass