Source code for forte.data.converter.converter

#  Copyright 2020 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from typing import List, Tuple, Any, Optional, Union, Dict, Sequence

import numpy as np

from forte.utils import create_import_error_msg
from forte.common.configuration import Config
from forte.common import ValidationError
from forte.data.converter.feature import Feature
from forte.data.types import MatrixLike

logger = logging.getLogger(__name__)

__all__ = ["Converter"]


[docs]class Converter:
    """
    This class has the functionality of converting a batch of
    :class:`~forte.data.converter.Feature` to a `MatrixLike` type which can
    be a Numpy array, a PyTorch `Tensor`, or a nested list.

    It can also perform padding for the given batch of
    :class:`~forte.data.converter.Feature`
    if user requested it. Please refer to the `request` parameter in
    :class:`~forte.train_preprocessor.TrainPreprocessor` for details.

    Args:
        config: An instance of `Dict` or
            :class:`~forte.common.configuration.Config` that provides all
            configurable options. See :meth:`default_configs` for available
            options and default values.
    """

    def __init__(self, config: Union[Dict, Config] = None):
        self._config = Config(
            config,
            default_hparams=self.default_configs(),
            allow_new_hparam=True,
        )

[docs]    @staticmethod
    def default_configs():
        """
        Returns a dictionary of default hyper-parameters.

        .. code-block:: python

            {
                "to_numpy": True,
                "to_torch": True
            }

        Here:

            - `"to_numpy"`: bool
              Whether convert to `numpy.ndarray`.
              Default is True.

            - `"to_torch"`: bool
              Whether convert to `torch.tensor`. Default is True.

            .. note::
                If `need_pad` in :class:`forte.data.converter.Feature`
                is False and `to_numpy` and `to_torch` is True,
                it will raise an exception if the target data cannot be
                converted to `numpy.ndarray` or `torch.tensor`.

            .. note::
                If `need_pad` in :class:`forte.data.converter.Feature`
                is True and `to_torch` is True, `to_torch` will overwrite the
                effect of `to_numpy`.
        """
        return {"to_numpy": True, "to_torch": True}

    @property
    def to_numpy(self) -> bool:
        return self._config.to_numpy

    @property
    def to_torch(self) -> bool:
        return self._config.to_torch

    @property
    def state(self) -> Dict:
        return {"to_numpy": self.to_numpy, "to_torch": self.to_torch}

    def load_state(self, state: Dict):
        self._config.to_numpy = state["to_numpy"]
        self._config.to_torch = state["to_torch"]

[docs]    def convert(
        self, features: List[Feature]
    ) -> Tuple[MatrixLike, Sequence[MatrixLike]]:
        """
        Convert a list of Features to matrix-like form, where

        1. The outer most dimension will always be the batch dimension (i.e
        `len(output) = len(feature_num)`).

        2. The type can be:

            2.1 A `List` of primitive `int` or another `List`

            2.2 A `numpy.ndarray`

            2.3 A `torch.Tensor`

        If `need_pad` in :class:`forte.data.converter.Feature` is True, it
        will pad all features with given `pad_value` stored inside
        :class:`forte.data.converter.Feature`.

        If `to_numpy` is True, it will try to convert data into
        `numpy.ndarray`.

        If `to_torch` is True, it will try to convert data into
        `torch.tensor`.

        Args:
            features:
                A list of :class:`forte.data.converter.Feature`

        Returns:
            A `Tuple` containing two elements.

            1. The first element is either a `MatrixLike` type representing the
            batch of data.

            2. The second element is a `MatrixLike` type representing
            masks along different feature dimensions.

        Example 1:

        .. code-block:: python

            data = [[1,2,3], [4,5], [6,7,8,9]]
            meta_data = {
                "pad_value": 0,
                "need_pad": True,
                "dim": 1
                "dtype": np.long
            }
            features = [Feature(i, meta_data=meta_data) for i in data]
            converter = Converter(to_numpy=True,
                                  to_torch=False)

            output_data, masks = converter.convert(features)

            # output_data is:
            # np.array([[1,2,3,0], [4,5,0,0], [6,7,8,9]], dtype=np.long)

            # masks is:
            # [
            #     np.array([[1,1,1,0], [1,1,0,0], [1,1,1,1]],
            #              dtype=np.bool)
            # ]

        Example 2:

        .. code-block:: python

            data = [[[1,2,3], [4,5]], [[3]]]
            meta_data = {
                "pad_value": 0,
                "need_pad": True,
                "dim": 2
                "dtype": np.long
            }
            features = [Feature(i, meta_data=meta_data) for i in data]
            converter = Converter(to_numpy=True,
                                  to_torch=False)

            output_data, masks = converter.convert(features)

            # output_data is:
            # np.array([[[1,2,3], [4,5,0]], [[3,0,0], [0,0,0]]],
            #          dtype=np.long)


            # masks is:
            # [
            #     np.array([[1,1], [1,0]], dtype=np.bool),
            #     np.array([[[1,1,1], [1,1,0]],
            #              [[1,0,0], [0,0,0]]], dtype=np.bool)
            # ]

        Example 3:

        .. code-block:: python

            data = [[1,2,3,0], [4,5,0,0], [6,7,8,9]]
            meta_data = {
                "pad_value": 0
                "need_pad": False,
                "dim": 1
                "dtype": np.long
            }
            features = [Feature(i, meta_data=meta_data) for i in data]
            converter = Converter(need_pad=False)

            output_data, _ = converter.convert(features)

            # output_data is:
            # torch.tensor([[1,2,3,0], [4,5,0,0], [6,7,8,9]], dtype=torch.long)

        Example 4:

        .. code-block:: python

            data = [[1,2,3], [4,5], [6,7,8,9]]
            meta_data = {
                "pad_value": 0,
                "need_pad": True,
                "dim": 1
                "dtype": np.long
            }
            features = [Feature(i, meta_data=meta_data) for i in data]
            converter = Converter(to_torch=True)

            output_data, masks = converter.convert(features)

            # output_data is:
            # torch.tensor([[1,2,3,0], [4,5,0,0], [6,7,8,9]], dtype=torch.long)

            # masks is:
            # [
            #     torch.tensor([[1,1,1,0], [1,1,0,0], [1,1,1,1]],
            #                  dtype=np.bool)
            # ]
        """
        if self.to_torch:
            try:
                import torch  # pylint: disable=import-outside-toplevel
            except ImportError as e:
                raise ImportError(
                    create_import_error_msg("torch", "extractor", "data module")
                ) from e
        dtype: Optional[np.dtype] = None
        need_pad: bool = features[0].need_pad

        if need_pad and self.to_torch and not self.to_numpy:
            logger.warning(
                "need_pad is True and to_torch is True, "
                "setting to_numpy to False will be ignored."
            )

        # Do padding if needed.
        if need_pad:
            dtype = self._padding(features)

        # Collect a batch of data & masks from Features
        data_list: List[List[Any]] = []
        # batch_masks_per_example:
        # (feature_num, feature_dim, feature_mask1, [feature_mask2, ...])
        masks_per_example_list: List[List[Any]] = []
        for feature in features:
            padded_feature, mask_list = feature.data
            data_list.append(padded_feature)
            masks_per_example_list.append(mask_list)

        # Switch the two outer most dimensions
        # batch_list:
        # (feature_dim, feature_num, feature_mask1, [feature_mask2, ...])
        masks_list: List[List[Any]] = []
        for i in range(features[0].dim):
            curr_dim_masks = []
            for mask in masks_per_example_list:
                curr_dim_masks.append(mask[i])
            masks_list.append(curr_dim_masks)

        # Convert to target type
        if not self.to_numpy and not self.to_torch:
            return data_list, masks_list

        # Note: to_torch == True overwrite to_numpy option
        if self.to_torch:
            data_tensor: torch.Tensor = self._to_tensor_type(data_list, dtype)
            masks_tensor_list: List[torch.Tensor] = []
            for batch_masks_dim_i in masks_list:
                masks_tensor_list.append(
                    self._to_tensor_type(batch_masks_dim_i, np.bool)
                )

            return data_tensor, masks_tensor_list

        if self.to_numpy:
            data_np: np.ndarray = self._to_numpy_type(data_list, dtype)
            masks_np_list: List[np.ndarray] = []
            for batch_masks_dim_i in masks_list:
                masks_np_list.append(
                    self._to_numpy_type(batch_masks_dim_i, np.bool)
                )
            return data_np, masks_np_list

        # Control should not reach here
        raise RuntimeError("Invalid converter internal state")

    @staticmethod
    def _padding(features: List[Feature]):
        try:
            import torch  # pylint: disable=import-outside-toplevel
        except ImportError as e:
            raise ImportError(
                create_import_error_msg(
                    "torch", "extractor", "the extractor system"
                )
            ) from e
        # BFS to pad each dimension
        queue: List[Feature] = []
        curr_max_len: int = -1
        dtype: Optional[torch.dtype] = None

        for feature in features:
            if not dtype:
                dtype = feature.dtype
            else:
                if dtype != feature.dtype:
                    raise ValidationError(
                        "The dtype should be same within a batch of Features"
                    )
            if not feature.need_pad:
                raise ValidationError(
                    "Inconsistent need pad flag for a batch of Features"
                )
            queue.append(feature)
            curr_max_len = max(curr_max_len, len(feature))

        while len(queue) > 0:
            size: int = len(queue)
            next_max_len = -1
            while size > 0:
                feature = queue.pop(0)
                feature.pad(curr_max_len)

                if not feature.leaf_feature:
                    for sub_feature in feature.sub_features:
                        next_max_len = max(next_max_len, len(sub_feature))
                        queue.append(sub_feature)

                size -= 1
            curr_max_len = next_max_len

        return dtype

    @staticmethod
    def _to_numpy_type(data: List[Any], dtype) -> np.ndarray:
        return np.array(data, dtype=dtype)

    @staticmethod
    def _to_tensor_type(data: List[Any], dtype):
        try:
            import torch  # pylint: disable=import-outside-toplevel
        except ImportError as e:
            raise ImportError(
                create_import_error_msg("torch", "extractor", "data module")
            ) from e
        return torch.tensor(data, dtype=dtype)