# Copyright 2020 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from typing import List, Tuple, Any, Optional, Union, Dict, Sequence
import numpy as np
from forte.utils import create_import_error_msg
from forte.common.configuration import Config
from forte.common import ValidationError
from forte.data.converter.feature import Feature
from forte.data.types import MatrixLike
logger = logging.getLogger(__name__)
__all__ = ["Converter"]
[docs]class Converter:
"""
This class has the functionality of converting a batch of
:class:`~forte.data.converter.Feature` to a `MatrixLike` type which can
be a Numpy array, a PyTorch `Tensor`, or a nested list.
It can also perform padding for the given batch of
:class:`~forte.data.converter.Feature`
if user requested it. Please refer to the `request` parameter in
:class:`~forte.train_preprocessor.TrainPreprocessor` for details.
Args:
config: An instance of `Dict` or
:class:`~forte.common.configuration.Config` that provides all
configurable options. See :meth:`default_configs` for available
options and default values.
"""
def __init__(self, config: Union[Dict, Config] = None):
self._config = Config(
config,
default_hparams=self.default_configs(),
allow_new_hparam=True,
)
[docs] @staticmethod
def default_configs():
"""
Returns a dictionary of default hyper-parameters.
.. code-block:: python
{
"to_numpy": True,
"to_torch": True
}
Here:
- `"to_numpy"`: bool
Whether convert to `numpy.ndarray`.
Default is True.
- `"to_torch"`: bool
Whether convert to `torch.tensor`. Default is True.
.. note::
If `need_pad` in :class:`forte.data.converter.Feature`
is False and `to_numpy` and `to_torch` is True,
it will raise an exception if the target data cannot be
converted to `numpy.ndarray` or `torch.tensor`.
.. note::
If `need_pad` in :class:`forte.data.converter.Feature`
is True and `to_torch` is True, `to_torch` will overwrite the
effect of `to_numpy`.
"""
return {"to_numpy": True, "to_torch": True}
@property
def to_numpy(self) -> bool:
return self._config.to_numpy
@property
def to_torch(self) -> bool:
return self._config.to_torch
@property
def state(self) -> Dict:
return {"to_numpy": self.to_numpy, "to_torch": self.to_torch}
def load_state(self, state: Dict):
self._config.to_numpy = state["to_numpy"]
self._config.to_torch = state["to_torch"]
[docs] def convert(
self, features: List[Feature]
) -> Tuple[MatrixLike, Sequence[MatrixLike]]:
"""
Convert a list of Features to matrix-like form, where
1. The outer most dimension will always be the batch dimension (i.e
`len(output) = len(feature_num)`).
2. The type can be:
2.1 A `List` of primitive `int` or another `List`
2.2 A `numpy.ndarray`
2.3 A `torch.Tensor`
If `need_pad` in :class:`forte.data.converter.Feature` is True, it
will pad all features with given `pad_value` stored inside
:class:`forte.data.converter.Feature`.
If `to_numpy` is True, it will try to convert data into
`numpy.ndarray`.
If `to_torch` is True, it will try to convert data into
`torch.tensor`.
Args:
features:
A list of :class:`forte.data.converter.Feature`
Returns:
A `Tuple` containing two elements.
1. The first element is either a `MatrixLike` type representing the
batch of data.
2. The second element is a `MatrixLike` type representing
masks along different feature dimensions.
Example 1:
.. code-block:: python
data = [[1,2,3], [4,5], [6,7,8,9]]
meta_data = {
"pad_value": 0,
"need_pad": True,
"dim": 1
"dtype": np.long
}
features = [Feature(i, meta_data=meta_data) for i in data]
converter = Converter(to_numpy=True,
to_torch=False)
output_data, masks = converter.convert(features)
# output_data is:
# np.array([[1,2,3,0], [4,5,0,0], [6,7,8,9]], dtype=np.long)
# masks is:
# [
# np.array([[1,1,1,0], [1,1,0,0], [1,1,1,1]],
# dtype=np.bool)
# ]
Example 2:
.. code-block:: python
data = [[[1,2,3], [4,5]], [[3]]]
meta_data = {
"pad_value": 0,
"need_pad": True,
"dim": 2
"dtype": np.long
}
features = [Feature(i, meta_data=meta_data) for i in data]
converter = Converter(to_numpy=True,
to_torch=False)
output_data, masks = converter.convert(features)
# output_data is:
# np.array([[[1,2,3], [4,5,0]], [[3,0,0], [0,0,0]]],
# dtype=np.long)
# masks is:
# [
# np.array([[1,1], [1,0]], dtype=np.bool),
# np.array([[[1,1,1], [1,1,0]],
# [[1,0,0], [0,0,0]]], dtype=np.bool)
# ]
Example 3:
.. code-block:: python
data = [[1,2,3,0], [4,5,0,0], [6,7,8,9]]
meta_data = {
"pad_value": 0
"need_pad": False,
"dim": 1
"dtype": np.long
}
features = [Feature(i, meta_data=meta_data) for i in data]
converter = Converter(need_pad=False)
output_data, _ = converter.convert(features)
# output_data is:
# torch.tensor([[1,2,3,0], [4,5,0,0], [6,7,8,9]], dtype=torch.long)
Example 4:
.. code-block:: python
data = [[1,2,3], [4,5], [6,7,8,9]]
meta_data = {
"pad_value": 0,
"need_pad": True,
"dim": 1
"dtype": np.long
}
features = [Feature(i, meta_data=meta_data) for i in data]
converter = Converter(to_torch=True)
output_data, masks = converter.convert(features)
# output_data is:
# torch.tensor([[1,2,3,0], [4,5,0,0], [6,7,8,9]], dtype=torch.long)
# masks is:
# [
# torch.tensor([[1,1,1,0], [1,1,0,0], [1,1,1,1]],
# dtype=np.bool)
# ]
"""
if self.to_torch:
try:
import torch # pylint: disable=import-outside-toplevel
except ImportError as e:
raise ImportError(
create_import_error_msg("torch", "extractor", "data module")
) from e
dtype: Optional[np.dtype] = None
need_pad: bool = features[0].need_pad
if need_pad and self.to_torch and not self.to_numpy:
logger.warning(
"need_pad is True and to_torch is True, "
"setting to_numpy to False will be ignored."
)
# Do padding if needed.
if need_pad:
dtype = self._padding(features)
# Collect a batch of data & masks from Features
data_list: List[List[Any]] = []
# batch_masks_per_example:
# (feature_num, feature_dim, feature_mask1, [feature_mask2, ...])
masks_per_example_list: List[List[Any]] = []
for feature in features:
padded_feature, mask_list = feature.data
data_list.append(padded_feature)
masks_per_example_list.append(mask_list)
# Switch the two outer most dimensions
# batch_list:
# (feature_dim, feature_num, feature_mask1, [feature_mask2, ...])
masks_list: List[List[Any]] = []
for i in range(features[0].dim):
curr_dim_masks = []
for mask in masks_per_example_list:
curr_dim_masks.append(mask[i])
masks_list.append(curr_dim_masks)
# Convert to target type
if not self.to_numpy and not self.to_torch:
return data_list, masks_list
# Note: to_torch == True overwrite to_numpy option
if self.to_torch:
data_tensor: torch.Tensor = self._to_tensor_type(data_list, dtype)
masks_tensor_list: List[torch.Tensor] = []
for batch_masks_dim_i in masks_list:
masks_tensor_list.append(
self._to_tensor_type(batch_masks_dim_i, np.bool)
)
return data_tensor, masks_tensor_list
if self.to_numpy:
data_np: np.ndarray = self._to_numpy_type(data_list, dtype)
masks_np_list: List[np.ndarray] = []
for batch_masks_dim_i in masks_list:
masks_np_list.append(
self._to_numpy_type(batch_masks_dim_i, np.bool)
)
return data_np, masks_np_list
# Control should not reach here
raise RuntimeError("Invalid converter internal state")
@staticmethod
def _padding(features: List[Feature]):
try:
import torch # pylint: disable=import-outside-toplevel
except ImportError as e:
raise ImportError(
create_import_error_msg(
"torch", "extractor", "the extractor system"
)
) from e
# BFS to pad each dimension
queue: List[Feature] = []
curr_max_len: int = -1
dtype: Optional[torch.dtype] = None
for feature in features:
if not dtype:
dtype = feature.dtype
else:
if dtype != feature.dtype:
raise ValidationError(
"The dtype should be same within a batch of Features"
)
if not feature.need_pad:
raise ValidationError(
"Inconsistent need pad flag for a batch of Features"
)
queue.append(feature)
curr_max_len = max(curr_max_len, len(feature))
while len(queue) > 0:
size: int = len(queue)
next_max_len = -1
while size > 0:
feature = queue.pop(0)
feature.pad(curr_max_len)
if not feature.leaf_feature:
for sub_feature in feature.sub_features:
next_max_len = max(next_max_len, len(sub_feature))
queue.append(sub_feature)
size -= 1
curr_max_len = next_max_len
return dtype
@staticmethod
def _to_numpy_type(data: List[Any], dtype) -> np.ndarray:
return np.array(data, dtype=dtype)
@staticmethod
def _to_tensor_type(data: List[Any], dtype):
try:
import torch # pylint: disable=import-outside-toplevel
except ImportError as e:
raise ImportError(
create_import_error_msg("torch", "extractor", "data module")
) from e
return torch.tensor(data, dtype=dtype)