Source code for forte.data.extractors.char_extractor

# Copyright 2020 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This file implements CharExtractor, which is used to extract feature
from characters of a piece of text.
"""
import logging
from typing import Optional

from forte.common import ProcessorConfigError
from forte.data.data_pack import DataPack
from forte.data.converter.feature import Feature
from forte.data.base_extractor import BaseExtractor
from forte.data.ontology import Annotation

logger = logging.getLogger(__name__)

__all__ = ["CharExtractor"]


[docs]class CharExtractor(BaseExtractor):
    r"""CharExtractor extracts feature from the text of entry.
    Text will be split into characters.
    """

[docs]    @classmethod
    def default_configs(cls):
        r"""Returns a dictionary of default configuration parameters.

        Here:

        - "max_char_length": int
          The maximum number of characters for one token in the text,
          default is None, which means no limit will be set.
        - "entry_type": str
          The fully qualified name of an annotation type entry. Characters
          will be extracted based on these entries. Default is `Token`,
          which means characters of tokens will be extracted.
        """
        config = super().default_configs()
        config.update(
            {
                "max_char_length": None,
                "entry_type": "ft.onto.base_ontology.Token",
            }
        )
        return config

[docs]    def update_vocab(
        self, pack: DataPack, context: Optional[Annotation] = None
    ):
        r"""Add all character into vocabulary.

        Args:
            pack: The input data pack.
            context: The context is an Annotation entry where
                features will be extracted within its range. If None, then the
                whole data pack will be used as the context. Default is None.
        """
        word: Annotation
        if self.config is None:
            raise ProcessorConfigError(
                "Configuration for the extractor not found."
            )
        for word in pack.get(self.config.entry_type, context):
            for char in word.text:  # type: ignore
                self.add(char)

[docs]    def extract(
        self, pack: DataPack, context: Optional[Annotation] = None
    ) -> Feature:
        r"""Extract the character feature of one instance.

        Args:
            pack: The datapack to extract features from.
            context: The context is an Annotation entry where
                features will be extracted within its range. If None, then the
                whole data pack will be used as the context. Default is None.

        Returns:
            a iterator of feature that contains the characters of each
            specified annotation.
        """
        data = []

        if self.config is None:
            raise ProcessorConfigError(
                "Configuration for the extractor not found."
            )

        entry: Annotation
        for entry in pack.get(self.config.entry_type, context):
            if self.config.max_char_length is not None:
                max_char_length = min(
                    self.config.max_char_length, len(entry.text)  # type: ignore
                )
            else:
                max_char_length = len(entry.text)  # type: ignore

            characters = entry.text[:max_char_length]  # type: ignore

            if self.vocab:
                data.append([self.element2repr(char) for char in characters])
            else:
                data.append(list(characters))

        if self.config is None:
            raise ProcessorConfigError(
                "Configuration for the extractor not found."
            )

        meta_data = {
            "need_pad": self.config.need_pad,
            "pad_value": self.get_pad_value(),
            "dim": 2,
            "dtype": int if self.vocab else str,
        }
        return Feature(data=data, metadata=meta_data, vocab=self.vocab)