Source code for forte.data.extractors.seqtagging_extractor

# Copyright 2020 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This file implements BioSeqTaggingExtractor, which is used to extract feature
from the tagging label.
"""
import logging
from typing import Tuple, List, Dict, Union, Optional, Iterable, Type

from torch import Tensor

from forte.common import ProcessorConfigError
from forte.common.configuration import Config
from forte.data.base_extractor import BaseExtractor
from forte.data.converter.feature import Feature
from forte.data.data_pack import DataPack
from forte.data.ontology import Annotation
from forte.data.ontology.core import Entry
from forte.datasets.conll.conll_utils import bio_tagging
from forte.utils import get_class

logger = logging.getLogger(__name__)

__all__ = ["BioSeqTaggingExtractor"]


[docs]class BioSeqTaggingExtractor(BaseExtractor): r"""BioSeqTaggingExtractor will the feature by performing BIO encoding for the attribute of entry and aligning to the tagging_unit entry. Most of the time, a user will not need to call this class explicitly, they will be called by the framework. """
[docs] def initialize(self, config: Union[Dict, Config]): """ Initialize the extractor based on the provided configuration. Args: config: The configuration of the extractor, it can be a `Dict` or :class:`~forte.common.configuration.Config`. See :meth:`default_configs` for available options and default values. """ # pylint: disable=attribute-defined-outside-init super().initialize(config=config) if self.config is None: raise ProcessorConfigError( "Configuration for the extractor not found." ) if self.config.attribute is None: raise AttributeError( "attribute is required in BioSeqTaggingExtractor." ) if not self.config.tagging_unit: raise AttributeError( "tagging_unit is required in BioSeqTaggingExtractor." ) self._attribute: str = self.config.attribute self._tagging_unit: Type[Annotation] = get_class( self.config.tagging_unit ) self._entry_type: Type[Annotation] = get_class(self.config.entry_type)
[docs] @classmethod def default_configs(cls): r"""Returns a dictionary of default hyper-parameters. Here, additional parameters are added from the parent class: - entry_type (str): Required. The fully qualified name of an Annotation entry to extract attribute from. For example, for an NER task, it could be `ft.onto.base_ontology.EntityMention`. - attribute (str): Required. The attribute name of the entry from which labels are extracted. - tagging_unit (str): Required. The fully qualified name of the units for tagging, The tagging label will align to the units, e.g: `ft.onto.base_ontology.Token`. - pad_value (int): A customized value/representation to be used for padding. This value is only needed when `use_pad` is True. Default is -100 to follow PyTorch convention. - is_bert (bool): It indicates whether Bert model is used. If true, padding will be added to the beginning and end of a sentence corresponding to the special tokens ([CLS], [SEP]) used in Bert. Default is False. For example, the config can be: .. code-block:: python { "entry_type": "ft.onto.base_ontology.EntityMention", "attribute": "ner_type", "tagging_unit": "ft.onto.base_ontology.Token" } The extractor will extract the BIO NER tags for instances. A possible feature can be: .. code-block:: python [[None, "O"], ["LOC", "B"], ["LOC", "I"], [None, "O"], [None, "O"], ["PER", "B"], [None, "O"]] """ config = super().default_configs() config.update( { "entry_type": None, "attribute": None, "tagging_unit": "", "pad_value": -100, "is_bert": False, } ) return config
@classmethod def _bio_variance(cls, tag: str): r"""Return the BIO-schemed augmented tagging scheme, for example, if the `tag` is "person", the output would be `B-person`, `I-person`, `O-person`. Currently only supports B, I, O label. Args: tag: Tag name. """ return [(tag, "B"), (tag, "I"), (None, "O")]
[docs] def predefined_vocab(self, predefined: Iterable): r"""Add predefined tags into the vocabulary. i.e. One can construct the tag vocabulary without exploring the training data. Args: predefined: A set of pre-defined tags. """ for tag in predefined: for element in self._bio_variance(tag): self.add(element)
[docs] def update_vocab( self, pack: DataPack, context: Optional[Annotation] = None ): r"""Add all the tag from one instance into the vocabulary. Args: pack: The datapack that contains the current instance. context: The context is an Annotation entry where features will be extracted within its range. If None, then the whole data pack will be used as the context. Default is None. """ if self.config is None: raise ProcessorConfigError( "Configuration for the extractor not found." ) anno: Annotation for anno in pack.get(self.config.entry_type, context): for tag_variance in self._bio_variance( getattr(anno, self._attribute) ): self.add(tag_variance)
[docs] def extract( self, pack: DataPack, context: Optional[Annotation] = None ) -> Feature: r"""Extract the sequence tagging feature of one instance. If the vocabulary of this extractor is set, then the extracted tag sequences will be converted to the tag ids (int). Args: pack: The datapack that contains the current instance. context: The context is an Annotation entry where features will be extracted within its range. If None, then the whole data pack will be used as the context. Default is None. Returns (Feature): a feature that contains the extracted BIO sequence of and other metadata. """ if self.config is None: raise ProcessorConfigError( "Configuration for the extractor not found." ) instance_tagged: List[Tuple[Optional[str], str]] = bio_tagging( pack, self.config.tagging_unit, self.config.entry_type, self.config.attribute, context, ) pad_value = self.get_pad_value() if self.vocab: # Use the vocabulary to map data into representation. vocab_mapped: List[Union[int, List[int]]] = [] for pair in instance_tagged: vocab_mapped.append(self.element2repr(pair)) raw_data: List = vocab_mapped if self.config.is_bert: raw_data = [pad_value] + raw_data + [pad_value] need_pad = self.vocab.use_pad else: # When vocabulary is not available, use the original data. raw_data = instance_tagged need_pad = self.config.need_pad meta_data = { "need_pad": need_pad, "pad_value": pad_value, "dim": 1, "dtype": int if self.vocab else tuple, } return Feature(data=raw_data, metadata=meta_data, vocab=self.vocab)
[docs] def pre_evaluation_action( self, pack: DataPack, context: Optional[Annotation] = None ): r"""This function is performed on the pack before the evaluation stage, allowing one to perform some actions before the evaluation. By default, this function will remove tags in the instance. You can overwrite this function by yourself. Args: pack: The datapack to be processed. context: The context is an Annotation entry where data are extracted within its range. If None, then the whole data pack will be used as the context. Default is None. """ if self.config is None: raise ProcessorConfigError( "Configuration for the extractor not found." ) all_entries: List[Entry] = [] entry: Entry for entry in pack.get(self.config.entry_type, context): all_entries.append(entry) for e in all_entries: pack.delete_entry(e)
[docs] def add_to_pack( self, pack: DataPack, predictions: List[int], context: Optional[Annotation] = None, ): r"""Add the prediction results to data pack. The predictions are We make following assumptions for prediction. 1. If we encounter "I" while its tag is different from the previous tag, we will consider this "I" as a "B" and start a new tag here. 2. We will truncate the prediction it according to the number of entry. If the prediction contains `<PAD>` element, this should remove them. Args: pack: The datapack that contains the current instance. predictions: This is the output of the model, which contains the index for attributes of one instance. context: The context is an Annotation entry where features will be extracted within its range. If None, then the whole data pack will be used as the context. Default is None. """ if self.config is None: raise ProcessorConfigError( "Configuration for the extractor not found." ) instance_tagging_unit: List[Annotation] = list( pack.get(self._tagging_unit, context) ) if self.config.is_bert: predictions = predictions[1:-1] predictions = predictions[: len(instance_tagging_unit)] if isinstance(predictions, Tensor): predictions = predictions.cpu().numpy() tags = [self.id2element(x) for x in predictions] tag_start = None tag_end = None tag_type = None for entry, tag in zip(instance_tagging_unit, tags): if ( tag[1] == "O" or tag[1] == "B" or (tag[1] == "I" and tag[0] != tag_type) ): if tag_type: entity_mention = self._entry_type(pack, tag_start, tag_end) setattr(entity_mention, self._attribute, tag_type) tag_start = entry.begin tag_end = entry.end tag_type = tag[0] else: tag_end = entry.end # Handle the final tag if tag_type and tag_start and tag_end: entity_mention = self._entry_type( pack, tag_start, tag_end # type: ignore ) setattr(entity_mention, self._attribute, tag_type)