Source code for forte.processors.misc.delete_overlap_entries

# Copyright 2021 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A processor to delete overlapping annotations."""

__all__ = [
    "DeleteOverlapEntry",
]

from typing import List

from forte.common import Resources
from forte.common.configuration import Config
from forte.common.exception import ProcessorConfigError
from forte.data import Span
from forte.data.data_pack import DataPack
from forte.data.ontology import Annotation
from forte.processors.base import PackProcessor
from forte.utils import get_class


[docs]class DeleteOverlapEntry(PackProcessor): """ A processor to delete overlapping annotations in a data pack. When overlapping, the first annotation (based on the iteration order) will be kept and the rest of them will be removed. """ # pylint: disable=attribute-defined-outside-init,unused-argument
[docs] def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) if not self.configs.entry_type: raise ProcessorConfigError("Please specify an entity mention type!") self.entry_type = get_class(self.configs.entry_type) if not issubclass(self.entry_type, Annotation): raise AttributeError( f"The entry type to delete [{self.entry_type}] " f"is not a sub-class of " f"'forte.data.ontology.top.Annotation' class." )
def _process(self, input_pack: DataPack): entry_spans: List[Span] = [] entries: List[Annotation] = list(input_pack.get(self.entry_type)) for entry in entries: current_span = entry.span if entry_spans and self._is_overlap(entry_spans[-1], current_span): input_pack.delete_entry(entry) else: entry_spans.append(current_span) def _is_overlap(self, interval1: Span, interval2: Span) -> bool: """ Determine whether two intervals overlap with each other. """ if interval2.begin < interval1.begin: interval1, interval2 = interval2, interval1 return interval2.begin <= interval1.end
[docs] @classmethod def default_configs(cls): """ The ``entry_type`` config determines which type of annotation to be checked for duplication. This value should be the name of a class that is sub-class for :class:`~forte.data.ontology.top.Annotation`. Otherwise a `ValueError` will be raised. Returns: None. """ return { "entry_type": None, }