Source code for forte.data.data_pack

# Copyright 2019 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from enum import IntEnum
import logging
from pathlib import Path
from typing import (
    Dict,
    Iterable,
    Iterator,
    List,
    Optional,
    Type,
    Union,
    Any,
    Set,
    Callable,
    Tuple,
    cast,
)

import numpy as np
from sortedcontainers import SortedList
from forte.common.exception import (
    ProcessExecutionException,
    UnknownOntologyClassException,
)
from forte.common.constants import TID_INDEX, BEGIN_ATTR_NAME, END_ATTR_NAME
from forte.data import data_utils_io
from forte.data.data_store import DataStore
from forte.data.entry_converter import EntryConverter
from forte.data.base_pack import BaseMeta, BasePack
from forte.data.index import BaseIndex
from forte.data.ontology.core import Entry
from forte.data.ontology.core import EntryType
from forte.data.ontology.top import (
    Annotation,
    Link,
    Group,
    Generics,
    AudioAnnotation,
    Payload,
    AudioPayload,
    TextPayload,
    ImagePayload,
    SinglePackEntries,
    AnnotationLikeEntries,
)

from forte.data.modality import Modality
from forte.data.span import Span
from forte.data.types import ReplaceOperationsType, DataRequest
from forte.utils import get_class, get_full_module_name

logger = logging.getLogger(__name__)

__all__ = ["Meta", "DataPack", "DataIndex"]


[docs]class Meta(BaseMeta): r"""Basic Meta information associated with each instance of :class:`~forte.data.data_pack.DataPack`. Args: pack_name: An name to identify the data pack, which is helpful in situation like serialization. It is suggested that the packs should have different doc ids. language: The language used by this data pack, default is English. span_unit: The unit used for interpreting the Span object of this data pack. Default is character. sample_rate: An integer specifying the sample rate of audio payload. Default is None. info: Store additional string based information that the user add. Attributes: pack_name: storing the provided `pack_name`. language: storing the provided `language`. sample_rate: storing the provided `sample_rate`. info: storing the provided `info`. record: Initialized as a dictionary. This is not a required field. The key of the record should be the entry type and values should be attributes of the entry type. All the information would be used for consistency checking purpose if the pipeline is initialized with `enforce_consistency=True`. """ def __init__( self, pack_name: Optional[str] = None, language: str = "eng", span_unit: str = "character", sample_rate: Optional[int] = None, info: Optional[Dict[str, str]] = None, ): super().__init__(pack_name) self.language = language self.span_unit = span_unit self.sample_rate: Optional[int] = sample_rate self.record: Dict[str, Set[str]] = {} self.info: Dict[str, str] if info is None: self.info = {} else: self.info = info
def as_entry_type(entry_type: Union[str, Type[EntryType]]): entry_type_: Type[EntryType] if isinstance(entry_type, str): entry_type_ = get_class(entry_type) if not issubclass(entry_type_, Entry): raise ValueError( f"The specified entry type [{entry_type}] " f"does not correspond to a " f"`forte.data.ontology.core.Entry` class" ) else: entry_type_ = entry_type return entry_type_ def as_sorted_error_check(entries: List[EntryType]) -> SortedList: """ Given a list of entries, return a sorted list of it. If unknown entry classes are seen during this process, a :class:`~forte.common.exception.UnknownOntologyClassException` exception will be thrown. Args: entries: A list of entries to be converted. Returns: Sorted list of the input entries. """ try: return SortedList(entries) except TypeError as e: for entry in entries: if isinstance(entry, Dict) and "py/object" in entry: entry_class = entry["py/object"] try: get_class(entry_class) except ValueError: raise UnknownOntologyClassException( f"Cannot deserialize ontology type {entry_class}, " f"make sure it is included in the PYTHONPATH." ) from e
[docs]class DataPack(BasePack[Entry, Link, Group]): # pylint: disable=too-many-public-methods, unused-private-member r"""A :class:`~forte.data.data_pack.DataPack` contains a piece of natural language text and a collection of NLP entries (annotations, links, and groups). The natural language text could be a document, paragraph or in any other granularity. Args: pack_name: A name for this data pack. """ def __init__(self, pack_name: Optional[str] = None): super().__init__(pack_name) self._data_store: DataStore = DataStore() self._entry_converter: EntryConverter = EntryConverter() self.text_payloads: List[Payload] = [] self.audio_payloads: List[Payload] = [] self.image_payloads: List[Payload] = [] self._index: DataIndex = DataIndex() def __getstate__(self): r""" In serialization, 1) will remove ``_entry_converter`` to save space. """ state = super().__getstate__() state.pop("_entry_converter") return state def __setstate__(self, state): r""" In deserialization, we 1) Perform pack version compatibility checking; 2) initialize the entry converter 3) initialize the indexes. 4) Obtain the pack ids. """ self._entry_converter = EntryConverter() super().__setstate__(state) for payload in ( self.text_payloads + self.audio_payloads + self.image_payloads ): payload.set_pack(self) self._index = DataIndex() self._index.update_basic_index(list(iter(self))) def __iter__(self): yield from self.annotations yield from self.links yield from self.groups yield from self.generics yield from self.audio_annotations def _init_meta(self, pack_name: Optional[str] = None) -> Meta: return Meta(pack_name) def _validate(self, entry: EntryType) -> bool: return isinstance(entry, SinglePackEntries) @property def text(self) -> str: """ Get the first text data stored in the DataPack. If there is no text payload in the DataPack, it will return empty string. Args: text_payload_index: the index of the text payload. Defaults to 0. Raises: ValueError: raised when the index is out of bound of the text payload list. Returns: text data in the text payload. """ if len(self.text_payloads) > 0: return str(self.get_payload_data_at(Modality.Text, 0)) else: return "" @property def audio(self): r""" Return the audio data from the first audio payload in the DataPack. """ return self.get_payload_data_at(Modality.Audio, 0) @property def image(self): r""" Return the image data from the first image payload in the data pack. """ return self.get_image(0)
[docs] def get_image(self, index: int): """ Return the image data from the image payload at the specified index. Args: index: image payload index for retrieving the image data. Returns: image payload data at the specified index. """ return self.get_payload_data_at(Modality.Image, index)
@property def all_annotations(self) -> Iterator[Annotation]: """ An iterator of all annotations in this data pack. Returns: Iterator of all annotations, of type :class:`~forte.data.ontology.top.Annotation`. """ for entry in self._data_store.all_entries( "forte.data.ontology.top.Annotation" ): yield self.get_entry(tid=entry[TID_INDEX]) # type: ignore @property def num_annotations(self) -> int: """ Number of annotations in this data pack. Returns: (int) Number of the links. """ return self._data_store.num_entries( "forte.data.ontology.top.Annotation" ) @property def all_links(self) -> Iterator[Link]: """ An iterator of all links in this data pack. Returns: Iterator of all links, of type :class:`~forte.data.ontology.top.Link`. """ for entry in self._data_store.all_entries( "forte.data.ontology.top.Link" ): yield self.get_entry(tid=entry[TID_INDEX]) # type: ignore @property def num_links(self) -> int: """ Number of links in this data pack. Returns: Number of the links. """ return self._data_store.num_entries("forte.data.ontology.top.Link") @property def all_groups(self) -> Iterator[Group]: """ An iterator of all groups in this data pack. Returns: Iterator of all groups, of type :class:`~forte.data.ontology.top.Group`. """ for entry in self._data_store.all_entries( "forte.data.ontology.top.Group" ): yield self.get_entry(tid=entry[TID_INDEX]) # type: ignore @property def num_groups(self): """ Number of groups in this data pack. Returns: Number of groups. """ return self._data_store.num_entries("forte.data.ontology.top.Group") @property def all_generic_entries(self) -> Iterator[Generics]: """ An iterator of all generic entries in this data pack. Returns: Iterator of generic """ for entry in self._data_store.all_entries( "forte.data.ontology.top.Generics" ): yield self.get_entry(tid=entry[TID_INDEX]) # type: ignore @property def num_generics_entries(self): """ Number of generics entries in this data pack. Returns: Number of generics entries. """ return self._data_store.num_entries("forte.data.ontology.top.Generics") @property def all_audio_annotations(self) -> Iterator[AudioAnnotation]: """ An iterator of all audio annotations in this data pack. Returns: Iterator of all audio annotations, of type :class:`~forte.data.ontology.top.AudioAnnotation`. """ for entry in self._data_store.all_entries( "forte.data.ontology.top.AudioAnnotation" ): yield self.get_entry(tid=entry[TID_INDEX]) # type: ignore @property def num_audio_annotations(self): """ Number of audio annotations in this data pack. Returns: Number of audio annotations. """ return self._data_store.num_entries( "forte.data.ontology.top.AudioAnnotation" ) @property def annotations(self): """ A SortedList container of all annotations in this data pack. Returns: SortedList of all annotations, of type :class:`~forte.data.ontology.top.Annotation`. """ return SortedList(self.all_annotations) @property def generics(self): """ A SortedList container of all generic entries in this data pack. Returns: SortedList of generics """ return SortedList(self.all_generic_entries) @property def audio_annotations(self): """ A SortedList container of all audio annotations in this data pack. Returns: SortedList of all audio annotations, of type :class:`~forte.data.ontology.top.AudioAnnotation`. """ return SortedList(self.all_audio_annotations) @property def links(self): """ A List container of all links in this data pack. Returns: List of all links, of type :class:`~forte.data.ontology.top.Link`. """ return SortedList(self.all_links) @property def groups(self): """ A List container of all groups in this data pack. Returns: List of all groups, of type :class:`~forte.data.ontology.top.Group`. """ return SortedList(self.all_groups) @groups.setter def groups(self, val): self._groups = val
[docs] def get_payload_at( self, modality: IntEnum, payload_index: int ): # -> Union[TextPayload, AudioPayload, ImagePayload]: """ Get Payload of requested modality at the requested payload index. Args: modality: data modality among "text", "audio", "image" payload_index: the zero-based index of the Payload in this DataPack's Payload entries of the requested modality. Raises: ValueError: raised when the requested modality is not supported. Returns: Payload entry containing text data, image or audio data. """ supported_modality = [enum.name for enum in Modality] payloads_length = -1 try: if modality == Modality.Text: payloads_length = len(self.text_payloads) payload = self.text_payloads[payload_index] elif modality == Modality.Audio: payloads_length = len(self.audio_payloads) payload = self.audio_payloads[payload_index] elif modality == Modality.Image: payloads_length = len(self.image_payloads) payload = self.image_payloads[payload_index] else: raise ValueError( f"Provided modality {modality.name} is not supported." "Please provide one of modality among" f" {supported_modality}." ) except IndexError as e: raise ProcessExecutionException( f"payload index ({payload_index}) " f"is larger or equal to {modality.name} payload list" f" length ({payloads_length}). " f"Please input a {modality.name} payload index less than it." ) from e return payload
[docs] def get_payload_data_at( self, modality: IntEnum, payload_index: int ) -> Union[str, np.ndarray]: """ Get Payload of requested modality at the requested payload index. Args: modality: data modality among "text", "audio", "image" payload_index: the zero-based index of the Payload in this DataPack's Payload entries of the requested modality. Raises: ValueError: raised when the requested modality is not supported. Returns: different data types for different data modalities. 1. str data for text data. 2. Numpy array for image and audio data. """ return self.get_payload_at(modality, payload_index).cache
[docs] def get_span_text( self, begin: int, end: int, text_payload_index: int = 0 ) -> str: r"""Get the text in the data pack contained in the span. Args: begin: begin index to query. end: end index to query. text_payload_index: the zero-based index of the TextPayload in this DataPack's TextPayload entries. Defaults to 0. Returns: The text within this span. """ return cast( str, self.get_payload_data_at(Modality.Text, text_payload_index) )[begin:end]
[docs] def get_span_audio( self, begin: int, end: int, audio_payload_index=0 ) -> np.ndarray: r"""Get the audio in the data pack contained in the span. `begin` and `end` represent the starting and ending indices of the span in audio payload respectively. Each index corresponds to one sample in audio time series. Args: begin: begin index to query. end: end index to query. audio_payload_index: the zero-based index of the AudioPayload in this DataPack's AudioPayload entries. Defaults to 0. Returns: The audio within this span. """ return cast( np.ndarray, self.get_payload_data_at(Modality.Audio, audio_payload_index)[ begin:end ], )
[docs] def add_text(self, text): """ Add a text payload to this data pack. Args: text: Text to be added. """ ip = TextPayload(self) ip.cache = text
[docs] def set_text( self, text: str, replace_func: Optional[Callable[[str], ReplaceOperationsType]] = None, text_payload_index: int = 0, ): """ Set text for TextPayload at a specified index or add a new TextPayload in the DataPack. Raises: ValueError: raised when the text payload index is out of range. Args: text: the input text to be assigned to this pack. replace_func: function that replace text. Defaults to None. text_payload_index: the zero-based index of to locate a TextPayload in this DataPack, default 0. This allows one to set multiple texts per DataPack. A DataPack by default contains one such TextPayload, if the `text_payload_index` is larger than 0, then more than one TextPayload need to be added before this, otherwise """ span_ops = [] if replace_func is None else replace_func(text) # The spans should be mutually exclusive ( text, replace_back_operations, processed_original_spans, orig_text_len, ) = data_utils_io.modify_text_and_track_ops(text, span_ops) # temporary solution for backward compatibility # past API use this method to add a single text in the datapack if ( self._data_store.num_entries("forte.data.ontology.top.TextPayload") == 0 and text_payload_index == 0 ): # Create a new TextPayload. tp = TextPayload(self) else: tp = self.get_payload_at(Modality.Text, text_payload_index) tp.cache = text tp.replace_back_operations = replace_back_operations tp.processed_original_spans = processed_original_spans tp.orig_text_len = orig_text_len
[docs] def set_audio( self, audio: np.ndarray, sample_rate: int, audio_payload_index: int = 0, ): r""" Set audio for AudioPayload at a specified index or add a new AudioPayload in the DataPack. Raises: ValueError: raised when the audio payload index is out of range. Args: audio: A numpy array storing the audio waveform. sample_rate: An integer specifying the sample rate. audio_payload_index: the zero-based index of the AudioPayload in this DataPack's AudioPayload entries. Defaults to 0, and it adds a new audio payload if there is no audio payload in the data pack. """ # temporary solution for backward compatibility # past API use this method to add a single audio in the datapack if ( self._data_store.num_entries("forte.data.ontology.top.AudioPayload") == 0 and audio_payload_index == 0 ): logging.warning( "audio_payload_index is set to zero," "and there is not existing AudioPayload" " in the DataPack." "An `AudioPayload` will be added into the DataPack." "However, we encourage user to" " use DataPack.add_audio() function instead." ) ap = AudioPayload(self) else: ap = self.get_payload_at(Modality.Audio, audio_payload_index) ap.cache = audio ap.sample_rate = sample_rate
[docs] def add_audio(self, audio): r""" Add an AudioPayload storing the audio given in the parameters. Args: audio: A numpy array storing the audio. """ ip = AudioPayload(self) ip.cache = audio
[docs] def add_image(self, image): r""" Add an ImagePayload storing the image given in the parameters. Args: image: A numpy array storing the image. """ ip = ImagePayload(self) ip.cache = image
[docs] def set_image( self, image, image_payload_index: int = 0, ): r"""Set the image payload of the :class:`~forte.data.data_pack.DataPack` object. Args: image: A numpy array storing the image. image_payload_index: the zero-based index of the ImagePayload in this DataPack's ImagePayload entries. Defaults to 0. """ # temporary solution for backward compatibility # past API use this method to add a single image in the datapack if ( self._data_store.num_entries("forte.data.ontology.top.ImagePayload") == 0 and image_payload_index == 0 ): ip = ImagePayload(self) logging.warning( "image_payload_index is set to zero," "and there is not existing ImagePayload" " in the DataPack." "An `ImagePayload` will be added into the DataPack." "However, we encourage user to" " use DataPack.add_image() function instead." ) else: ip = self.get_payload_at(Modality.Image, image_payload_index) ip.cache = image
[docs] def get_original_text(self, text_payload_index: int = 0): r"""Get original unmodified text from the :class:`~forte.data.data_pack.DataPack` object. Args: text_payload_index: the zero-based index of the TextPayload in this DataPack's entries. Defaults to 0. Returns: Original text after applying the `replace_back_operations` of :class:`~forte.data.data_pack.DataPack` object to the modified text """ tp = self.get_payload_at(Modality.Text, text_payload_index) original_text, _, _, _ = data_utils_io.modify_text_and_track_ops( tp.cache, tp.replace_back_operations ) return original_text
[docs] def get_original_span( self, input_processed_span: Span, align_mode: str = "relaxed" ): r"""Function to obtain span of the original text that aligns with the given span of the processed text. Args: input_processed_span: Span of the processed text for which the corresponding span of the original text is desired. align_mode: The strictness criteria for alignment in the ambiguous cases, that is, if a part of input_processed_span spans a part of the inserted span, then align_mode controls whether to use the span fully or ignore it completely according to the following possible values: - "strict" - do not allow ambiguous input, give ValueError. - "relaxed" - consider spans on both sides. - "forward" - align looking forward, that is, ignore the span towards the left, but consider the span towards the right. - "backward" - align looking backwards, that is, ignore the span towards the right, but consider the span towards the left. Returns: Span of the original text that aligns with input_processed_span Example: * Let o-up1, o-up2, ... and m-up1, m-up2, ... denote the unprocessed spans of the original and modified string respectively. Note that each o-up would have a corresponding m-up of the same size. * Let o-pr1, o-pr2, ... and m-pr1, m-pr2, ... denote the processed spans of the original and modified string respectively. Note that each o-p is modified to a corresponding m-pr that may be of a different size than o-pr. * Original string: <--o-up1--> <-o-pr1-> <----o-up2----> <----o-pr2----> <-o-up3-> * Modified string: <--m-up1--> <----m-pr1----> <----m-up2----> <-m-pr2-> <-m-up3-> * Note that `self.inverse_original_spans` that contains modified processed spans and their corresponding original spans, would look like - [(o-pr1, m-pr1), (o-pr2, m-pr2)] .. code-block:: python >> data_pack = DataPack() >> original_text = "He plays in the park" >> data_pack.set_text(original_text,\ >> lambda _: [(Span(0, 2), "She"))] >> data_pack.text "She plays in the park" >> input_processed_span = Span(0, len("She plays")) >> orig_span = data_pack.get_original_span(input_processed_span) >> data_pack.get_original_text()[orig_span.begin: orig_span.end] "He plays" """ assert align_mode in ["relaxed", "strict", "backward", "forward"] req_begin = input_processed_span.begin req_end = input_processed_span.end def get_original_index( input_index: int, is_begin_index: bool, mode: str ) -> int: r""" Args: input_index: begin or end index of the input span is_begin_index: if the index is the begin index of the input span or the end index of the input span mode: alignment mode Returns: Original index that aligns with input_index """ processed_original_spans = self.get_payload_at( Modality.Text, 0 ).processed_original_spans if len(processed_original_spans) == 0: return input_index len_processed_text = len(self.text) orig_index = None prev_end = 0 for ( inverse_span, original_span, ) in processed_original_spans: # check if the input_index lies between one of the unprocessed # spans if prev_end <= input_index < inverse_span.begin: increment = original_span.begin - inverse_span.begin orig_index = input_index + increment # check if the input_index lies between one of the processed # spans elif inverse_span.begin <= input_index < inverse_span.end: # look backward - backward shift of input_index if is_begin_index and mode in ["backward", "relaxed"]: orig_index = original_span.begin if not is_begin_index and mode == "backward": orig_index = original_span.begin - 1 # look forward - forward shift of input_index if is_begin_index and mode == "forward": orig_index = original_span.end if not is_begin_index and mode in ["forward", "relaxed"]: orig_index = original_span.end - 1 # break if the original index is populated if orig_index is not None: break prev_end = inverse_span.end if orig_index is None: # check if the input_index lies between the last unprocessed # span inverse_span, original_span = processed_original_spans[-1] if inverse_span.end <= input_index < len_processed_text: increment = original_span.end - inverse_span.end orig_index = input_index + increment else: # check if there input_index is not valid given the # alignment mode or lies outside the processed string raise ValueError( f"The input span either does not adhere " f"to the {align_mode} alignment mode or " f"lies outside to the processed string." ) return orig_index orig_begin = get_original_index(req_begin, True, align_mode) orig_end = get_original_index(req_end - 1, False, align_mode) + 1 return Span(orig_begin, orig_end)
[docs] @classmethod def deserialize( cls, data_source: Union[Path, str], serialize_method: str = "json", zip_pack: bool = False, ) -> "DataPack": """ Deserialize a Data Pack from a string. This internally calls the internal :meth:`~forte.data.base_pack.BasePack._deserialize()` function from :class:`~forte.data.base_pack.BasePack`. Args: data_source: The path storing data source. serialize_method: The method used to serialize the data, this should be the same as how serialization is done. The current options are `json`, `jsonpickle` and `pickle`. The default method is `json`. zip_pack: Boolean value indicating whether the input source is zipped. Returns: An data pack object deserialized from the string. """ return cls._deserialize(data_source, serialize_method, zip_pack) # type: ignore
def _add_entry(self, entry: Union[EntryType, int]) -> EntryType: r"""Force add an :class:`~forte.data.ontology.core.Entry` object to the :class:`~forte.data.data_pack.DataPack` object. Allow duplicate entries in a pack. Args: entry: An :class:`~forte.data.ontology.core.Entry` object to be added to the pack. Returns: The input entry itself """ return self.__add_entry_with_check(entry) def __add_entry_with_check(self, entry: Union[EntryType, int]) -> EntryType: r"""Internal method to add an :class:`~forte.data.ontology.core.Entry` object to the :class:`~forte.data.DataPack` object. Args: entry: An :class:`~forte.data.ontology.core.Entry` object to be added to the datapack. Returns: The input entry itself """ if isinstance(entry, int): # If entry is a TID, convert it to the class object. entry = self._entry_converter.get_entry_object(tid=entry, pack=self) # type: ignore if isinstance(entry, Annotation): begin, end = entry.begin, entry.end if begin < 0: raise ValueError( f"The begin {begin} is smaller than 0, this " f"is not a valid begin." ) if end > len(self.text): if len(self.text) == 0: raise ValueError( f"The end {end} of span is greater than the text " f"length {len(self.text)}, which is invalid. The text " f"length is 0, so it may be the case the you haven't " f"set text for the data pack. Please set the text " f"before calling `add_entry` on the annotations." ) else: pack_ref = entry.pack.pack_id raise ValueError( f"The end {end} of span is greater than the text " f"length {len(self.text)}, which is invalid. The " f"problematic entry is of type {entry.__class__} " f"at [{begin}:{end}], in pack {pack_ref}." ) # update the data pack index if needed self._index.update_basic_index([entry]) # type: ignore if self._index.link_index_on and isinstance(entry, Link): self._index.update_link_index([entry]) if self._index.group_index_on and isinstance(entry, Group): self._index.update_group_index([entry]) self._index.deactivate_coverage_index() self._pending_entries.pop(entry.tid) # type: ignore return entry # type: ignore
[docs] def delete_entry(self, entry: EntryType): r"""Delete an :class:`~forte.data.ontology.core.Entry` object from the :class:`~forte.data.data_pack.DataPack`. This find out the entry in the index and remove it from the index. Note that entries will only appear in the index if `add_entry` (or _add_entry_with_check) is called. Please note that deleting a entry do not guarantee the deletion of the related entries. Args: entry: An :class:`~forte.data.ontology.core.Entry` object to be deleted from the pack. """ super().delete_entry(entry=entry) self._index.deactivate_coverage_index()
@classmethod def validate_link(cls, entry: EntryType) -> bool: return isinstance(entry, Link) @classmethod def validate_group(cls, entry: EntryType) -> bool: return isinstance(entry, Group)
[docs] def get_data( self, context_type: Union[str, Type[Annotation], Type[AudioAnnotation]], request: Optional[DataRequest] = None, skip_k: int = 0, payload_index: int = 0, ) -> Iterator[Dict[str, Any]]: r"""Fetch data from entries in the data_pack of type `context_type`. Data includes `"span"`, annotation-specific default data fields and specific data fields by `"request"`. Annotation-specific data fields means: - `"text"` for ``Type[Annotation]`` - `"audio"` for ``Type[AudioAnnotation]`` Currently, we do not support Groups and Generics in the request. Example: .. code-block:: python requests = { base_ontology.Sentence: { "component": ["dummy"], "fields": ["speaker"], }, base_ontology.Token: ["pos", "sense"], base_ontology.EntityMention: { }, } pack.get_data(base_ontology.Sentence, requests) Args: context_type: The granularity of the data context, which could be any :class:`~forte.data.ontology.top.Annotation` or :class:`~forte.data.ontology.top.AudioAnnotation` type. Behaviors under different context_type varies: - str type will be converted into either :class:`~forte.data.ontology.top.Annotation` type or :class:`~forte.data.ontology.top.AudioAnnotation` type. - ``Type[Annotation]``: the default data field for getting context data is :attr:`text`. This function iterates :attr:`all_annotations` to search target entry data. - ``Type[AudioAnnotation]``: the default data field for getting context data is :attr:`audio` which stores audio data in numpy arrays. This function iterates :attr:`all_audio_annotations` to search target entry data. request: The entry types and fields User wants to request. The keys of the requests dict are the required entry types and the value should be either: - a list of field names or - a dict which accepts three keys: `"fields"`, `"component"`, and `"unit"`. - By setting `"fields"` (list), users specify the requested fields of the entry. If "fields" is not specified, only the default fields will be returned. - By setting `"component"` (list), users can specify the components by which the entries are generated. If `"component"` is not specified, will return entries generated by all components. - By setting `"unit"` (string), users can specify a unit by which the annotations are indexed. Note that for all annotation types, `"span"` fields and annotation-specific data fields are returned by default. For all link types, `"child"` and `"parent"` fields are returned by default. skip_k: Will skip the first `skip_k` instances and generate data from the (`offset` + 1)th instance. payload_index: the zero-based index of the Payload in this DataPack's Payload entries of a particular modality. The modality is dependent on ``context_type``. Defaults to 0. Returns: A data generator, which generates one piece of data (a dict containing the required entries, fields, and context). """ context_type_: Union[Type[Annotation], Type[AudioAnnotation]] if isinstance(context_type, str): context_type_ = get_class(context_type) if not issubclass(context_type_, Entry): raise ValueError( f"The provided `context_type` [{context_type_}] " f"is not a subclass to the" f"`forte.data.ontology.top.Annotation` class" ) else: context_type_ = context_type annotation_types: Dict[ Union[Type[Annotation], Type[AudioAnnotation]], Union[Dict, List] ] = {} link_types: Dict[Type[Link], Union[Dict, List]] = {} group_types: Dict[Type[Group], Union[Dict, List]] = {} generics_types: Dict[Type[Generics], Union[Dict, List]] = {} audio_annotation_types: Dict[ Type[AudioAnnotation], Union[Dict, List] ] = {} if request is not None: for key_, value in request.items(): key = as_entry_type(key_) if issubclass(key, Annotation): annotation_types[key] = value elif issubclass(key, Link): link_types[key] = value elif issubclass(key, Group): group_types[key] = value elif issubclass(key, Generics): generics_types[key] = value elif issubclass(key, AudioAnnotation): audio_annotation_types[key] = value context_args = annotation_types.get(context_type_) context_components, _, context_fields = self._parse_request_args( context_type_, context_args ) valid_context_ids: Set[int] = self._index.query_by_type_subtype( context_type_ ) if context_components: valid_component_id: Set[int] = set() for component in context_components: valid_component_id |= self.get_ids_by_creator(component) valid_context_ids &= valid_component_id def get_annotation_list( c_type: Union[Type[Annotation], Type[AudioAnnotation]] ): r"""Get an annotation list of a given context type. Args: c_type: The granularity of the data context, which could be any :class:`~forte.data.ontology.top.Annotation` type. Raises: NotImplementedError: raised when the given context type is not implemented. Returns: List(Union[Annotation, AudioAnnotation]): a list of annotations which is a copy of `self.annotations` and it enables modifications of `self.annotations` while iterating through its copy. """ if issubclass(c_type, Annotation): return list(self.annotations) elif issubclass(c_type, AudioAnnotation): return list(self.audio_annotations) else: raise NotImplementedError( f"Context type is set to {c_type}," " but currently we only support" " [Annotation, AudioAnnotation]." ) def get_context_data( c_type: Union[Type[Annotation], Type[AudioAnnotation]], context: Union[Annotation, AudioAnnotation], payload_index: int, ): r""" Get context-specific data of a given context type and context. Args: c_type: The granularity of the data context, which could be any :class:`~forte.data.ontology.top.Annotation` type. context: context that contains data to be extracted. payload_index: the zero-based index of the Payload in this DataPack's Payload entries of a particular modality. The modality is dependent on ``c_type``. Raises: NotImplementedError: raised when the given context type is not implemented. Returns: str: context data. """ if issubclass(c_type, Annotation): return self.get_payload_data_at(Modality.Text, payload_index)[ context.begin : context.end ] elif issubclass(c_type, AudioAnnotation): return self.get_payload_data_at(Modality.Audio, payload_index)[ context.begin : context.end ] else: raise NotImplementedError( f"Context type is set to {context_type}" "but currently we only support" "[Annotation, AudioAnnotation]" ) skipped = 0 for context in get_annotation_list(context_type_): if context.tid not in valid_context_ids or not isinstance( context, context_type_ ): continue if skipped < skip_k: skipped += 1 continue data: Dict[str, Any] = {} data["context"] = get_context_data( context_type_, context, payload_index ) data["offset"] = context.begin for field in context_fields: data[field] = getattr(context, field) if annotation_types: for a_type, a_args in annotation_types.items(): if issubclass(a_type, context_type_): continue if a_type.__name__ in data: raise KeyError( f"Requesting two types of entries with the " f"same class name {a_type.__name__} at the " f"same time is not allowed" ) data[ a_type.__name__ ] = self._generate_annotation_entry_data( a_type, a_args, data, context ) if audio_annotation_types: for a_type, a_args in audio_annotation_types.items(): if a_type.__name__ in data: raise KeyError( f"Requesting two types of entries with the " f"same class name {a_type.__name__} at the " f"same time is not allowed" ) data[ a_type.__name__ ] = self._generate_annotation_entry_data( a_type, a_args, data, context ) if link_types: for l_type, l_args in link_types.items(): if l_type.__name__ in data: raise KeyError( f"Requesting two types of entries with the " f"same class name {l_type.__name__} at the " f"same time is not allowed" ) data[l_type.__name__] = self._generate_link_entry_data( l_type, l_args, data, context ) # TODO: Getting Group based on range is not done yet. if group_types: raise NotImplementedError( "Querying groups based on ranges is " "currently not supported." ) if generics_types: raise NotImplementedError( "Querying generic types based on ranges is " "currently not supported." ) yield data
def _parse_request_args(self, a_type, a_args): # request which fields generated by which component components = None unit = None fields = set() if isinstance(a_args, dict): components = a_args.get("component") # pylint: disable=isinstance-second-argument-not-valid-type # TODO: until fix: https://github.com/PyCQA/pylint/issues/3507 if components is not None and not isinstance(components, Iterable): raise TypeError( "Invalid request format for 'components'. " "The value of 'components' should be of an iterable type." ) unit = a_args.get("unit") if unit is not None and not isinstance(unit, str): raise TypeError( "Invalid request format for 'unit'. " "The value of 'unit' should be a string." ) a_args = a_args.get("fields", set()) # pylint: disable=isinstance-second-argument-not-valid-type # TODO: disable until fix: https://github.com/PyCQA/pylint/issues/3507 if isinstance(a_args, Iterable): fields = set(a_args) elif a_args is not None: raise TypeError( f"Invalid request format for '{a_type}'. " f"The request should be of an iterable type or a dict." ) fields.add("tid") return components, unit, fields def _generate_annotation_entry_data( self, a_type: Union[Type[Annotation], Type[AudioAnnotation]], a_args: Union[Dict, Iterable], data: Dict, cont: Optional[Annotation], ) -> Dict: components, unit, fields = self._parse_request_args(a_type, a_args) a_dict: Dict[str, Any] = {} a_dict["span"] = [] # For AudioAnnotation, since the data is single numpy array # we don't initialize an empty list for a_dict["audio"] if issubclass(a_type, Annotation): a_dict["text"] = [] elif issubclass(a_type, AudioAnnotation): a_dict["audio"] = [] for field in fields: a_dict[field] = [] unit_begin = 0 if unit is not None: if unit not in data: raise KeyError( f"{unit} is missing in data. You need to " f"request {unit} before {a_type}." ) a_dict["unit_span"] = [] cont_begin = cont.begin if cont else 0 annotation: Union[Type[Annotation], Type[AudioAnnotation]] for annotation in self.get(a_type, cont, components): # type: ignore # we provide span, text (and also tid) by default a_dict["span"].append((annotation.begin, annotation.end)) if isinstance(annotation, Annotation): a_dict["text"].append(annotation.text) elif isinstance(annotation, AudioAnnotation): a_dict["audio"].append(annotation.audio) else: raise NotImplementedError( f"Annotation is set to {annotation}" "but currently we only support" "instances of [Annotation, " "AudioAnnotation] and their subclass." ) for field in fields: if field in ("span", "text", "audio"): continue if field == "context_span": a_dict[field].append( ( annotation.begin - cont_begin, annotation.end - cont_begin, ) ) continue a_dict[field].append(getattr(annotation, field)) if unit is not None: while not self._index.in_span( data[unit]["tid"][unit_begin], annotation.span, ): unit_begin += 1 unit_span_begin = unit_begin unit_span_end = unit_span_begin + 1 while self._index.in_span( data[unit]["tid"][unit_span_end], annotation.span, ): unit_span_end += 1 a_dict["unit_span"].append((unit_span_begin, unit_span_end)) for key, value in a_dict.items(): a_dict[key] = np.array(value) return a_dict def _generate_link_entry_data( self, a_type: Type[Link], a_args: Union[Dict, Iterable], data: Dict, cont: Optional[Annotation], ) -> Dict: components, unit, fields = self._parse_request_args(a_type, a_args) if unit is not None: raise ValueError(f"Link entries cannot be indexed by {unit}.") a_dict: Dict[str, Any] = {} for field in fields: a_dict[field] = [] a_dict["parent"] = [] a_dict["child"] = [] link: Link for link in self.get(a_type, cont, components): parent_type = link.ParentType.__name__ child_type = link.ChildType.__name__ if parent_type not in data: raise KeyError( f"The Parent entry of {a_type} is not requested." f" You should also request {parent_type} with " f"{a_type}" ) if child_type not in data: raise KeyError( f"The child entry of {a_type} is not requested." f" You should also request {child_type} with " f"{a_type}" ) a_dict["parent"].append( np.where(data[parent_type]["tid"] == link.parent)[0][0] ) a_dict["child"].append( np.where(data[child_type]["tid"] == link.child)[0][0] ) for field in fields: if field in ("parent", "child"): continue a_dict[field].append(getattr(link, field)) for key, value in a_dict.items(): a_dict[key] = np.array(value) return a_dict
[docs] def build_coverage_for( self, context_type: Type[Union[Annotation, AudioAnnotation]], covered_type: Type[EntryType], ): """ User can call this function to build coverage index for specific types. The index provide a in-memory mapping from entries of `context_type` to the entries "covered" by it. See :class:`forte.data.data_pack.DataIndex` for more details. Args: context_type: The context/covering type. covered_type: The entry to find under the context type. """ if self._index.coverage_index(context_type, covered_type) is None: self._index.build_coverage_index(self, context_type, covered_type)
[docs] def covers( self, context_entry: Union[Annotation, AudioAnnotation], covered_entry: EntryType, ) -> bool: """ Check if the `covered_entry` is covered (in span) of the `context_type`. See :meth:`~forte.data.data_pack.DataIndex.in_span` and :meth:`~forte.data.data_pack.DataIndex.in_audio_span` for the definition of `in span`. Args: context_entry: The context entry. covered_entry: The entry to be checked on whether it is in span of the context entry. Returns (bool): True if in span. """ return covered_entry.tid in self._index.get_covered( self, context_entry, covered_entry.__class__ )
[docs] def get( # type: ignore self, entry_type: Union[str, Type[EntryType]], range_annotation: Optional[ Union[Annotation, AudioAnnotation, int] ] = None, components: Optional[Union[str, Iterable[str]]] = None, include_sub_type: bool = True, get_raw: bool = False, ) -> Iterable[EntryType]: r"""This function is used to get data from a data pack with various methods. Depending on the provided arguments, the function will perform several different filtering of the returned data. The ``entry_type`` is mandatory, where all the entries matching this type will be returned. The sub-types of the provided entry type will be also returned if ``include_sub_type`` is set to True (which is the default behavior). The ``range_annotation`` controls the search area of the sub-types. An entry `E` will be returned if :meth:`~forte.data.data_pack.DataIndex.in_span` or :meth:`~forte.data.data_pack.DataIndex.in_audio_span` returns True. If this function is called frequently with queries related to the ``range_annotation``, please consider to build the coverage index regarding the related entry types. User can call :meth:`build_coverage_for(context_type, covered_type)` in order to build a mapping between a pair of entry types and target entries that are covered in ranges specified by outer entries. The ``components`` list will filter the results by the `component` (i.e the creator of the entry). If ``components`` is provided, only the entries created by one of the ``components`` will be returned. Example: .. code-block:: python # Iterate through all the sentences in the pack. for sentence in input_pack.get(Sentence): # Take all tokens from a sentence created by NLTKTokenizer. token_entries = input_pack.get( entry_type=Token, range_annotation=sentence, component='NLTKTokenizer') ... In the above code snippet, we get entries of type ``Token`` within each ``sentence`` which were generated by ``NLTKTokenizer``. You can consider build coverage index between ``Token`` and ``Sentence`` if this snippet is frequently used: .. code-block:: python # Build coverage index between `Token` and `Sentence` input_pack.build_coverage_for( context_type=Sentence covered_type=Token ) After building the index from the snippet above, you will be able to retrieve the tokens covered by sentence much faster. Args: entry_type: The type of entries requested. range_annotation: The range of entries requested. This value can be given by an entry object or the ``tid`` of that entry. If `None`, will return valid entries in the range of whole data pack. components: The component (creator) generating the entries requested. If `None`, will return valid entries generated by any component. include_sub_type: whether to consider the sub types of the provided entry type. Default `True`. get_raw: boolean to indicate if the entry should be returned in its primitive form as opposed to an object. False by default Yields: Each `Entry` found using this method. """ # Convert entry_type to str entry_type_ = ( get_full_module_name(entry_type) if not isinstance(entry_type, str) else entry_type ) # pylint: disable=protected-access # Check if entry_type_ represents a valid entry if not self._data_store._is_subclass(entry_type_, Entry): raise ValueError( f"The specified entry type [{entry_type}] " f"does not correspond to a " f"`forte.data.ontology.core.Entry` class" ) def require_annotations(entry_class=Annotation) -> bool: if self._data_store._is_subclass(entry_type_, entry_class): return True curr_class: Type[EntryType] = as_entry_type(entry_type_) if issubclass(curr_class, Link): return issubclass( curr_class.ParentType, entry_class ) and issubclass(curr_class.ChildType, entry_class) if issubclass(curr_class, Group): return issubclass(curr_class.MemberType, entry_class) return False # If we don't have any annotations but the items to check requires them, # then we simply yield from an empty list. # changed form using len(annotations) to num_annotations directly for # improving the performance. if ( self.num_annotations == 0 and isinstance(range_annotation, Annotation) and require_annotations(Annotation) ) or ( self.num_audio_annotations == 0 and isinstance(range_annotation, AudioAnnotation) and require_annotations(AudioAnnotation) ): yield from [] return # If the ``entry_type`` and `range_annotation` are for different types of # payload, then we yield from an empty list with a warning. if ( require_annotations(Annotation) and isinstance(range_annotation, AudioAnnotation) ) or ( require_annotations(AudioAnnotation) and isinstance(range_annotation, Annotation) ): logger.warning( "Incompatible combination of ``entry_type`` and " "`range_annotation` found in the input of `DataPack.get()`" " method. An empty iterator will be returned when inputs " "contain multi-media entries. Please double check the input " "arguments and make sure they are associated with the same type" " of payload (i.e., either text or audio)." ) yield from [] return # If range_annotation is specified, we record its begin and # end index range_begin: int range_end: int if range_annotation is not None: if isinstance(range_annotation, AnnotationLikeEntries): range_begin = range_annotation.begin range_end = range_annotation.end else: # range_annotation is given by the tid of the entry it # represents range_raw = self._data_store.transform_data_store_entry( self.get_entry_raw(range_annotation) ) range_begin = range_raw[BEGIN_ATTR_NAME] range_end = range_raw[END_ATTR_NAME] try: for entry_data in self._data_store.get( type_name=entry_type_, include_sub_type=include_sub_type, range_span=range_annotation # type: ignore and (range_begin, range_end), ): # Filter by components if components is not None: if not self.is_created_by( entry_data[TID_INDEX], components ): continue entry: Union[Entry, Dict[str, Any]] if get_raw: entry = self._data_store.transform_data_store_entry( entry_data ) else: entry = self.get_entry(tid=entry_data[TID_INDEX]) # Filter out incompatible audio span comparison for Links and Groups if ( self._data_store._is_subclass( entry_type_, (Link, Group) ) and isinstance(range_annotation, AudioAnnotation) and not self._index.in_audio_span( entry, range_annotation.span ) ): continue yield entry # type: ignore except ValueError: # type_name does not exist in DataStore yield from []
[docs] def update(self, datapack: "DataPack"): r"""Update the attributes and properties of the current DataPack with another DataPack. Args: datapack: A reference datapack to update """ # TODO: Not recommended to directly update __dict__. Should find a # better solution. self.__dict__.update(datapack.__dict__)
def _save_entry_to_data_store(self, entry: Entry): r"""Save an existing entry object into DataStore""" self._entry_converter.save_entry_object(entry=entry, pack=self) if isinstance(entry, Payload): if entry.modality == Modality.Text: self.text_payloads.append(entry) elif entry.modality == Modality.Audio: self.audio_payloads.append(entry) elif entry.modality == Modality.Image: self.image_payloads.append(entry) def _get_entry_from_data_store(self, tid: int) -> Entry[Any]: r"""Generate a class object from entry data in DataStore""" return self._entry_converter.get_entry_object(tid=tid, pack=self)
[docs]class DataIndex(BaseIndex[Entry]): r"""A set of indexes used in :class:`~forte.data.data_pack.DataPack`, note that this class is used by the `DataPack` internally. #. :attr:`entry_index`, the index from each ``tid`` to the corresponding entry #. :attr:`type_index`, the index from each type to the entries of that type #. :attr:`component_index`, the index from each component to the entries generated by that component #. :attr:`link_index`, the index from child (:attr:`link_index["child_index"]`)and parent (:attr:`link_index["parent_index"]`) nodes to links #. :attr:`group_index`, the index from group members to groups. #. :attr:`_coverage_index`, the index that maps from an annotation to the entries it covers. :attr:`_coverage_index` is a dict of dict, where the key is a tuple of the outer entry type and the inner entry type. The outer entry type should be an annotation type. The value is a dict, where the key is the ``tid`` of the outer entry, and the value is a set of ``tid`` that are covered by the outer entry. We say an Annotation A covers an entry E if one of the following condition is met: 1. E is of Annotation type, and that E.begin >= A.begin, E.end <= E.end 2. E is of Link type, and both E's parent and child node are Annotation that are covered by A. """ def __init__(self): super().__init__() self._coverage_index: Dict[ Tuple[Type[Union[Annotation, AudioAnnotation]], Type[Entry]], Dict[int, Set[int]], ] = {} self._coverage_index_valid = True def remove_entry(self, entry: Entry): super().remove_entry(entry) self.deactivate_coverage_index() @property def coverage_index_is_valid(self): return self._coverage_index_valid def activate_coverage_index(self): self._coverage_index_valid = True def deactivate_coverage_index(self): self._coverage_index_valid = False
[docs] def coverage_index( self, outer_type: Type[Union[Annotation, AudioAnnotation]], inner_type: Type[EntryType], ) -> Optional[Dict[int, Set[int]]]: r"""Get the coverage index from ``outer_type`` to ``inner_type``. Args: outer_type: an annotation or `AudioAnnotation` type. inner_type: an entry type. Returns: If the coverage index does not exist, return `None`. Otherwise, return a dict. """ if not self.coverage_index_is_valid: return None return self._coverage_index.get((outer_type, inner_type))
[docs] def get_covered( self, data_pack: DataPack, context_annotation: Union[Annotation, AudioAnnotation], inner_type: Type[EntryType], ) -> Set[int]: """ Get the entries covered by a certain context annotation Args: data_pack: The data pack to search for. context_annotation: The context annotation to search in. inner_type: The inner type to be searched for. Returns: Entry ID of type `inner_type` that is covered by `context_annotation`. """ context_type = context_annotation.__class__ if self.coverage_index(context_type, inner_type) is None: self.build_coverage_index(data_pack, context_type, inner_type) assert self._coverage_index is not None return self._coverage_index.get((context_type, inner_type), {}).get( context_annotation.tid, set() )
[docs] def build_coverage_index( self, data_pack: DataPack, outer_type: Type[Union[Annotation, AudioAnnotation]], inner_type: Type[EntryType], ): r"""Build the coverage index from ``outer_type`` to ``inner_type``. Args: data_pack: The data pack to build coverage for. outer_type: an annotation or `AudioAnnotation` type. inner_type: an entry type, can be Annotation, Link, Group, `AudioAnnotation`. """ if not issubclass( inner_type, (Annotation, Link, Group, AudioAnnotation) ): raise ValueError(f"Do not support coverage index for {inner_type}.") if not self.coverage_index_is_valid: self._coverage_index = {} # prevent the index from being used during construction self.deactivate_coverage_index() # TODO: tests and documentations for the edge cases are missing. i.e. we # are not clear about what would happen if the covered annotation # is the same as the covering annotation, or if their spans are the # same. self._coverage_index[(outer_type, inner_type)] = {} for range_annotation in data_pack.get_entries_of(outer_type): if isinstance(range_annotation, (Annotation, AudioAnnotation)): entries = data_pack.get(inner_type, range_annotation) entry_ids = {e.tid for e in entries} self._coverage_index[(outer_type, inner_type)][ range_annotation.tid ] = entry_ids self.activate_coverage_index()
[docs] def have_overlap( self, entry1: Union[Annotation, int, AudioAnnotation], entry2: Union[Annotation, int, AudioAnnotation], ) -> bool: r"""Check whether the two annotations have overlap in span. Args: entry1: An :class:`Annotation` or :class:`AudioAnnotation` object to be checked, or the ``tid`` of the Annotation. entry2: Another :class:`Annotation` or :class:`AudioAnnotation` object to be checked, or the ``tid`` of the Annotation. """ entry1_: Union[Annotation, AudioAnnotation] = ( self._entry_index[entry1] # type: ignore if isinstance(entry1, (int, np.integer)) else entry1 ) entry2_: Union[Annotation, AudioAnnotation] = ( self._entry_index[entry2] # type: ignore if isinstance(entry2, (int, np.integer)) else entry2 ) if not isinstance(entry1_, (Annotation, AudioAnnotation)): raise TypeError( f"'entry1' should be an instance of Annotation or `AudioAnnotation`," f" but get {type(entry1)}" ) if not isinstance(entry2_, (Annotation, AudioAnnotation)): raise TypeError( f"'entry2' should be an instance of Annotation or `AudioAnnotation`," f" but get {type(entry2)}" ) if ( isinstance(entry1_, Annotation) and isinstance(entry2_, AudioAnnotation) ) or ( isinstance(entry1_, AudioAnnotation) and isinstance(entry2_, Annotation) ): raise TypeError( "'entry1' and 'entry2' should be the same type of entry, " f"but get type(entry1)={type(entry1_)}, " f"typr(entry2)={type(entry2_)}" ) return not ( entry1_.begin >= entry2_.end or entry1_.end <= entry2_.begin )
[docs] def in_span(self, inner_entry: Union[int, Entry], span: Span) -> bool: r"""Check whether the ``inner entry`` is within the given ``span``. The criterion are as followed: Annotation entries: they are considered in a span if the begin is not smaller than `span.begin` and the end is not larger than `span.end`. Link entries: if the parent and child of the links are both `Annotation` type, this link will be considered in span if both parent and child are :meth:`~forte.data.data_pack.DataIndex.in_span` of the provided `span`. If either the parent and the child is not of type `Annotation`, this function will always return `False`. Group entries: if the child type of the group is `Annotation` type, then the group will be considered in span if all the elements are :meth:`~forte.data.data_pack.DataIndex.in_span` of the provided `span`. If the child type is not `Annotation` type, this function will always return `False`. Other entries (i.e Generics and `AudioAnnotation`): they will not be considered :meth:`~forte.data.data_pack.DataIndex.in_span` of any spans. The function will always return `False`. Args: inner_entry: The inner entry object to be checked whether it is within ``span``. The argument can be the entry id or the entry object itself. span: A :class:`~forte.data.span.Span` object to be checked. We will check whether the ``inner_entry`` is within this span. Returns: True if the `inner_entry` is considered to be in span of the provided span. """ # The reason of this check is that the get_data method will use numpy # integers. This might create problems when other unexpected integers # are used. if isinstance(inner_entry, (int, np.integer)): inner_entry = self._entry_index[inner_entry] inner_begin = -1 inner_end = -1 if isinstance(inner_entry, Annotation): inner_begin = inner_entry.begin inner_end = inner_entry.end elif isinstance(inner_entry, Link): if not issubclass(inner_entry.ParentType, Annotation): return False if not issubclass(inner_entry.ChildType, Annotation): return False child = inner_entry.get_child() parent = inner_entry.get_parent() if not isinstance(child, Annotation) or not isinstance( parent, Annotation ): # Cannot check in_span for non-annotations. return False child_: Annotation = child parent_: Annotation = parent inner_begin = min(child_.begin, parent_.begin) inner_end = max(child_.end, parent_.end) elif isinstance(inner_entry, Group): if not issubclass(inner_entry.MemberType, Annotation): return False for mem in inner_entry.get_members(): mem_: Annotation = mem # type: ignore if inner_begin == -1: inner_begin = mem_.begin inner_begin = min(inner_begin, mem_.begin) inner_end = max(inner_end, mem_.end) else: # Generics, AudioAnnotation, or other user defined types will not # be check here. return False return inner_begin >= span.begin and inner_end <= span.end
[docs] def in_audio_span(self, inner_entry: Union[int, Entry], span: Span) -> bool: r"""Check whether the ``inner entry`` is within the given audio span. This method is identical to :meth::meth:`~forte.data.data_pack.DataIndex.in_span` except that it operates on the audio payload of datapack. The criterion are as followed: `AudioAnnotation` entries: they are considered in a span if the begin is not smaller than `span.begin` and the end is not larger than `span.end`. Link entries: if the parent and child of the links are both `AudioAnnotation` type, this link will be considered in span if both parent and child are :meth:`~forte.data.data_pack.DataIndex.in_span` of the provided `span`. If either the parent and the child is not of type `AudioAnnotation`, this function will always return `False`. Group entries: if the child type of the group is `AudioAnnotation` type, then the group will be considered in span if all the elements are :meth:`~forte.data.data_pack.DataIndex.in_span` of the provided `span`. If the child type is not `AudioAnnotation` type, this function will always return `False`. Other entries (i.e Generics and Annotation): they will not be considered :meth:`~forte.data.data_pack.DataIndex.in_span` of any spans. The function will always return `False`. Args: inner_entry: The inner entry object to be checked whether it is within ``span``. The argument can be the entry id or the entry object itself. span: A :class:`~forte.data.span.Span` object to be checked. We will check whether the ``inner_entry`` is within this span. Returns: True if the `inner_entry` is considered to be in span of the provided span. """ # The reason of this check is that the get_data method will use numpy # integers. This might create problems when other unexpected integers # are used. if isinstance(inner_entry, (int, np.integer)): inner_entry = self._entry_index[inner_entry] inner_begin = -1 inner_end = -1 if isinstance(inner_entry, AudioAnnotation): inner_begin = inner_entry.begin inner_end = inner_entry.end elif isinstance(inner_entry, Link): if not ( issubclass(inner_entry.ParentType, AudioAnnotation) and issubclass(inner_entry.ChildType, AudioAnnotation) ): return False child = inner_entry.get_child() parent = inner_entry.get_parent() if not isinstance(child, AudioAnnotation) or not isinstance( parent, AudioAnnotation ): # Cannot check in_span for non-AudioAnnotation. return False child_: AudioAnnotation = child parent_: AudioAnnotation = parent inner_begin = min(child_.begin, parent_.begin) inner_end = max(child_.end, parent_.end) elif isinstance(inner_entry, Group): if not issubclass(inner_entry.MemberType, AudioAnnotation): return False for mem in inner_entry.get_members(): mem_: AudioAnnotation = mem # type: ignore if inner_begin == -1: inner_begin = mem_.begin inner_begin = min(inner_begin, mem_.begin) inner_end = max(inner_end, mem_.end) else: # Generics, Annotation, or other user defined types will not be # check here. return False return inner_begin >= span.begin and inner_end <= span.end