Source code for forte.processors.misc.vocabulary_processor

# Copyright 2019 The Forte Authors. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
from abc import ABC
from typing import Counter as CounterType, Dict, List, Optional
from asyml_utilities.special_tokens import SpecialTokens
from forte.processors.base import PackProcessor

__all__ = [

[docs]class Alphabet: """ Args: name: The name of the alphabet keep_growing: If True, new instances not found ruing `get_index` will be added to the vocabulary. ignore_case_in_query: If it's True, Alphabet will try to query the lower-cased input from it's vocabulary if it cannot find the input in its keys. """ def __init__( self, name, word_cnt: Optional[CounterType[str]] = None, keep_growing: bool = True, ignore_case_in_query: bool = True, other_embeddings: Optional[Dict] = None, ): self.__name = name self.reserved_tokens = SpecialTokens self.instance2index: Dict = {} self.instances: List = [] for sp in [ self.reserved_tokens.PAD, self.reserved_tokens.BOS, self.reserved_tokens.EOS, self.reserved_tokens.UNK, ]: self.instance2index[sp] = len(self.instance2index) self.instances.append(sp) self.pad_id = self.instance2index[self.reserved_tokens.PAD] # 0 self.bos_id = self.instance2index[self.reserved_tokens.BOS] # 1 self.eos_id = self.instance2index[self.reserved_tokens.EOS] # 2 self.unk_id = self.instance2index[self.reserved_tokens.UNK] # 3 self.keep_growing = keep_growing self.ignore_case_in_query = ignore_case_in_query self.other_embeddings = other_embeddings if word_cnt is not None: for word in word_cnt: self.add(word) self.close() def add(self, instance): if instance not in self.instance2index: self.instance2index[instance] = len(self.instance2index) self.instances.append(instance)
[docs] def get_index(self, instance): """ Args: instance: the input token Returns: the index of the queried token in the dictionary """ if instance is None: return self.instance2index[self.reserved_tokens.PAD] try: return self.instance2index[instance] except KeyError: if self.keep_growing: self.add(instance) return self.instance2index[instance] else: if self.ignore_case_in_query: try: return self.instance2index[instance.lower()] except KeyError: return self.instance2index[self.reserved_tokens.UNK] else: return self.instance2index[self.reserved_tokens.UNK]
def get_instance(self, index): try: return self.instances[index] except IndexError as e: raise IndexError("unknown index: %d" % index) from e def size(self): return len(self.instances) def items(self): return self.instance2index.items() def close(self): self.keep_growing = False def open(self): self.keep_growing = True def get_content(self): return { "instance2index": self.instance2index, "instances": self.instances, } def __from_json(self, data): self.instances = data["instances"] self.instance2index = data["instance2index"]
[docs] def save(self, output_directory, name=None): """ Save both alphabet records to the given directory. Args: output_directory: Directory to save model and weights. name: The alphabet saving name, optional. """ saving_name = name if name else self.__name if not os.path.exists(output_directory): os.makedirs(output_directory) with open( os.path.join(output_directory, saving_name + ".json"), "w", encoding="utf-8", ) as out: json.dump( self.get_content(), out, indent=4, )
def load(self, input_directory, name=None): loading_name = name if name else self.__name with open( os.path.join(input_directory, loading_name + ".json"), encoding="utf-8", ) as f: self.__from_json(json.load(f)) self.keep_growing = False
[docs]class VocabularyProcessor(PackProcessor, ABC): """ Build vocabulary from the input DataPack, write the result into the shared resources. """ def __init__(self) -> None: super().__init__() self.min_frequency = 0