Source code for concepts.benchmark.common.vocab

#! /usr/bin/env python3
# -*- coding: utf-8 -*-
# File   : vocab.py
# Author : Jiayuan Mao
# Email  : maojiayuan@gmail.com
# Date   : 02/26/2023
#
# This file is part of Project Concepts.
# Distributed under terms of the MIT license.

from typing import Optional, Union, Iterable, Sequence, List

import torch

import jacinle.io as io
import jaclearn.embedding.constant as const
from jacinle.utils.tqdm import tqdm

__all__ = ['Vocab', 'gen_vocab', 'gen_vocab_from_words']


[docs] class Vocab(object): """A simple vocabulary class."""
[docs] def __init__(self, word2idx=None): """Initialize the vocabulary. Args: word2idx: a dictionary mapping words to indices. If not specified, the vocabulary will be empty. """ self.word2idx = word2idx if word2idx is not None else dict() self._idx2word = None
[docs] @classmethod def from_json(cls, json_file: str) -> 'Vocab': """Load a vocabulary from a json file.""" return cls(io.load_json(json_file))
[docs] @classmethod def from_dataset(cls, dataset, keys: Sequence[str], extra_words: Optional[Sequence[str]] = None, single_word: bool = False) -> 'Vocab': """Generate a vocabulary from a dataset. Args: dataset: the dataset to generate the vocabulary from. keys: the keys to retrieve from the dataset items. extra_words: additional words to add to the vocabulary. single_word: whether to treat the values of the keys as single words. """ return gen_vocab(dataset, keys, extra_words=extra_words, cls=cls, single_word=single_word)
[docs] @classmethod def from_list(cls, dataset: list, extra_words: Optional[Sequence[str]] = None, single_word: bool = False) -> 'Vocab': """Generate a vocabulary from a list of strings. Args: dataset: the list of strings to generate the vocabulary from. extra_words: additional words to add to the vocabulary. single_word: whether to treat the values of the keys as single words. """ return gen_vocab(dataset, extra_words=extra_words, cls=cls, single_word=single_word)
[docs] def dump_json(self, json_file: str): """Dump the vocabulary to a json file.""" io.dump_json(json_file, self.word2idx)
[docs] def check_json_consistency(self, json_file: str) -> bool: """Check whether the vocabulary is consistent with a json file.""" rhs = io.load_json(json_file) for k, v in self.word2idx.items(): if not (k in rhs and rhs[k] == v): return False return True
[docs] def words(self) -> Iterable[str]: return self.word2idx.keys()
@property def idx2word(self) -> dict: """A dictionary mapping indices to words. This is a lazy property. It will be automatically recomputed when the length of the vocabulary changes.""" if self._idx2word is None or len(self.word2idx) != len(self._idx2word): self._idx2word = {v: k for k, v in self.word2idx.items()} return self._idx2word
[docs] def __len__(self) -> int: """Return the size of the vocabulary.""" return len(self.word2idx)
[docs] def __iter__(self) -> Iterable[str]: """Return an iterator over the words in the vocabulary.""" return iter(self.word2idx.keys())
[docs] def add(self, word: str): """Add a word to the vocabulary. Alias of :meth:`add_word`.""" self.add_word(word)
[docs] def add_word(self, word: str): """Add a word to the vocabulary.""" self.word2idx[word] = len(self.word2idx)
[docs] def map(self, word: str) -> int: """Map a word to its index. If the word is not in the vocabulary, return the index of the unknown token.""" return self.word2idx.get( word, self.word2idx.get(const.EBD_UNKNOWN, -1) )
[docs] def map_sequence(self, sequence: Sequence[str], add_be: bool = False) -> List[int]: """Map a sequence of words to a sequence of indices. If the argument `add_be` is True, the begin-of-sentence and end-of-sentence tokens will be added to the sequence. Args: sequence: the sequence of words to map. add_be: whether to add the begin-of-sentence and end-of-sentence tokens to the sequence. Returns: a list of indices. """ if isinstance(sequence, str): sequence = sequence.split() sequence = [self.map(w) for w in sequence] if add_be: sequence.insert(0, self.word2idx[const.EBD_BOS]) sequence.append(self.word2idx[const.EBD_EOS]) return sequence
[docs] def map_fields(self, feed_dict: dict, fields: Sequence[str]) -> dict: """Map the content in a specified set of fields in a dictionary to indices. The argument `fields` is a list of keys in the dictionary to map. This function will modify the dictionary in-place. Args: feed_dict: the dictionary of fields to map. fields: the list of keys to map. Returns: a dictionary of mapped fields. """ feed_dict = feed_dict.copy() for k in fields: if k in feed_dict: feed_dict[k] = self.map(feed_dict[k]) return feed_dict
[docs] def invmap_sequence(self, sequence: Union[Sequence[int], torch.Tensor], proc_be: bool = False) -> List[str]: """Map a sequence of indices to a sequence of words. If the argument `proc_be` is True, the begin-of-sentence and end-of-sentence tokens will be removed from the sequence. Args: sequence: the sequence of indices to map. proc_be: whether to remove the begin-of-sentence and end-of-sentence tokens from the sequence. Returns: a list of words. """ if torch.is_tensor(sequence): sequence = sequence.detach().cpu().tolist() str_sequence = [self.idx2word[int(x)] for x in sequence] if proc_be: if str_sequence[0] == const.EBD_BOS: str_sequence = str_sequence[1:] if str_sequence[-1] == const.EBD_EOS: str_sequence = str_sequence[:-1] return str_sequence
[docs] def gen_vocab(dataset: Sequence, keys: Optional[Iterable[str]] = None, extra_words: Optional[Iterable[str]] = None, cls: type = None, single_word: bool = False): """Generate a Vocabulary instance from a dataset. By default, this function will retrieve the data using the `get_metainfo` function, or it will fall back to `dataset[i]` if the function does not exist. The function should return a dictionary. Users can specify a list of keys that will be returned by the `get_metainfo` function. This function will split the string indexed by these keys and add tokens to the vocabulary. If the argument `keys` is not specified, this function assumes the return of `get_metainfo` to be a string. By default, this function will add four additional tokens: EBD_PAD, EBD_BOS, EBD_EOS, and EBD_UNK. Users can specify additional extra tokens using the extra_words argument. Args: dataset: the dataset to generate the vocabulary from. It can be a list of strings or a dataset instance. keys: the keys to retrieve from the dataset items. If not specified, the dataset is assumed to be a list of strings. extra_words: additional words to add to the vocabulary. cls: the class of the Vocabulary instance to generate. single_word: whether to treat the entries in the dataset as single words. Default to False. When set to False, the entries should either be a list of strings or a single string (in which case it will be split by spaces). """ if cls is None: cls = Vocab all_words = set() for i in tqdm(len(dataset), desc='Building the vocab'): if hasattr(dataset, 'get_metainfo'): metainfo = dataset.get_metainfo(i) else: metainfo = dataset[i] if keys is None: for w in metainfo.split(): all_words.add(w) else: for k in keys: if single_word: all_words.add(str(metainfo[k])) elif isinstance(metainfo[k], str): for w in metainfo[k].split(): all_words.add(w) else: for w in metainfo[k]: all_words.add(w) vocab = cls() vocab.add(const.EBD_ALL_ZEROS) for w in sorted(all_words): vocab.add(w) for w in [const.EBD_UNKNOWN, const.EBD_BOS, const.EBD_EOS]: vocab.add(w) if extra_words is not None: for w in extra_words: vocab.add(w) return vocab
[docs] def gen_vocab_from_words(words: Sequence[str], extra_words: Optional[Iterable[str]] = None, cls: type = None): """Generate a Vocabulary instance from a list of words.""" if cls is None: cls = Vocab vocab = cls() vocab.add(const.EBD_ALL_ZEROS) for w in sorted(words): vocab.add(w) for w in [const.EBD_UNKNOWN, const.EBD_BOS, const.EBD_EOS]: vocab.add(w) if extra_words is not None: for w in extra_words: vocab.add(w) return vocab