How to add all standard special tokens to my hugging face tokenizer and model?

Question

I want all special tokens to always be available. How do I do this?

My first attempt to give it to my tokenizer:

def does_t5_have_sep_token():
    tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained('t5-small')
    assert isinstance(tokenizer, PreTrainedTokenizerFast)
    print(tokenizer)
    print(f'{len(tokenizer)=}')
    # print(f'{tokenizer.all_special_tokens=}')
    print(f'{tokenizer.sep_token=}')
    print(f'{tokenizer.eos_token=}')
    print(f'{tokenizer.all_special_tokens=}')

    special_tokens_dict = {'additional_special_tokens': ['<bos>', '<cls>', '<s>'] + tokenizer.all_special_tokens }
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

    print(f'{tokenizer.sep_token=}')
    print(f'{tokenizer.eos_token=}')
    print(f'{tokenizer.all_special_tokens=}')



if __name__ == '__main__':
    does_t5_have_sep_token()
    print('Done\a')

but feels hacky.

refs:

seems useful: https://huggingface.co/docs/transformers/v4.21.1/en/main_classes/model#transformers.PreTrainedModel.resize_token_embeddings

I want to add standard tokens by adding the right "standard tokens" the solution provided didn't work for me since the .bos_token is still None. See:

tokenizer.bos_token=None
tokenizer.cls_token=None
tokenizer.sep_token=None
tokenizer.mask_token=None
tokenizer.eos_token='</s>'
tokenizer.unk_token='<unk>'
tokenizer.bos_token_id=None
tokenizer.cls_token_id=None
tokenizer.sep_token_id=None
tokenizer.mask_token_id=None
tokenizer.eos_token_id=1
tokenizer.unk_token_id=2
tokenizer.all_special_tokens=['</s>', '<unk>', '<pad>', '<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_id_44>', '<extra_id_45>', '<extra_id_46>', '<extra_id_47>', '<extra_id_48>', '<extra_id_49>', '<extra_id_50>', '<extra_id_51>', '<extra_id_52>', '<extra_id_53>', '<extra_id_54>', '<extra_id_55>', '<extra_id_56>', '<extra_id_57>', '<extra_id_58>', '<extra_id_59>', '<extra_id_60>', '<extra_id_61>', '<extra_id_62>', '<extra_id_63>', '<extra_id_64>', '<extra_id_65>', '<extra_id_66>', '<extra_id_67>', '<extra_id_68>', '<extra_id_69>', '<extra_id_70>', '<extra_id_71>', '<extra_id_72>', '<extra_id_73>', '<extra_id_74>', '<extra_id_75>', '<extra_id_76>', '<extra_id_77>', '<extra_id_78>', '<extra_id_79>', '<extra_id_80>', '<extra_id_81>', '<extra_id_82>', '<extra_id_83>', '<extra_id_84>', '<extra_id_85>', '<extra_id_86>', '<extra_id_87>', '<extra_id_88>', '<extra_id_89>', '<extra_id_90>', '<extra_id_91>', '<extra_id_92>', '<extra_id_93>', '<extra_id_94>', '<extra_id_95>', '<extra_id_96>', '<extra_id_97>', '<extra_id_98>', '<extra_id_99>']
Using bos_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using mask_token, but it is not set yet.

code:

def does_t5_have_sep_token():
    """

    https://huggingface.co/docs/transformers/v4.21.1/en/main_classes/model#transformers.PreTrainedModel.resize_token_embeddings
    """
    import torch
    from transformers import AutoModelForSeq2SeqLM

    tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained('t5-small')
    assert isinstance(tokenizer, PreTrainedTokenizerFast)
    print(tokenizer)
    print(f'{len(tokenizer)=}')

    print()
    print(f'{tokenizer.sep_token=}')
    print(f'{tokenizer.eos_token=}')
    print(f'{tokenizer.all_special_tokens=}')
    print()

    # special_tokens_dict = {'additional_special_tokens': ['<bos>', '<cls>', '<s>'] + tokenizer.all_special_tokens}
    # num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    tokenizer.add_tokens([f"_{n}" for n in range(1, 100)], special_tokens=True)
    model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
    assert isinstance(model, torch.nn.Module)
    model.resize_token_embeddings(len(tokenizer))
    # tokenizer.save_pretrained('pathToExtendedTokenizer/')
    # tokenizer = T5Tokenizer.from_pretrained("sandbox/t5_models/pretrained/tokenizer/")

    print()
    print(f'{tokenizer.bos_token=}')
    print(f'{tokenizer.cls_token=}')
    print(f'{tokenizer.sep_token=}')
    print(f'{tokenizer.mask_token=}')
    print(f'{tokenizer.eos_token=}')
    print(f'{tokenizer.unk_token=}')
    print(f'{tokenizer.bos_token_id=}')
    print(f'{tokenizer.cls_token_id=}')
    print(f'{tokenizer.sep_token_id=}')
    print(f'{tokenizer.mask_token_id=}')
    print(f'{tokenizer.eos_token_id=}')
    print(f'{tokenizer.unk_token_id=}')
    print(f'{tokenizer.all_special_tokens=}')
    print()



if __name__ == '__main__':
    does_t5_have_sep_token()
    print('Done\a')

perhaps useful: https://stackoverflow.com/questions/65387101/what-s-the-meaning-of-using-bos-token-but-it-is-not-set-yet? — Charlie Parker, Aug 11 '22 at 14:49
perhaps this is what I actually want. Seems models and tokenizers are coupled: https://stackoverflow.com/questions/71039446/how-to-suppress-using-bos-token-but-it-is-not-set-yet-in-huggingface-t5-to?rq=1 — Charlie Parker, Aug 11 '22 at 14:51
perhaps useful but didn't quite work for me: https://stackoverflow.com/questions/71039446/how-to-suppress-using-bos-token-but-it-is-not-set-yet-in-huggingface-t5-to?rq=1 — Charlie Parker, Aug 11 '22 at 14:58
perhaps useful? https://github.com/huggingface/transformers/issues/5142 — Charlie Parker, Aug 11 '22 at 15:23
or this: https://discuss.huggingface.co/t/issue-with-finetuning-a-seq-to-seq-model/1680/28 — Charlie Parker, Aug 11 '22 at 15:23

jroz · Answer 1 · 2022-08-12T18:08:17.463

I do not entirely understand what you're trying to accomplish, but here are some notes that might help:

T5 documentation shows that T5 has only three special tokens (</s>, <unk> and <pad>). You can also see this in the T5Tokenizer class definition. I am confident this is because the original T5 model was trained only with these special tokens (no BOS, no MASK, no CLS).

Running, e.g.,

from transformers import AutoTokenizer
tokenizer =  AutoTokenizer.from_pretrained('t5-small')
print(tokenizer.all_special_tokens)

will show you these three tokens as well as the <extra_id_*> tokens.

Is there a reason you want the other tokens like BOS?

(Edit - to answer your comments): (I really think you would benefit from reading the linked documentation at huggingface. The point of a pretrained model is to take advantage of what has already been done. T5 does not use BOS nor CLS in the way you seem to be imagining. Maybe you can get it to work, but IMO it makes more sense to adapt the task you want to solve to the T5 approach)

</s> is the sep token and is already available.

As I understand, for the T5 model, masking (for the sake of ignoring loss) is implemented using attention_mask. On the other hand, if you want to "fill in the blank" then <extra_id> is used to indicate to the model that it should predict the missing token (this is how semi-supervised pretraining is done). See the section on training in the documentation.

BOS is similar - T5 is not trained to use a BOS token. (E.g. (again from documentation),

Note that T5 uses the pad_token_id as the decoder_start_token_id, so when doing generation without using generate(), make sure you start it with the pad_token_id.

t5 does not use the CLS token. If you want to do classification, you should finetune a new task (or find a corresponding one done in pretraining), finetuning the model to generate a word (or words) that correspond to the classifications you want. (again from documentation:)

Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A sequence has the following format:

I want to fine tune my own T5 and want those tokens, especially the sentence separator `~~` token `sep_token`. I'd like to have them all.~~ — Charlie Parker, Aug 11 '22 at 18:25
what is the string rep of the mask token? I want to add all special tokens. Is it ``? — Charlie Parker, Aug 11 '22 at 18:28
what do you think of this answer: https://stackoverflow.com/a/73361984/1601580 — Charlie Parker, Aug 15 '22 at 13:56
It is true that your approach will add tokens, but as I wrote above, T5 pretraining does not use the ones that you are adding. Huggingface documentation shows how to use T5 for various tasks, and (I think) none of those tasks should require introducing BOS, MASK, etc. Also, as I said, the sep token should already be set and is not ~~. If it's not showing up when you load the tokenizer there might be some other issue~~ — jroz, Aug 19 '22 at 21:04

score 0 · Answer 2 · answered Aug 11 '22 at 18:57

I think this is correct. Please correct me if I'm wrong:

def add_special_all_special_tokens(tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]):
    """
        special_tokens_dict = {"cls_token": "<CLS>"}

        num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
        print("We have added", num_added_toks, "tokens")
        # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
        model.resize_token_embeddings(len(tokenizer))

        assert tokenizer.cls_token == "<CLS>"

    """
    original_len: int = len(tokenizer)
    num_added_toks: dict = {}
    if tokenizer.bos_token is None:
        num_added_toks['bos_token'] = "<bos>"
    if tokenizer.bos_token is None:
        num_added_toks['cls_token'] = "<cls>"
    if tokenizer.bos_token is None:
        num_added_toks['sep_token'] = "<s>"
    if tokenizer.bos_token is None:
        num_added_toks['mask_token'] = "<mask>"
    # num_added_toks = {"bos_token": "<bos>", "cls_token": "<cls>", "sep_token": "<s>", "mask_token": "<mask>"}
    # special_tokens_dict = {'additional_special_tokens': new_special_tokens + tokenizer.all_special_tokens}
    num_new_tokens: int = tokenizer.add_special_tokens(num_added_toks)
    assert tokenizer.bos_token == "<bos>"
    assert tokenizer.cls_token == "<cls>"
    assert tokenizer.sep_token == "<s>"
    assert tokenizer.mask_token == "<mask>"
    msg = f"Error, not equal: {len(tokenizer)=}, {original_len + num_new_tokens=}"
    assert len(tokenizer) == original_len + num_new_tokens, msg

left comment from doc that inspired my answer:

    def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int:
        """
        Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If
        special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
        current vocabulary).

        Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding
        matrix of the model so that its embedding matrix matches the tokenizer.

        In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.

        Using `add_special_tokens` will ensure your special tokens can be used in several ways:

        - Special tokens are carefully handled by the tokenizer (they are never split).
        - You can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This
          makes it easy to develop model-agnostic training and fine-tuning scripts.

        When possible, special tokens are already registered for provided pretrained models (for instance
        [`BertTokenizer`] `cls_token` is already registered to be :obj*'[CLS]'* and XLM's one is also registered to be
        `'</s>'`).

        Args:
            special_tokens_dict (dictionary *str* to *str* or `tokenizers.AddedToken`):
                Keys should be in the list of predefined special attributes: [`bos_token`, `eos_token`, `unk_token`,
                `sep_token`, `pad_token`, `cls_token`, `mask_token`, `additional_special_tokens`].

                Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer
                assign the index of the `unk_token` to them).

        Returns:
            `int`: Number of tokens added to the vocabulary.

        Examples:

        ```python
        # Let's see how to add a new classification token to GPT-2
        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        model = GPT2Model.from_pretrained("gpt2")

        special_tokens_dict = {"cls_token": "<CLS>"}

        num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
        print("We have added", num_added_toks, "tokens")
        # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
        model.resize_token_embeddings(len(tokenizer))

        assert tokenizer.cls_token == "<CLS>"
        ```"""

it was in hf's tokenization_utils_base.py

score 0 · Answer 3 · answered Aug 15 '22 at 13:55

I think the right answer is here: https://stackoverflow.com/a/73361984/1601580

Links can be bad answers so here is the code:

def add_special_all_special_tokens(tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]):
    """
        special_tokens_dict = {"cls_token": "<CLS>"}

        num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
        print("We have added", num_added_toks, "tokens")
        # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
        model.resize_token_embeddings(len(tokenizer))

        assert tokenizer.cls_token == "<CLS>"

    """
    original_len: int = len(tokenizer)
    num_added_toks: dict = {}
    if tokenizer.bos_token is None:
        num_added_toks['bos_token'] = "<bos>"
    if tokenizer.bos_token is None:
        num_added_toks['cls_token'] = "<cls>"
    if tokenizer.bos_token is None:
        num_added_toks['sep_token'] = "<s>"
    if tokenizer.bos_token is None:
        num_added_toks['mask_token'] = "<mask>"
    # num_added_toks = {"bos_token": "<bos>", "cls_token": "<cls>", "sep_token": "<s>", "mask_token": "<mask>"}
    # special_tokens_dict = {'additional_special_tokens': new_special_tokens + tokenizer.all_special_tokens}
    num_new_tokens: int = tokenizer.add_special_tokens(num_added_toks)
    assert tokenizer.bos_token == "<bos>"
    assert tokenizer.cls_token == "<cls>"
    assert tokenizer.sep_token == "<s>"
    assert tokenizer.mask_token == "<mask>"
    err_msg = f"Error, not equal: {len(tokenizer)=}, {original_len + num_new_tokens=}"
    assert len(tokenizer) == original_len + num_new_tokens, err_msg

Feedback is always welcomed.

How to add all standard special tokens to my hugging face tokenizer and model?

3 Answers3

Linked