Using transformers
version 2.4.8
I'm trying to tokenize a list of texts like this :
tokenizer = ElectraTokenizer.from_pretrained(transformer_model_name)
data = tokenizer(
text_data.tolist(),
verbose=True,
add_special_tokens=True, # add [CLS] and [SEP] tokens
return_attention_mask=True,
return_token_type_ids=True, # not needed for this type of ML task
padding='max_length', # add 0 pad tokens to the sequences less than max_length
truncation=True,
max_length=max_length, # truncates if len(s) > max_length
)
But I'm getting this error:
Traceback (most recent call last):
File "/home/marzi/workspace/nlp_classification/src/test/test_with_label.py", line 128, in <module>
main()
File "/home/marzi/workspace/nlp_classification/src/test/test_with_label.py", line 64, in main
test_deep_model(data, input_config)
File "/home/marzi/workspace/nlp_classification/src/test/test_with_label.py", line 85, in test_deep_model
preprocessor = get_preprocessor_for_test(test_deep_config, embedding, data)
File "/home/marzi/workspace/nlp_classification/src/test/test_with_label.py", line 107, in get_preprocessor_for_test
preprocessor.prepare_data_for_test(the_data)
File "/home/marzi/workspace/nlp_classification/src/data/preprocess.py", line 33, in prepare_data_for_test
self._prepare([], the_data, [x for x in range(len(the_data.data))])
File "/home/marzi/workspace/nlp_classification/src/data/transformer_data_preprocess.py", line 19, in _prepare
self.x_test, self.x_test_dict = self._tokenize_using_bert(text2seq, test_index, the_data)
File "/home/marzi/workspace/nlp_classification/src/data/transformer_data_preprocess.py", line 27, in _tokenize_using_bert
max_length=self.max_sequence_length)
File "/home/marzi/workspace/nlp_classification/src/data/text_to_sequence.py", line 43, in tokenize_batch_using_bert
max_length=max_length, # truncates if len(s) > max_length
File "/home/marzi/anaconda3/envs/nc_gpu/lib/python3.6/site-packages/transformers/tokenization_utils_base.py", line 2327, in __call__
**kwargs,
File "/home/marzi/anaconda3/envs/nc_gpu/lib/python3.6/site-packages/transformers/tokenization_utils_base.py", line 2512, in batch_encode_plus
**kwargs,
File "/home/marzi/anaconda3/envs/nc_gpu/lib/python3.6/site-packages/transformers/tokenization_utils.py", line 563, in _batch_encode_plus
verbose=verbose,
File "/home/marzi/anaconda3/envs/nc_gpu/lib/python3.6/site-packages/transformers/tokenization_utils.py", line 626, in _batch_prepare_for_model
return_attention_mask=return_attention_mask,
File "/home/marzi/anaconda3/envs/nc_gpu/lib/python3.6/site-packages/transformers/tokenization_utils_base.py", line 2621, in pad
"You should supply an encoding or a list of encodings to this method "
ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided []
There is no empty item in the text_data
and I have no idea what causes this error.
from Recent Questions - Stack Overflow https://ift.tt/3rfwnwO
https://ift.tt/eA8V8J
No comments:
Post a Comment