@@ -3782,44 +3782,93 @@ def _xlmroberta_set_vocab(self) -> None:
3782
3782
from sentencepiece import sentencepiece_model_pb2 as model
3783
3783
3784
3784
tokenizer_path = self .dir_model / 'sentencepiece.bpe.model'
3785
+
3786
+ tokenizer_json = {}
3787
+ tokenizer_config_json = {}
3785
3788
if not tokenizer_path .is_file ():
3786
- raise FileNotFoundError (f"File not found: { tokenizer_path } " )
3789
+ tokenizer_path = self .dir_model / 'tokenizer.json'
3790
+ tokenizer_config_path = self .dir_model / 'tokenizer_config.json'
3787
3791
3788
- sentencepiece_model = model .ModelP
8000
roto () # pyright: ignore[reportAttributeAccessIssue]
3789
- sentencepiece_model .ParseFromString (open (tokenizer_path , "rb" ).read ())
3790
- assert sentencepiece_model .trainer_spec .model_type == 1 # UNIGRAM
3792
+ if not tokenizer_path .is_file ():
3793
+ raise FileNotFoundError (f"File not found: { tokenizer_path } " )
3791
3794
3792
- add_prefix = sentencepiece_model . normalizer_spec . add_dummy_prefix
3793
- remove_whitespaces = sentencepiece_model . normalizer_spec . remove_extra_whitespaces
3794
- precompiled_charsmap = sentencepiece_model . normalizer_spec . precompiled_charsmap
3795
+ from base64 import b64decode
3796
+ from transformers import AutoTokenizer
3797
+ tokenizer = AutoTokenizer . from_pretrained ( self . dir_model )
3795
3798
3796
- tokenizer = SentencePieceProcessor ()
3797
- tokenizer . LoadFromFile ( str ( tokenizer_path ) )
3799
+ with open ( tokenizer_path , "r" , encoding = "utf-8" ) as fp :
3800
+ tokenizer_json = json . load ( fp )
3798
3801
3799
- vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
3802
+ if tokenizer_config_path .is_file ():
3803
+ with open (tokenizer_config_path , "r" , encoding = "utf-8" ) as fp :
3804
+ tokenizer_config_json = json .load (fp )
3805
+
3806
+ add_prefix = tokenizer .add_prefix_space
3807
+ remove_whitespaces = tokenizer .clean_up_tokenization_spaces
3808
+ precompiled_charsmap = b64decode (tokenizer_json ["normalizer" ]["precompiled_charsmap" ])
3809
+
3810
+ vocab_size = self .hparams .get ("vocab_size" , tokenizer .vocab_size )
3811
+ else :
3812
+ sentencepiece_model = model .ModelProto () # pyright: ignore[reportAttributeAccessIssue]
3813
+ sentencepiece_model .ParseFromString (open (tokenizer_path , "rb" ).read ())
3814
+ assert sentencepiece_model .trainer_spec .model_type == 1 # UNIGRAM
3815
+
3816
+ add_prefix = sentencepiece_model .normalizer_spec .add_dummy_prefix
3817
+ remove_whitespaces = sentencepiece_model .normalizer_spec .remove_extra_whitespaces
3818
+ precompiled_charsmap = sentencepiece_model .normalizer_spec .precompiled_charsmap
3819
+
3820
+ tokenizer = SentencePieceProcessor ()
3821
+ tokenizer .LoadFromFile (str (tokenizer_path ))
3822
+
3823
+ vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
3800
3824
3801
3825
tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
3802
3826
scores : list [float ] = [- 10000.0 ] * vocab_size
3803
3827
toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
3804
3828
3805
- for token_id in range (tokenizer .vocab_size ()):
3806
- piece = tokenizer .IdToPiece (token_id )
3807
- text = piece .encode ("utf-8" )
3808
- score = tokenizer .GetScore (token_id )
3829
+ if isinstance (tokenizer , SentencePieceProcessor ):
3830
+ for token_id in range (tokenizer .vocab_size ()):
3831
+ piece = tokenizer .IdToPiece (token_id )
3832
+ text = piece .encode ("utf-8" )
3833
+ score = tokenizer .GetScore (token_id )
3809
3834
3810
- toktype = SentencePieceTokenTypes .NORMAL
3811
- if tokenizer .IsUnknown (token_id ):
3812
- toktype = SentencePieceTokenTypes .UNKNOWN
3813
- elif tokenizer .IsControl (token_id ):
3814
- toktype = SentencePieceTokenTypes .CONTROL
3815
- elif tokenizer .IsUnused (token_id ):
3816
- toktype = SentencePieceTokenTypes .UNUSED
3817
- elif tokenizer .IsByte (token_id ):
3818
- toktype = SentencePieceTokenTypes .BYTE
3835
+ toktype = SentencePieceTokenTypes .NORMAL
3836
+ if tokenizer .IsUnknown (token_id ):
3837
+ toktype = SentencePieceTokenTypes .UNKNOWN
3838
+ elif tokenizer .IsControl (token_id ):
3839
+ toktype = SentencePieceTokenTypes .CONTROL
3840
+ elif tokenizer .IsUnused (token_id ):
3841
+ toktype = SentencePieceTokenTypes .UNUSED
3842
+ elif tokenizer .IsByte (token_id ):
3843
+ toktype = SentencePieceTokenTypes .BYTE
3819
3844
3820
- tokens [token_id ] = text
3821
- scores [token_id ] = score
3822
- toktypes [token_id ] = toktype
3845
+ tokens [token_id ] = text
3846
+ scores [token_id ] = score
3847
+ toktypes [token_id ] = toktype
3848
+ else :
3849
+ added_vocab = tokenizer .get_added_vocab ()
3850
+ unk_token = tokenizer_config_json .get ("unk_token" )
3851
+ unk_token_id = added_vocab .get (unk_token , tokenizer_json ["model" ].get ("unk_id" , 3 ))
3852
+
3853
+ for token_id in range (vocab_size ):
3854
+ piece = tokenizer ._convert_id_to_token (token_id )
3855
+ text = piece .encode ("utf-8" )
3856
+ score = tokenizer_json ["model" ]["vocab" ][token_id ][1 ]
3857
+
3858
+ toktype = SentencePieceTokenTypes .NORMAL
3859
+ if token_id == unk_token_id :
3860
+ toktype = SentencePieceTokenTypes .UNKNOWN
3861
+ elif token_id in tokenizer .all_special_ids :
3862
+ toktype = SentencePieceTokenTypes .CONTROL
3863
+ elif token_id in added_vocab .values ():
3864
+ toktype = SentencePieceTokenTypes .USER_DEFINED
3865
+ # No reliable way to detect this, but jina doesn't have any
3866
+ # elif tokenizer.IsByte(token_id):
3867
+ # toktype = SentencePieceTokenTypes.BYTE
3868
+
3869
+ tokens [token_id ] = text
3870
+ scores [token_id ] = score
3871
+ toktypes [token_id ] = toktype
3823
3872
3824
3873
if vocab_size > len (tokens ):
3825
3874
pad_count = vocab_size - len (tokens )
@@ -3829,15 +3878,16 @@ def _xlmroberta_set_vocab(self) -> None:
3829
3878
scores .append (- 1000.0 )
3830
3879
toktypes .append (SentencePieceTokenTypes .UNUSED )
3831
3880
3832
- # realign tokens (see HF tokenizer code)
3833
- tokens = [b'<s>' , b'<pad>' , b'</s>' , b'<unk>' ] + tokens [3 :- 1 ]
3834
- scores = [0.0 , 0.0 , 0.0 , 0.0 ] + scores [3 :- 1 ]
3835
- toktypes = [
3836
- SentencePieceTokenTypes .CONTROL ,
3837
- SentencePieceTokenTypes .CONTROL ,
3838
- SentencePieceTokenTypes .CONTROL ,
3839
- SentencePieceTokenTypes .UNKNOWN ,
3840
- ] + toktypes [3 :- 1 ]
3881
+ if isinstance (tokenizer , SentencePieceProcessor ):
3882
+ # realign tokens (see HF tokenizer code)
3883
+ tokens = [b'<s>' , b'<pad>' , b'</s>' , b'<unk>' ] + tokens [3 :- 1 ]
3884
+ scores = [0.0 , 0.0 , 0.0 , 0.0 ] + scores [3 :- 1 ]
3885
+ toktypes = [
3886
+ SentencePieceTokenTypes .CONTROL ,
3887
+ SentencePieceTokenTypes .CONTROL ,
3888
+ SentencePieceTokenTypes .CONTROL ,
3889
+ SentencePieceTokenTypes .UNKNOWN ,
3890
+ ] + toktypes [3 :- 1 ]
3841
3891
3842
3892
self .gguf_writer .add_tokenizer_model ("t5" )
3843
3893
self .gguf_writer .add_tokenizer_pre ("default" )
0 commit comments