@@ -67,14 +67,20 @@ class Model:
67
67
dir_model_card : Path
68
68
remote_hf_model_id : str | None
69
69
70
+ # for vision encoders
71
+ mmproj : bool
72
+ ignore_vision : bool = False # subclasses may overwrite this
73
+ mtmd_model : MultimodalModel | None = None
74
+
70
75
# subclasses should define this!
71
76
model_arch : gguf .MODEL_ARCH
72
77
73
78
def __init__ (self , dir_model : Path , ftype : gguf .LlamaFileType , fname_out : Path , is_big_endian : bool = False ,
74
79
use_temp_file : bool = False , eager : bool = False ,
75
80
metadata_override : Path | None = None , model_name : str | None = None ,
76
81
split_max_tensors : int = 0 , split_max_size : int = 0 , dry_run : bool = False ,
77
- small_first_shard : bool = False , hparams : dict [str , Any ] | None = None , remote_hf_model_id : str | None = None ):
82
+ small_first_shard : bool = False , hparams : dict [str , Any ] | None = None , remote_hf_model_id : str | None = None ,
83
+ mmproj : bool = False ):
78
84
if type (self ) is Model :
79
85
raise TypeError (f"{ type (self ).__name__ !r} should not be directly instantiated" )
80
86
@@ -109,6 +115,7 @@ def get_remote_tensors() -> Iterator[tuple[str, Tensor]]:
109
115
self .metadata_override = metadata_override
110
116
self .model_name = model_name
111
117
self .dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
118
+ self .mmproj = mmproj
112
119
113
120
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
114
121
if self .ftype == gguf .LlamaFileType .GUESSED :
@@ -125,6 +132,28 @@ def get_remote_tensors() -> Iterator[tuple[str, Tensor]]:
125
132
self .gguf_writer = gguf .GGUFWriter (path = None , arch = gguf .MODEL_ARCH_NAMES [self .model_arch ], endianess = self .endianess , use_temp_file = self .use_temp_file ,
126
133
split_max_tensors = split_max_tensors , split_max_size = split_max_size , dry_run = dry_run , small_first_shard = small_first_shard )
127
134
135
+ # vision encoder
136
+ if mmproj :
137
+ vision_hparams = self .hparams .get ("vision_config" )
138
+ if vision_hparams is None :
139
+ raise ValueError ("Vision config not found in model config" )
140
+ elif self .ignore_vision :
141
+ raise ValueError ("Vision config found, but mmproj conversion for this model is not supported yet" )
142
+ else :
143
+ self .mtmd_model = MultimodalModel (
144
+ hparams = vision_hparams ,
145
+ ftype = self .ftype ,
146
+ fname_out = self .fname_out ,
147
+ endianess = self .endianess ,
148
+ use_temp_file = self .use_temp_file ,
149
+ )
150
+
151
+ @classmethod
152
+ def add_prefix_to_filename (cls , path : Path , prefix : str ) -> Path :
153
+ stem , suffix = path .stem , path .suffix
154
+ new_name = f"{ prefix } { stem } { suffix } "
155
+ return path .with_name (new_name )
156
+
128
157
@classmethod
129
158
def __init_subclass__ (cls ):
130
159
# can't use an abstract property, because overriding it without type errors
@@ -272,8 +301,13 @@ def set_gguf_parameters(self):
272
301
self .gguf_writer .add_key_length (head_dim )
273
302
self .gguf_writer .add_value_length (head_dim )
274
303
275
- self .gguf_writer .add_file_type (self .ftype )
276
- logger .info (f"gguf: file type = { self .ftype } " )
304
+ if not self .mmproj :
305
+ self .gguf_writer .add_file_type (self .ftype )
306
+ logger .info (f"gguf: file type = { self .ftype } " )
307
+ else :
308
+ assert self .mtmd_model is not None
309
+ self .mtmd_model .set_gguf_parameters (n_embd_text = n_embd )
310
+ logger .info (f"mmproj: file type = { self .mtmd_model .ftype } " )
277
311
278
312
def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
279
313
del bid # unused
@@ -311,6 +345,10 @@ def prepare_tensors(self):
311
345
break
312
346
313
347
for new_name , data_torch in (self .modify_tensors (data_torch , name , bid )):
348
+ # skip adding tensor if we're working with a vision model
349
+ if self .mmproj :
350
+ continue
351
+
314
352
# TODO: why do we squeeze here?
315
353
# data = data_torch.squeeze().numpy()
316
354
data = data_torch .numpy ()
@@ -455,12 +493,18 @@ def prepare_metadata(self, vocab_only: bool):
455
493
self .gguf_writer .add_quantization_version (gguf .GGML_QUANT_VERSION )
456
494
457
495
def write (self ):
458
- self .prepare_tensors ()
459
- self .prepare_metadata (vocab_only = False )
460
- self .gguf_writer .write_header_to_file (path = self .fname_out )
461
- self .gguf_writer .write_kv_data_to_file ()
462
- self .gguf_writer .write_tensors_to_file (progress = True )
463
- self .gguf_writer .close ()
496
+ if self .mtmd_model is not None :
497
+ self .prepare_tensors ()
498
+ self .prepare_metadata (vocab_only = False )
499
+ logger .info ("Writing vision model" )
500
+ self .mtmd_model .write ()
501
+ else :
502
+ self .prepare_tensors ()
503
+ self .prepare_metadata (vocab_only = False )
504
+ self .gguf_writer .write_header_to_file (path = self .fname_out )
505
+ self .gguf_writer .write_kv_data_to_file ()
506
+ self .gguf_writer .write_tensors_to_file (progress = True )
507
+ self .gguf_writer .close ()
464
508
465
509
def write_vocab (self ):
466
510
if len (self .gguf_writer .tensors ) != 1 :
@@ -485,7 +529,10 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]
485
529
@staticmethod
486
530
def load_hparams (dir_model : Path ):
487
531
with open (dir_model / "config.json" , "r" , encoding = "utf-8" ) as f :
488
- return json .load (f )
532
+ hparams = json .load (f )
533
+ if "text_config" in hparams :
534
+ hparams = {** hparams , ** hparams ["text_config" ]}
535
+ return hparams
489
536
490
537
@classmethod
491
538
def register (cls , * names : str ) -> Callable [[AnyModel ], AnyModel ]:
@@ -1024,6 +1071,101 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab
1024
1071
self .gguf_writer .add_add_eos_token (field .parts [- 1 ].tolist ()[0 ])
1025
1072
1026
1073
1074
+ # for converting mmproj file
1075
+ class MultimodalModel :
1076
+ hparams : dict
1077
+ dir_model : Path
1078
+ ftype : gguf .LlamaFileType
1079
+ fname_out : Path
1080
+ tensor_map : gguf .TensorNameMap
1081
+ gguf_writer : gguf .GGUFWriter
1082
+
1083
+ def __init__ (self , hparams : dict , ftype : gguf .LlamaFileType , fname_out : Path , endianess : gguf .GGUFEndian , use_temp_file : bool ):
1084
+ self .hparams = hparams
1085
+ self .ftype = ftype
1086
+ self .fname_out = fname_out
1087
+ self .tensor_map = gguf .get_tensor_name_map (gguf .MODEL_ARCH .CLIP_VISION , 128 )
1088
+ self .gguf_writer = gguf .GGUFWriter (path = None ,
1089
+ arch = "clip" ,
1090
+ endianess = endianess ,
1091
+ use_temp_file = use_temp_file )
1092
+
1093
+ def set_gguf_parameters (self , n_embd_text : int ):
1094
+ """Function to be called by Model.set_gguf_parameters()"""
1095
+ self .gguf_writer .add_type (gguf .GGUFType .CLIP_VISION )
1096
+ self .gguf_writer .add_file_type (self .ftype )
1097
+ self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .PROJECTION_DIM , n_embd_text )
1098
+ self .gguf_writer .add_bool (gguf .Keys .ClipVision .HAS_VISION_ENCODER , True )
1099
+
1100
+ # vision config
1101
+ self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .IMAGE_SIZE , self .find_hparam (["image_size" ]))
1102
+ self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .PATCH_SIZE , self .find_hparam (["patch_size" ]))
1103
+ self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .EMBEDDING_LENGTH , self .find_hparam (["hidden_size" ]))
1104
+ self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .FEED_FORWARD_LENGTH , self .find_hparam (["intermediate_size" ]))
1105
+ self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .BLOCK_COUNT , self .find_hparam (["num_hidden_layers" ]))
1106
+ self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .Attention .HEAD_COUNT , self .find_hparam (["num_attention_heads" ]))
1107
+
1108
+ def find_hparam (self , keys : Iterable [str ], optional : bool = False ) -> Any :
1109
+ key = next ((k for k in keys if k in self .hparams ), None )
1110
+ if key is not None :
1111
+ return self .hparams [key ]
1112
+ if optional :
1113
+ return None
1114
+ raise KeyError (f"could not find any of: { keys } " )
1115
+
1116
+ def get_quantization (self , mapped_name : str , data_torch : Tensor ) -> gguf .GGMLQuantizationType :
1117
+ is_1d = len (data_torch .shape ) == 1
1118
+ is_embd = "_embd" in mapped_name
1119
+ can_quantize = not is_1d and not is_embd
1120
+ data_qtype = gguf .GGMLQuantizationType .F32
1121
+ if can_quantize :
1122
+ if self .ftype == gguf .LlamaFileType .ALL_F32 :
1123
+ data_qtype = gguf .GGMLQuantizationType .F32
1124
+ elif self .ftype == gguf .LlamaFileType .MOSTLY_F16 :
1125
+ data_qtype = gguf .GGMLQuantizationType .F16
1126
+ elif self .ftype == gguf .LlamaFileType .MOSTLY_BF16 :
1127
+ data_qtype = gguf .GGMLQuantizationType .BF16
1128
+ elif self .ftype == gguf .LlamaFileType .MOSTLY_Q8_0 :
1129
+ data_qtype = gguf .GGMLQuantizationType .Q8_0
1130
+ else :
1131
+ raise ValueError (f"Unsupported file type: { self .ftype } " )
1132
+ return data_qtype
1133
+
1134
+ def add_tensor (self , original_name : str , data_torch : Tensor ) -> None :
1135
+ """Function to be called inside Model.modify_tensors()"""
1136
+ # name mapping
1137
+ new_name = self .tensor_map .get_name (key = original_name , try_suffixes = (".weight" , ".bias" ))
1138
+ if new_name is None :
1139
+ raise ValueError (f"Can not map tensor { original_name !r} " )
1140
+
1141
+ # process data
1142
+ # old_dtype = data_torch.dtype
1143
+ data_qtype = self .get_quantization (new_name , data_torch )
1144
+ data = data_torch .numpy ()
1145
+ try :
1146
+ data = gguf .quants .quantize (data , data_qtype )
1147
+ except Exception as e :
1148
+ logger .error (f"Error quantizing tensor '{ new_name } ': { e } , fallback to F16" )
1149
+ data_qtype = gguf .GGMLQuantizationType .F16
1150
+ data = gguf .quants .quantize (data , data_qtype )
1151
+
1152
+ # reverse shape to make it similar to the internal ggml dimension order
1153
+ # TODO: we don't print old_dtype because it's not correct, to be fixed later
1154
+ old_dtype = ""
1155
+ shape_str = f"{{{ ', ' .join (str (n ) for n in reversed (data_torch .shape ))} }}"
1156
+ logger .info (f"{ f'%-32s' % f'{ new_name } ,' } { old_dtype } --> { data_qtype .name } , shape = { shape_str } " )
1157
+
1158
+ # add tensor
1159
+ self .gguf_writer .add_tensor (new_name , data , raw_dtype = data_qtype )
1160
+
1161
+ def write (self ):
1162
+ """Function to be called by Model.write()"""
1163
+ self .gguf_writer .write_header_to_file (path = self .fname_out )
1164
+ self .gguf_writer .write_kv_data_to_file ()
1165
+ self .gguf_writer .write_tensors_to_file (progress = True )
1166
+ self .gguf_writer .close ()
1167
+
1168
+
1027
1169
@Model .register ("GPTNeoXForCausalLM" )
1028
1170
class GPTNeoXModel (Model ):
1029
1171
model_arch = gguf .MODEL_ARCH .GPTNEOX
@@ -1781,20 +1923,13 @@ def prepare_tensors(self):
1781
1923
@Model .register ("Llama4ForConditionalGeneration" )
1782
1924
class Llama4Model (LlamaModel ):
1783
1925
model_arch = gguf .MODEL_ARCH .LLAMA4
1784
- has_vision : bool = False
1785
1926
undo_permute = False
1927
+ ignore_vision = True
1786
1928
1787
1929
# TODO @ngxson : avoid duplicate this code everywhere by at least support "text_config"
1788
1930
# same with llama, but we need to merge the text_config into the root level of hparams
1789
1931
def __init__ (self , * args , ** kwargs ):
1790
- hparams = kwargs ["hparams" ] if "hparams" in kwargs else Model .load_hparams (args [0 ])
1791
- if "text_config" in hparams :
1792
- hparams = {** hparams , ** hparams ["text_config" ]}
1793
- kwargs ["hparams" ] = hparams
1794
1932
super ().__init__ (* args , ** kwargs )
1795
- if "vision_config" in hparams :
1796
- logger .info ("Has vision encoder, but it will be ignored" )
1797
- self .has_vision = True
1798
1933
# IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this
1799
1934
self .hparams ["intermediate_size_moe" ] = self .hparams ["intermediate_size" ]
1800
1935
self .hparams ["intermediate_size" ] = self .hparams ["intermediate_size_mlp" ]
@@ -1824,7 +1959,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
1824
1959
name += ".weight"
1825
1960
data_torch = data_torch .transpose (- 1 , - 2 )
1826
1961
1827
- if "multi_modal_projector" in name or "vision_model " in name :
1962
+ if "multi_modal_projector" in name or "mtmd_model " in name :
1828
1963
return []
1829
1964
return super ().modify_tensors (data_torch , name , bid )
1830
1965
@@ -3474,24 +3609,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
3474
3609
@Model .register ("Gemma3ForCausalLM" , "Gemma3ForConditionalGeneration" )
3475
3610
class Gemma3Model (Model ):
3476
3611
model_arch = gguf .MODEL_ARCH .GEMMA3
3477
- has_vision : bool = False
3478
-
3479
- # we need to merge the text_config into the root level of hparams
3480
- def __init__ (self , * args , ** kwargs ):
3481
- hparams = kwargs ["hparams" ] if "hparams" in kwargs else Model .load_hparams (args [0 ])
3482
- if "text_config" in hparams :
3483
- hparams = {** hparams , ** hparams ["text_config" ]}
3484
- kwargs ["hparams" ] = hparams
3485
- super ().__init__ (* args , ** kwargs )
3486
- if "vision_config" in hparams :
3487
- logger .info ("Has vision encoder, but it will be ignored" )
3488
- self .has_vision = True
3489
3612
3490
3613
def write (self ):
3491
3614
super ().write ()
3492
- if self .has_vision :
3493
- logger .info ("NOTE: this script only convert the language model to GGUF" )
3494
- logger .info (" for the vision model, please use gemma3_convert_encoder_to_gguf.py" )
3495
3615
3496
3616
def set_vocab (self ):
3497
3617
self ._set_vocab_sentencepiece ()
@@ -3524,15 +3644,42 @@ def set_gguf_parameters(self):
3524
3644
self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .LINEAR )
3525
3645
self .gguf_writer .add_rope_scaling_factor (hparams ["rope_scaling" ]["factor" ])
3526
3646
3647
+ if self .mtmd_model is not None :
3648
+ self .mtmd_model .set_gguf_parameters (n_embd_text = hparams ["hidden_size" ])
3649
+ vgguf = self .mtmd_model .gguf_writer
3650
+ vgguf .add_string (gguf .Keys .ClipVision .PROJECTOR_TYPE , "gemma3" )
3651
+ # default values below are taken from HF tranformers code
3652
+ vgguf .add_float32 (gguf .Keys .ClipVision .Attention .LAYERNORM_EPS , self .mtmd_model .hparams .get ("layer_norm_eps" , 1e-6 ))
3653
+ vgguf .add_array (gguf .Keys .ClipVision .IMAGE_MEAN , [0.5 , 0.5 , 0.5 ])
3654
+ vgguf .add_array (gguf .Keys .ClipVision .IMAGE_STD , [0.5 , 0.5 , 0.5 ])
3655
+ vgguf .add_bool (gguf .Keys .ClipVision .USE_GELU , True )
3656
+
3527
3657
def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
3528
3658
del bid # unused
3529
3659
3530
3660
if name .startswith ("language_model." ):
3531
3661
name = name .replace ("language_model." , "" )
3662
+
3532
3663
elif name .startswith ("multi_modal_projector." ) or name .startswith ("vision_tower." ) \
3533
- or name .startswith ("multimodal_projector." ) or name .startswith ("vision_model." ): # this is for old HF model, should be removed later
3534
- # ignore vision tensors
3535
- return []
3664
+ or name .startswith ("multimodal_projector." ) or name .startswith ("mtmd_model." ):
3665
+ if self .mmproj :
3666
+ assert self .mtmd_model is not None
3667
+ # process vision tensors
3668
+ name = name .replace ("_weight" , ".weight" )
3669
+ if "fc1" in name :
3670
+ name = name .replace ("fc1" , "fc2" )
3671
+ else :
<
10000
tr class="diff-line-row">
3672
+ name = name .replace ("fc2" , "fc1" )
3673
+
3674
+ # corrent norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
3675
+ # the other norm values are part of SigLIP model, and they are already correct
3676
+ # ref code: Gemma3RMSNorm
3677
+ if "soft_emb_norm.weight" in name :
3678
+ logger .info (f"Correcting norm value for '{ name } '" )
3679
+ data_torch = data_torch + 1
3680
+
3681
+ self .mtmd_model .add_tensor (name , data_torch )
3682
+ return [] # vision tensor already handled
3536
3683
3537
3684
# remove OOV (out-of-vocabulary) rows in token_embd
3538
3685
if "embed_tokens.weight" in name :
@@ -5554,6 +5701,10 @@ def parse_args() -> argparse.Namespace:
5554
5701
"--remote" , action = "store_true" ,
5555
5702
help = "(Experimental) Read safetensors file remotely without downloading to disk. Config and tokenizer files will still be downloaded. To use this feature, you need to specify Hugging Face model repo name instead of a local directory. For example: 'HuggingFaceTB/SmolLM2-1.7B-Instruct'. Note: To access gated repo, set HF_TOKEN environment variable to your Hugging Face token." ,
5556
5703
)
5704
+ parser .add_argument (
5705
+ "--mmproj" , action = "store_true" ,
5706
+ help = "(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name." ,
5707
+ )
5557
5708
5558
5709
args = parser .parse_args ()
5559
5710
if not args .print_supported_models and args .model is None :
@@ -5633,6 +5784,10 @@ def main() -> None:
5633
5784
5634
5785
hparams = Model .load_hparams (dir_model )
5635
5786
5787
+ if args .mmproj :
5788
+ if "mmproj" not in fname_out .name :
5789
+ fname_out = Model .add_prefix_to_filename (fname_out , "mmproj-" )
5790
+
5636
5791
with torch .inference_mode ():
5637
5792
output_type = ftype_map [args .outtype ]
5638
5793
model_architecture = hparams ["architectures" ][0 ]
@@ -5649,7 +5804,8 @@ def main() -> None:
5649
5804
split_max_tensors = args .split_max_tensors ,
5650
5805
split_max_size = split_str_to_n_bytes (args .split_max_size ), dry_run = args .dry_run ,
5651
5806
small_first_shard = args .no_tensor_first_split ,
5652
- remote_hf_model_id = str (args .model ) if args .remote else None )
5807
+ remote_hf_model_id = str (args .model ) if args .remote else None ,
5808
+ mmproj = args .mmproj )
5653
5809
5654
5810
if args .vocab_only :
5655
5811
logger .info ("Exporting model vocab..." )
0 commit comments