8000 gguf : support big endian platform (#3552) · nomic-ai/llama.cpp@10e6c38 · GitHub
[go: up one dir, main page]

Skip to content

Commit 10e6c38

Browse files
chenqinyggerganov
authored andcommitted
gguf : support big endian platform (ggml-org#3552)
* check whether platform is 390x if yes->do not import immintrin.h * support s390x big endian * support --bigendian option for s390x 1. verified with baichuan7b-chat with float 16 on s390x 2. verified with baichuan7b-chat 3. verified with chinese-alpaca-2-13b-f16 * update format based on editor-config checker result * Update convert-baichuan-hf-to-gguf.py * 1. check in ggml.c if endianess is not match 2. update GGUF version 3. change get_pack_prefix to property 4. update information log * always use "GGUF" as beginng of GGUF file * Compare "GGUF" with file header char by char 1. Set GGUF_MAGIC to "GGUF" string instead of int value 2. Compare "GGUF" char by char to ensure its byte order 3. Move bytes swap code from convert.py to gguf.py write_tensor_data --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
1 parent fd77ecf commit 10e6c38

File tree

8 files changed

+83
-48
lines changed

8 files changed

+83
-48
lines changed

convert-baichuan-hf-to-gguf.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ def parse_args() -> argparse.Namespace:
7373
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
7474
help="output format - use 0 for float32, 1 for float16",
7575
)
76+
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
7677
return parser.parse_args()
7778

7879
args = parse_args()
@@ -83,6 +84,11 @@ def parse_args() -> argparse.Namespace:
8384
print(f'Error: {args.model} is not a directory', file = sys.stderr)
8485
sys.exit(1)
8586

87+
endianess = gguf.GGUFEndian.LITTLE
88+
if args.bigendian:
89+
endianess = gguf.GGUFEndian.BIG
90+
endianess_str = "Big Endian" if args.bigendian else "Little Endian"
91+
print(f"gguf: Conversion Endianess {endianess}")
8692
# possible tensor data types
8793
# ftype == 0 -> float32
8894
# ftype == 1 -> float16
@@ -110,7 +116,7 @@ def parse_args() -> argparse.Namespace:
110116
num_parts = count_model_parts(dir_model)
111117
print(f"num_parts:{num_parts}\n")
112118
ARCH=gguf.MODEL_ARCH.BAICHUAN
113-
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
119+
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
114120

115121
print("gguf: get model metadata")
116122

convert.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -818,8 +818,8 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
818818

819819

820820
class OutputFile:
821-
def __init__(self, fname_out: Path) -> None:
822-
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
821+
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
822+
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
823823

824824
def add_meta_arch(self, params: Params) -> None:
825825
name = "LLaMA"
@@ -890,10 +890,10 @@ def close(self) -> None:
890890
self.gguf.close()
891891

892892
@staticmethod
893-
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
893+
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
894894
check_vocab_size(params, vocab)
895895

896-
of = OutputFile(fname_out)
896+
of = OutputFile(fname_out, endianess=endianess)
897897

898898
# meta data
899899
of.add_meta_arch(params)
@@ -918,10 +918,10 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
918918
return dt.quantize(arr)
919919

920920
@staticmethod
921-
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
921+
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
922922
check_vocab_size(params, vocab)
923923

924-
of = OutputFile(fname_out)
924+
of = OutputFile(fname_out, endianess=endianess)
925925

926926
# meta data
927927
of.add_meta_arch(params)
@@ -1138,8 +1138,9 @@ def main(args_in: list[str] | None = None) -> None:
11381138
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
11391139
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
11401140
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
1141-
args = parser.parse_args(args_in)
1141+
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
11421142

1143+
args = parser.parse_args(args_in)
11431144
if args.dump_single:
11441145
model_plus = lazy_load_file(args.model)
11451146
do_dump_model(model_plus)
@@ -1153,6 +1154,9 @@ def main(args_in: list[str] | None = None) -> None:
11531154
if args.dump:
11541155
do_dump_model(model_plus)
11551156
return
1157+
endianess = gguf.GGUFEndian.LITTLE
1158+
if args.bigendian:
1159+
endianess = gguf.GGUFEndian.BIG
11561160

11571161
params = Params.load(model_plus)
11581162
if params.n_ctx == -1:
@@ -1200,7 +1204,7 @@ def main(args_in: list[str] | None = None) -> None:
12001204
params.ftype = ftype
12011205
print(f"Writing {outfile}, format {ftype}")
12021206

1203-
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
1207+
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
12041208
print(f"Wrote {outfile}")
12051209

12061210

examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -536,7 +536,7 @@ static bool is_ggml_file(const char * filename) {
536536
if (file.size < 4) {
537537
return false;
538538
}
539-
uint32_t magic = file.read_u32();
539+
std::string magic = file.read_string(4);
540540
return magic == GGUF_MAGIC;
541541
}
542542

ggml.c

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19659,7 +19659,7 @@ struct gguf_kv {
1965919659
};
1966019660

1966119661
struct gguf_header {
19662-
uint32_t magic;
19662+
char magic[4];
1966319663
uint32_t version;
1966419664
uint64_t n_tensors; // GGUFv2
1966519665
uint64_t n_kv; // GGUFv2
@@ -19729,7 +19729,7 @@ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset)
1972919729
struct gguf_context * gguf_init_empty(void) {
1973019730
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
1973119731

19732-
ctx->header.magic = GGUF_MAGIC;
19732+
memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
1973319733
ctx->header.version = GGUF_VERSION;
1973419734
ctx->header.n_tensors = 0;
1973519735
ctx->header.n_kv = 0;
@@ -19755,16 +19755,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
1975519755
// offset from start of file
1975619756
size_t offset = 0;
1975719757

19758-
uint32_t magic = 0;
19758+
char magic[4];
1975919759

1976019760
// check the magic before making allocations
1976119761
{
1976219762
gguf_fread_el(file, &magic, sizeof(magic), &offset);
1976319763

19764-
if (magic != GGUF_MAGIC) {
19765-
fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
19766-
fclose(file);
19767-
return NULL;
19764+
for (uint32_t i = 0; i < sizeof(magic); i++) {
19765+
if (magic[i] != GGUF_MAGIC[i]) {
19766+
fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
19767+
fclose(file);
19768+
return NULL;
19769+
}
1976819770
}
1976919771
}
1977019772

@@ -19774,7 +19776,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
1977419776

1977519777
// read the header
1977619778
{
19777-
ctx->header.magic = magic;
19779+
strncpy(ctx->header.magic, magic, 4);
19780+
1977819781

1977919782
ctx->kv = NULL;
1978019783
ctx->infos = NULL;

ggml.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -231,8 +231,9 @@
231231
#define GGML_EXIT_SUCCESS 0
232232
#define GGML_EXIT_ABORTED 1
233233

234-
#define GGUF_MAGIC 0x46554747 // "GGUF"
235-
#define GGUF_VERSION 2
234+
#define GGUF_MAGIC "GGUF"
235+
< F438 /td>236+
#define GGUF_VERSION 3
236237

237238
#define GGUF_DEFAULT_ALIGNMENT 32
238239

gguf-py/gguf/gguf.py

Lines changed: 46 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,10 @@
1919
#
2020

2121
GGUF_MAGIC = 0x46554747
22-
GGUF_VERSION = 2
22+
GGUF_VERSION = 3
2323
GGUF_DEFAULT_ALIGNMENT = 32
2424

25+
2526
# general
2627
KEY_GENERAL_ARCHITECTURE = "general.architecture"
2728
KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
@@ -491,6 +492,10 @@ class GGMLQuantizationType(IntEnum):
491492
Q6_K = 14
492493
Q8_K = 15
493494

495+
class GGUFEndian(IntEnum):
496+
LITTLE = 0
497+
BIG = 1
498+
494499

495500
class GGUFValueType(IntEnum):
496501
UINT8 = 0
@@ -538,18 +543,41 @@ class GGUFWriter:
538543
temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
539544
tensors: list[tuple[np.ndarray[Any, Any], int]]
540545

541-
def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
546+
@property
547+
def pack_prefix(self):
548+
if self.endianess==GGUFEndian.LITTLE:
549+
return "<"
550+
else:
551+
return ">"
552+
553+
def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True, endianess=GGUFEndian.LITTLE):
542554
self.fout = open(path, "wb")
543555
self.arch = arch
556+
self.endianess = endianess
557+
self._simple_value_packing = {
558+
GGUFValueType.UINT8: f"{self.pack_prefix}B",
559+
GGUFValueType.INT8: f"{self.pack_prefix}b",
560+
GGUFValueType.UINT16: f"{self.pack_prefix}H",
561+
GGUFValueType.INT16: f"{self.pack_prefix}h",
562+
GGUFValueType.UINT32: f"{self.pack_prefix}I",
563+
GGUFValueType.INT32: f"{self.pack_prefix}i",
564+
GGUFValueType.FLOAT32: f"{self.pack_prefix}f",
565+
GGUFValueType.UINT64: f"{self.pack_prefix}Q",
566+
GGUFValueType.INT64: f"{self.pack_prefix}q",
567+
GGUFValueType.FLOAT64: f"{self.pack_prefix}d",
568+
GGUFValueType.BOOL: "?" ,
569+
}
544570
self.add_architecture()
545571
self.use_temp_file = use_temp_file
546572
self.tensors = []
573+
endianess_str = "Big Endian" if self.endianess == GGUFEndian.BIG else "Little Endian"
574+
print(f"This gguf file is for {endianess_str} only")
547575

548576
def write_header_to_file(self):
549577
self.fout.write(struct.pack("<I", GGUF_MAGIC))
550-
self.fout.write(struct.pack("<I", GGUF_VERSION))
551-
self.fout.write(struct.pack("<Q", self.ti_data_count))
552-
self.fout.write(struct.pack("<Q", self.kv_data_count))
578+
self.fout.write(struct.pack(f"{self.pack_prefix}I", GGUF_VERSION))
579+
self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.ti_data_count))
580+
self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.kv_data_count))
553581
self.flush()
554582
# print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
555583

@@ -621,40 +649,27 @@ def add_array(self, key: str, val: Sequence[Any]):
621649
self.add_key(key)
622650
self.add_val(val, GGUFValueType.ARRAY)
623651

624-
_simple_value_packing = {
625-
GGUFValueType.UINT8: "<B",
626-
GGUFValueType.INT8: "<b",
627-
GGUFValueType.UINT16: "<H",
628-
GGUFValueType.INT16: "<h",
629-
GGUFValueType.UINT32: "<I",
630-
GGUFValueType.INT32: "<i",
631-
GGUFValueType.FLOAT32: "<f",
632-
GGUFValueType.UINT64: "<Q",
633-
GGUFValueType.INT64: "<q",
634-
GGUFValueType.FLOAT64: "<d",
635-
GGUFValueType.BOOL: "?" ,
636-
}
637652
def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
638653
if vtype is None:
639654
vtype = GGUFValueType.get_type(val)
640655

641656
if add_vtype:
642-
self.kv_data += struct.pack("<I", vtype)
657+
self.kv_data += struct.pack(f"{self.pack_prefix}I", vtype)
643658
self.kv_data_count += 1
644659

645660
pack_fmt = self._simple_value_packing.get(vtype)
646661
if pack_fmt is not None:
647662
self.kv_data += struct.pack(pack_fmt, val)
648663
elif vtype == GGUFValueType.STRING:
649664
encoded_val = val.encode("utf8") if isinstance(val, str) else val
650-
self.kv_data += struct.pack("<Q", len(encoded_val))
665+
self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_val))
651666
self.kv_data += encoded_val
652667
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
653668
ltype = GGUFValueType.get_type(val[0])
654669
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
655670
raise ValueError("All items in a GGUF array should be of the same type")
656-
self.kv_data += struct.pack("<I", ltype)
657-
self.kv_data += struct.pack("<Q", len(val))
671+
self.kv_data += struct.pack(f"{self.pack_prefix}I", ltype)
672+
self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(val))
658673
for item in val:
659674
self.add_val(item, add_vtype=False)
660675
else:
@@ -668,22 +683,24 @@ def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype:
668683
assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
669684

670685
encoded_name = name.encode("utf8")
671-
self.ti_data += struct.pack("<Q", len(encoded_name))
686+
self.ti_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_name))
672687
self.ti_data += encoded_name
673688
n_dims = len(tensor_shape)
674-
self.ti_data += struct.pack("<I", n_dims)
689+
self.ti_data += struct.pack(f"{self.pack_prefix}I", n_dims)
675690
for i in range(n_dims):
676-
self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
691+
self.ti_data += struct.pack(f"{self.pack_prefix}Q", tensor_shape[n_dims - 1 - i])
677692
if raw_dtype is None:
678693
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
679694
else:
680695
dtype = raw_dtype
681-
self.ti_data += struct.pack("<I", dtype)
682-
self.ti_data += struct.pack("<Q", self.offset_tensor)
696+
self.ti_data += struct.pack(f"{self.pack_prefix}I", dtype)
697+
self.ti_data += struct.pack(f"{self.pack_prefix}Q", self.offset_tensor)
683698
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
684699
self.ti_data_count += 1
685700

686701
def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
702+
if self.endianess == GGUFEndian.BIG:
703+
tensor.byteswap(inplace=True)
687704
if self.use_temp_file and self.temp_file is None:
688705
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
689706
fp.seek(0)
@@ -709,6 +726,8 @@ def write_padding(self, fp: BinaryIO, n: int, align: int | None = None):
709726
fp.write(bytes([0] * pad))
710727

711728
def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
729+
if self.endianess==GGUFEndian.BIG:
730+
tensor.byteswap(inplace=True)
712731
self.write_padding(self.fout, self.fout.tell())
713732
tensor.tofile(self.fout)
714733
self.write_padding(self.fout, tensor.nbytes)

k_quants.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
4646
#if defined(_MSC_VER) || defined(__MINGW32__)
4747
#include <intrin.h>
4848
#else
49-
#if !defined(__riscv)
49+
#if !defined(__riscv) && !defined(__s390__)
5050
#include <immintrin.h>
5151
#endif
5252
#endif

tests/test-double-float.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44

55
#undef NDEBUG
66
#include <cassert>
7+
#if !defined(__riscv) && !defined(__s390__)
78
#include <immintrin.h>
9+
#endif
810
#include <cmath>
911
#include <cstdint>
1012
#include <cstring>

0 commit comments

Comments
 (0)
0