From 289637a6a32db14bb7c39b08892aa9ff854eac8b Mon Sep 17 00:00:00 2001 From: Dmitry Wolf Date: Wed, 15 Mar 2023 21:25:01 +0300 Subject: [PATCH 1/5] streaming conversion without pytorch --- convert-pth-to-ggml.py | 92 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 79 insertions(+), 13 deletions(-) diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py index d2557500af094..75e182cc1d34d 100644 --- a/convert-pth-to-ggml.py +++ b/convert-pth-to-ggml.py @@ -17,11 +17,16 @@ # and vocabulary. # +from collections import defaultdict import sys import json import struct import numpy as np -import torch +from tqdm import tqdm +import zipfile +import pickle +import concurrent.futures + from sentencepiece import SentencePieceProcessor if len(sys.argv) < 3: @@ -73,19 +78,22 @@ def get_n_parts(dim): n_parts = get_n_parts(hparams["dim"]) -print(hparams) -print('n_parts = ', n_parts) +print(f'Model params.json: {hparams}') +print(f'Parts to process: {n_parts}') + -for p in range(n_parts): - print('Processing part ', p) +def get_fname(p): + fname = "/consolidated.0" + str(p) + ".pth" + return fname - #fname_model = sys.argv[1] + "/consolidated.00.pth" - fname_model = sys.argv[1] + "/consolidated.0" + str(p) + ".pth" +def process_part(p): + fname = get_fname(p) + fname_model = sys.argv[1] + fname fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" if (p > 0): fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + "." + str(p) - model = torch.load(fname_model, map_location="cpu") + print(f"Processing part {fname}") fout = open(fname_out, "wb") @@ -123,7 +131,58 @@ def get_n_parts(dim): fout.write(struct.pack("i", len(text))) fout.write(text) - for k, v in model.items(): + + def load_model(fname): + class Tensor(): + def __init__(self, shape, dtype, loadinfo): + self.shape = shape + self.dtype = dtype + self.loadinfo = loadinfo + # print(shape, dtype) + + def numpy(self): + fname_model, base_name, storage_offset, k, shape, dtype = self.loadinfo + with zipfile.ZipFile(fname_model, 'r') as myzip: + with myzip.open(f'{base_name}/data/{k}') as myfile: + bytes_size = np.dtype(self.dtype).itemsize + myfile.seek(storage_offset * bytes_size, 1) + ret = np.empty(shape, dtype=dtype) + myfile.readinto(ret.data) + return ret + + def my_unpickle(datapkl, fname_model, base_name): + def my_rebuild_tensor(storage, storage_offset, size, stride, requires_grad, backward_hooks, metadata=None): + storage_type = storage[1] + obj_key = storage[2] + return Tensor(shape=size, dtype=storage_type, loadinfo=( + fname_model, base_name, storage_offset, + obj_key, size, storage_type + )) + + class MyUnpickler(pickle.Unpickler): + def find_class(self, *p): + if p == ('torch', 'HalfStorage'): return np.float16 + if p == ('torch', 'FloatStorage'): return np.float32 + if p == ('torch._utils', '_rebuild_tensor_v2'): return my_rebuild_tensor + if p == ('collections', 'OrderedDict'): return dict + raise ValueError(f'Unrecognized pickle {p}') + + def persistent_load(self, pid): + return pid + + return MyUnpickler(datapkl).load() + + with zipfile.ZipFile(fname, 'r') as myzip: + base_name = myzip.namelist()[0].split('/', 1)[0] + # print(myzip.namelist()) + with myzip.open(f'{base_name}/data.pkl') as myfile: + model = my_unpickle(myfile, fname, base_name) + return model + + model = load_model(fname_model) + + for k, v in (t := tqdm(model.items())): + t.set_description(f"Processing {k} with shape {tuple(v.shape)} and type {np.dtype(v.dtype)}") name = k shape = v.shape @@ -131,11 +190,11 @@ def get_n_parts(dim): if name[-5:] == "freqs": continue - print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype) + # print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype) #data = tf.train.load_variable(dir_model, name).squeeze() data = v.numpy().squeeze() - n_dims = len(data.shape); + n_dims = len(data.shape) # for efficiency - transpose some matrices # "model/h.*/attn/c_attn/w" @@ -154,7 +213,7 @@ def get_n_parts(dim): # default type is fp16 ftype_cur = 1 if ftype == 0 or n_dims == 1: - print(" Converting to float32") + # print(" Converting to float32") data = data.astype(np.float32) ftype_cur = 0 @@ -163,7 +222,7 @@ def get_n_parts(dim): fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur)) for i in range(n_dims): fout.write(struct.pack("i", dshape[n_dims - 1 - i])) - fout.write(sname); + fout.write(sname) # data data.tofile(fout) @@ -175,3 +234,10 @@ def get_n_parts(dim): print("Done. Output file: " + fname_out + ", (part ", p, ")") print("") + +with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + futures = {executor.submit(process_part, p) for p in range(n_parts)} + for f in (concurrent.futures.as_completed(futures)): + if f.exception() is not None: raise f.exception() + +print("All done.") \ No newline at end of file From bb997650be33b49519a23927d5e354f7defb4943 Mon Sep 17 00:00:00 2001 From: Dmitry Wolf Date: Wed, 15 Mar 2023 22:38:28 +0300 Subject: [PATCH 2/5] separate threads for r/w ops --- convert-pth-to-ggml.py | 62 ++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py index 75e182cc1d34d..0556149582885 100644 --- a/convert-pth-to-ggml.py +++ b/convert-pth-to-ggml.py @@ -1,6 +1,5 @@ # Convert a LLaMA model checkpoint to a ggml compatible file # -# Load the model using Torch # Iterate over all variables and write them to a binary file. # # For each variable, write the following: @@ -26,6 +25,9 @@ import zipfile import pickle import concurrent.futures +import io +import threading +import queue from sentencepiece import SentencePieceProcessor @@ -138,24 +140,22 @@ def __init__(self, shape, dtype, loadinfo): self.shape = shape self.dtype = dtype self.loadinfo = loadinfo - # print(shape, dtype) def numpy(self): - fname_model, base_name, storage_offset, k, shape, dtype = self.loadinfo - with zipfile.ZipFile(fname_model, 'r') as myzip: - with myzip.open(f'{base_name}/data/{k}') as myfile: - bytes_size = np.dtype(self.dtype).itemsize - myfile.seek(storage_offset * bytes_size, 1) - ret = np.empty(shape, dtype=dtype) - myfile.readinto(ret.data) - return ret - - def my_unpickle(datapkl, fname_model, base_name): + myzip, base_name, storage_offset, k, shape, dtype = self.loadinfo + with myzip.open(f'{base_name}/data/{k}') as myfile: + bytes_size = np.dtype(self.dtype).itemsize + myfile.seek(storage_offset * bytes_size, 1) + ret = np.empty(shape, dtype=dtype) + myfile.readinto(ret.data) + return ret + + def my_unpickle(datapkl, myzip, base_name): def my_rebuild_tensor(storage, storage_offset, size, stride, requires_grad, backward_hooks, metadata=None): storage_type = storage[1] obj_key = storage[2] return Tensor(shape=size, dtype=storage_type, loadinfo=( - fname_model, base_name, storage_offset, + myzip, base_name, storage_offset, obj_key, size, storage_type )) @@ -172,15 +172,24 @@ def persistent_load(self, pid): return MyUnpickler(datapkl).load() - with zipfile.ZipFile(fname, 'r') as myzip: - base_name = myzip.namelist()[0].split('/', 1)[0] - # print(myzip.namelist()) - with myzip.open(f'{base_name}/data.pkl') as myfile: - model = my_unpickle(myfile, fname, base_name) + myzip = zipfile.ZipFile(fname, 'r') + base_name = myzip.namelist()[0].split('/', 1)[0] + with myzip.open(f'{base_name}/data.pkl') as myfile: + model = my_unpickle(myfile, myzip, base_name) return model model = load_model(fname_model) + q = queue.Queue(maxsize=2) + + def writer(): + while True: + item = q.get() + fout.write(item.getvalue()) + q.task_done() + + threading.Thread(target=writer, daemon=True).start() + for k, v in (t := tqdm(model.items())): t.set_description(f"Processing {k} with shape {tuple(v.shape)} and type {np.dtype(v.dtype)}") name = k @@ -190,8 +199,6 @@ def persistent_load(self, pid): if name[-5:] == "freqs": continue - # print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype) - #data = tf.train.load_variable(dir_model, name).squeeze() data = v.numpy().squeeze() n_dims = len(data.shape) @@ -217,23 +224,26 @@ def persistent_load(self, pid): data = data.astype(np.float32) ftype_cur = 0 + memout = io.BytesIO() # header sname = name.encode('utf-8') - fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur)) + memout.write(struct.pack("iii", n_dims, len(sname), ftype_cur)) for i in range(n_dims): - fout.write(struct.pack("i", dshape[n_dims - 1 - i])) - fout.write(sname) + memout.write(struct.pack("i", dshape[n_dims - 1 - i])) + memout.write(sname) # data - data.tofile(fout) + memout.write(data.tobytes()) + # data.tofile(memout) + q.put(memout) + + q.join() - # I hope this deallocates the memory .. model = None fout.close() print("Done. Output file: " + fname_out + ", (part ", p, ")") - print("") with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: futures = {executor.submit(process_part, p) for p in range(n_parts)} From 0deb075a3f8bf5445f4148a2d75c8a179d42da5b Mon Sep 17 00:00:00 2001 From: Dmitry Wolf Date: Wed, 15 Mar 2023 22:57:59 +0300 Subject: [PATCH 3/5] clean --- convert-pth-to-ggml.py | 90 +++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 46 deletions(-) diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py index 0556149582885..e239b85496fe4 100644 --- a/convert-pth-to-ggml.py +++ b/convert-pth-to-ggml.py @@ -84,6 +84,50 @@ def get_n_parts(dim): print(f'Parts to process: {n_parts}') +def load_model(fname): + class Tensor(): + def __init__(self, shape, dtype, loadinfo): + self.shape = shape + self.dtype = dtype + self.loadinfo = loadinfo + + def numpy(self): + myzip, base_name, storage_offset, k, shape, dtype = self.loadinfo + with myzip.open(f'{base_name}/data/{k}') as myfile: + bytes_size = np.dtype(self.dtype).itemsize + myfile.seek(storage_offset * bytes_size, 1) + ret = np.empty(shape, dtype=dtype) + myfile.readinto(ret.data) + return ret + + def my_unpickle(datapkl, myzip, base_name): + def my_rebuild_tensor(storage, storage_offset, size, stride, requires_grad, backward_hooks, metadata=None): + storage_type = storage[1] + obj_key = storage[2] + return Tensor(shape=size, dtype=storage_type, loadinfo=( + myzip, base_name, storage_offset, + obj_key, size, storage_type + )) + + class MyUnpickler(pickle.Unpickler): + def find_class(self, *p): + if p == ('torch', 'HalfStorage'): return np.float16 + if p == ('torch', 'FloatStorage'): return np.float32 + if p == ('torch._utils', '_rebuild_tensor_v2'): return my_rebuild_tensor + if p == ('collections', 'OrderedDict'): return dict + raise ValueError(f'Unrecognized pickle {p}') + + def persistent_load(self, pid): + return pid + + return MyUnpickler(datapkl).load() + + myzip = zipfile.ZipFile(fname, 'r') + base_name = myzip.namelist()[0].split('/', 1)[0] + with myzip.open(f'{base_name}/data.pkl') as myfile: + model = my_unpickle(myfile, myzip, base_name) + return model + def get_fname(p): fname = "/consolidated.0" + str(p) + ".pth" return fname @@ -133,51 +177,6 @@ def process_part(p): fout.write(struct.pack("i", len(text))) fout.write(text) - - def load_model(fname): - class Tensor(): - def __init__(self, shape, dtype, loadinfo): - self.shape = shape - self.dtype = dtype - self.loadinfo = loadinfo - - def numpy(self): - myzip, base_name, storage_offset, k, shape, dtype = self.loadinfo - with myzip.open(f'{base_name}/data/{k}') as myfile: - bytes_size = np.dtype(self.dtype).itemsize - myfile.seek(storage_offset * bytes_size, 1) - ret = np.empty(shape, dtype=dtype) - myfile.readinto(ret.data) - return ret - - def my_unpickle(datapkl, myzip, base_name): - def my_rebuild_tensor(storage, storage_offset, size, stride, requires_grad, backward_hooks, metadata=None): - storage_type = storage[1] - obj_key = storage[2] - return Tensor(shape=size, dtype=storage_type, loadinfo=( - myzip, base_name, storage_offset, - obj_key, size, storage_type - )) - - class MyUnpickler(pickle.Unpickler): - def find_class(self, *p): - if p == ('torch', 'HalfStorage'): return np.float16 - if p == ('torch', 'FloatStorage'): return np.float32 - if p == ('torch._utils', '_rebuild_tensor_v2'): return my_rebuild_tensor - if p == ('collections', 'OrderedDict'): return dict - raise ValueError(f'Unrecognized pickle {p}') - - def persistent_load(self, pid): - return pid - - return MyUnpickler(datapkl).load() - - myzip = zipfile.ZipFile(fname, 'r') - base_name = myzip.namelist()[0].split('/', 1)[0] - with myzip.open(f'{base_name}/data.pkl') as myfile: - model = my_unpickle(myfile, myzip, base_name) - return model - model = load_model(fname_model) q = queue.Queue(maxsize=2) @@ -234,7 +233,6 @@ def writer(): # data memout.write(data.tobytes()) - # data.tofile(memout) q.put(memout) q.join() From de17acac72ef1f3f16727aa1151a03f67740a783 Mon Sep 17 00:00:00 2001 From: Dmitry Wolf Date: Wed, 15 Mar 2023 23:24:48 +0300 Subject: [PATCH 4/5] readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0b2532a09a3b2..0c314a0a1c9ab 100644 --- a/README.md +++ b/README.md @@ -139,7 +139,7 @@ ls ./models 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model # install Python dependencies -python3 -m pip install torch numpy sentencepiece +python3 -m pip install tqdm numpy sentencepiece # convert the 7B model to ggml FP16 format python3 convert-pth-to-ggml.py models/7B/ 1 From 5f0ada21bf8ac8a69231f28cc40b1aa59bfcaeb4 Mon Sep 17 00:00:00 2001 From: Dmitry Wolf Date: Thu, 16 Mar 2023 00:30:07 +0300 Subject: [PATCH 5/5] fix --- convert-pth-to-ggml.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py index db30a7e1ffca3..bef3618cb7281 100644 --- a/convert-pth-to-ggml.py +++ b/convert-pth-to-ggml.py @@ -184,12 +184,15 @@ def process_part(p): def writer(): while True: item = q.get() + if item is None: + q.task_done() + break fout.write(item.getvalue()) q.task_done() threading.Thread(target=writer, daemon=True).start() - for k, v in (t := tqdm(model.items())): + for k, v in (t := tqdm(model.items(), bar_format="{r_bar} {percentage:3.0f}% |{bar:50} | {desc}")): t.set_description(f"Processing {k} with shape {tuple(v.shape)} and type {np.dtype(v.dtype)}") name = k shape = v.shape @@ -235,6 +238,7 @@ def writer(): memout.write(data.tobytes()) q.put(memout) + q.put(None) q.join() model = None