8000 Streaming conversion with no torch by diimdeep · Pull Request #176 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

Streaming conversion with no torch #176

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
streaming conversion without pytorch
  • Loading branch information
diimdeep committed Mar 15, 2023
commit 289637a6a32db14bb7c39b08892aa9ff854eac8b
92 changes: 79 additions & 13 deletions convert-pth-to-ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,16 @@
# and vocabulary.
#

from collections import defaultdict
import sys
import json
import struct
import numpy as np
import torch
from tqdm import tqdm
import zipfile
import pickle
import concurrent.futures

from sentencepiece import SentencePieceProcessor

if len(sys.argv) < 3:
Expand Down Expand Up @@ -73,19 +78,22 @@ def get_n_parts(dim):

n_parts = get_n_parts(hparams["dim"])

print(hparams)
print('n_parts = ', n_parts)
print(f'Model params.json: {hparams}')
print(f'Parts to process: {n_parts}')


for p in range(n_parts):
print('Processing part ', p)
def get_fname(p):
fname = "/consolidated.0" + str(p) + ".pth"
return fname

#fname_model = sys.argv[1] + "/consolidated.00.pth"
fname_model = sys.argv[1] + "/consolidated.0" + str(p) + ".pth"
def process_part(p):
fname = get_fname(p)
fname_model = sys.argv[1] + fname
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
if (p > 0):
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + "." + str(p)

model = torch.load(fname_model, map_location="cpu")
print(f"Processing part {fname}")

fout = open(fname_out, "wb")

Expand Down Expand Up @@ -123,19 +131,70 @@ def get_n_parts(dim):
fout.write(struct.pack("i", len(text)))
fout.write(text)

for k, v in model.items():

def load_model(fname):
class Tensor():
def __init__(self, shape, dtype, loadinfo):
self.shape = shape
self.dtype = dtype
self.loadinfo = loadinfo
# print(shape, dtype)

def numpy(self):
fname_model, base_name, storage_offset, k, shape, dtype = self.loadinfo
with zipfile.ZipFile(fname_model, 'r') as myzip:
with myzip.open(f'{base_name}/data/{k}') as myfile:
bytes_size = np.dtype(self.dtype).itemsize
myfile.seek(storage_offset * bytes_size, 1)
ret = np.empty(shape, dtype=dtype)
myfile.readinto(ret.data)
return ret

def my_unpickle(datapkl, fname_model, base_name):
def my_rebuild_tensor(storage, storage_offset, size, stride, requires_grad, backward_hooks, metadata=None):
storage_type = storage[1]
obj_key = storage[2]
return Tensor(shape=size, dtype=storage_type, loadinfo=(
fname_model, base_name, storage_offset,
obj_key, size, storage_type
))

class MyUnpickler(pickle.Unpickler):
def find_class(self, *p):
if p == ('torch', 'HalfStorage'): return np.float16
if p == ('torch', 'FloatStorage'): return np.float32
if p == ('torch._utils', '_rebuild_tensor_v2'): return my_rebuild_tensor
if p == ('collections', 'OrderedDict'): return dict
raise ValueError(f'Unrecognized pickle {p}')

def persistent_load(self, pid):
return pid

return MyUnpickler(datapkl).load()

with zipfile.ZipFile(fname, 'r') as myzip:
base_name = myzip.namelist()[0].split('/', 1)[0]
# print(myzip.namelist())
with myzip.open(f'{base_name}/data.pkl') as myfile:
model = my_unpickle(myfile, fname, base_name)
return model

model = load_model(fname_model)

for k, v in (t := tqdm(model.items())):
t.set_description(f"Processing {k} with shape {tuple(v.shape)} and type {np.dtype(v.dtype)}")
name = k
shape = v.shape

# skip layers.X.attention.inner_attention.rope.freqs
if name[-5:] == "freqs":
continue

print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
# print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)

#data = tf.train.load_variable(dir_model, name).squeeze()
data = v.numpy().squeeze()
n_dims = len(data.shape);
n_dims = len(data.shape)

# for efficiency - transpose some matrices
# "model/h.*/attn/c_attn/w"
Expand All @@ -154,7 +213,7 @@ def get_n_parts(dim):
# default type is fp16
ftype_cur = 1
if ftype == 0 or n_dims == 1:
print(" Converting to float32")
# print(" Converting to float32")
data = data.astype(np.float32)
ftype_cur = 0

Expand All @@ -163,7 +222,7 @@ def get_n_parts(dim):
fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
for i in range(n_dims):
fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
fout.write(sname);
fout.write(sname)

# data
data.tofile(fout)
Expand All @@ -175,3 +234,10 @@ def get_n_parts(dim):

print("Done. Output file: " + fname_out + ", (part ", p, ")")
print("")

with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
futures = {executor.submit(process_part, p) for p in range(n_parts)}
for f in (concurrent.futures.as_completed(futures)):
if f.exception() is not None: raise f.exception()

print("All done.")
0