8000 Various script cleanups/fixes + convert merges and special token handling by KerfuffleV2 · Pull Request #2842 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

Various script cleanups/fixes + convert merges and special token handling #2842

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Aug 30, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Uniform args parsing and vocab only mode for convert examples
  • Loading branch information
KerfuffleV2 committed Aug 29, 2023
commit 58fa4dc87012a142be9e57090fa556a72da436ed
5 changes: 2 additions & 3 deletions convert-falcon-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,10 @@ def bytes_to_unicode():
bs.append(b)
cs.append(2**8+n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
return dict(zip(bs, (chr(n) for n in cs)))


def count_model_parts(dir_model: str) -> int:
def count_model_parts(dir_model: Path) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
Expand Down
117 changes: 63 additions & 54 deletions convert-gptneox-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import json
import numpy as np
import torch
import argparse

from typing import Any, List
from pathlib import Path
Expand Down Expand Up @@ -37,7 +38,7 @@ def bytes_to_unicode():
return dict(zip(bs, (chr(n) for n in cs)))


def count_model_parts(dir_model: str) -> int:
def count_model_parts(dir_model: Path) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
Expand All @@ -48,16 +49,21 @@ def count_model_parts(dir_model: str) -> int:
return num_parts


if len(sys.argv) < 3:
print(f"Usage: python {sys.argv[0]} dir-model ftype\n")
print(" ftype == 0 -> float32")
print(" ftype == 1 -> float16")
sys.exit(1)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
return parser.parse_args()

args = parse_args()

# output in the same directory as the model
dir_model = sys.argv[1]
last_dir = os.path.basename(os.path.normpath(dir_model))
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)

# possible tensor data types
# ftype == 0 -> float32
Expand All @@ -66,19 +72,15 @@ def count_model_parts(dir_model: str) -> int:
# map from ftype to string
ftype_str = ["f32", "f16"]

ftype = 1
if len(sys.argv) > 2:
ftype = int(sys.argv[2])
if ftype < 0 or ftype > 1:
print("Invalid ftype: " + str(ftype))

sys.exit(1)

fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'

print("gguf: loading model "+last_dir)
print("gguf: loading model "+dir_model.name)

with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)

if hparams["architectures"][0] != "GPTNeoXForCausalLM":
Expand All @@ -96,7 +98,7 @@ def count_model_parts(dir_model: str) -> int:

block_count = hparams["num_hidden_layers"]

gguf_writer.add_name(last_dir)
gguf_writer.add_name(dir_model.name)
gguf_writer.add_context_length(hparams["max_position_embeddings"])
gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_block_count(block_count)
Expand All @@ -112,45 +114,49 @@ def count_model_parts(dir_model: str) -> int:

tokens: List[bytearray] = []
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These changes shut up the type warnings but I'm not 100% sure they're correct. The alternative would be to leave it List[str] and then convert the token text to a string. I assume it's already UTF-8 bytes so there probably isn't a functional difference.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The types are correct this way, because when we get to tokens.append(text), 'text' is explicitly a bytearray.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The types are correct this way, because when we get to tokens.append(text), 'text' is explicitly a bytearray.

Right. I meant my change fixes the type warning but the part I wasn't sure about was whether text actually is supposed to be a bytearray there and what is supposed to get submitted to gguf to write out the tokens. The type annotation for add_token_list method is also just List and doesn't specify what the element is supposed to be.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The decision was made to accept several types as input to the token list depending on type of he tokenizer output (spm vs bpe)
https://github.com/ggerganov/llama.cpp/blob/dd0dc366dab10e8df28d3924e7f313b5c695e908/gguf-py/gguf/gguf.py#L373-L375

Copy link
Collaborator Author
@KerfuffleV2 KerfuffleV2 Aug 28, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The decision was made to accept several types as input to the token list

So we'd want

def add_token_list(self, tokens: Union[List[str], List[bytes], List[bytearray]]):

Correct? Or would you want to allow the items to be non-homogenous like List[Union[str, bytes, bytearray]]?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the differences of the python types str, bytes and bytearray? If they all resolve to the same when written to disk then any of them could be used.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make it as simple it could be as long there is no difference in how the tokens are written to disk.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

str is unicode text which will be encoded as utf-8 before being written to disk. bytes and bytearray are binary data. Those two are subclasses of ByteString, but we can't use that because it also allows memoryview. The least repetitive way to write this would be to use a TypeVar:

StrOrBytes = TypeVar('StrOrBytes', str, bytes, bytearray)
# ...
    def add_token_list(self, tokens: Sequence[StrOrBytes]):
        # ...
    def add_token_merges(self, merges: Sequence[StrOrBytes]):
        # ...

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should merges actually support all of those?

< 9E88 /div>

if Path(dir_model + "/tokenizer.json").is_file():
# gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2")
tokenizer_json_file = dir_model / 'tokenizer.json'
if not tokenizer_json_file.is_file():
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
sys.exit(1)

# gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2")

with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
tokenizer_json = json.load(f)
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
tokenizer_json = json.load(f)

print("gguf: get gpt2 tokenizer vocab")
print("gguf: get gpt2 tokenizer vocab")

vocab_size = len(tokenizer_json["model"]["vocab"])
vocab_size = len(tokenizer_json["model"]["vocab"])

# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)

reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
byte_encoder = bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
byte_encoder = bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}

for i in range(vocab_size):
if i in reverse_vocab:
try:
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
except KeyError:
text = bytearray()
for c in reverse_vocab[i]:
if ord(c) < 256: # single byte character
text.append(byte_decoder[ord(c)])
else: # multibyte special token character
text.extend(c.encode('utf-8'))
else:
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
pad_token = f"[PAD{i}]".encode("utf8")
text = bytearray(pad_token)
for i in range(vocab_size):
if i in reverse_vocab:
try:
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
except KeyError:
text = bytearray()
for c in reverse_vocab[i]:
if ord(c) < 256: # single byte character
text.append(byte_decoder[ord(c)])
else: # multibyte special token character
text.extend(c.encode('utf-8'))
else:
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
pad_token = f"[PAD{i}]".encode("utf8")
text = bytearray(pad_token)

tokens.append(text)
tokens.append(text)

gguf_writer.add_token_list(tokens)
gguf_writer.add_token_list(tokens)

special_vocab = gguf.SpecialVocab(Path(dir_model), load_merges = True)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
special_vocab.add_to_gguf(gguf_writer)

# TENSORS
Expand All @@ -168,6 +174,8 @@ def count_model_parts(dir_model: str) -> int:
)

for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")

Expand Down Expand Up @@ -216,10 +224,11 @@ def count_model_parts(dir_model: str) -> int:
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()

gguf_writer.close()

print("gguf: model successfully exported to '" + fname_out + "'")
print(f"gguf: model successfully exported to '{fname_out}'")
print("")
Loading
0