surtweig
diff --git a/‎tests/test_llama.py
Lines changed: 115 additions & 239 deletions b/‎tests/test_llama.py
Lines changed: 115 additions & 239 deletions
@@ -1,10 +1,16 @@
 import ctypes
+import multiprocessing
 
 import numpy as np
-import pytest
 from scipy.special import log_softmax
 
+from huggingface_hub import hf_hub_download
+
+import pytest
+
 import llama_cpp
+import llama_cpp._internals as internals
+
 
 MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama-spm.gguf"
 
@@ -46,248 +52,118 @@ def test_llama_cpp_tokenization():
     assert text == llama.detokenize(tokens)
 
 
-@pytest.fixture
-def mock_llama(monkeypatch):
-    def setup_mock(llama: llama_cpp.Llama, output_text: str):
-        n_ctx = llama.n_ctx()
-        n_vocab = llama.n_vocab()
-        output_tokens = llama.tokenize(
-            output_text.encode("utf-8"), add_bos=True, special=True
-        )
-        logits = (ctypes.c_float * (n_vocab * n_ctx))()
-        for i in range(n_ctx):
-            output_idx = i + 1  # logits for first tokens predict second token
-            if output_idx < len(output_tokens):
-                logits[i * n_vocab + output_tokens[output_idx]] = 100.0
-            else:
-                logits[i * n_vocab + llama.token_eos()] = 100.0
-        n = 0
-        last_n_tokens = 0
-
-        def mock_decode(ctx: llama_cpp.llama_context_p, batch: llama_cpp.llama_batch):
-            # Test some basic invariants of this mocking technique
-            assert ctx == llama._ctx.ctx, "context does not match mock_llama"
-            assert batch.n_tokens > 0, "no tokens in batch"
-            assert all(
-                batch.n_seq_id[i] == 1 for i in range(batch.n_tokens)
-            ), "n_seq >1 not supported by mock_llama"
-            assert all(
-                batch.seq_id[i][0] == 0 for i in range(batch.n_tokens)
-            ), "n_seq >1 not supported by mock_llama"
-            assert batch.logits[
-                batch.n_tokens - 1
-            ], "logits not allocated for last token"
-            # Update the mock context state
-            nonlocal n
-            nonlocal last_n_tokens
-            n = max(batch.pos[i] for i in range(batch.n_tokens)) + 1
-            last_n_tokens = batch.n_tokens
-            return 0
-
-        def mock_get_logits(ctx: llama_cpp.llama_context_p):
-            # Test some basic invariants of this mocking technique
-            assert ctx == llama._ctx.ctx, "context does not match mock_llama"
-            assert n > 0, "mock_llama_decode not called"
-            assert last_n_tokens > 0, "mock_llama_decode not called"
-            # Return view of logits for last_n_tokens
-            return (ctypes.c_float * (last_n_tokens * n_vocab)).from_address(
-                ctypes.addressof(logits)
-                + (n - last_n_tokens) * n_vocab * ctypes.sizeof(ctypes.c_float)
-            )
-
-        monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_decode)
-        monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits)
-
-        def mock_kv_cache_clear(ctx: llama_cpp.llama_context_p):
-            # Test some basic invariants of this mocking technique
-            assert ctx == llama._ctx.ctx, "context does not match mock_llama"
-            return
-
-        def mock_kv_cache_seq_rm(
-            ctx: llama_cpp.llama_context_p,
-            seq_id: llama_cpp.llama_seq_id,
-            pos0: llama_cpp.llama_pos,
-            pos1: llama_cpp.llama_pos,
-        ):
-            # Test some basic invariants of this mocking technique
-            assert ctx == llama._ctx.ctx, "context does not match mock_llama"
-            return
-
-        def mock_kv_cache_seq_cp(
-            ctx: llama_cpp.llama_context_p,
-            seq_id_src: llama_cpp.llama_seq_id,
-            seq_id_dst: llama_cpp.llama_seq_id,
-            pos0: llama_cpp.llama_pos,
-            pos1: llama_cpp.llama_pos,
-        ):
-            # Test some basic invariants of this mocking technique
-            assert ctx == llama._ctx.ctx, "context does not match mock_llama"
-            return
-    
-        def mock_kv_cache_seq_keep(
-            ctx: llama_cpp.llama_context_p,
-            seq_id: llama_cpp.llama_seq_id,
-        ):
-            # Test some basic invariants of this mocking technique
-            assert ctx == llama._ctx.ctx, "context does not match mock_llama"
-            return
-
-        def mock_kv_cache_seq_add(
-            ctx: llama_cpp.llama_context_p,
-            seq_id: llama_cpp.llama_seq_id,
-            pos0: llama_cpp.llama_pos,
-            pos1: llama_cpp.llama_pos,
-        ):
-            # Test some basic invariants of this mocking technique
-            assert ctx == llama._ctx.ctx, "context does not match mock_llama"
-            return
-
-        monkeypatch.setattr("llama_cpp.llama_cpp.llama_kv_cache_clear", mock_kv_cache_clear)
-        monkeypatch.setattr("llama_cpp.llama_cpp.llama_kv_cache_seq_rm", mock_kv_cache_seq_rm)
-        monkeypatch.setattr("llama_cpp.llama_cpp.llama_kv_cache_seq_cp", mock_kv_cache_seq_cp)
-        monkeypatch.setattr("llama_cpp.llama_cpp.llama_kv_cache_seq_keep", mock_kv_cache_seq_keep)
-        monkeypatch.setattr("llama_cpp.llama_cpp.llama_kv_cache_seq_add", mock_kv_cache_seq_add)
-
-    return setup_mock
-
-
-# def test_llama_patch(mock_llama):
-#     n_ctx = 128
-#     llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, n_ctx=n_ctx)
-#     n_vocab = llama_cpp.llama_n_vocab(llama._model.model)
-#     assert n_vocab == 32000
-#
-#     text = "The quick brown fox"
-#     output_text = " jumps over the lazy dog."
-#     all_text = text + output_text
-#
-#     ## Test basic completion from bos until eos
-#     mock_llama(llama, all_text)
-#     completion = llama.create_completion("", max_tokens=36)
-#     assert completion["choices"][0]["text"] == all_text
-#     assert completion["choices"][0]["finish_reason"] == "stop"
-#
-#     ## Test basic completion until eos
-#     mock_llama(llama, all_text)
-#     completion = llama.create_completion(text, max_tokens=20)
-#     assert completion["choices"][0]["text"] == output_text
-#     assert completion["choices"][0]["finish_reason"] == "stop"
-#
-#     ## Test streaming completion until eos
-#     mock_llama(llama, all_text)
-#     chunks = list(llama.create_completion(text, max_tokens=20, stream=True))
-#     assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == output_text
-#     assert chunks[-1]["choices"][0]["finish_reason"] == "stop"
-#
-#     ## Test basic completion until stop sequence
-#     mock_llama(llama, all_text)
-#     completion = llama.create_completion(text, max_tokens=20, stop=["lazy"])
-#     assert completion["choices"][0]["text"] == " jumps over the "
-#     assert completion["choices"][0]["finish_reason"] == "stop"
-#
-#     ## Test streaming completion until stop sequence
-#     mock_llama(llama, all_text)
-#     chunks = list(
-#         llama.create_completion(text, max_tokens=20, stream=True, stop=["lazy"])
-#     )
-#     assert (
-#         "".join(chunk["choices"][0]["text"] for chunk in chunks) == " jumps over the "
-#     )
-#     assert chunks[-1]["choices"][0]["finish_reason"] == "stop"
-#
-#     ## Test basic completion until length
-#     mock_llama(llama, all_text)
-#     completion = llama.create_completion(text, max_tokens=2)
-#     assert completion["choices"][0]["text"] == " jumps"
-#     assert completion["choices"][0]["finish_reason"] == "length"
-#
-#     ## Test streaming completion until length
-#     mock_llama(llama, all_text)
-#     chunks = list(llama.create_completion(text, max_tokens=2, stream=True))
-#     assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == " jumps"
-#     assert chunks[-1]["choices"][0]["finish_reason"] == "length"
-
-
-def test_llama_pickle():
-    import pickle
-    import tempfile
-
-    fp = tempfile.TemporaryFile()
-    llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
-    pickle.dump(llama, fp)
-    fp.seek(0)
-    llama = pickle.load(fp)
-
-    assert llama
-    assert llama.ctx is not None
-
-    text = b"Hello World"
-
-    assert llama.detokenize(llama.tokenize(text)) == text
+def test_llama_cpp_version():
+    assert llama_cpp.__version__
 
 
-# def test_utf8(mock_llama):
-#     llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, logits_all=True)
-#
-#     output_text = "😀"
-#
-#     ## Test basic completion with utf8 multibyte
-#     mock_llama(llama, output_text)
-#     completion = llama.create_completion("", max_tokens=4)
-#     assert completion["choices"][0]["text"] == output_text
-#
-#     ## Test basic completion with incomplete utf8 multibyte
-#     mock_llama(llama, output_text)
-#     completion = llama.create_completion("", max_tokens=1)
-#     assert completion["choices"][0]["text"] == ""
+@pytest.fixture
+def llama_cpp_model_path():
+    repo_id = "Qwen/Qwen2-0.5B-Instruct-GGUF"
+    filename = "qwen2-0_5b-instruct-q8_0.gguf"
+    model_path = hf_hub_download(repo_id, filename)
+    return model_path
+
+
+def test_real_model(llama_cpp_model_path):
+    import os
+    assert os.path.exists(llama_cpp_model_path)
+
+    params = llama_cpp.llama_model_default_params()
+    params.use_mmap = llama_cpp.llama_supports_mmap()
+    params.use_mlock = llama_cpp.llama_supports_mlock()
+    params.check_tensors = False
+
+    model = internals.LlamaModel(path_model=llama_cpp_model_path, params=params)
+
+    cparams = llama_cpp.llama_context_default_params()
+    cparams.n_ctx = 16
+    cparams.n_batch = 16
+    cparams.n_ubatch = 16
+    cparams.n_threads = multiprocessing.cpu_count()
+    cparams.n_threads_batch = multiprocessing.cpu_count()
+    cparams.logits_all = False
+    cparams.flash_attn = True
+
+    context = internals.LlamaContext(model=model, params=cparams)
+    tokens = model.tokenize(b"Hello, world!", add_bos=True, special=True)
+
+    assert tokens == [9707, 11, 1879, 0]
+
+    tokens = model.tokenize(b"The quick brown fox jumps", add_bos=True, special=True)
+
+    batch = internals.LlamaBatch(n_tokens=len(tokens), embd=0, n_seq_max=1)
+
+    seed = 1337
+    sampler = internals.LlamaSampler()
+    sampler.add_top_k(50)
+    sampler.add_top_p(0.9, 1)
+    sampler.add_temp(0.8)
+    sampler.add_dist(seed)
+
+    result = tokens
+    n_eval = 0
+    for _ in range(4):
+        batch.set_batch(tokens, n_past=n_eval, logits_all=False)
+        context.decode(batch)
+        n_eval += len(tokens)
+        token_id = sampler.sample(context, -1)
+        tokens = [token_id]
+        result += tokens
+
+    output = result[5:]
+    output_text = model.detokenize(output, special=True)
+    assert output_text == b" over the lazy dog"
+
+def test_real_llama(llama_cpp_model_path):
+    model = llama_cpp.Llama(
+        llama_cpp_model_path,
+        n_ctx=32,
+        n_batch=32,
+        n_ubatch=32,
+        n_threads=multiprocessing.cpu_count(),
+        n_threads_batch=multiprocessing.cpu_count(),
+        logits_all=False,
+        flash_attn=True,
+    )
 
+    output = model.create_completion(
+        "The quick brown fox jumps",
+        max_tokens=4,
+        top_k=50,
+        top_p=0.9,
+        temperature=0.8,
+        seed=1337
+    )
+    assert output["choices"][0]["text"] == " over the lazy dog"
+
+
+    output = model.create_completion(
+        "The capital of france is paris, 'true' or 'false'?:\n",
+        max_tokens=4,
+        top_k=50,
+        top_p=0.9,
+        temperature=0.8,
+        seed=1337,
+        grammar=llama_cpp.LlamaGrammar.from_string("""
+root ::= "true" | "false"
+""")
+    )
+    assert output["choices"][0]["text"] == "true"
 
-def test_llama_server():
-    from fastapi.testclient import TestClient
-    from llama_cpp.server.app import create_app, Settings
+    def logit_processor_func(input_ids, logits):
+        return logits * 1
 
-    settings = Settings(
-        model=MODEL,
-        vocab_only=True,
+    logit_processors = llama_cpp.LogitsProcessorList(
+        [logit_processor_func]
     )
-    app = create_app(settings)
-    client = TestClient(app)
-    response = client.get("/v1/models")
-    assert response.json() == {
-        "object": "list",
-        "data": [
-            {
-                "id": MODEL,
-                "object": "model",
-                "owned_by": "me",
-                "permissions": [],
-            }
-        ],
-    }
-
-
-@pytest.mark.parametrize(
-    "size_and_axis",
-    [
-        ((32_000,), -1),  # last token's next-token logits
-        ((10, 32_000), -1),  # many tokens' next-token logits, or batch of last tokens
-        ((4, 10, 32_000), -1),  # batch of texts
-    ],
-)
-@pytest.mark.parametrize("convert_to_list", [True, False])
-def test_logits_to_logprobs(size_and_axis, convert_to_list: bool, atol: float = 1e-7):
-    size, axis = size_and_axis
-    logits: np.ndarray = -np.random.uniform(low=0, high=60, size=size)
-    logits = logits.astype(np.single)
-    if convert_to_list:
-        # Currently, logits are converted from arrays to lists. This may change soon
-        logits = logits.tolist()
-    log_probs = llama_cpp.Llama.logits_to_logprobs(logits, axis=axis)
-    log_probs_correct = log_softmax(logits, axis=axis)
-    assert log_probs.dtype == np.single
-    assert log_probs.shape == size
-    assert np.allclose(log_probs, log_probs_correct, atol=atol)
-
 
-def test_llama_cpp_version():
-    assert llama_cpp.__version__
+    output = model.create_completion(
+        "The capital of france is par",
+        max_tokens=4,
+        top_k=50,
+        top_p=0.9,
+        temperature=0.8,
+        seed=1337,
+        logits_processor=logit_processors
+    )
+    assert output["choices"][0]["text"].lower().startswith("is")