-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Retrieve attention score for all input tokens per generated token #1141
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Comments
Hey @parallaxe the approach mentioned in that repo requires computation of per token attention based on outputs of specific transformer attention head layers. This isn't currently supported by the llama.cpp api because the entire model is computed in a single forward pass and intermediate values are discarded before. At best we can currently return the per token logits, this will only tell you the relative confidence of the model in predicting that specific token given the previous sequence. Hope that helps. |
Thanks for clarifying! |
As I have looked around in the llama.cpp-project, I found this callback. A sample usage can be found here (but the |
Hey @parallaxe yes you're correct that that should work, right now I'm not exposing the |
Hope this can be a good starting point! First just need to update cmake to add the
cmake_minimum_required(VERSION 3.21)
project(llama_cpp)
option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python package" ON)
option(LLAVA_BUILD "Build llava shared library and install alongside python package" ON)
if (LLAMA_BUILD)
set(BUILD_SHARED_LIBS "On")
# Building llama
if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
# Need to disable these llama.cpp flags on Apple x86_64,
# otherwise users may encounter invalid instruction errors
set(LLAMA_AVX "Off" CACHE BOOL "llama: enable AVX" FORCE)
set(LLAMA_AVX2 "Off" CACHE BOOL "llama: enable AVX2" FORCE)
set(LLAMA_FMA "Off" CACHE BOOL "llama: enable FMA" FORCE)
set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE)
endif()
add_subdirectory(vendor/llama.cpp)
install(
TARGETS llama
LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
)
# Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
install(
TARGETS llama
LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
)
# Workaround for Windows + CUDA https://github.com/abetlen/llama-cpp-python/issues/563
install(
FILES $<TARGET_RUNTIME_DLLS:llama>
DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
)
install(
FILES $<TARGET_RUNTIME_DLLS:llama>
DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
)
if (LLAVA_BUILD)
# Building llava
add_subdirectory(vendor/llama.cpp/examples/llava)
set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
# Set CUDA_ARCHITECTURES to OFF on windows
if (WIN32)
set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF)
endif()
install(
TARGETS llava_shared
LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
)
# Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
install(
TARGETS llava_shared
LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
)
endif()
set_target_properties(ggml_shared PROPERTIES OUTPUT_NAME "ggml")
install(
TARGETS ggml_shared
LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
)
install(
TARGETS ggml_shared
LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
)
endif() Then write the minimal bindings to convert the
import os
import sys
import ctypes
import pathlib
from typing import List, TypeAlias
# Load the library
def _load_shared_library(lib_base_name: str):
# Construct the paths to the possible shared library names
_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__)))
# Searching for the library in the current directory under the name "libllama" (default name
# for llamacpp) and "llama" (default name for this repo)
_lib_paths: List[pathlib.Path] = []
# Determine the file extension based on the platform
if sys.platform.startswith("linux"):
_lib_paths += [
_base_path / f"lib{lib_base_name}.so",
]
elif sys.platform == "darwin":
_lib_paths += [
_base_path / f"lib{lib_base_name}.so",
_base_path / f"lib{lib_base_name}.dylib",
]
elif sys.platform == "win32":
_lib_paths += [
_base_path / f"{lib_base_name}.dll",
_base_path / f"lib{lib_base_name}.dll",
]
else:
raise RuntimeError("Unsupported platform")
if "LLAVA_CPP_LIB" in os.environ:
lib_base_name = os.environ["LLAVA_CPP_LIB"]
_lib = pathlib.Path(lib_base_name)
_base_path = _lib.parent.resolve()
_lib_paths = [_lib.resolve()]
cdll_args = dict() # type: ignore
# Add the library directory to the DLL search path on Windows (if needed)
if sys.platform == "win32" and sys.version_info >= (3, 8):
os.add_dll_directory(str(_base_path))
if "CUDA_PATH" in os.environ:
os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
cdll_args["winmode"] = ctypes.RTLD_GLOBAL
# Try to load the shared library, handling potential errors
for _lib_path in _lib_paths:
if _lib_path.exists():
try:
return ctypes.CDLL(str(_lib_path), **cdll_args)
except Exception as e:
raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
raise FileNotFoundError(
f"Shared library with base
8000
name '{lib_base_name}' not found"
)
# Load the library
libname = "ggml"
lib = _load_shared_library(libname)
GGML_MAX_DIMS = 4
GGML_MAX_SRC = 10
GGML_MAX_NAME = 64
GGML_MAX_OP_PARAMS = 64
# // n-dimensional tensor
# struct ggml_tensor {
# enum ggml_type type;
# enum ggml_backend_type backend;
# struct ggml_backend_buffer * buffer;
# int64_t ne[GGML_MAX_DIMS]; // number of elements
# size_t nb[GGML_MAX_DIMS]; // stride in bytes:
# // nb[0] = ggml_type_size(type)
# // nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding
# // nb[i] = nb[i-1] * ne[i-1]
# // compute data
# enum ggml_op op;
# // op params - allocated as int32_t for alignment
# int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
# bool is_param;
# struct ggml_tensor * grad;
# struct ggml_tensor * src[GGML_MAX_SRC];
# // performance
# int perf_runs;
# int64_t perf_cycles;
# int64_t perf_time_us;
# struct ggml_tensor * view_src;
# size_t view_offs;
# void * data;
# char name[GGML_MAX_NAME];
# void * extra; // extra things e.g. for ggml-cuda.cu
# char padding[8];
# };
class ggml_tensor(ctypes.Structure):
"""n-dimensional tensor
Attributes:
type (int): ggml_type
backend (int): ggml_backend
buffer (ctypes.pointer[ggml_backend_buffer]): pointer to backend buffer
ne (ctypes.Array[ctypes.c_int64]): number of elements in each dimension
nb (ctypes.Array[ctypes.c_size_t]): stride in bytes for each dimension
op (int): ggml operation
op_params (ctypes.Array[ctypes.c_int32]): `GGML_MAX_OP_PARAMS`-length array of operation parameters
is_param (bool): is this a parameter tensor
grad (ggml_tensor_p): reference to gradient tensor
src (ctypes.Array[ggml_tensor_p]): `GGML_MAX_SRC`-length array of source tensors
perf_runs (int): number of performance runs
perf_cycles (int): number of cycles
perf_time_us (int): time in microseconds
view_src (ggml_tensor_p): pointer to tensor if this tensor is a view, None if the tensor is not a view
view_offs (ctypes.c_size_t): offset into the data pointer of the view tensor
data (ctypes.c_void_p): reference to raw tensor data
name (bytes): name of tensor
extra (ctypes.c_void_p): extra data (e.g. for CUDA)
"""
ggml_tensor._fields_ = [
("type", ctypes.c_int),
("backend", ctypes.c_int),
("buffer", ctypes.c_void_p),
("ne", ctypes.c_int64 * GGML_MAX_DIMS),
("nb", ctypes.c_size_t * GGML_MAX_DIMS),
("op", ctypes.c_int),
(
"op_params",
ctypes.c_int32 * (GGML_MAX_OP_PARAMS // ctypes.sizeof(ctypes.c_int32)),
),
("is_param", ctypes.c_bool),
("grad", ctypes.POINTER(ggml_tensor)),
("src", ctypes.POINTER(ggml_tensor) * GGML_MAX_SRC),
("perf_runs", ctypes.c_int),
("perf_cycles", ctypes.c_int64),
("perf_time_us", ctypes.c_int64),
("view_src", ctypes.POINTER(ggml_tensor)),
("view_offs", ctypes.c_size_t),
("data", ctypes.c_void_p),
("name", ctypes.c_char * GGML_MAX_NAME),
("extra", ctypes.c_void_p),
("padding", ctypes.c_char * 8),
]
GGML_TENSOR_SIZE = ctypes.sizeof(ggml_tensor)
ggml_tensor_p: TypeAlias = "ctypes._Pointer[ggml_tensor]" if TYPE_CHECKING else ctypes.POINTER(ggml_tensor) # type: ignore
# typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
ggml_backend_sched_eval_callback = ctypes.CFUNCTYPE(
ctypes.c_bool, ggml_tensor_p, ctypes.c_bool, ctypes.c_void_p
)
# GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
def ggml_backend_tensor_get(
tensor: ggml_tensor_p, data: ctypes.c_void_p, offset: int, size: int
):
return lib.ggml_backend_tensor_get(tensor, data, offset, size)
lib.ggml_backend_tensor_get.argtypes = [
ggml_tensor_p,
ctypes.c_void_p,
ctypes.c_size_t,
ctypes.c_size_t,
]
lib.ggml_backend_tensor_get.restype = None
# GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
def ggml_nbytes(tensor: ggml_tensor_p) -> int:
return lib.ggml_nbytes(tensor)
lib.ggml_nbytes.argtypes = [ggml_tensor_p]
lib.ggml_nbytes.restype = ctypes.c_size_t
# GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor);
def ggml_nelements(tensor: ggml_tensor_p) -> int:
return lib.ggml_nelements(tensor)
lib.ggml_nelements.argtypes = [ggml_tensor_p]
lib.ggml_nelements.restype = ctypes.c_int64
# GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
def ggml_n_dims(tensor: ggml_tensor_p) -> int:
return lib.ggml_n_dims(tensor)
lib.ggml_n_dims.argtypes = [ggml_tensor_p]
lib.ggml_n_dims.restype = ctypes.c_int
def ggml_tensor_to_numpy(tensor: ggml_tensor_p):
import numpy as np
nbytes = ggml_nbytes(tensor)
nelements = ggml_nelements(tensor)
data = np.empty(nelements, dtype=np.float32)
ggml_backend_tensor_get(tensor, ctypes.cast(data.ctypes.data, ctypes.c_void_p), 0, nbytes)
return data.reshape(tensor.contents.ne[: ggml_n_dims(tensor)]) You would also need to filter out the exact operators in the ggml graph for the attention head outputs, likely either by name or by operator type. |
Oh wow, this looks great! Thank you! I will see how far I can get with it 😄 |
Hey @parallaxe, |
Hi @reuank, I switched from llama-cpp-python to llama.cpp for other reasons, and started implementing an attention score collecting callback for the server-implementation. I'm not sure how / if this may end in an PR, but I will push it as fork once the implementation is good enough. |
+1 would love to see this feature. |
+1 I would love to see this in the future! |
I think I figured out how to do this. I believe it is in llama-graph.cpp in function build_attn_mha on line 1122. The calback gives me 24 x {4, 4, 16, 1} tensors. |
Is your feature request related to a problem? Please describe.
In RAG-scenarious, I think it would be a great help to differentiate if a LLM is hallucinating or retrieving its informations from the given context, when we could get an attention score for all input-tokens per generated token.
Describe the solution you'd like
Having a callback-mechanism for every generated token, similar to the LogitsProcessor, that receives a list of scores.
Describe alternatives you've considered
Calculating the scores by myself. But my knowledge of transformers is not sufficient.
Additional context
I would like to build something like the "Attention tracing" in this repository, but with llama.cpp as backend.
The text was updated successfully, but these errors were encountered: