diff --git a/.github/workflows/build-wheels-cu124-cu126-win.yml b/.github/workflows/build-wheels-cu124-cu126-win.yml index 7ce77041e..5d5a91efe 100644 --- a/.github/workflows/build-wheels-cu124-cu126-win.yml +++ b/.github/workflows/build-wheels-cu124-cu126-win.yml @@ -72,13 +72,13 @@ jobs: $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off -DLLAMA_HTTPLIB=on $env:CMAKE_ARGS" if ($env:AVXVER -eq 'AVX') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' } if ($env:AVXVER -eq 'AVX2') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' } if ($env:AVXVER -eq 'AVXVNNI') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on -DGGML_FMA=on -DGGML_F16C=off' + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' } # if ($env:AVXVER -eq 'AVX512') { # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' diff --git a/.github/workflows/build-wheels-cu124-linux-basic.yml b/.github/workflows/build-wheels-cu124-linux-basic.yml new file mode 100644 index 000000000..98f50fe47 --- /dev/null +++ b/.github/workflows/build-wheels-cu124-linux-basic.yml @@ -0,0 +1,116 @@ +name: Build Wheels(CU124) for Linux(Basic) + +on: + workflow_dispatch: # Manual trigger + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} + runs-on: ubuntu-22.04 + container: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 + strategy: + matrix: # Define the build matrix directly here + os: ["ubuntu-22.04"] + pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions + cuda: ["12.4.1"] + releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) + cudaarch: ["all"] # Controls target CUDA architectures for nvcc + + defaults: + run: + shell: bash + + env: + CUDAVER: ${{ matrix.cuda }} + AVXVER: ${{ matrix.releasetag }} + CUDAARCHVER: ${{ matrix.cudaarch }} + + steps: + - name: Install dependencies + run: | + apt update + apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev + + - uses: actions/checkout@v4 # Checkout code + with: + submodules: "recursive" + + # from astral-sh/setup-uv + - name: Install the latest version of uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - run: nvcc -V + + - name: Build Wheel With Cmake # Main build step: configures and builds the wheel + env: + LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" + VERBOSE: 1 # Enable verbose build output + CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME + CUDA_PATH: "${PATH}" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR + run: | + echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting + find /usr/ -name 'libcuda.so.*' + echo $LD_LIBRARY_PATH + + # Add project-specific and feature flags + CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='70-real;75-real;80-real;86-real;87-real;89-real'" + CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" + CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" + + # Basic options for compiling without AVX instructions + if [ "${AVXVER}" = "Basic" ]; then + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + fi + + # Export CMAKE_ARGS environment variable so the python -m build command can use it + echo ${CMAKE_ARGS} + echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV + + # Run the Python build command to generate the wheel + uv pip install build setuptools wheel packaging + # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 + CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel + + # --- Post-build steps to get info for release tag --- + + # Find the generated wheel file in the 'dist' directory using bash + # Assumes only one wheel is generated per build configuration run + wheel_file=$(ls dist/*.whl | head -n 1) + + # Extract the package version (e.g., 1.2.3) from the wheel filename + # Filename format is typically: package_name-version-tag-specificators.whl + # Using basename and cut to split by '-' and get the second field + tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2) + echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step + + # Extract the short CUDA version (e.g., 124) from the full version (e.g., 12.4.1) from the matrix variable + cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') + echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env + + + - name: Get Current Date # Step to get current date for the release tag + id: get-date + run: | + # Get date in YYYYMMDD format using bash date command + currentDate=$(date +%Y%m%d) + # Store the date in environment variable for the release step + echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV + + - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release + with: + files: dist/* # Upload the generated wheel files from the dist directory + # Define the release tag name using the collected environment variables + # Format: v-cu--linux- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux + # Note: This action will create a new release tag if it doesn't exist, + # or upload assets to an existing tag. Be mindful of potential tag name conflicts. + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file diff --git a/.github/workflows/build-wheels-cu124-linux.yml b/.github/workflows/build-wheels-cu124-linux.yml index 1dbe7fa5c..3feeeecfd 100644 --- a/.github/workflows/build-wheels-cu124-linux.yml +++ b/.github/workflows/build-wheels-cu124-linux.yml @@ -32,7 +32,7 @@ jobs: - name: Install dependencies run: | apt update - apt install -y build-essential cmake ccache curl git libgomp1 libcurl4-openssl-dev + apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - uses: actions/checkout@v4 # Checkout code with: @@ -63,23 +63,23 @@ jobs: # Add project-specific and feature flags CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='70-real;75-real;80-real;86-real;87-real;89-real'" CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=on -DLLAMA_HTTPLIB=on" + CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" if [ "${AVXVER}" = "AVX" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" fi if [ "${AVXVER}" = "AVX2" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off" fi if [ "${AVXVER}" = "AVXVNNI" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on -DGGML_FMA=on -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on" fi # if [ "${AVXVER}" = "AVX512" ]; then # CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on" # fi # Basic options for compiling without AVX instructions if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS = "${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" fi # Export CMAKE_ARGS environment variable so the python -m build command can use it diff --git a/.github/workflows/build-wheels-cu126-linux-basic.yml b/.github/workflows/build-wheels-cu126-linux-basic.yml new file mode 100644 index 000000000..78d1471c7 --- /dev/null +++ b/.github/workflows/build-wheels-cu126-linux-basic.yml @@ -0,0 +1,116 @@ +name: Build Wheels(CU126) for Linux(Basic) + +on: + workflow_dispatch: # Manual trigger + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} + runs-on: ubuntu-22.04 + container: nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04 + strategy: + matrix: # Define the build matrix directly here + os: ["ubuntu-22.04"] + pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions + cuda: ["12.6.3"] + releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) + cudaarch: ["all"] # Controls target CUDA architectures for nvcc + + defaults: + run: + shell: bash + + env: + CUDAVER: ${{ matrix.cuda }} + AVXVER: ${{ matrix.releasetag }} + CUDAARCHVER: ${{ matrix.cudaarch }} + + steps: + - name: Install dependencies + run: | + apt update + apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev + + - uses: actions/checkout@v4 # Checkout code + with: + submodules: "recursive" + + # from astral-sh/setup-uv + - name: Install the latest version of uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - run: nvcc -V + + - name: Build Wheel With Cmake # Main build step: configures and builds the wheel + env: + LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" + VERBOSE: 1 # Enable verbose build output + CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME + CUDA_PATH: "${PATH}" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR + run: | + echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting + find /usr/ -name 'libcuda.so.*' + echo $LD_LIBRARY_PATH + + # Add project-specific and feature flags + CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='70-real;75-real;80-real;86-real;87-real;89-real'" + CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" + CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" + + # Basic options for compiling without AVX instructions + if [ "${AVXVER}" = "Basic" ]; then + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + fi + + # Export CMAKE_ARGS environment variable so the python -m build command can use it + echo ${CMAKE_ARGS} + echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV + + # Run the Python build command to generate the wheel + uv pip install build setuptools wheel packaging + # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 + CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel + + # --- Post-build steps to get info for release tag --- + + # Find the generated wheel file in the 'dist' directory using bash + # Assumes only one wheel is generated per build configuration run + wheel_file=$(ls dist/*.whl | head -n 1) + + # Extract the package version (e.g., 1.2.3) from the wheel filename + # Filename format is typically: package_name-version-tag-specificators.whl + # Using basename and cut to split by '-' and get the second field + tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2) + echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step + + # Extract the short CUDA version (e.g., 126) from the full version (e.g., 12.6.3) from the matrix variable + cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') + echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env + + + - name: Get Current Date # Step to get current date for the release tag + id: get-date + run: | + # Get date in YYYYMMDD format using bash date command + currentDate=$(date +%Y%m%d) + # Store the date in environment variable for the release step + echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV + + - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release + with: + files: dist/* # Upload the generated wheel files from the dist directory + # Define the release tag name using the collected environment variables + # Format: v-cu--linux- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux + # Note: This action will create a new release tag if it doesn't exist, + # or upload assets to an existing tag. Be mindful of potential tag name conflicts. + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file diff --git a/.github/workflows/build-wheels-cu126-linux.yml b/.github/workflows/build-wheels-cu126-linux.yml index bff7a778a..f9b566fab 100644 --- a/.github/workflows/build-wheels-cu126-linux.yml +++ b/.github/workflows/build-wheels-cu126-linux.yml @@ -32,7 +32,7 @@ jobs: - name: Install dependencies run: | apt update - apt install -y build-essential cmake ccache curl git libgomp1 libcurl4-openssl-dev + apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - uses: actions/checkout@v4 # Checkout code with: @@ -63,23 +63,23 @@ jobs: # Add project-specific and feature flags CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='70-real;75-real;80-real;86-real;87-real;89-real'" CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=on -DLLAMA_HTTPLIB=on" + CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" if [ "${AVXVER}" = "AVX" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" fi if [ "${AVXVER}" = "AVX2" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off" fi if [ "${AVXVER}" = "AVXVNNI" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on -DGGML_FMA=on -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on" fi # if [ "${AVXVER}" = "AVX512" ]; then # CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on" # fi # Basic options for compiling without AVX instructions if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS = "${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" fi # Export CMAKE_ARGS environment variable so the python -m build command can use it diff --git a/.github/workflows/build-wheels-cu128-linux-basic.yml b/.github/workflows/build-wheels-cu128-linux-basic.yml new file mode 100644 index 000000000..8c527c718 --- /dev/null +++ b/.github/workflows/build-wheels-cu128-linux-basic.yml @@ -0,0 +1,116 @@ +name: Build Wheels(CU128) for Linux(Basic) + +on: + workflow_dispatch: # Manual trigger + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} + runs-on: ubuntu-22.04 + container: nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04 + strategy: + matrix: # Define the build matrix directly here + os: ["ubuntu-22.04"] + pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions + cuda: ["12.8.1"] + releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) + cudaarch: ["all"] # Controls target CUDA architectures for nvcc + + defaults: + run: + shell: bash + + env: + CUDAVER: ${{ matrix.cuda }} + AVXVER: ${{ matrix.releasetag }} + CUDAARCHVER: ${{ matrix.cudaarch }} + + steps: + - name: Install dependencies + run: | + apt update + apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev + + - uses: actions/checkout@v4 # Checkout code + with: + submodules: "recursive" + + # from astral-sh/setup-uv + - name: Install the latest version of uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - run: nvcc -V + + - name: Build Wheel With Cmake # Main build step: configures and builds the wheel + env: + LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" + VERBOSE: 1 # Enable verbose build output + CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME + CUDA_PATH: "${PATH}" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR + run: | + echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting + find /usr/ -name 'libcuda.so.*' + echo $LD_LIBRARY_PATH + + # Add project-specific and feature flags + CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real'" + CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" + CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" + + # Basic options for compiling without AVX instructions + if [ "${AVXVER}" = "Basic" ]; then + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + fi + + # Export CMAKE_ARGS environment variable so the python -m build command can use it + echo ${CMAKE_ARGS} + echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV + + # Run the Python build command to generate the wheel + uv pip install build setuptools wheel packaging + # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 + CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel + + # --- Post-build steps to get info for release tag --- + + # Find the generated wheel file in the 'dist' directory using bash + # Assumes only one wheel is generated per build configuration run + wheel_file=$(ls dist/*.whl | head -n 1) + + # Extract the package version (e.g., 1.2.3) from the wheel filename + # Filename format is typically: package_name-version-tag-specificators.whl + # Using basename and cut to split by '-' and get the second field + tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2) + echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step + + # Extract the short CUDA version (e.g., 128) from the full version (e.g., 12.8.1) from the matrix variable + cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') + echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env + + + - name: Get Current Date # Step to get current date for the release tag + id: get-date + run: | + # Get date in YYYYMMDD format using bash date command + currentDate=$(date +%Y%m%d) + # Store the date in environment variable for the release step + echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV + + - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release + with: + files: dist/* # Upload the generated wheel files from the dist directory + # Define the release tag name using the collected environment variables + # Format: v-cu--linux- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux + # Note: This action will create a new release tag if it doesn't exist, + # or upload assets to an existing tag. Be mindful of potential tag name conflicts. + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file diff --git a/.github/workflows/build-wheels-cu128-linux.yml b/.github/workflows/build-wheels-cu128-linux.yml index e3fc186df..b25128d6e 100644 --- a/.github/workflows/build-wheels-cu128-linux.yml +++ b/.github/workflows/build-wheels-cu128-linux.yml @@ -32,7 +32,7 @@ jobs: - name: Install dependencies run: | apt update - apt install -y build-essential ccache cmake curl git libgomp1 libcurl4-openssl-dev + apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - uses: actions/checkout@v4 # Checkout code with: @@ -63,23 +63,23 @@ jobs: # Add project-specific and feature flags CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real'" CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=on -DLLAMA_HTTPLIB=on" + CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" if [ "${AVXVER}" = "AVX" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" fi if [ "${AVXVER}" = "AVX2" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off" fi if [ "${AVXVER}" = "AVXVNNI" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on -DGGML_FMA=on -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on" fi # if [ "${AVXVER}" = "AVX512" ]; then # CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on" # fi # Basic options for compiling without AVX instructions if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS = "${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" fi # Export CMAKE_ARGS environment variable so the python -m build command can use it diff --git a/.github/workflows/build-wheels-cu128-win-basic.yml b/.github/workflows/build-wheels-cu128-win-basic.yml new file mode 100644 index 000000000..2d78084db --- /dev/null +++ b/.github/workflows/build-wheels-cu128-win-basic.yml @@ -0,0 +1,107 @@ +name: Build Wheels (CU128) for Windows(Basic) + +on: + workflow_dispatch: + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: ['windows-2022'] + pyver: ["3.10", "3.11", "3.12", "3.13"] + cuda: ["12.8.1"] + releasetag: ["Basic"] + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real"] + defaults: + run: + shell: pwsh + env: + CUDAVER: ${{ matrix.cuda }} + AVXVER: ${{ matrix.releasetag }} + CUDAARCHVER: ${{ matrix.cudaarch }} + # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html + # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list + # e.g. "all" "89" "90" "100" "120" + MAX_JOBS: 8 + + steps: + - name: Add MSBuild to PATH + if: runner.os == 'Windows' + uses: microsoft/setup-msbuild@v2 + with: + msbuild-architecture: x64 + + - uses: actions/checkout@v5 + with: + submodules: "recursive" + + # from kingbri1/flash-attention build-wheels.yml + - name: Install CUDA ${{ matrix.cuda }} + uses: N-Storm/cuda-toolkit@v0.2.28 + id: cuda-toolkit + with: + cuda: "${{ matrix.cuda }}" + use-github-cache: false + + # from astral-sh/setup-uv + - name: Install the latest version of uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - name: Install Dependencies + run: | + git config --system core.longpaths true + uv pip install --upgrade build setuptools wheel packaging + + - name: Build Wheel + run: | + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') + $env:CUDA_HOME = $env:CUDA_PATH + $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH + $env:VERBOSE = '1' + $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS + $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" + $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off -DLLAMA_HTTPLIB=on $env:CMAKE_ARGS" + + # Basic options for compiling without AVX instructions + if ($env:AVXVER -eq 'Basic') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' + } + python -m build --wheel + + # Check if wheel was built + if (!(Test-Path '.\dist\*.whl')) { + Write-Error "No wheel built in dist/ directory" + exit 1 + } + + # write the build tag to the output + Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV + + $wheel = (gi '.\dist\*.whl')[0] + $tagVer = $wheel.name.split('-')[1] + Write-Output "TAG_VERSION=$tagVer" >> $env:GITHUB_ENV + + - name: Get Current Date + id: get-date + run: | + $currentDate = Get-Date -UFormat "%Y%m%d" + Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV + + - name: Create Release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v2 + with: + files: dist/* + # Set tag_name to -cu--win + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu128-win.yml b/.github/workflows/build-wheels-cu128-win.yml index adf78e1e7..40578c8b4 100644 --- a/.github/workflows/build-wheels-cu128-win.yml +++ b/.github/workflows/build-wheels-cu128-win.yml @@ -75,10 +75,10 @@ jobs: $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' } if ($env:AVXVER -eq 'AVX2') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' } if ($env:AVXVER -eq 'AVXVNNI') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on -DGGML_FMA=on -DGGML_F16C=off' + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' } # if ($env:AVXVER -eq 'AVX512') { # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' diff --git a/.github/workflows/build-wheels-cu130-linux-basic.yml b/.github/workflows/build-wheels-cu130-linux-basic.yml new file mode 100644 index 000000000..0f03787a6 --- /dev/null +++ b/.github/workflows/build-wheels-cu130-linux-basic.yml @@ -0,0 +1,116 @@ +name: Build Wheels(CU130) for Linux(Basic) + +on: + workflow_dispatch: # Manual trigger + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} + runs-on: ubuntu-22.04 + container: nvidia/cuda:13.0.2-cudnn-devel-ubuntu22.04 + strategy: + matrix: # Define the build matrix directly here + os: ["ubuntu-22.04"] + pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions + cuda: ["13.0.2"] + releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) + cudaarch: ["all"] # Controls target CUDA architectures for nvcc + + defaults: + run: + shell: bash + + env: + CUDAVER: ${{ matrix.cuda }} + AVXVER: ${{ matrix.releasetag }} + CUDAARCHVER: ${{ matrix.cudaarch }} + + steps: + - name: Install dependencies + run: | + apt update + apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev + + - uses: actions/checkout@v5 # Checkout code + with: + submodules: "recursive" + + # from astral-sh/setup-uv + - name: Install the latest version of uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - run: nvcc -V + + - name: Build Wheel With Cmake # Main build step: configures and builds the wheel + env: + LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" + VERBOSE: 1 # Enable verbose build output + CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME + CUDA_PATH: "${PATH}" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR + run: | + echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting + find /usr/ -name 'libcuda.so.*' + echo $LD_LIBRARY_PATH + + # Add project-specific and feature flags + CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real'" + CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" + CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" + + # Basic options for compiling without AVX instructions + if [ "${AVXVER}" = "Basic" ]; then + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + fi + + # Export CMAKE_ARGS environment variable so the python -m build command can use it + echo ${CMAKE_ARGS} + echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV + + # Run the Python build command to generate the wheel + uv pip install build setuptools wheel packaging + # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 + CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel + + # --- Post-build steps to get info for release tag --- + + # Find the generated wheel file in the 'dist' directory using bash + # Assumes only one wheel is generated per build configuration run + wheel_file=$(ls dist/*.whl | head -n 1) + + # Extract the package version (e.g., 1.2.3) from the wheel filename + # Filename format is typically: package_name-version-tag-specificators.whl + # Using basename and cut to split by '-' and get the second field + tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2) + echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step + + # Extract the short CUDA version (e.g., 130) from the full version (e.g., 13.0.2) from the matrix variable + cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') + echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env + + + - name: Get Current Date # Step to get current date for the release tag + id: get-date + run: | + # Get date in YYYYMMDD format using bash date command + currentDate=$(date +%Y%m%d) + # Store the date in environment variable for the release step + echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV + + - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release + with: + files: dist/* # Upload the generated wheel files from the dist directory + # Define the release tag name using the collected environment variables + # Format: v-cu--linux- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux + # Note: This action will create a new release tag if it doesn't exist, + # or upload assets to an existing tag. Be mindful of potential tag name conflicts. + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file diff --git a/.github/workflows/build-wheels-cu130-linux.yml b/.github/workflows/build-wheels-cu130-linux.yml new file mode 100644 index 000000000..6451c7ee4 --- /dev/null +++ b/.github/workflows/build-wheels-cu130-linux.yml @@ -0,0 +1,128 @@ +name: Build Wheels(CU130) for Linux + +on: + workflow_dispatch: # Manual trigger + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} + runs-on: ubuntu-22.04 + container: nvidia/cuda:13.0.2-cudnn-devel-ubuntu22.04 + strategy: + matrix: # Define the build matrix directly here + os: ["ubuntu-22.04"] + pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions + cuda: ["13.0.2"] + releasetag: ["AVX2"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) + cudaarch: ["all"] # Controls target CUDA architectures for nvcc + + defaults: + run: + shell: bash + + env: + CUDAVER: ${{ matrix.cuda }} + AVXVER: ${{ matrix.releasetag }} + CUDAARCHVER: ${{ matrix.cudaarch }} + + steps: + - name: Install dependencies + run: | + apt update + apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev + + - uses: actions/checkout@v5 # Checkout code + with: + submodules: "recursive" + + # from astral-sh/setup-uv + - name: Install the latest version of uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - run: nvcc -V + + - name: Build Wheel With Cmake # Main build step: configures and builds the wheel + env: + LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" + VERBOSE: 1 # Enable verbose build output + CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME + CUDA_PATH: "${PATH}" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR + run: | + echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting + find /usr/ -name 'libcuda.so.*' + echo $LD_LIBRARY_PATH + + # Add project-specific and feature flags + CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real'" + CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" + CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" + + if [ "${AVXVER}" = "AVX" ]; then + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + fi + if [ "${AVXVER}" = "AVX2" ]; then + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off" + fi + if [ "${AVXVER}" = "AVXVNNI" ]; then + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on" + fi + # if [ "${AVXVER}" = "AVX512" ]; then + # CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on" + # fi + # Basic options for compiling without AVX instructions + if [ "${AVXVER}" = "Basic" ]; then + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + fi + + # Export CMAKE_ARGS environment variable so the python -m build command can use it + echo ${CMAKE_ARGS} + echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV + + # Run the Python build command to generate the wheel + uv pip install build setuptools wheel packaging + # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 + CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel + + # --- Post-build steps to get info for release tag --- + + # Find the generated wheel file in the 'dist' directory using bash + # Assumes only one wheel is generated per build configuration run + wheel_file=$(ls dist/*.whl | head -n 1) + + # Extract the package version (e.g., 1.2.3) from the wheel filename + # Filename format is typically: package_name-version-tag-specificators.whl + # Using basename and cut to split by '-' and get the second field + tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2) + echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step + + # Extract the short CUDA version (e.g., 130) from the full version (e.g., 13.0.2) from the matrix variable + cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') + echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env + + + - name: Get Current Date # Step to get current date for the release tag + id: get-date + run: | + # Get date in YYYYMMDD format using bash date command + currentDate=$(date +%Y%m%d) + # Store the date in environment variable for the release step + echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV + + - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release + with: + files: dist/* # Upload the generated wheel files from the dist directory + # Define the release tag name using the collected environment variables + # Format: v-cu--linux- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux + # Note: This action will create a new release tag if it doesn't exist, + # or upload assets to an existing tag. Be mindful of potential tag name conflicts. + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file diff --git a/.github/workflows/build-wheels-cu130-win-basic.yml b/.github/workflows/build-wheels-cu130-win-basic.yml new file mode 100644 index 000000000..17b0fb6c7 --- /dev/null +++ b/.github/workflows/build-wheels-cu130-win-basic.yml @@ -0,0 +1,107 @@ +name: Build Wheels (CU130) for Windows(Basic) + +on: + workflow_dispatch: + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: ['windows-2022'] + pyver: ["3.10", "3.11", "3.12", "3.13"] + cuda: ["13.0.2"] + releasetag: ["Basic"] + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] + defaults: + run: + shell: pwsh + env: + CUDAVER: ${{ matrix.cuda }} + AVXVER: ${{ matrix.releasetag }} + CUDAARCHVER: ${{ matrix.cudaarch }} + # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html + # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list + # e.g. "all" "89" "90" "100" "120" + MAX_JOBS: 8 + + steps: + - name: Add MSBuild to PATH + if: runner.os == 'Windows' + uses: microsoft/setup-msbuild@v2 + with: + msbuild-architecture: x64 + + - uses: actions/checkout@v5 + with: + submodules: "recursive" + + # from kingbri1/flash-attention build-wheels.yml + - name: Install CUDA ${{ matrix.cuda }} + uses: N-Storm/cuda-toolkit@v0.2.29 + id: cuda-toolkit + with: + cuda: "${{ matrix.cuda }}" + use-github-cache: false + + # from astral-sh/setup-uv + - name: Install the latest version of uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - name: Install Dependencies + run: | + git config --system core.longpaths true + uv pip install --upgrade build setuptools wheel packaging + + - name: Build Wheel + run: | + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') + $env:CUDA_HOME = $env:CUDA_PATH + $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH + $env:VERBOSE = '1' + $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS + $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" + $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off -DLLAMA_HTTPLIB=on $env:CMAKE_ARGS" + + # Basic options for compiling without AVX instructions + if ($env:AVXVER -eq 'Basic') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' + } + python -m build --wheel + + # Check if wheel was built + if (!(Test-Path '.\dist\*.whl')) { + Write-Error "No wheel built in dist/ directory" + exit 1 + } + + # write the build tag to the output + Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV + + $wheel = (gi '.\dist\*.whl')[0] + $tagVer = $wheel.name.split('-')[1] + Write-Output "TAG_VERSION=$tagVer" >> $env:GITHUB_ENV + + - name: Get Current Date + id: get-date + run: | + $currentDate = Get-Date -UFormat "%Y%m%d" + Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV + + - name: Create Release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v2 + with: + files: dist/* + # Set tag_name to -cu--win + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml new file mode 100644 index 000000000..3c7d07caa --- /dev/null +++ b/.github/workflows/build-wheels-cu130-win.yml @@ -0,0 +1,119 @@ +name: Build Wheels (CU130) for Windows + +on: + workflow_dispatch: + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: ['windows-2022'] + pyver: ["3.10", "3.11", "3.12", "3.13"] + cuda: ["13.0.2"] + releasetag: ["AVX2"] + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] + defaults: + run: + shell: pwsh + env: + CUDAVER: ${{ matrix.cuda }} + AVXVER: ${{ matrix.releasetag }} + CUDAARCHVER: ${{ matrix.cudaarch }} + # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html + # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list + # e.g. "all" "89" "90" "100" "120" + MAX_JOBS: 8 + + steps: + - name: Add MSBuild to PATH + if: runner.os == 'Windows' + uses: microsoft/setup-msbuild@v2 + with: + msbuild-architecture: x64 + + - uses: actions/checkout@v5 + with: + submodules: "recursive" + + # from kingbri1/flash-attention build-wheels.yml + - name: Install CUDA ${{ matrix.cuda }} + uses: N-Storm/cuda-toolkit@v0.2.29 + id: cuda-toolkit + with: + cuda: "${{ matrix.cuda }}" + use-github-cache: false + + # from astral-sh/setup-uv + - name: Install the latest version of uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - name: Install Dependencies + run: | + git config --system core.longpaths true + uv pip install --upgrade build setuptools wheel packaging + + - name: Build Wheel + run: | + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') + $env:CUDA_HOME = $env:CUDA_PATH + $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH + $env:VERBOSE = '1' + $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS + $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" + $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off -DLLAMA_HTTPLIB=on $env:CMAKE_ARGS" + + if ($env:AVXVER -eq 'AVX') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' + } + if ($env:AVXVER -eq 'AVX2') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' + } + if ($env:AVXVER -eq 'AVXVNNI') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' + } + # if ($env:AVXVER -eq 'AVX512') { + # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' + # } + # Basic options for compiling without AVX instructions + if ($env:AVXVER -eq 'Basic') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' + } + python -m build --wheel + + # Check if wheel was built + if (!(Test-Path '.\dist\*.whl')) { + Write-Error "No wheel built in dist/ directory" + exit 1 + } + + # write the build tag to the output + Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV + + $wheel = (gi '.\dist\*.whl')[0] + $tagVer = $wheel.name.split('-')[1] + Write-Output "TAG_VERSION=$tagVer" >> $env:GITHUB_ENV + + - name: Get Current Date + id: get-date + run: | + $currentDate = Get-Date -UFormat "%Y%m%d" + Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV + + - name: Create Release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v2 + with: + files: dist/* + # Set tag_name to -cu--win + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml deleted file mode 100644 index 4620cdd76..000000000 --- a/.github/workflows/build-wheels-cuda.yaml +++ /dev/null @@ -1,136 +0,0 @@ -name: Build Wheels (CUDA) - -on: workflow_dispatch - -permissions: - contents: write - -jobs: - define_matrix: - name: Define Build Matrix - runs-on: ubuntu-22.04 - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - defaults: - run: - shell: pwsh - - steps: - - name: Define Job Output - id: set-matrix - run: | - $matrix = @{ - 'os' = @('ubuntu-22.04', 'windows-2022') - 'pyver' = @("3.9", "3.10", "3.11", "3.12") - 'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1") #, "12.5.1", "12.6.1") - 'releasetag' = @("basic") - } - - $matrixOut = ConvertTo-Json $matrix -Compress - Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT - - build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} - needs: define_matrix - runs-on: ${{ matrix.os }} - strategy: - matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }} - defaults: - run: - shell: pwsh - env: - CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} - - steps: - - name: Add MSBuild to PATH - if: runner.os == 'Windows' - uses: microsoft/setup-msbuild@v2 - with: - vs-version: '[16.11,16.12)' - - - uses: actions/checkout@v4 - with: - submodules: "recursive" - - - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.pyver }} - cache: 'pip' - - - name: Setup Mamba - uses: conda-incubator/setup-miniconda@v3.1.0 - with: - activate-environment: "llamacpp" - python-version: ${{ matrix.pyver }} - miniforge-version: latest - add-pip-as-python-dependency: true - auto-activate-base: false - - - name: VS Integration Cache - id: vs-integration-cache - if: runner.os == 'Windows' - uses: actions/cache@v4 - with: - path: ./MSBuildExtensions - key: cuda-${{ matrix.cuda }}-vs-integration - - - name: Get Visual Studio Integration - if: runner.os == 'Windows' && steps.vs-integration-cache.outputs.cache-hit != 'true' - run: | - if ($env:CUDAVER -eq '12.1.1') {$x = '12.1.0'} else {$x = $env:CUDAVER} - $links = (Invoke-RestMethod 'https://raw.githubusercontent.com/Jimver/cuda-toolkit/master/src/links/windows-links.ts').Trim().split().where({$_ -ne ''}) - for ($i=$q=0;$i -lt $links.count -and $q -lt 2;$i++) {if ($links[$i] -eq "'$x',") {$q++}} - Invoke-RestMethod $links[$i].Trim("'") -OutFile 'cudainstaller.zip' - & 'C:\Program Files\7-Zip\7z.exe' e cudainstaller.zip -oMSBuildExtensions -r *\MSBuildExtensions\* > $null - Remove-Item 'cudainstaller.zip' - - - name: Install Visual Studio Integration - if: runner.os == 'Windows' - run: | - $y = (gi '.\MSBuildExtensions').fullname + '\*' - (gi 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Microsoft\VC\*\BuildCustomizations').fullname.foreach({cp $y $_}) - $cupath = 'CUDA_PATH_V' + $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','_') - echo "$cupath=$env:CONDA_PREFIX" >> $env:GITHUB_ENV - - - name: Install Dependencies - env: - MAMBA_DOWNLOAD_FAILFAST: "0" - MAMBA_NO_LOW_SPEED_LIMIT: "1" - run: | - $cudaVersion = $env:CUDAVER - mamba install -y 'cuda' -c nvidia/label/cuda-$cudaVersion - python -m pip install build wheel - - - name: Build Wheel - run: | - $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') - $env:CUDA_PATH = $env:CONDA_PREFIX - $env:CUDA_HOME = $env:CONDA_PREFIX - $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX - if ($IsLinux) { - $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH - } - $env:VERBOSE = '1' - $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all' - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS" - # if ($env:AVXVER -eq 'AVX') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' - # } - # if ($env:AVXVER -eq 'AVX512') { - # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' - # } - # if ($env:AVXVER -eq 'basic') { - # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' - # } - python -m build --wheel - # write the build tag to the output - Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV - - - uses: softprops/action-gh-release@v2 - with: - files: dist/* - # Set tag_name to -cu - tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }} - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 76d899335..2ad6f1679 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,39 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.18] +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/ce734a8a2f9fb6eb4f0383ab1370a1b0014ab787](https://github.com/ggml-org/llama.cpp/commit/ce734a8a2f9fb6eb4f0383ab1370a1b0014ab787) +- feat: Sync llama.cpp llama/mtmd API Binding 20251215 +- feat: **implement `GLM46VChatHandler` for GLM-4.6V Series Model** +- feat: **implement `LFM2VLChatHandler` for LFM2-VL series models** +- feat: **implement `GLM41VChatHandler` for GLM-4.1V-9B-Thinking Model** +- workflow: Added workflows for compiling with CUDA 13.0.2 on Windows and Linux. +- feat: Added the scan path for CUDA 13.0+ dynamic link libraries under Windows system ($env:CUDA_PATH\bin\x64) +- Optimization: Improved batch token processing logic in Llava15ChatHandler. +- [perf: optimize LlamaModel.metadata reading performance](https://github.com/JamePeng/llama-cpp-python/commit/8213c19b0e164780ffffa3e64b5fc033cdbe4974) + - Increase initial buffer size to 16KB to eliminate re-allocations for large chat templates. + - Cache ctypes function references to reduce loop overhead. + - Repeated model loading can result in a cumulative speed improvement of 1-3%. +- build: Improve CMakeLists target logic +- refactor: optimize LlamaGrammar class code + +More information see: https://github.com/JamePeng/llama-cpp-python/compare/67421d546ddcaa07678ac7921a9f124e7e3de10e...d5131e2ff41e05f83fd847052b06938c7a551a6a + +## [0.3.17] +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/054a45c3d313387a4becd5eae982285932852b35](https://github.com/ggml-org/llama.cpp/commit/054a45c3d313387a4becd5eae982285932852b35) +- feat: Sync llama.cpp llama/mtmd API Binding 20251121 +- feat: **Support clip flash-attn** +- feat: **0day support Qwen3VLChatHandler into llama_chat_format.py** +- Update README.md for Qwen3VL example(Thinking/No Thinking) +- feat: **Better Qwen3VL chat template. (by @alcoftTAO)** +- feat: [Implement LlamaTrieCache into llama_cache.py](https://github.com/JamePeng/llama-cpp-python/commit/2419dc2d9bb0a6be0cd381038ce00fcaea124c76): Optimize LlamaCache lookup from **O(N)** to **O(K)** using a Trie, **improves retrieval speed at least 40x compared to the original linear scan method of finding the longest prefix , thereby enhancing service responsiveness.** +- feat: Update Llava15ChatHandler to accept use_gpu, image_min_tokens, and image_max_tokens.Now can pass the`image_min_tokens`parameter in Qwen3VLChatHandler to support bbox grounding tasks. +- feat: [Add Pillow process code in _load_image for VLM](https://github.com/JamePeng/llama-cpp-python/commit/3b0133365e25840c023aef6b6c8578073cd081e8): that can reliably handle common formats, Supports 20+ image formats (PNG, JPEG, WebP, AVIF, HEIC, SVG, BMP, ICO, TIFF, etc.). Images with alpha channel (PNG, WebP, etc.) → automatically composites on white/black background(white for dark content, black for bright content) +- feat: Optimize CUDA Wheel Build Workflow, now workflow action support python3.10-3.13 cu124-cu126-cu128 Basic(Non AVX)-AVX2 win-linux + + +More information see: https://github.com/JamePeng/llama-cpp-python/compare/e5392b52036bd2770ece5269352f5600a8db5639...fbb0ed2f089c663a5eb75aadcad08f768041ed72 + ## [0.3.16] - feat: Update llama.cpp to [ggml-org/llama.cpp/commit/5e6229a8409ac786e62cb133d09f1679a9aec13e](https://github.com/ggml-org/llama.cpp/commit/5e6229a8409ac786e62cb133d09f1679a9aec13e) diff --git a/CMakeLists.txt b/CMakeLists.txt index d6429fac8..ca53b2ff6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,43 +5,49 @@ project(llama_cpp) option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python package" ON) option(MTMD_BUILD "Build mtmd shared library and install alongside python package" ON) +# Helper function to install targets to Python package directories function(llama_cpp_python_install_target target) if(NOT TARGET ${target}) return() endif() - install( - TARGETS ${target} - LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib - RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib - ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib - FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib - RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib - ) - install( - TARGETS ${target} - LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib - RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib - ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib - FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib - RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib - ) - set_target_properties(${target} PROPERTIES - INSTALL_RPATH "$ORIGIN" - BUILD_WITH_INSTALL_RPATH TRUE + # Define install destinations to avoid code duplication + set(INSTALL_DIRS + "${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib" + "${SKBUILD_PLATLIB_DIR}/llama_cpp/lib" ) + + foreach(DIR ${INSTALL_DIRS}) + install( + TARGETS ${target} + LIBRARY DESTINATION ${DIR} + RUNTIME DESTINATION ${DIR} + ARCHIVE DESTINATION ${DIR} + FRAMEWORK DESTINATION ${DIR} + RESOURCE DESTINATION ${DIR} + ) + + # Automatically handle Windows DLL installation for each target + if (WIN32) + install( + FILES $ + DESTINATION ${DIR} + OPTIONAL # Prevent errors if the target has no DLLs + ) + endif() + endforeach() + + # Configure RPATH if(UNIX) + set(INSTALL_RPATH_VAL "$ORIGIN") if(APPLE) - set_target_properties(${target} PROPERTIES - INSTALL_RPATH "@loader_path" - BUILD_WITH_INSTALL_RPATH TRUE - ) - else() - set_target_properties(${target} PROPERTIES - INSTALL_RPATH "$ORIGIN" - BUILD_WITH_INSTALL_RPATH TRUE - ) + set(INSTALL_RPATH_VAL "@loader_path") endif() + + set_target_properties(${target} PROPERTIES + INSTALL_RPATH "${INSTALL_RPATH_VAL}" + BUILD_WITH_INSTALL_RPATH TRUE + ) endif() endfunction() @@ -65,21 +71,17 @@ if (LLAMA_BUILD) # Disable building curl support set(LLAMA_CURL OFF CACHE BOOL "llama.cpp: enable curl" FORCE) + # Enable build and link OpenSSL + set(LLAMA_OPENSSL ON CACHE BOOL "llama.cpp: build and link OpenSSL" FORCE) + # Architecture detection and settings for Apple platforms if (APPLE) - # Get the target architecture - execute_process( - COMMAND uname -m - OUTPUT_VARIABLE HOST_ARCH - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - # If CMAKE_OSX_ARCHITECTURES is not set, use the host architecture if(NOT CMAKE_OSX_ARCHITECTURES) - set(CMAKE_OSX_ARCHITECTURES ${HOST_ARCH} CACHE STRING "Build architecture for macOS" FORCE) + set(CMAKE_OSX_ARCHITECTURES ${CMAKE_HOST_SYSTEM_PROCESSOR} CACHE STRING "Build architecture for macOS" FORCE) endif() - message(STATUS "Host architecture: ${HOST_ARCH}") + message(STATUS "Host architecture: ${CMAKE_HOST_SYSTEM_PROCESSOR}") message(STATUS "Target architecture: ${CMAKE_OSX_ARCHITECTURES}") # Configure based on target architecture @@ -104,45 +106,32 @@ if (LLAMA_BUILD) endif() endif() - llama_cpp_python_install_target(llama) - llama_cpp_python_install_target(ggml) - - llama_cpp_python_install_target(ggml-base) - - llama_cpp_python_install_target(ggml-blas) - llama_cpp_python_install_target(ggml-cann) - llama_cpp_python_install_target(ggml-cpu) - llama_cpp_python_install_target(ggml-cuda) - llama_cpp_python_install_target(ggml-hexagon) - llama_cpp_python_install_target(ggml-hip) - llama_cpp_python_install_target(ggml-metal) - llama_cpp_python_install_target(ggml-musa) - llama_cpp_python_install_target(ggml-opencl) - llama_cpp_python_install_target(ggml-rpc) - llama_cpp_python_install_target(ggml-sycl) - llama_cpp_python_install_target(ggml-vulkan) - llama_cpp_python_install_target(ggml-webgpu) - llama_cpp_python_install_target(ggml-zdnn) - - # Workaround for Windows + CUDA https://github.com/abetlen/llama-cpp-python/issues/563 - if (WIN32) - install( - FILES $ - DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib - ) - install( - FILES $ - DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib - ) - install( - FILES $ - DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib - ) - install( - FILES $ - DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib - ) - endif() + # Define list of GGML targets to install + set(GGML_TARGETS + llama + ggml + ggml-base + ggml-blas + ggml-cann + ggml-cpu + ggml-cuda + ggml-hexagon + ggml-hip + ggml-metal + ggml-musa + ggml-opencl + ggml-rpc + ggml-sycl + ggml-vulkan + ggml-webgpu + ggml-zdnn + ggml-zendnn + ) + + # Loop through targets to avoid repetitive function calls + foreach(TARGET_NAME ${GGML_TARGETS}) + llama_cpp_python_install_target(${TARGET_NAME}) + endforeach() if (MTMD_BUILD) if (NOT DEFINED LLAMA_BUILD_NUMBER) @@ -167,10 +156,5 @@ if (LLAMA_BUILD) endif() llama_cpp_python_install_target(mtmd) - - if (WIN32) - install(FILES $ DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib) - install(FILES $ DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib) - endif() endif() endif() diff --git a/README.md b/README.md index 44354f25b..6c18a4a24 100644 --- a/README.md +++ b/README.md @@ -125,8 +125,10 @@ CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements: -- CUDA Version is 12.4, 12.6 or 12.8 +- CUDA Version is 12.4, 12.6, 12.8 or 13.0 - Python Version is 3.10, 3.11, 3.12 or 3.13 +- Basic version: A version compiled without using AVX instructions (for compatibility with CPU platforms lacking AVX instructions or with AVX instruction compatibility issues). +- AVX2 version: A version compiled using AVX2 instructions. Check the releases page: https://github.com/JamePeng/llama-cpp-python/releases @@ -170,7 +172,11 @@ CMAKE_ARGS="-DGGML_HIPBLAS=on" pip install llama-cpp-python
Vulkan -To install with Vulkan support, set the `GGML_VULKAN=on` environment variable before installing: +- For Windows User: Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings. + +- For Linux User: Follow the official LunarG instructions for the installation and setup of the Vulkan SDK in the [Getting Started with the Linux Tarball Vulkan SDK](https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html) guide. + +Then install with Vulkan support by set the `GGML_VULKAN=on` environment variable before installing: ```bash CMAKE_ARGS="-DGGML_VULKAN=on" pip install llama-cpp-python @@ -490,6 +496,9 @@ Below are the supported multi-modal models and their respective chat handlers (P | [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` | | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6`, `minicpm-v-4.0` | | [gemma3](https://huggingface.co/unsloth/gemma-3-27b-it-GGUF) | `Gemma3ChatHandler` | `gemma3` | +| [glm4.1v](https://huggingface.co/unsloth/GLM-4.1V-9B-Thinking-GGUF) | `GLM41VChatHandler` | `glm4.1v` | +| [glm4.6v](https://huggingface.co/unsloth/GLM-4.6V-Flash-GGUF) | `GLM46VChatHandler` | `glm4.6v` | +| [lfm2-vl](https://huggingface.co/LiquidAI/LFM2-VL-3B-GGUF) | `LFM2VLChatHandler` | `lfm2-vl` | | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | | [qwen3-vl](https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-GGUF) | `Qwen3VLChatHandler` | `qwen3-vl` | @@ -585,8 +594,7 @@ messages = [
-
-Loading a Local Image With Qwen3VL(Thinking/Instruct) +## Loading a Local Image With Qwen3VL(Thinking/Instruct) This script demonstrates how to load a local image, encode it as a base64 Data URI, and pass it to a local Qwen3-VL model (with the 'force_reasoning' parameter enabled for thinking model, disabled for instruct model) for processing using the llama-cpp-python library. @@ -607,47 +615,92 @@ MMPROJ_PATH = r"./mmproj-Qwen3-VL-8b-Thinking-F16.gguf" llm = Llama( model_path=MODEL_PATH, # Set up the chat handler for Qwen3-VL, specifying the projector path - chat_handler=Qwen3VLChatHandler(clip_model_path=MMPROJ_PATH, force_reasoning=True), + chat_handler=Qwen3VLChatHandler( + clip_model_path=MMPROJ_PATH, + force_reasoning=True, + image_min_tokens=1024, # Note: Qwen-VL models require at minimum 1024 image tokens to function correctly on bbox grounding tasks + ), n_gpu_layers=-1, # Offload all layers to the GPU n_ctx=10240, # Set the context window size swa_full=True, ) -# --- Helper Function to Convert Image to Base64 Data URI --- -def image_to_base64_data_uri(file_path): +# Comprehensive MIME type mapping (updated as of 2025) +# Reference: IANA official media types + common real-world usage +_IMAGE_MIME_TYPES = { + # Most common formats + '.png': 'image/png', + '.jpg': 'image/jpeg', + '.jpeg': 'image/jpeg', + '.gif': 'image/gif', + '.webp': 'image/webp', + '.svg': 'image/svg+xml', + '.svgz': 'image/svg+xml', + + # Next-generation formats + '.avif': 'image/avif', + '.heic': 'image/heic', + '.heif': 'image/heif', + '.heics': 'image/heic-sequence', + '.heifs': 'image/heif-sequence', + + # Legacy / Windows formats + '.bmp': 'image/bmp', + '.dib': 'image/bmp', + '.ico': 'image/x-icon', + '.cur': 'image/x-icon', + + # Professional imaging + '.tif': 'image/tiff', + '.tiff': 'image/tiff', +} + +def image_to_base64_data_uri( + file_path: str, + *, + fallback_mime: str = "application/octet-stream" +) -> str: """ - Reads an image file, determines its MIME type, and converts it - to a base64 encoded Data URI. + Convert a local image file to a base64-encoded data URI with the correct MIME type. + + Supports 20+ image formats (PNG, JPEG, WebP, AVIF, HEIC, SVG, BMP, ICO, TIFF, etc.). + + Args: + file_path: Path to the image file on disk. + fallback_mime: MIME type used when the file extension is unknown. + + Returns: + A valid data URI string (e.g., data:image/webp;base64,...). + + Raises: + FileNotFoundError: If the file does not exist. + OSError: If reading the file fails. """ - # Get the file extension to determine MIME type + if not os.path.isfile(file_path): + raise FileNotFoundError(f"Image file not found: {file_path}") + extension = os.path.splitext(file_path)[1].lower() + mime_type = _IMAGE_MIME_TYPES.get(extension, fallback_mime) - # Determine the MIME type based on the file extension - if extension == '.png': - mime_type = 'image/png' - elif extension in ('.jpg', '.jpeg'): - mime_type = 'image/jpeg' - elif extension == '.gif': - mime_type = 'image/gif' - elif extension == '.svg': - mime_type = 'image/svg+xml' - else: - # Use a generic stream type for unsupported formats - mime_type = 'application/octet-stream' - print(f"Warning: Unsupported image type for file: {file_path}. Using a generic MIME type.") - - # Read the image file in binary mode - with open(file_path, "rb") as img_file: - # Encode the binary data to base64 and decode to UTF-8 - base64_data = base64.b64encode(img_file.read()).decode('utf-8') - # Format as a Data URI string - return f"data:{mime_type};base64,{base64_data}" + if mime_type == fallback_mime: + print(f"Warning: Unknown extension '{extension}' for '{file_path}'. " + f"Using fallback MIME type: {fallback_mime}") + + try: + with open(file_path, "rb") as img_file: + encoded_data = base64.b64encode(img_file.read()).decode("utf-8") + except OSError as e: + raise OSError(f"Failed to read image file '{file_path}': {e}") from e + + return f"data:{mime_type};base64,{encoded_data}" # --- Main Logic for Image Processing --- # 1. Create a list containing all image paths image_paths = [ r'./scene.jpeg', + r'./cat.png', + r'./network.webp', # Add more image paths here if needed ] @@ -666,7 +719,7 @@ images_messages.append({"type": "text", "text": "Describes the images."}) # 5. Use this list to build the chat_completion request res = llm.create_chat_completion( messages=[ - {"role": "system", "content": "You are a AI assistant who perfectly describes images."}, + {"role": "system", "content": "You are a highly accurate vision-language assistant. Provide detailed, precise, and well-structured image descriptions."}, # The user's content is the list containing both images and text {"role": "user", "content": images_messages} ] @@ -677,7 +730,6 @@ print(res["choices"][0]["message"]["content"]) ``` -
### Speculative Decoding diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index c1dde7046..bdaefb9e0 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.16" +__version__ = "0.3.18" diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py index e88ed387d..0ba7f416d 100644 --- a/llama_cpp/_ctypes_extensions.py +++ b/llama_cpp/_ctypes_extensions.py @@ -53,11 +53,26 @@ def load_shared_library(lib_base_name: str, base_path: pathlib.Path): if sys.platform == "win32" and sys.version_info >= (3, 8): os.add_dll_directory(str(base_path)) if "CUDA_PATH" in os.environ: - os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin")) - os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib")) + cuda_path = os.environ["CUDA_PATH"] + sub_dirs_to_add = [ + "bin", + os.path.join("bin", "x64"), # CUDA 13.0+ + "lib", + os.path.join("lib", "x64") + ] + for sub_dir in sub_dirs_to_add: + full_path = os.path.join(cuda_path, sub_dir) + if os.path.exists(full_path): + os.add_dll_directory(full_path) + if "HIP_PATH" in os.environ: os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "bin")) os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "lib")) + + if "VULKAN_SDK" in os.environ: + os.add_dll_directory(os.path.join(os.environ["VULKAN_SDK"], "Bin")) + os.add_dll_directory(os.path.join(os.environ["VULKAN_SDK"], "Lib")) + cdll_args["winmode"] = ctypes.RTLD_GLOBAL # Try to load the shared library, handling potential errors diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 91eb801f6..0c95a6132 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -225,32 +225,38 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes: # Extra def metadata(self) -> Dict[str, str]: metadata: Dict[str, str] = {} - buffer_size = 1024 + # Pre-allocate a 16KB buffer. This is large enough to handle almost all + # metadata values (including gpt-oss large chat templates ~15KB) in a single pass, + # eliminating the need for resize-and-retry in most cases. + buffer_size = 16384 buffer = ctypes.create_string_buffer(buffer_size) - # zero the buffer - buffer.value = b"\0" * buffer_size + + # Caching function references reduces the overhead of property lookups within loops. + get_key_by_index = llama_cpp.llama_model_meta_key_by_index + get_val_by_index = llama_cpp.llama_model_meta_val_str_by_index + metadata_count = llama_cpp.llama_model_meta_count(self.model) # iterate over model keys - for i in range(llama_cpp.llama_model_meta_count(self.model)): - nbytes = llama_cpp.llama_model_meta_key_by_index( - self.model, i, buffer, buffer_size - ) + for i in range(metadata_count): + # 1. Get Key + nbytes = get_key_by_index(self.model, i, buffer, buffer_size) + # Handle buffer resize if the key exceeds current size if nbytes > buffer_size: - buffer_size = nbytes + 1 + buffer_size = nbytes + 1024 buffer = ctypes.create_string_buffer(buffer_size) - nbytes = llama_cpp.llama_model_meta_key_by_index( - self.model, i, buffer, buffer_size - ) + # Retry with the larger buffer + nbytes = get_key_by_index(self.model, i, buffer, buffer_size) key = buffer.value.decode("utf-8") - nbytes = llama_cpp.llama_model_meta_val_str_by_index( - self.model, i, buffer, buffer_size - ) + + # 2. Get Value + nbytes = get_val_by_index(self.model, i, buffer, buffer_size) + # Handle buffer resize if the value exceeds current size if nbytes > buffer_size: - buffer_size = nbytes + 1 + buffer_size = nbytes + 1024 buffer = ctypes.create_string_buffer(buffer_size) - nbytes = llama_cpp.llama_model_meta_val_str_by_index( - self.model, i, buffer, buffer_size - ) + # Retry with the larger buffer + nbytes = get_val_by_index(self.model, i, buffer, buffer_size) value = buffer.value.decode("utf-8") + metadata[key] = value return metadata @@ -301,8 +307,8 @@ def __del__(self): def n_ctx(self) -> int: return llama_cpp.llama_n_ctx(self.ctx) - def n_batch(self) -> int: - return llama_cpp.llama_n_batch(self.ctx) + def n_ctx_seq(self) -> int: + return llama_cpp.llama_n_ctx_seq(self.ctx) def n_batch(self) -> int: return llama_cpp.llama_n_batch(self.ctx) @@ -436,12 +442,19 @@ def encode(self, batch: LlamaBatch): raise RuntimeError(f"llama_encode returned {return_code}") def decode(self, batch: LlamaBatch): - return_code = llama_cpp.llama_decode( - self.ctx, - batch.batch, - ) - if return_code != 0: - raise RuntimeError(f"llama_decode returned {return_code}") + return_code = llama_cpp.llama_decode(self.ctx, batch.batch) + + if return_code == 0: + return + + error_map = { + 1: "No KV slot available: try reducing batch size or increasing context window", + 2: "Decoding aborted", + -1: "Invalid input batch", + } + + msg = error_map.get(return_code, "Fatal internal error") + raise RuntimeError(f"llama_decode failed (code {return_code}): {msg}") def set_n_threads(self, n_threads: int, n_threads_batch: int): llama_cpp.llama_set_n_threads(self.ctx, n_threads, n_threads_batch) @@ -649,7 +662,7 @@ def n_tokens(self) -> int: def reset(self): self.batch.n_tokens = 0 - def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool): + def set_batch(self, batch: Sequence[int], n_past: llama_cpp.llama_pos, logits_all: bool): n_tokens = len(batch) self.batch.n_tokens = n_tokens for i in range(n_tokens): @@ -1003,27 +1016,6 @@ def convert_list_str_to_char_array_ptr(self, str_list: List[str]): # Return the char** pointer and the number of strings return char_array_ptr, num_byte_list - def add_grammar_lazy( - self, - model: LlamaModel, - grammar: LlamaGrammar, - trigger_tokens:list[llama_cpp.llama_token], - num_trigger_tokens: int, - trigger_words: list[str]=[] - ): - trigger_words_char_array_ptr, num_trigger_words = self.convert_list_str_to_char_array_ptr(trigger_words) - - sampler = llama_cpp.llama_sampler_init_grammar_lazy( - model.vocab, - grammar._grammar.encode("utf-8"), - grammar._root.encode("utf-8"), - trigger_words_char_array_ptr, - num_trigger_words, - trigger_tokens, - num_trigger_tokens - ) - self._add_sampler(sampler) - def add_grammar_lazy_patterns( self, model: LlamaModel, diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 2bb1c8769..c2628555b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -273,17 +273,17 @@ def __init__( if isinstance(v, bool): self._kv_overrides_array[ i - ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL + ].tag = llama_cpp.LlamaModelKVOverrideType.LLAMA_KV_OVERRIDE_TYPE_BOOL.value self._kv_overrides_array[i].value.val_bool = v elif isinstance(v, int): self._kv_overrides_array[ i - ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT + ].tag = llama_cpp.LlamaModelKVOverrideType.LLAMA_KV_OVERRIDE_TYPE_INT.value self._kv_overrides_array[i].value.val_i64 = v elif isinstance(v, float): self._kv_overrides_array[ i - ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT + ].tag = llama_cpp.LlamaModelKVOverrideType.LLAMA_KV_OVERRIDE_TYPE_FLOAT.value self._kv_overrides_array[i].value.val_f64 = v elif isinstance(v, str): # type: ignore v_bytes = v.encode("utf-8") @@ -292,7 +292,7 @@ def __init__( v_bytes = v_bytes.ljust(128, b"\0") self._kv_overrides_array[ i - ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR + ].tag = llama_cpp.LlamaModelKVOverrideType.LLAMA_KV_OVERRIDE_TYPE_STR.value # copy min(v_bytes, 128) to str_value address = typing.cast( int, @@ -1031,6 +1031,11 @@ def create_embedding( Returns: An embedding object. """ + warnings.warn( + "The `create_embedding` method in `Llama` class is deprecated. " + "Please migrate to `LlamaEmbedding.create_embedding` for better efficiency.", + DeprecationWarning, + ) model_name: str = model if model is not None else self.model_path input = input if isinstance(input, list) else [input] @@ -1075,6 +1080,12 @@ def embed( Returns: A list of embeddings """ + warnings.warn( + "The `embed` method in `Llama` class is deprecated and will be removed in future versions. " + "Please use the `LlamaEmbedding` class from `llama_embedding` module for optimized performance and reranking support.", + DeprecationWarning, + ) + n_embd = self.n_embd() n_batch = self.n_batch diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index efb17bdd8..166459edb 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2792,10 +2792,13 @@ class Llava15ChatHandler: "{% endif %}" ) - def __init__(self, clip_model_path: str, verbose: bool = True): + def __init__(self, clip_model_path: str, verbose: bool = True, use_gpu: bool = True, image_min_tokens: int = -1, image_max_tokens: int = -1): import llama_cpp.mtmd_cpp as mtmd_cpp self.clip_model_path = clip_model_path + self.image_min_tokens = image_min_tokens + self.image_max_tokens = image_max_tokens + self.use_gpu = use_gpu self.verbose = verbose self._mtmd_cpp = mtmd_cpp self._exit_stack = ExitStack() @@ -2805,6 +2808,12 @@ def __init__(self, clip_model_path: str, verbose: bool = True): if not os.path.exists(clip_model_path): raise ValueError(f"Clip model path does not exist: {clip_model_path}") + # Pre-compile Jinja template + self.chat_template = ImmutableSandboxedEnvironment( + trim_blocks=True, + lstrip_blocks=True, + ).from_string(self.CHAT_FORMAT) + def _init_mtmd_context(self, llama_model: llama.Llama): """Initialize mtmd context with the llama model.""" if self.mtmd_ctx is not None: @@ -2815,10 +2824,17 @@ def _init_mtmd_context(self, llama_model: llama.Llama): # Get default parameters mctx_params = self._mtmd_cpp.mtmd_context_params_default() - mctx_params.use_gpu = True # TODO: Make this configurable + mctx_params.use_gpu = self.use_gpu mctx_params.print_timings = self.verbose mctx_params.n_threads = llama_model.n_threads mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO + mctx_params.warmup = True + if self.image_min_tokens > 0: + mctx_params.image_min_tokens = self.image_min_tokens + if self.image_max_tokens > 0: + mctx_params.image_max_tokens = self.image_max_tokens + if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0: + raise ValueError(f"image_max_pixels {self.image_max_tokens} is less than image_min_pixels {self.image_min_tokens}") # Initialize mtmd context self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( @@ -2922,16 +2938,12 @@ def __call__( ] + messages image_urls = self.get_image_urls(messages) - template = ImmutableSandboxedEnvironment( - trim_blocks=True, - lstrip_blocks=True, - ).from_string(self.CHAT_FORMAT) # Get the default media marker media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8') # Replace image URLs with media markers in the template - text = template.render( + text = self.chat_template.render( messages=messages, tools=tools, add_generation_prompt=True, @@ -2987,22 +2999,19 @@ def __call__( llama._ctx.memory_clear(True) # Process each chunk - n_past = llama_cpp.llama_pos(0) + n_past = 0 n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks) for i in range(n_chunks): chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) - if chunk is None: - continue + if chunk is None: continue chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT: # Handle text chunk n_tokens_out = ctypes.c_size_t() - tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text( - chunk, ctypes.byref(n_tokens_out) - ) + tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk, ctypes.byref(n_tokens_out)) if tokens_ptr and n_tokens_out.value > 0: # Convert ctypes array to Python list @@ -3012,15 +3021,17 @@ def __call__( raise ValueError( f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}" ) + llama.n_tokens = n_past llama.eval(tokens) + n_past = llama.n_tokens elif chunk_type in [self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE, self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO]: # Handle image/audio chunk using helper chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk) - if llama.n_tokens + chunk_n_tokens > llama.n_ctx(): + if n_past + chunk_n_tokens > llama.n_ctx(): raise ValueError( - f"Prompt exceeds n_ctx: {llama.n_tokens + chunk_n_tokens} > {llama.n_ctx()}" + f"Prompt exceeds n_ctx: {n_past + chunk_n_tokens} > {llama.n_ctx()}" ) new_n_past = llama_cpp.llama_pos(0) @@ -3028,7 +3039,7 @@ def __call__( self.mtmd_ctx, llama._ctx.ctx, chunk, - llama_cpp.llama_pos(llama.n_tokens), + llama_cpp.llama_pos(n_past), llama_cpp.llama_seq_id(0), llama.n_batch, False, # logits_last @@ -3039,8 +3050,15 @@ def __call__( raise ValueError(f"Failed to evaluate chunk: error code {result}") # Update llama's token count - llama.n_tokens = new_n_past.value - + n_past = new_n_past.value + llama.n_tokens = n_past + + n_past = llama.n_tokens + if n_past > 0: + llama._ctx.memory_seq_rm(0, n_past - 1, -1) + if llama._ctx.memory_seq_pos_min(0) == llama._ctx.memory_seq_pos_max(0): + n_past += 1 + llama.n_tokens = n_past # Get prompt tokens to avoid a cache miss prompt = llama.input_ids[: llama.n_tokens].tolist() @@ -3144,16 +3162,83 @@ def __call__( @staticmethod def _load_image(image_url: str) -> bytes: - # TODO: Add Pillow support for other image formats beyond (jpg, png) - if image_url.startswith("data:"): + """ + Load an image from either a URL or a data URI and return it as JPEG bytes. + + Supports: + - Remote images via HTTP/HTTPS (with proper User-Agent) + - Data URIs (base64-encoded, e.g., data:image/png;base64,...) + - Images with alpha channel (PNG, WebP, etc.) → automatically composites on white/black background + - Any format that Pillow can open + + Returns: + JPEG-encoded bytes (quality=95) in RGB mode, suitable for most vision models. + """ + image_bytes = b"" + + # 1. Handle data URI (base64) + if image_url.strip().startswith("data:"): import base64 - image_bytes = base64.b64decode(image_url.split(",")[1]) - return image_bytes + # Split only once from the right to correctly handle mime types containing commas + comma_pos = image_url.find(",") + if comma_pos == -1: + raise ValueError("Invalid data URI: missing comma separator") + base64_data = image_url[comma_pos + 1 :] + image_bytes = base64.b64decode(base64_data) + + # 2. Handle local/remote URL else: import urllib.request - with urllib.request.urlopen(image_url) as f: - image_bytes = f.read() - return image_bytes + from urllib.error import URLError, HTTPError + + headers = {"User-Agent": "Mozilla/5.0"} + req = urllib.request.Request(image_url, headers=headers) + + try: + with urllib.request.urlopen(req, timeout=15) as f: + image_bytes = f.read() + except (URLError, HTTPError) as e: + raise ConnectionError(f"Failed to download image from {image_url}: {e}") + + if not image_bytes: + raise ValueError("Empty image data received") + + # 3. Open image with Pillow + try: + from PIL import Image, ImageStat + except ImportError: + raise ImportError("Pillow is required for image processing. Install with: pip install pillow") + + import io + image = Image.open(io.BytesIO(image_bytes)) + + # 4. Handle transparency (RGBA, LA, P with transparency, etc.) + if image.mode in ("RGBA", "LA", "PA") or (image.mode == "P" and "transparency" in image.info): + # Use alpha channel as mask + if image.mode == "P": + image = image.convert("RGBA") + + alpha = image.split()[-1] # Last channel is alpha + # Compute average brightness of visible (non-transparent) pixels + stat = ImageStat.Stat(image.convert("L"), mask=alpha) + + # Choose background: white for dark content, black for bright content + bg_color = (255, 255, 255) # white + if stat.count[0] > 0 and stat.mean[0] > 127: + bg_color = (0, 0, 0) # black + + background = Image.new("RGB", image.size, bg_color) + background.paste(image, mask=alpha) + image = background + + # 5. Ensure RGB mode for formats like CMYK, palette, etc. + elif image.mode != "RGB": + image = image.convert("RGB") + + # 6. Save as high-quality JPEG, suitable for most vision models. + output = io.BytesIO() + image.save(output, format="JPEG", quality=95, optimize=True, progressive=True) + return output.getvalue() @staticmethod def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]): @@ -3638,6 +3723,249 @@ class Gemma3ChatHandler(Llava15ChatHandler): ) +class GLM41VChatHandler(Llava15ChatHandler): + # Note: Make sure the GGUF files of your converted model and mmproj are F16 or F32. + + GLM41V_EOS_TOKEN = "<|endoftext|>" + GLM41V_PAD_TOKEN = "<|endoftext|>" + GLM41V_IMAGE_START_TOKEN = "<|begin_of_image|>" + GLM41V_IMAGE_END_TOKEN = "<|end_of_image|>" + + CHAT_FORMAT = ( + "[gMASK]\n" + "{%- for msg in messages -%}" + "{%- if msg.role == 'system' -%}" + "<|system|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" + "{%- elif msg.role == 'user' -%}" + "<|user|>\n" + "{%- if msg.content is string -%}" + "{{ msg.content }}" + "{%- else -%}" + "{%- for item in msg.content -%}" + "{%- if item.type == 'image_url' or 'image_url' in item -%}" + "<|begin_of_image|>" + "{%- if item.image_url is string -%}" + "{{- item.image_url -}}" + "{%- else -%}" + "{{- item.image_url.url -}}" + "{%- endif -%}" + "<|end_of_image|>" + "{%- elif item.type == 'text' -%}" + "{{ item.text }}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}{{ GLM41V_EOS_TOKEN }}" + "{%- elif msg.role == 'assistant' -%}" + "{%- if msg.metadata -%}" + "<|assistant|>{{ msg.metadata }}\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" + "{%- else -%}" + "<|assistant|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" + "{%- endif -%}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "<|assistant|>\n" + "{%- endif -%}" + ) + + def __call__(self, **kwargs): + self.extra_template_arguments["GLM41V_EOS_TOKEN"] = self.GLM41V_EOS_TOKEN + # https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking/blob/main/generation_config.json + stop_tokens = [self.GLM41V_EOS_TOKEN, "<|user|>", "<|observation|>", ""] # Stop token patch + kwargs['stop'] = stop_tokens + + llama = kwargs['llama'] + + # Clear state for multiple runs + llama.reset() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + # Clear any handler state + if hasattr(self, '_last_image_embed'): + self._last_image_embed = None + self._last_image_hash = None + + if self.verbose: + messages = kwargs.get('messages', []) + try: + image_count = len(self.get_image_urls(messages)) + print(f"GLM4VChatHandler - Cleared state, processing {image_count} images", file=sys.stderr) + except Exception: + print(f"GLM4VChatHandler - Cleared state", file=sys.stderr) + + # Use parent implementation + return super().__call__(**kwargs) + + +class GLM46VChatHandler(Llava15ChatHandler): + GLM46V_EOS_TOKEN = "<|endoftext|>" + GLM46V_PAD_TOKEN = "<|endoftext|>" + GLM46V_IMAGE_START_TOKEN = "<|begin_of_image|>" + GLM46V_IMAGE_END_TOKEN = "<|end_of_image|>" + + CHAT_FORMAT = ( + "[gMASK]" + "{%- if tools -%}" + "<|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n" + "You are provided with function signatures within XML tags:\n\n" + "{%- for tool in tools -%}" + "{{ tool | tojson(ensure_ascii=False) }}\n" + "{%- endfor -%}" + "\n\nFor each function call, output the function name and arguments within the following XML format:\n" + "{function-name}\n{arg-key-1}\n{arg-value-1}\n...\n" + "{%- endif -%}" + + "{%- for m in messages -%}" + "{%- if m.role == 'system' -%}" + "<|system|>\n{{ m.content }}" + "{%- elif m.role == 'user' -%}" + "<|user|>\n" + "{%- if m.content is string -%}" + "{{ m.content }}" + "{%- else -%}" + "{%- for item in m.content -%}" + "{%- if item.type == 'image_url' or 'image_url' in item -%}" + "<|begin_of_image|>" + "{%- if item.image_url is string -%}" + "{{- item.image_url -}}" + "{%- else -%}" + "{{- item.image_url.url -}}" + "{%- endif -%}" + "<|end_of_image|>" + "{%- elif item.type == 'text' -%}" + "{{ item.text }}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + # If enable_thinking is disabled, insert `/nothink` according to the source code logic. + "{{ '/nothink' if not enable_thinking else '' }}" + "{%- elif m.role == 'assistant' -%}" + "<|assistant|>" + "{%- if enable_thinking -%}" + "{%- set reasoning = m.reasoning_content if m.reasoning_content is string else '' -%}" + "\n{{ reasoning.strip() }}" + "{%- else -%}" + "\n" + "{%- endif -%}" + "{{ '\n' + m.content.strip() if m.content.strip() else '' }}" + "{%- endif -%}" + "{{ GLM46V_EOS_TOKEN }}" + "{%- endfor -%}" + + "{%- if add_generation_prompt -%}" + "<|assistant|>\n" + "{{ '' if enable_thinking else '\n' }}" + "{%- endif -%}" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + GLM-4.6V Handler + Parameters: + - enable_thinking (bool): Whether to enable the model's think process. The default is True. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + self.extra_template_arguments["GLM46V_EOS_TOKEN"] = self.GLM46V_EOS_TOKEN + + # https://huggingface.co/zai-org/GLM-4.6V-Flash/blob/main/generation_config.json + kwargs['stop'] = [self.GLM46V_EOS_TOKEN, "<|user|>", "<|observation|>", "<|code_middle|>"] # Stop token patch + + llama = kwargs['llama'] + llama.reset() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if hasattr(self, '_last_image_embed'): + self._last_image_embed = None + self._last_image_hash = None + + if self.verbose: + messages = kwargs.get('messages', []) + try: + image_count = len(self.get_image_urls(messages)) + print(f"GLM46VChatHandler(enable_thinking={self.enable_thinking}) - Processing {image_count} images", file=sys.stderr) + except Exception: + print(f"GLM46VChatHandler(enable_thinking={self.enable_thinking}) - Cleared state", file=sys.stderr) + + return super().__call__(**kwargs) + + +class LFM2VLChatHandler(Llava15ChatHandler): + LFM2VL_BOS_TOKEN = "<|startoftext|>" + LFM2VL_EOS_TOKEN = "<|im_end|>" + LFM2VL_IMAGE_START_TOKEN = "<|image_start|>" + LFM2VL_IMAGE_END_TOKEN = "<|image_end|>" + + CHAT_FORMAT = ( + "{%- for message in messages -%}" + "{{ '<|im_start|>' + message['role'] + '\n' }}" + "{%- if message['content'] is string -%}" + "{{ message['content'] }}" + "{%- else -%}" + "{%- for content in message['content'] -%}" + "{%- if 'image_url' in content -%}" + "{%- if content.image_url is string -%}" + "<|image_start|>{{ content.image_url }}<|image_end|>" + "{%- else -%}" + "<|image_start|>{{ content.image_url.url }}<|image_end|>" + "{%- endif -%}" + "{%- elif content['type'] == 'text' -%}" + "{{ content['text'] }}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{{ '<|im_end|>\n' }}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "{{ '<|im_start|>assistant\n' }}" + "{%- endif -%}" + ) + + def __init__(self, image_min_tokens: int = -1, image_max_tokens: int = -1, **kwargs): + """ + LFM2-VL Handler + LiquidAI officially recommends configuring LFM2-VL with the following Vision parameters: min_image_tokens=64, max_image_tokens=256 + """ + self.image_min_tokens = image_min_tokens + self.image_max_tokens = image_max_tokens + super().__init__(image_min_tokens=self.image_min_tokens, image_max_tokens=self.image_max_tokens, **kwargs) + + def __call__(self, **kwargs): + + llama = kwargs['llama'] + llama.reset() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if hasattr(self, '_last_image_embed'): + self._last_image_embed = None + self._last_image_hash = None + + if self.verbose: + messages = kwargs.get('messages', []) + try: + image_count = len(self.get_image_urls(messages)) + print(f"LFM2VLChatHandler - Cleared state, Processing {image_count} images", file=sys.stderr) + except Exception: + print(f"LFM2VLChatHandler - Cleared state", file=sys.stderr) + + return super().__call__(**kwargs) + + class Qwen25VLChatHandler(Llava15ChatHandler): DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant." @@ -3718,7 +4046,7 @@ class Qwen3VLChatHandler(Llava15ChatHandler): "{{- '\n' -}}" "{{- tool | tojson -}}" "{%- endfor -%}" - "{{- '\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n\n\nYou can also return a response for the user alongside a function call:\n\n\n{\"name\": , \"arguments\": }\n' -}}" + "{{- '\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n\n\nYou can also return a response for the user alongside a function call:\nRESPONSE FOR THE USER HERE\n\n{\"name\": , \"arguments\": }\n' -}}" "{%- endif -%}" "{{- '<|im_end|>\n' -}}" "{%- set image_count = namespace(value=0) -%}" @@ -3780,7 +4108,7 @@ class Qwen3VLChatHandler(Llava15ChatHandler): "{%- endif -%}" "{%- endfor -%}" "{%- if add_generation_prompt -%}" - "{{- 'assistant\n' -}}" + "{{- '<|im_start|>assistant\n' -}}" "{%- if force_reasoning -%}" "{{- '\n' -}}" "{%- endif -%}" @@ -3791,6 +4119,7 @@ def __init__( self, force_reasoning: bool = False, add_vision_id: bool = True, + image_min_tokens: int = -1, **kwargs, ): """ @@ -3801,11 +4130,15 @@ def __init__( - add_vision_id (bool): - True (default): Count all the images. Recommended for multi-image. - False: Doesn't count the images. Can save tokens with single-image. + - image_min_tokens (int): + It only takes effect when the value is greater than zero. the default value is -1 (i.e., using the default parameters in the model's preprocessor_config.json). + Note: Qwen-VL models require at minimum 1024 image tokens to function correctly on bbox grounding tasks """ self.force_reasoning = force_reasoning self.add_vision_id = add_vision_id + self.image_min_tokens = image_min_tokens - super().__init__(**kwargs) + super().__init__(image_min_tokens=self.image_min_tokens, **kwargs) def __call__(self, **kwargs): self.extra_template_arguments["force_reasoning"] = self.force_reasoning diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index d3890c0c1..aa4e40e05 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -550,7 +550,7 @@ class llama_batch(ctypes.Structure): The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens Attributes: - n_tokens (int): number of tokens + n_tokens (ctypes.c_int32): number of tokens token (ctypes.Array[llama_token]): the token ids of the input (used when embd is NULL) embd (ctypes.Array[ctypes.ctypes.c_float]): token embeddings (i.e. float vector of size n_embd) (used when token is NULL) pos (ctypes.Array[ctypes.Array[llama_pos]]): the positions of the respective token in the sequence @@ -559,7 +559,7 @@ class llama_batch(ctypes.Structure): """ if TYPE_CHECKING: - n_tokens: int + n_tokens: ctypes.c_int32 token: CtypesArray[llama_token] embd: CtypesArray[ctypes.c_float] pos: CtypesArray[CtypesArray[llama_pos]] @@ -584,10 +584,40 @@ class llama_batch(ctypes.Structure): # LLAMA_KV_OVERRIDE_TYPE_BOOL, # LLAMA_KV_OVERRIDE_TYPE_STR, # }; -LLAMA_KV_OVERRIDE_TYPE_INT = 0 -LLAMA_KV_OVERRIDE_TYPE_FLOAT = 1 -LLAMA_KV_OVERRIDE_TYPE_BOOL = 2 -LLAMA_KV_OVERRIDE_TYPE_STR = 3 +class LlamaModelKVOverrideType(enum.IntEnum): + LLAMA_KV_OVERRIDE_TYPE_INT = 0 + LLAMA_KV_OVERRIDE_TYPE_FLOAT = 1 + LLAMA_KV_OVERRIDE_TYPE_BOOL = 2 + LLAMA_KV_OVERRIDE_TYPE_STR = 3 + + +# enum llama_model_meta_key { +# LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE, +# LLAMA_MODEL_META_KEY_SAMPLING_TOP_K, +# LLAMA_MODEL_META_KEY_SAMPLING_TOP_P, +# LLAMA_MODEL_META_KEY_SAMPLING_MIN_P, +# LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY, +# LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD, +# LLAMA_MODEL_META_KEY_SAMPLING_TEMP, +# LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N, +# LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT, +# LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT, +# LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU, +# LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA, +# }; +class LlamaModelMetaKey(enum.IntEnum): + LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE = 0 + LLAMA_MODEL_META_KEY_SAMPLING_TOP_K = 1 + LLAMA_MODEL_META_KEY_SAMPLING_TOP_P = 2 + LLAMA_MODEL_META_KEY_SAMPLING_MIN_P = 3 + LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY = 4 + LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD = 5 + LLAMA_MODEL_META_KEY_SAMPLING_TEMP = 6 + LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N = 7 + LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT = 8 + LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT = 9 + LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU = 10 + LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA = 11 # struct llama_model_kv_override { @@ -682,6 +712,7 @@ class llama_model_tensor_buft_override(ctypes.Structure): # bool check_tensors; // validate model tensor data # bool use_extra_bufts; // use extra buffer types (used for weight repacking) # bool no_host; // bypass host buffer allowing extra buffers to be used +# bool no_alloc; // only load metadata and simulate memory allocations # }; class llama_model_params(ctypes.Structure): """Parameters for llama_model @@ -701,7 +732,8 @@ class llama_model_params(ctypes.Structure): use_mlock (bool): force system to keep model in RAM check_tensors (bool): validate model tensor data use_extra_bufts (bool): use extra buffer types (used for weight repacking) - no_host (bool): bypass host buffer allowing extra buffers to be used""" + no_host (bool): bypass host buffer allowing extra buffers to be used + no_alloc (bool): only load metadata and simulate memory allocations""" if TYPE_CHECKING: devices: CtypesArray[ctypes.c_void_p] # NOTE: unused @@ -719,6 +751,7 @@ class llama_model_params(ctypes.Structure): check_tensors: bool use_extra_bufts: bool no_host: bool + no_alloc: bool _fields_ = [ ("devices", ctypes.c_void_p), # NOTE: unnused @@ -736,8 +769,10 @@ class llama_model_params(ctypes.Structure): ("check_tensors", ctypes.c_bool), ("use_extra_bufts", ctypes.c_bool), ("no_host", ctypes.c_bool), + ("no_alloc", ctypes.c_bool), ] +llama_model_params_p = ctypes.POINTER(llama_model_params) # // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations # // https://github.com/ggml-org/llama.cpp/pull/7544 @@ -888,6 +923,7 @@ class llama_context_params(ctypes.Structure): ("kv_unified", ctypes.c_bool), ] +llama_context_params_p = ctypes.POINTER(llama_context_params) # // Signature for logging events # // Note that text includes the new line character at the end for most events. @@ -1276,6 +1312,51 @@ def llama_free(ctx: llama_context_p, /): ... +# // fits mparams and cparams to free device memory (assumes system memory is unlimited) +# // returns true if the parameters could be successfully modified to fit device memory +# // this function is NOT thread safe because it modifies the global llama logger state +# LLAMA_API bool llama_params_fit( +# const char * path_model, +# struct llama_model_params * mparams, +# struct llama_context_params * cparams, +# float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements +# struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements +# size_t margin, // margin of memory to leave per device in bytes +# uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use +# enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log +@ctypes_function( + "llama_params_fit", + [ + ctypes.c_char_p, + llama_model_params_p, + llama_context_params_p, + ctypes.POINTER(ctypes.c_float), + ctypes.POINTER(llama_model_tensor_buft_override), + ctypes.c_size_t, + ctypes.c_uint32, + ctypes.c_int, + ], + ctypes.c_bool, +) +def llama_params_fit( + path_model: ctypes.c_char_p, + mparams: llama_model_params_p, + cparams: llama_context_params_p, + tensor_split: ctypes.pointer(ctypes.c_float), + tensor_buft_overrides: ctypes.pointer(llama_model_tensor_buft_override), + margin: ctypes.c_size_t, + n_ctx_min: ctypes.c_uint32, + log_level: int, + /, +) -> bool: + """ + fits mparams and cparams to free device memory (assumes system memory is unlimited) + returns true if the parameters could be successfully modified to fit device memory + this function is NOT thread safe because it modifies the global llama logger state + """ + ... + + # LLAMA_API int64_t llama_time_us(void); @ctypes_function( "llama_time_us", @@ -1298,6 +1379,12 @@ def llama_max_parallel_sequences() -> int: ... +# LLAMA_API size_t llama_max_tensor_buft_overrides(void); +@ctypes_function("llama_max_tensor_buft_overrides", [], ctypes.c_size_t) +def llama_max_tensor_buft_overrides() -> int: + ... + + # LLAMA_API bool llama_supports_mmap (void); @ctypes_function("llama_supports_mmap", [], ctypes.c_bool) def llama_supports_mmap() -> bool: @@ -1442,22 +1529,47 @@ def llama_model_n_head(model: llama_model_p, /) -> int: ... - # LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model); -@ctypes_function("llama_model_n_swa", [llama_model_p_ctypes], ctypes.c_int32) -def llama_model_n_swa(model: llama_model_p, /) -> int: - ... - - # LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model); @ctypes_function("llama_model_n_head_kv", [llama_model_p_ctypes], ctypes.c_int32) def llama_model_n_head_kv(model: llama_model_p, /) -> int: ... + # LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model); +@ctypes_function("llama_model_n_swa", [llama_model_p_ctypes], ctypes.c_int32) +def llama_model_n_swa(model: llama_model_p, /) -> int: + ... + + # // Get the model's RoPE frequency scaling factor # LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model); @ctypes_function("llama_model_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float) def llama_model_rope_freq_scale_train(model: llama_model_p, /) -> float: + """ + Get the model's RoPE frequency scaling factor + """ + ... + + +# // Returns the number of classifier outputs (only valid for classifier models) +# // Undefined behavior for non-classifier models +# LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model); +@ctypes_function("llama_model_n_cls_out", [llama_model_p_ctypes], ctypes.c_uint32) +def llama_model_n_cls_out(model: llama_model_p, /) -> int: + """ + Returns the number of classifier outputs (only valid for classifier models) + Undefined behavior for non-classifier models + """ + ... + + +# // Returns label of classifier output by index ( ctypes.c_char_p: + """ + Returns label of classifier output by index ( int: ... +# // Get sampling metadata key name. Returns nullptr if the key is invalid +# LLAMA_API const char * llama_model_meta_key_str(enum llama_model_meta_key key); +@ctypes_function("llama_model_meta_key_str", [ctypes.c_int], ctypes.c_char_p) +def llama_model_meta_key_str(key: int, /) -> ctypes.c_char_p: + """Get sampling metadata key name. Returns nullptr if the key is invalid""" + ... + + # // Get metadata key name by index # LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size); @ctypes_function( @@ -4179,6 +4299,23 @@ def llama_print_system_info() -> bytes: # // Set callback for all future logging events. # // If this is not called, or NULL is supplied, everything is output on stderr. +# // The logger state is global so these functions are NOT thread safe. +# LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data); +@ctypes_function( + "llama_log_get", + [ctypes.POINTER(ggml_log_callback), ctypes.POINTER(ctypes.c_void_p)], + None, +) +def llama_log_get( + log_callback: Optional[ctypes.pointer(ggml_log_callback)], + user_data: ctypes.pointer(ctypes.c_void_p), + /, +): + """Get callback for all future logging events. + If this is not called, or NULL is supplied, everything is output on stderr.""" + ... + + # LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data); @ctypes_function( "llama_log_set", @@ -4191,7 +4328,6 @@ def llama_log_set( /, ): """Set callback for all future logging events. - If this is not called, or NULL is supplied, everything is output on stderr.""" ... diff --git a/llama_cpp/llama_embedding.py b/llama_cpp/llama_embedding.py new file mode 100644 index 000000000..44d918e16 --- /dev/null +++ b/llama_cpp/llama_embedding.py @@ -0,0 +1,345 @@ +import numpy as np +from typing import Union, List, Optional, Dict, Any, Tuple +import llama_cpp.llama_cpp as llama_cpp +from .llama_types import Embedding +from .llama import Llama +# Pooling types from .llama_cpp +from .llama_cpp import ( + LLAMA_POOLING_TYPE_UNSPECIFIED, + LLAMA_POOLING_TYPE_NONE, + LLAMA_POOLING_TYPE_MEAN, + LLAMA_POOLING_TYPE_CLS, + LLAMA_POOLING_TYPE_LAST, + LLAMA_POOLING_TYPE_RANK, # Specifically for Reranking models +) + +# Normalization modes for embedding vectors +# See: https://github.com/ggml-org/llama.cpp/tree/master/examples/embedding#--embd-normalize-integer +NORM_MODE_NONE = -1 +NORM_MODE_MAX_INT16 = 0 +NORM_MODE_TAXICAB = 1 +NORM_MODE_EUCLIDEAN = 2 + +# TODO(JamePeng): Needs more extensive testing with various embedding and reranking models. +class LlamaEmbedding(Llama): + """ + A specialized class for high-performance Text Embedding and Reranking. + Inherits from the base Llama class but is optimized for vector operations. + + Key Features: + 1. Auto-configuration: Automatically sets embedding=True. + 2. Streaming Batch: Handles massive datasets without OOM (Out Of Memory). + 3. Native Reranking Support: Specifically handles `LLAMA_POOLING_TYPE_RANK` models (like BGE-Reranker). / + It correctly identifies classification heads to output scalar relevance scores instead of high-dimensional vectors. + 4. Advanced Normalization: Implements MaxInt16, Taxicab (L1), and Euclidean (L2) normalization strategies / + using NumPy for optimal performance and compatibility with various vector databases. + """ + + def __init__(self, model_path: str, pooling_type: int = LLAMA_POOLING_TYPE_UNSPECIFIED, **kwargs): + """ + Initialize the embedding model with enforced configuration. + + Args: + model_path: Path to the GGUF model file. + pooling_type: The pooling strategy used by the model. + - Use `LLAMA_POOLING_TYPE_RANK` (4) for Reranker models. + - Use `LLAMA_POOLING_TYPE_UNSPECIFIED` (-1) to let the model metadata decide (for standard embeddings). + **kwargs: Additional arguments passed to the Llama base class (e.g., n_gpu_layers, n_batch, n_ctx). + """ + kwargs["embedding"] = True + + # Enable Unified KV Cache (Crucial for Batching) + # This allows us to assign arbitrary seq_ids in a batch, enabling the parallel / + # encoding of multiple unrelated documents without "invalid seq_id" errors. + kwargs["kv_unified"] = True + + # Set pooling type + kwargs["pooling_type"] = pooling_type + + super().__init__(model_path=model_path, **kwargs) + + if self.verbose: + print(f"LlamaEmbedding initialized with pooling_type: {self.pooling_type()}") + + def _normalize_vector(self, vector: List[float], mode: int) -> List[float]: + """ + Apply mathematical normalization to a vector. + Uses numpy for performance. + """ + if mode == NORM_MODE_NONE: return vector + arr = np.array(vector, dtype=np.float32) + + # Mode 0: Max Absolute Int16 -> 32760 * x_i / max|x_i| + if mode == NORM_MODE_MAX_INT16: + max_abs = np.max(np.abs(arr)) + if max_abs == 0: return vector + return ((arr / max_abs) * 32760.0).tolist() + + # Mode 1: Taxicab (L1 Norm) -> x_i / sum|x_i| + elif mode == NORM_MODE_TAXICAB: + norm = np.sum(np.abs(arr)) + if norm == 0: return vector + return (arr / norm).tolist() + + # Mode 2: Euclidean (L2 Norm) -> x_i / sqrt(sum x_i^2) + elif mode == NORM_MODE_EUCLIDEAN: + norm = np.linalg.norm(arr) + if norm == 0: return vector + return (arr / norm).tolist() + + # Mode > 2: p-norm + elif mode > 2: + norm = np.sum(np.abs(arr) ** mode) ** (1.0 / mode) + if norm == 0: return vector + return (arr / norm).tolist() + + return vector + + def embed( + self, + input: Union[str, List[str], List[List[int]]], + normalize: int = NORM_MODE_EUCLIDEAN, + truncate: bool = True, + separator: Optional[str] = None, + return_count: bool = False, + ) -> Union[List[float], List[List[float]], Tuple[Any, int]]: + + ctx = self._ctx.ctx + n_batch = self.n_batch + n_ctx = self._n_ctx + n_ubatch = self.context_params.n_ubatch + + # Determine if it is in Rerank mode + try: + current_pooling = self.pooling_type() + except AttributeError: + current_pooling = LLAMA_POOLING_TYPE_UNSPECIFIED + is_rank = (current_pooling == LLAMA_POOLING_TYPE_RANK) + logits_all = current_pooling == llama_cpp.LLAMA_POOLING_TYPE_NONE + + # Determine the output dimension + if is_rank: + out_dim = llama_cpp.llama_model_n_cls_out(self._model.model) + else: + out_dim = self.n_embd() + + if self.verbose: + mode_str = "RANK (Score)" if is_rank else "EMBED (Vector)" + print(f"LlamaEmbedding Debug: Mode={mode_str} | Output Dimension={out_dim}") + + # Preprocess Input + inputs: List[Union[str, List[int]]] = [] + is_single = False + + if isinstance(input, str): + if separator: + inputs = input.split(separator) + is_single = False + else: + inputs = [input] + is_single = True + else: + inputs = input + is_single = False + + # Reset Context and Batch + if self.verbose: + llama_cpp.llama_perf_context_reset(ctx) + self._batch.reset() + llama_cpp.llama_memory_clear(llama_cpp.llama_get_memory(ctx), True) + + # Initialize State Variables + results: List[Any] = [] + batch_seq_lens: List[int] = [] + total_tokens_processed = 0 + + # --- Decode Current Batch --- + def _decode_batch(): + nonlocal batch_seq_lens + if not batch_seq_lens: return + + self._ctx.decode(self._batch) + + for i in range(len(batch_seq_lens)): + ptr = llama_cpp.llama_get_embeddings_seq(ctx, i) + data = ptr[:out_dim] + + if not is_rank: + data = self._normalize_vector(data, normalize) + + if is_rank and len(data) == 1: + results.append(data[0]) + else: + results.append(data) + + self._batch.reset() + llama_cpp.llama_memory_clear(llama_cpp.llama_get_memory(ctx), True) + batch_seq_lens = [] + + # Main Streaming Loop + idx_in_batch = 0 + + for item in inputs: + # Tokenize + tokens: List[int] = [] + if isinstance(item, list) and (not item or isinstance(item[0], int)): + tokens = item + elif isinstance(item, str): + tokens = self.tokenize(item.encode("utf-8")) + else: + raise ValueError("Input item must be str or List[int]") + + # Truncate + if truncate and len(tokens) > n_ctx: + tokens = tokens[:n_ctx] + + n_tokens = len(tokens) + total_tokens_processed += n_tokens + + if n_tokens == 0: + results.append(0.0 if is_rank else []) + continue + + # Check Batch Capacity + if (self._batch.n_tokens() + n_tokens > n_batch) or (idx_in_batch >= n_ubatch): + _decode_batch() + idx_in_batch = 0 + + # Add to Batch + self._batch.add_sequence(tokens, idx_in_batch, logits_all=logits_all) + batch_seq_lens.append(n_tokens) + idx_in_batch += 1 + + # Process Remaining Items + _decode_batch() + + if self.verbose: + llama_cpp.llama_perf_context_print(ctx) + + final_result = results[0] if is_single else results + + if return_count: + return final_result, total_tokens_processed + + return final_result + + def rank(self, query: str, documents: List[str]) -> List[float]: + """ + Calculate relevance scores for a list of documents against a query using a Reranking model. + + This method constructs a specific prompt structure ([BOS] Query [SEP] Doc [EOS]) + typically used by Cross-Encoders to estimate similarity. + + Args: + query: The search query string. + documents: A list of candidate document strings to be scored. + + Returns: + A list of float scores, where higher values indicate greater relevance. + """ + if self.pooling_type() != LLAMA_POOLING_TYPE_RANK: + raise ValueError(f"Model pooling_type is {self.pooling_type()}, but LLAMA_POOLING_TYPE_RANK is required.") + + # Prepare Special Tokens + sep_id = self.token_sep() + if sep_id == -1: sep_id = self.token_eos() + eos_id = self.token_eos() + + # Pre-process Query + q_tokens = self.tokenize(query.encode("utf-8"), add_bos=True, special=True) + # Remove the automatically added EOS token from the query + # because we need to append the separator and document tokens after it. + if q_tokens and q_tokens[-1] == eos_id: + q_tokens.pop() + + # Construct Batch Inputs + batch_inputs: List[List[int]] = [] + for doc in documents: + d_tokens = self.tokenize(doc.encode("utf-8"), add_bos=False, special=True) + full_seq = q_tokens + [sep_id] + d_tokens + # Ensure the sequence ends with an EOS token to mark the end of inference. + if not full_seq or full_seq[-1] != eos_id: + full_seq.append(eos_id) + batch_inputs.append(full_seq) + + # We use NORM_MODE_NONE because rerankers output raw logits/scores, not vectors that need normalization. + return self.embed(batch_inputs, normalize=NORM_MODE_NONE) + + def create_embedding( + self, + input: Union[str, List[str]], + model: Optional[str] = None, + normalize: int = NORM_MODE_EUCLIDEAN, + output_format: str = "json" + ) -> Union[Dict[str, Any], List[float], List[List[float]]]: + """ + High-level API compatible with OpenAI format. + + Args: + output_format: + - 'json': OpenAI style dict (Default) + - 'json+': OpenAI style dict + cosineSimilarity matrix + - 'array': Raw python list (List[float] or List[List[float]]) + """ + model_name = model if model is not None else self.model_path + + # Normalize input to list + inputs_list = [input] if isinstance(input, str) else input + + # Generate Embeddings(and get token count) + embeddings, token_count = self.embed( + inputs_list, + normalize=normalize, + return_count=True + ) + + if output_format == "array": + return embeddings + + # Structure the OpenAI-style response ('json' or 'json+') + # Ensure embeddings is a list for iteration + # (If input was single string, embeddings is List[float], wrap it for the loop) + iter_embeddings = [embeddings] if isinstance(embeddings[0], float) else embeddings + + data: List[Embedding] = [ + { + "object": "embedding", + "embedding": emb, + "index": idx, + } + for idx, emb in enumerate(iter_embeddings) + ] + + response = { + "object": "list", + "data": data, + "model": model_name, + "usage": { + "prompt_tokens": token_count, # Input consumption + "completion_tokens": 0, # The Embedding task does not generate text, so the value is 0. + "total_tokens": token_count, # Total consumption = Input consumption + Output + } + } + + # Calculate Cosine Similarity Matrix (Optimized via Numpy) + # Only if output_format is 'json+' and we have vectors + if output_format == "json+" and len(embeddings) > 1 and isinstance(embeddings[0], list): + try: + # Assuming embeddings are already L2 normalized if normalize=2 + mat = np.array(embeddings) + + # Safety check: Force normalize if not already done, to ensure Cosine (not Dot Product) + if normalize != NORM_MODE_EUCLIDEAN: + norm = np.linalg.norm(mat, axis=1, keepdims=True) + # Avoid division by zero + norm[norm == 0] = 1e-10 + mat = mat / norm + + # Matrix multiplication: A @ A.T + sim_matrix = np.dot(mat, mat.T) + response["cosineSimilarity"] = sim_matrix.tolist() + except Exception as e: + if self.verbose: + print(f"Warning: Failed to calculate similarity matrix: {e}") + + return response diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py index 1079c1d2e..46ae4ba1c 100644 --- a/llama_cpp/llama_grammar.py +++ b/llama_cpp/llama_grammar.py @@ -2,13 +2,15 @@ # flake8: noqa from pathlib import Path - -from itertools import groupby +import itertools +import json +import re +import sys from typing import ( Any, - Set, List, Optional, + Set, Tuple, Union, ) @@ -21,30 +23,50 @@ def __init__(self, *args, _grammar: str, **kwargs): self._grammar = _grammar self._root = LLAMA_GRAMMAR_DEFAULT_ROOT + @property + def grammar(self) -> str: + return self._grammar + @classmethod def from_string(cls, grammar: str, verbose: bool = True) -> "LlamaGrammar": return cls(_grammar=grammar) @classmethod def from_file(cls, file: Union[str, Path], verbose: bool = True) -> "LlamaGrammar": + file_path = Path(file) + + if not file_path.exists(): + raise FileNotFoundError(f"{cls.__name__}.from_file: file not found: {file_path}") + try: - with open(file) as f: - grammar = f.read() + grammar_content = file_path.read_text(encoding='utf-8') except Exception as err: - raise Exception( - f"{cls.from_file.__name__}: error reading grammar file: {err}" - ) + raise IOError(f"{cls.__name__}.from_file: error reading grammar file: {err}") - if grammar: - return cls.from_string(grammar, verbose=verbose) + if not grammar_content.strip(): + raise ValueError(f"{cls.__name__}.from_file: grammar file is empty") - raise ValueError( - f"{cls.from_file.__name__}: error parsing grammar file: params_grammer is empty" - ) + return cls.from_string(grammar_content, verbose=verbose) @classmethod - def from_json_schema(cls, json_schema: str, verbose: bool = True) -> "LlamaGrammar": - return cls.from_string(json_schema_to_gbnf(json_schema), verbose=verbose) + def from_json_schema( + cls, + json_schema: str, + prop_order: Optional[List[str]] = None, + verbose: bool = True + ) -> "LlamaGrammar": + """ + Create a syntax object from a JSON Schema. + + json_schema: A JSON Schema string or dictionary. + prop_order: Specifies the order in which fields are generated (helps improve the stability of small models). + verbose: Whether to log. + """ + try: + gbnf_grammar_str = json_schema_to_gbnf(json_schema, prop_order=prop_order) + return cls.from_string(gbnf_grammar_str, verbose=verbose) + except Exception as e: + raise ValueError(f"{cls.__name__}.from_json_schema: conversion failed: {e}") """llama.cpp gbnf rules from vendor/llama.cpp/grammars""" @@ -242,78 +264,190 @@ def from_json_schema(cls, json_schema: str, verbose: bool = True) -> "LlamaGramm """ """llama.cpp json-schema to grammar converter from vendor/llama.cpp/examples/json-schema-to-grammar.py""" -import json -import re -from typing import List, Optional -# whitespace is constrained to a single space char to prevent model "running away" in -# whitespace. Also maybe improves generation quality? -SPACE_RULE = '" "?' +def _build_repetition(item_rule, min_items, max_items, separator_rule=None): + if max_items == 0: + return "" -INVALID_RULE_CHARS_RE = re.compile(r"[^a-zA-Z0-9-]+") -GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]') -GRAMMAR_LITERAL_ESCAPES = {"\r": "\\r", "\n": "\\n", '"': '\\"'} - -# whitespace is constrained to a single space char to prevent model "running away" in -# whitespace. Also maybe improves generation quality? -SPACE_RULE = '" "?' - + if min_items == 0 and max_items == 1: + return f'{item_rule}?' -def _build_repetition( - item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False -): if not separator_rule: - if min_items == 0 and max_items == 1: - return f"{item_rule}?" - elif min_items == 1 and max_items is None: - return f"{item_rule}+" - - result = "" - - if min_items > 0: - if item_rule_is_literal and separator_rule is None: - result = '"' + (item_rule[1:-1] * min_items) + '"' + if min_items == 1 and max_items is None: + return f'{item_rule}+' + elif min_items == 0 and max_items is None: + return f'{item_rule}*' else: - result = (f" {separator_rule} " if separator_rule else " ").join( - [item_rule] * min_items - ) - - def opt_repetitions(up_to_n, prefix_with_sep=False): - """ - - n=4, no sep: '(a (a (a (a)?)?)?)?' - - n=4, sep=',', prefix: '("," a ("," a ("," a ("," a)?)?)?)?' - - n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?' - """ + return f'{item_rule}{{{min_items},{max_items if max_items is not None else ""}}}' - content = ( - f"{separator_rule} {item_rule}" - if prefix_with_sep and separator_rule - else item_rule - ) - if up_to_n == 0: - return "" - elif up_to_n == 1: - return f"({content})?" - elif separator_rule and not prefix_with_sep: - return f"({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?" - else: - return (f"({content} " * up_to_n).rstrip() + (")?" * up_to_n) + result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None) + return f'({result})?' if min_items == 0 else result - if min_items > 0 and max_items != min_items: - result += " " +def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True): + has_min = min_value != None + has_max = max_value != None - if max_items is not None: - result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0) - else: - item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})' - - if min_items == 0 and separator_rule: - result = f"({item_rule} {item_operator}*)?" + def digit_range(from_char: str, to_char: str): + out.append("[") + if from_char == to_char: + out.append(from_char) else: - result += f"{item_operator}*" + out.append(from_char) + out.append("-") + out.append(to_char) + out.append("]") + + def more_digits(min_digits: int, max_digits: int): + out.append("[0-9]") + if min_digits == max_digits and min_digits == 1: + return + out.append("{") + out.append(str(min_digits)) + if max_digits != min_digits: + out.append(",") + if max_digits != sys.maxsize: + out.append(str(max_digits)) + out.append("}") + + def uniform_range(from_str: str, to_str: str): + i = 0 + while i < len(from_str) and from_str[i] == to_str[i]: + i += 1 + if i > 0: + out.append("\"") + out.append(from_str[:i]) + out.append("\"") + if i < len(from_str): + if i > 0: + out.append(" ") + sub_len = len(from_str) - i - 1 + if sub_len > 0: + from_sub = from_str[i+1:] + to_sub = to_str[i+1:] + sub_zeros = "0" * sub_len + sub_nines = "9" * sub_len + + to_reached = False + out.append("(") + if from_sub == sub_zeros: + digit_range(from_str[i], chr(ord(to_str[i]) - 1)) + out.append(" ") + more_digits(sub_len, sub_len) + else: + out.append("[") + out.append(from_str[i]) + out.append("] ") + out.append("(") + uniform_range(from_sub, sub_nines) + out.append(")") + if ord(from_str[i]) < ord(to_str[i]) - 1: + out.append(" | ") + if to_sub == sub_nines: + digit_range(chr(ord(from_str[i]) + 1), to_str[i]) + to_reached = True + else: + digit_range(chr(ord(from_str[i]) + 1), chr(ord(to_str[i]) - 1)) + out.append(" ") + more_digits(sub_len, sub_len) + if not to_reached: + out.append(" | ") + digit_range(to_str[i], to_str[i]) + out.append(" ") + uniform_range(sub_zeros, to_sub) + out.append(")") + else: + out.append("[") + out.append(from_str[i]) + out.append("-") + out.append(to_str[i]) + out.append("]") + + if has_min and has_max: + if min_value < 0 and max_value < 0: + out.append("\"-\" (") + _generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True) + out.append(")") + return + + if min_value < 0: + out.append("\"-\" (") + _generate_min_max_int(0, -min_value, out, decimals_left, top_level=True) + out.append(") | ") + min_value = 0 + + min_s = str(min_value) + max_s = str(max_value) + min_digits = len(min_s) + max_digits = len(max_s) + + for digits in range(min_digits, max_digits): + uniform_range(min_s, "9" * digits) + min_s = "1" + "0" * digits + out.append(" | ") + uniform_range(min_s, max_s) + return + + less_decimals = max(decimals_left - 1, 1) + + if has_min: + if min_value < 0: + out.append("\"-\" (") + _generate_min_max_int(None, -min_value, out, decimals_left, top_level=False) + out.append(") | [0] | [1-9] ") + more_digits(0, decimals_left - 1) + elif min_value == 0: + if top_level: + out.append("[0] | [1-9] ") + more_digits(0, less_decimals) + else: + more_digits(1, decimals_left) + elif min_value <= 9: + c = str(min_value) + range_start = '1' if top_level else '0' + if c > range_start: + digit_range(range_start, chr(ord(c) - 1)) + out.append(" ") + more_digits(1, less_decimals) + out.append(" | ") + digit_range(c, "9") + out.append(" ") + more_digits(0, less_decimals) + else: + min_s = str(min_value) + length = len(min_s) + c = min_s[0] + + if c > "1": + digit_range("1" if top_level else "0", chr(ord(c) - 1)) + out.append(" ") + more_digits(length, less_decimals) + out.append(" | ") + digit_range(c, c) + out.append(" (") + _generate_min_max_int(int(min_s[1:]), None, out, less_decimals, top_level=False) + out.append(")") + if c < "9": + out.append(" | ") + digit_range(chr(ord(c) + 1), "9") + out.append(" ") + more_digits(length - 1, less_decimals) + return + + if has_max: + if max_value >= 0: + if top_level: + out.append("\"-\" [1-9] ") + more_digits(0, less_decimals) + out.append(" | ") + _generate_min_max_int(0, max_value, out, decimals_left, top_level=True) + else: + out.append("\"-\" (") + _generate_min_max_int(-max_value, None, out, decimals_left, top_level=False) + out.append(")") + return - return result + raise RuntimeError("At least one of min_value or max_value must be set") class BuiltinRule: @@ -321,70 +455,46 @@ def __init__(self, content: str, deps: list = None): self.content = content self.deps = deps or [] - -_up_to_15_digits = _build_repetition("[0-9]", 0, 15) +# Constraining spaces to prevent model "running away". +SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}' PRIMITIVE_RULES = { - "boolean": BuiltinRule('("true" | "false") space', []), - "decimal-part": BuiltinRule("[0-9] " + _up_to_15_digits, []), - "integral-part": BuiltinRule("[0-9] | [1-9] " + _up_to_15_digits, []), - "number": BuiltinRule( - '("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', - ["integral-part", "decimal-part"], - ), - "integer": BuiltinRule('("-"? integral-part) space', ["integral-part"]), - "value": BuiltinRule( - "object | array | string | number | boolean | null", - ["object", "array", "string", "number", "boolean", "null"], - ), - "object": BuiltinRule( - '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', - ["string", "value"], - ), - "array": BuiltinRule( - '"[" space ( value ("," space value)* )? "]" space', ["value"] - ), - "uuid": BuiltinRule( - r'"\"" ' - + ' "-" '.join("[0-9a-fA-F]" * n for n in [8, 4, 4, 4, 12]) - + r' "\"" space', - [], - ), - "char": BuiltinRule( - r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', - [], - ), - "string": BuiltinRule(r'"\"" char* "\"" space', ["char"]), - "null": BuiltinRule('"null" space', []), + 'boolean' : BuiltinRule('("true" | "false") space', []), + 'decimal-part' : BuiltinRule('[0-9]{1,16}', []), + 'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []), + 'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']), + 'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']), + 'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']), + 'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']), + 'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']), + 'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []), + 'char' : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []), + 'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']), + 'null' : BuiltinRule('"null" space', []), } # TODO: support "uri", "email" string formats STRING_FORMAT_RULES = { - "date": BuiltinRule( - '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( "0" [1-9] | [1-2] [0-9] | "3" [0-1] )', - [], - ), - "time": BuiltinRule( - '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', - [], - ), - "date-time": BuiltinRule('date "T" time', ["date", "time"]), - "date-string": BuiltinRule('"\\"" date "\\"" space', ["date"]), - "time-string": BuiltinRule('"\\"" time "\\"" space', ["time"]), - "date-time-string": BuiltinRule('"\\"" date-time "\\"" space', ["date-time"]), + 'date' : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []), + 'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []), + 'date-time' : BuiltinRule('date "T" time', ['date', 'time']), + 'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']), + 'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']), + 'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']), } -DOTALL = "[\\U00000000-\\U0010FFFF]" -DOT = "[^\\x0A\\x0D]" +DOTALL = '[\\U00000000-\\U0010FFFF]' +DOT = '[^\\x0A\\x0D]' -RESERVED_NAMES = set( - ["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()] -) +RESERVED_NAMES = set(["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()]) +INVALID_RULE_CHARS_RE = re.compile(r"[^a-zA-Z0-9-]+") +GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n\"\\\\]') +GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n\"\\]\\-\\\\]') +GRAMMAR_LITERAL_ESCAPES = {"\r": "\\r", "\n": "\\n", '"': '\\"', "-": "\\-", "]": "\\]", "\\": "\\\\"} NON_LITERAL_SET = set("|.()[]{}*+?") -ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set("[]()|{}*+?") - +ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set("^$.[]()|{}*+?") class SchemaConverter: def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern): @@ -393,101 +503,142 @@ def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern): self._dotall = dotall self._raw_pattern = raw_pattern self._rules = { - "space": SPACE_RULE, + 'space': SPACE_RULE, } self._refs = {} self._refs_being_resolved = set() def _format_literal(self, literal): escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub( - lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal + lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)) or m.group(0), literal ) return f'"{escaped}"' - def not_literal( - self, literal: str, dotall: bool = True, maybe_escaped_underscores=False - ) -> str: - """ - not_literal('a') -> '[^a]' - not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?' - """ - assert len(literal) > 0, "Empty literal not supported" - + def not_literal(self, literal: str, dotall: bool = True, maybe_escaped_underscores = False) -> str: + ''' + not_literal('a') -> '[^a]' + not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?' + ''' + assert len(literal) > 0, 'Empty literal not supported' def recurse(i: int): c = literal[i] - if maybe_escaped_underscores and c == "_": - yield f"[^{c}\\\\]" - yield " | " + if maybe_escaped_underscores and c == '_': + yield f'[^{c}\\\\]' + yield ' | ' yield f'"\\\\"? "{c}"' else: - yield f"[^{c}]" + yield f'[^{c}]' if i < len(literal) - 1: - yield " | " + yield ' | ' yield self._format_literal(c) - yield " (" + yield ' (' yield from recurse(i + 1) - yield ")?" - - return "".join(("(", *recurse(0), ")")) + yield ')?' + + return ''.join(('(', *recurse(0), ')')) + + def _not_strings(self, strings): + class TrieNode: + def __init__(self): + self.children = {} + self.is_end_of_string = False + + def insert(self, string): + node = self + for c in string: + node = node.children.setdefault(c, TrieNode()) + node.is_end_of_string = True + + trie = TrieNode() + for s in strings: + trie.insert(s) + + char_rule = self._add_primitive('char', PRIMITIVE_RULES['char']) + out = ['["] ( '] + + def visit(node): + rejects = [] + first = True + for c in sorted(node.children.keys()): + child = node.children[c] + rejects.append(c) + if first: + first = False + else: + out.append(' | ') + out.append(f'[{c}]') + if child.children: + out.append(f' (') + visit(child) + out.append(')') + elif child.is_end_of_string: + out.append(f' {char_rule}+') + if node.children: + if not first: + out.append(' | ') + out.append(f'[^"{"".join(rejects)}] {char_rule}*') + visit(trie) + + out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space') + return ''.join(out) def _add_rule(self, name, rule): - esc_name = INVALID_RULE_CHARS_RE.sub("-", name) + esc_name = INVALID_RULE_CHARS_RE.sub('-', name) if esc_name not in self._rules or self._rules[esc_name] == rule: key = esc_name else: i = 0 - while ( - f"{esc_name}{i}" in self._rules - and self._rules[f"{esc_name}{i}"] != rule - ): + while f'{esc_name}{i}' in self._rules and self._rules[f'{esc_name}{i}'] != rule: i += 1 - key = f"{esc_name}{i}" + key = f'{esc_name}{i}' self._rules[key] = rule return key def resolve_refs(self, schema: dict, url: str): - """ - Resolves all $ref fields in the given schema, fetching any remote schemas, - replacing $ref with absolute reference URL and populating self._refs with the - respective referenced (sub)schema dictionaries. - """ - + ''' + Resolves all $ref fields in the given schema, fetching any remote schemas, + replacing $ref with absolute reference URL and populating self._refs with the + respective referenced (sub)schema dictionaries. + ''' def visit(n: dict): if isinstance(n, list): return [visit(x) for x in n] elif isinstance(n, dict): - ref = n.get("$ref") + ref = n.get('$ref') if ref is not None and ref not in self._refs: - if ref.startswith("https://"): - assert ( - self._allow_fetch - ), "Fetching remote schemas is not allowed (use --allow-fetch for force)" + if ref.startswith('https://'): + assert self._allow_fetch, 'Fetching remote schemas is not allowed (use --allow-fetch for force)' import requests - frag_split = ref.split("#") + frag_split = ref.split('#') base_url = frag_split[0] target = self._refs.get(base_url) if target is None: - target = self.resolve_refs( - requests.get(ref).json(), base_url - ) + target = self.resolve_refs(requests.get(ref).json(), base_url) self._refs[base_url] = target - if len(frag_split) == 1 or frag_split[-1] == "": + if len(frag_split) == 1 or frag_split[-1] == '': return target - elif ref.startswith("#/"): + elif ref.startswith('#/'): target = schema - ref = f"{url}{ref}" - n["$ref"] = ref + ref = f'{url}{ref}' + n['$ref'] = ref else: - raise ValueError(f"Unsupported ref {ref}") - - for sel in ref.split("#")[-1].split("/")[1:]: - assert ( - target is not None and sel in target - ), f"Error resolving ref {ref}: {sel} not in {target}" - target = target[sel] + raise ValueError(f'Unsupported ref {ref}') + + for sel in ref.split('#')[-1].split('/')[1:]: + assert target is not None, f'Error resolving ref {ref}: {sel} not in {target}' + if isinstance(target, list): + try: + sel_index = int(sel) + except ValueError: + raise ValueError(f'Error resolving ref {ref}: {sel} not in {target}') + assert 0 <= sel_index < len(target), f'Error resolving ref {ref}: {sel} not in {target}' + target = target[sel_index] + else: + assert sel in target, f'Error resolving ref {ref}: {sel} not in {target}' + target = target[sel] self._refs[ref] = target else: @@ -495,47 +646,42 @@ def visit(n: dict): visit(v) return n - return visit(schema) def _generate_union_rule(self, name, alt_schemas): - return " | ".join( - ( - self.visit(alt_schema, f'{name}{"-" if name else "alternative-"}{i}') - for i, alt_schema in enumerate(alt_schemas) - ) - ) + return ' | '.join(( + self.visit(alt_schema, f'{name}{"-" if name else "alternative-"}{i}') + for i, alt_schema in enumerate(alt_schemas) + )) def _visit_pattern(self, pattern, name): - """ - Transforms a regular expression pattern into a GBNF rule. + ''' + Transforms a regular expression pattern into a GBNF rule. - Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions - Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md + Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions + Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md - Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers. + Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers. - Mostly a 1:1 translation, except for {x} / {x,} / {x,y} quantifiers for which - we define sub-rules to keep the output lean. - """ + Mostly a 1:1 translation, except for {x} / {x,} / {x,y} quantifiers for which + we define sub-rules to keep the output lean. + ''' - assert pattern.startswith("^") and pattern.endswith( - "$" - ), 'Pattern must start with "^" and end with "$"' + assert pattern.startswith('^') and pattern.endswith('$'), 'Pattern must start with "^" and end with "$"' pattern = pattern[1:-1] sub_rule_ids = {} i = 0 length = len(pattern) - def to_rule(s: Tuple[str, bool]) -> str: + def to_rule(s: tuple[str, bool]) -> str: (txt, is_literal) = s - return '"' + txt + '"' if is_literal else txt + return "\"" + txt + "\"" if is_literal else txt - def transform() -> Tuple[str, bool]: - """ - Parse a unit at index i (advancing it), and return its string representation + whether it's a literal. - """ + def transform() -> tuple[str, bool]: + ''' + Parse a unit at index i (advancing it), and return its string representation + whether it's a literal. + ''' nonlocal i nonlocal pattern nonlocal sub_rule_ids @@ -545,7 +691,7 @@ def transform() -> Tuple[str, bool]: # We only need a flat structure here to apply repetition operators to the last item, and # to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially # (GBNF's syntax is luckily very close to regular expressions!) - seq: list[Tuple[str, bool]] = [] + seq: list[tuple[str, bool]] = [] def get_dot(): if self._dotall: @@ -553,72 +699,64 @@ def get_dot(): else: # Accept any character... except \n and \r line break chars (\x0A and \xOD) rule = DOT - return self._add_rule(f"dot", rule) + return self._add_rule(f'dot', rule) def join_seq(): nonlocal seq ret = [] - for is_literal, g in groupby(seq, lambda x: x[1]): + for is_literal, g in itertools.groupby(seq, lambda x: x[1]): if is_literal: - ret.append(("".join(x[0] for x in g), True)) + ret.append((''.join(x[0] for x in g), True)) else: ret.extend(g) if len(ret) == 1: return ret[0] - return (" ".join(to_rule(x) for x in seq), False) + return (' '.join(to_rule(x) for x in seq), False) while i < length: c = pattern[i] - if c == ".": + if c == '.': seq.append((get_dot(), False)) i += 1 - elif c == "(": + elif c == '(': i += 1 if i < length: - assert ( - pattern[i] != "?" - ), f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/' - seq.append((f"({to_rule(transform())})", False)) - elif c == ")": + assert pattern[i] != '?', f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/' + seq.append((f'({to_rule(transform())})', False)) + elif c == ')': i += 1 - assert ( - start > 0 and pattern[start - 1] == "(" - ), f"Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}" + assert start > 0 and pattern[start-1] == '(', f'Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}' return join_seq() - elif c == "[": + elif c == '[': square_brackets = c i += 1 - while i < length and pattern[i] != "]": - if pattern[i] == "\\": - square_brackets += pattern[i : i + 2] + while i < length and pattern[i] != ']': + if pattern[i] == '\\': + square_brackets += pattern[i:i+2] i += 2 else: square_brackets += pattern[i] i += 1 - assert ( - i < length - ), f"Unbalanced square brackets; start = {start}, i = {i}, pattern = {pattern}" - square_brackets += "]" + assert i < length, f'Unbalanced square brackets; start = {start}, i = {i}, pattern = {pattern}' + square_brackets += ']' i += 1 seq.append((square_brackets, False)) - elif c == "|": - seq.append(("|", False)) + elif c == '|': + seq.append(('|', False)) i += 1 - elif c in ("*", "+", "?"): + elif c in ('*', '+', '?'): seq[-1] = (to_rule(seq[-1]) + c, False) i += 1 - elif c == "{": + elif c == '{': curly_brackets = c i += 1 - while i < length and pattern[i] != "}": + while i < length and pattern[i] != '}': curly_brackets += pattern[i] i += 1 - assert ( - i < length - ), f"Unbalanced curly brackets; start = {start}, i = {i}, pattern = {pattern}" - curly_brackets += "}" + assert i < length, f'Unbalanced curly brackets; start = {start}, i = {i}, pattern = {pattern}' + curly_brackets += '}' i += 1 - nums = [s.strip() for s in curly_brackets[1:-1].split(",")] + nums = [s.strip() for s in curly_brackets[1:-1].split(',')] min_times = 0 max_times = None try: @@ -630,49 +768,35 @@ def join_seq(): min_times = int(nums[0]) if nums[0] else 0 max_times = int(nums[1]) if nums[1] else None except ValueError: - raise ValueError( - f"Invalid quantifier {curly_brackets} in /{pattern}/" - ) + raise ValueError(f'Invalid quantifier {curly_brackets} in /{pattern}/') (sub, sub_is_literal) = seq[-1] if not sub_is_literal: id = sub_rule_ids.get(sub) if id is None: - id = self._add_rule(f"{name}-{len(sub_rule_ids) + 1}", sub) + id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub) sub_rule_ids[sub] = id sub = id - seq[-1] = ( - _build_repetition( - f'"{sub}"' if sub_is_literal else sub, - min_times, - max_times, - item_rule_is_literal=sub_is_literal, - ), - False, - ) + seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times), False) else: - literal = "" + literal = '' while i < length: - if pattern[i] == "\\" and i < length - 1: + if pattern[i] == '\\' and i < length - 1: next = pattern[i + 1] if next in ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS: i += 1 literal += pattern[i] i += 1 else: - literal += pattern[i : i + 2] + literal += pattern[i:i+2] i += 2 elif pattern[i] == '"' and not self._raw_pattern: literal += '\\"' i += 1 - elif pattern[i] not in NON_LITERAL_SET and ( - i == length - 1 - or literal == "" - or pattern[i + 1] == "." - or pattern[i + 1] not in NON_LITERAL_SET - ): + elif pattern[i] not in NON_LITERAL_SET and \ + (i == length - 1 or literal == '' or pattern[i+1] == '.' or pattern[i+1] not in NON_LITERAL_SET): literal += pattern[i] i += 1 else: @@ -684,15 +808,13 @@ def join_seq(): return self._add_rule( name, - ( - to_rule(transform()) - if self._raw_pattern - else '"\\"" ' + to_rule(transform()) + ' "\\"" space' - ), - ) + to_rule(transform()) if self._raw_pattern \ + else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space") + def _resolve_ref(self, ref): - ref_name = ref.split("/")[-1] + ref_fragment = ref.split('#')[-1] + ref_name = 'ref' + re.sub(r'[^a-zA-Z0-9-]+', '-', ref_fragment) if ref_name not in self._rules and ref not in self._refs_being_resolved: self._refs_being_resolved.add(ref) resolved = self._refs[ref] @@ -704,203 +826,166 @@ def _generate_constant_rule(self, value): return self._format_literal(json.dumps(value)) def visit(self, schema, name): - schema_type = schema.get("type") - schema_format = schema.get("format") - rule_name = name + "-" if name in RESERVED_NAMES else name or "root" + schema_type = schema.get('type') + schema_format = schema.get('format') + rule_name = name + '-' if name in RESERVED_NAMES else name or 'root' - if (ref := schema.get("$ref")) is not None: + if (ref := schema.get('$ref')) is not None: return self._add_rule(rule_name, self._resolve_ref(ref)) - elif "oneOf" in schema or "anyOf" in schema: - return self._add_rule( - rule_name, - self._generate_union_rule(name, schema.get("oneOf") or schema["anyOf"]), - ) + elif 'oneOf' in schema or 'anyOf' in schema: + return self._add_rule(rule_name, self._generate_union_rule(name, schema.get('oneOf') or schema['anyOf'])) elif isinstance(schema_type, list): - return self._add_rule( - rule_name, - self._generate_union_rule(name, [{"type": t} for t in schema_type]), - ) + return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type])) - elif "const" in schema: - return self._add_rule( - rule_name, self._generate_constant_rule(schema["const"]) - ) + elif 'const' in schema: + return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space') - elif "enum" in schema: - rule = " | ".join((self._generate_constant_rule(v) for v in schema["enum"])) + elif 'enum' in schema: + rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space' return self._add_rule(rule_name, rule) - elif schema_type in (None, "object") and ( - "properties" in schema - or ( - "additionalProperties" in schema - and schema["additionalProperties"] is not True - ) - ): - required = set(schema.get("required", [])) - properties = list(schema.get("properties", {}).items()) - return self._add_rule( - rule_name, - self._build_object_rule( - properties, required, name, schema.get("additionalProperties") - ), - ) + elif schema_type in (None, 'object') and \ + ('properties' in schema or \ + ('additionalProperties' in schema and schema['additionalProperties'] is not True)): + required = set(schema.get('required', [])) + properties = list(schema.get('properties', {}).items()) + return self._add_rule(rule_name, self._build_object_rule(properties, required, name, schema.get('additionalProperties'))) - elif schema_type in (None, "object") and "allOf" in schema: + elif schema_type in (None, 'object', 'string') and 'allOf' in schema: required = set() properties = [] + enum_sets = [] hybrid_name = name - def add_component(comp_schema, is_required): - if (ref := comp_schema.get("$ref")) is not None: + if (ref := comp_schema.get('$ref')) is not None: comp_schema = self._refs[ref] - if "properties" in comp_schema: - for prop_name, prop_schema in comp_schema["properties"].items(): + if 'properties' in comp_schema: + for prop_name, prop_schema in comp_schema['properties'].items(): properties.append((prop_name, prop_schema)) if is_required: required.add(prop_name) - for t in schema["allOf"]: - if "anyOf" in t: - for tt in t["anyOf"]: + if 'enum' in comp_schema: + enum_sets.append(set(comp_schema['enum'])) + + for t in schema['allOf']: + if 'anyOf' in t: + for tt in t['anyOf']: add_component(tt, is_required=False) else: add_component(t, is_required=True) - return self._add_rule( - rule_name, - self._build_object_rule( - properties, required, hybrid_name, additional_properties=[] - ), - ) + if enum_sets: + enum_intersection = enum_sets[0] + for s in enum_sets[1:]: + enum_intersection &= s + + if enum_intersection: + rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ') space' + return self._add_rule(rule_name, rule) - elif schema_type in (None, "array") and ( - "items" in schema or "prefixItems" in schema - ): - items = schema.get("items") or schema["prefixItems"] + return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None)) + + elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema): + items = schema.get('items') or schema['prefixItems'] if isinstance(items, list): return self._add_rule( rule_name, - '"[" space ' - + ' "," space '.join( + '"[" space ' + + ' "," space '.join( self.visit(item, f'{name}{"-" if name else ""}tuple-{i}') - for i, item in enumerate(items) - ) - + ' "]" space', - ) + for i, item in enumerate(items)) + + ' "]" space') else: item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item') min_items = schema.get("minItems", 0) max_items = schema.get("maxItems") - return self._add_rule( - rule_name, - '"[" space ' - + _build_repetition( - item_rule_name, min_items, max_items, separator_rule='"," space' - ) - + ' "]" space', - ) + return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space') - elif schema_type in (None, "string") and "pattern" in schema: - return self._visit_pattern(schema["pattern"], rule_name) + elif schema_type in (None, 'string') and 'pattern' in schema: + return self._visit_pattern(schema['pattern'], rule_name) - elif schema_type in (None, "string") and re.match( - r"^uuid[1-5]?$", schema_format or "" - ): + elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''): return self._add_primitive( - "root" if rule_name == "root" else schema_format, - PRIMITIVE_RULES["uuid"], - ) - - elif ( - schema_type in (None, "string") - and f"{schema_format}-string" in STRING_FORMAT_RULES - ): - prim_name = f"{schema_format}-string" - return self._add_rule( - rule_name, - self._add_primitive(prim_name, STRING_FORMAT_RULES[prim_name]), + 'root' if rule_name == 'root' else schema_format, + PRIMITIVE_RULES['uuid'] ) - elif schema_type == "string" and ( - "minLength" in schema or "maxLength" in schema - ): - char_rule = self._add_primitive("char", PRIMITIVE_RULES["char"]) - min_len = schema.get("minLength", 0) - max_len = schema.get("maxLength") - - return self._add_rule( - rule_name, - r'"\"" ' - + _build_repetition(char_rule, min_len, max_len) - + r' "\"" space', - ) - - elif (schema_type == "object") or (len(schema) == 0): - return self._add_rule( - rule_name, self._add_primitive("object", PRIMITIVE_RULES["object"]) - ) + elif schema_type in (None, 'string') and f'{schema_format}-string' in STRING_FORMAT_RULES: + prim_name = f'{schema_format}-string' + return self._add_rule(rule_name, self._add_primitive(prim_name, STRING_FORMAT_RULES[prim_name])) + + elif schema_type == 'string' and ('minLength' in schema or 'maxLength' in schema): + char_rule = self._add_primitive('char', PRIMITIVE_RULES['char']) + min_len = schema.get('minLength', 0) + max_len = schema.get('maxLength') + + return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space') + + elif schema_type in (None, 'integer') and \ + ('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema): + min_value = None + max_value = None + if 'minimum' in schema: + min_value = schema['minimum'] + elif 'exclusiveMinimum' in schema: + min_value = schema['exclusiveMinimum'] + 1 + if 'maximum' in schema: + max_value = schema['maximum'] + elif 'exclusiveMaximum' in schema: + max_value = schema['exclusiveMaximum'] - 1 + + out = ["("] + _generate_min_max_int(min_value, max_value, out) + out.append(") space") + return self._add_rule(rule_name, ''.join(out)) + + elif (schema_type == 'object') or (len(schema) == 0): + return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object'])) else: - assert schema_type in PRIMITIVE_RULES, f"Unrecognized schema: {schema}" + assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}' # TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero - return self._add_primitive( - "root" if rule_name == "root" else schema_type, - PRIMITIVE_RULES[schema_type], - ) + return self._add_primitive('root' if rule_name == 'root' else schema_type, PRIMITIVE_RULES[schema_type]) def _add_primitive(self, name: str, rule: BuiltinRule): n = self._add_rule(name, rule.content) for dep in rule.deps: dep_rule = PRIMITIVE_RULES.get(dep) or STRING_FORMAT_RULES.get(dep) - assert dep_rule, f"Rule {dep} not known" + assert dep_rule, f'Rule {dep} not known' if dep not in self._rules: self._add_primitive(dep, dep_rule) return n - def _build_object_rule( - self, - properties: List[Tuple[str, Any]], - required: Set[str], - name: str, - additional_properties: Union[bool, Any], - ): + def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Optional[Union[bool, Any]]): prop_order = self._prop_order # sort by position in prop_order (if specified) then by original order - sorted_props = [ - kv[0] - for _, kv in sorted( - enumerate(properties), - key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]), - ) - ] + sorted_props = [kv[0] for _, kv in sorted(enumerate(properties), key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]))] prop_kv_rule_names = {} for prop_name, prop_schema in properties: - prop_rule_name = self.visit( - prop_schema, f'{name}{"-" if name else ""}{prop_name}' - ) + prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}') prop_kv_rule_names[prop_name] = self._add_rule( f'{name}{"-" if name else ""}{prop_name}-kv', - rf'{self._format_literal(json.dumps(prop_name))} space ":" space {prop_rule_name}', + fr'{self._format_literal(json.dumps(prop_name))} space ":" space {prop_rule_name}' ) required_props = [k for k in sorted_props if k in required] optional_props = [k for k in sorted_props if k not in required] - if additional_properties == True or isinstance(additional_properties, dict): + if additional_properties is not None and additional_properties != False: sub_name = f'{name}{"-" if name else ""}additional' - value_rule = self.visit( - {} if additional_properties == True else additional_properties, - f"{sub_name}-value", - ) + value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \ + self._add_primitive('value', PRIMITIVE_RULES['value']) + key_rule = self._add_primitive('string', PRIMITIVE_RULES['string']) if not sorted_props \ + else self._add_rule(f'{sub_name}-k', self._not_strings(sorted_props)) + prop_kv_rule_names["*"] = self._add_rule( - f"{sub_name}-kv", - self._add_primitive("string", PRIMITIVE_RULES["string"]) - + f' ":" space {value_rule}', + f'{sub_name}-kv', + f'{key_rule} ":" space {value_rule}' ) optional_props.append("*") @@ -908,44 +993,40 @@ def _build_object_rule( rule += ' "," space '.join(prop_kv_rule_names[k] for k in required_props) if optional_props: - rule += " (" + rule += ' (' if required_props: rule += ' "," space ( ' def get_recursive_refs(ks, first_is_optional): [k, *rest] = ks kv_rule_name = prop_kv_rule_names[k] - if k == "*": - res = self._add_rule( - f'{name}{"-" if name else ""}additional-kvs', - f'{kv_rule_name} ( "," space ' + kv_rule_name + " )*", - ) - elif first_is_optional: - res = f'( "," space {kv_rule_name} )?' + comma_ref = f'( "," space {kv_rule_name} )' + if first_is_optional: + res = comma_ref + ('*' if k == '*' else '?') else: - res = kv_rule_name + res = kv_rule_name + (' ' + comma_ref + "*" if k == '*' else '') if len(rest) > 0: - res += " " + self._add_rule( + res += ' ' + self._add_rule( f'{name}{"-" if name else ""}{k}-rest', - get_recursive_refs(rest, first_is_optional=True), + get_recursive_refs(rest, first_is_optional=True) ) return res - rule += " | ".join( + rule += ' | '.join( get_recursive_refs(optional_props[i:], first_is_optional=False) for i in range(len(optional_props)) ) if required_props: - rule += " )" - rule += " )?" + rule += ' )' + rule += ' )?' rule += ' "}" space' return rule def format_grammar(self): - return "\n".join( - f"{name} ::= {rule}" + return '\n'.join( + f'{name} ::= {rule}' for name, rule in sorted(self._rules.items(), key=lambda kv: kv[0]) ) diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 89008043c..e2ee004de 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -127,6 +127,7 @@ class clip_flash_attn_type (enum.IntEnum): # enum clip_flash_attn_type flash_attn_type; # int image_min_tokens; # int image_max_tokens; +# bool warmup; # }; class clip_context_params(Structure): _fields_ = [ @@ -134,6 +135,7 @@ class clip_context_params(Structure): ("flash_attn_type", c_int), ("image_min_tokens", c_int), ("image_max_tokens", c_int), + ("warmup", c_bool), ] # struct mtmd_context_params { @@ -143,6 +145,7 @@ class clip_context_params(Structure): # const char * image_marker; // deprecated, use media_marker instead # const char * media_marker; # enum llama_flash_attn_type flash_attn_type; +# bool warmup; // whether to run a warmup encode pass after initialization # // limit number of image tokens, only for vision models with dynamic resolution # int image_min_tokens; // minimum number of tokens for image input (default: read from metadata) @@ -156,6 +159,7 @@ class mtmd_context_params(Structure): ("image_marker", c_char_p), ("media_marker", c_char_p), ("flash_attn_type", c_int), + ("warmup", c_bool), ("image_min_tokens", c_int), ("image_max_tokens", c_int), ] diff --git a/pyproject.toml b/pyproject.toml index f5ae7b59c..00dbab1ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,15 +12,15 @@ authors = [ { name = "Andrei Betlen", email = "abetlen@gmail.com" }, ] dependencies = [ - "typing-extensions>=4.5.0", - "numpy>=1.20.0", - "diskcache>=5.6.1", + "typing-extensions>=4.8.0", + "numpy>=1.21.6", + "diskcache>=5.6.2", "jinja2>=2.11.3", + "Pillow>=9.5.0", ] -requires-python = ">=3.8" +requires-python = ">=3.9" classifiers = [ "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", diff --git a/vendor/llama.cpp b/vendor/llama.cpp index c7b7db044..7ac890213 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit c7b7db04456da9839ebd81ecde6f488c5fe41cbc +Subproject commit 7ac8902133da6eb390c4d8368a7d252279123942