diff --git a/.github/workflows/build-wheels-cu124-cu126-win.yml b/.github/workflows/build-wheels-cu124-cu126-win.yml
index 7ce77041e..5d5a91efe 100644
--- a/.github/workflows/build-wheels-cu124-cu126-win.yml
+++ b/.github/workflows/build-wheels-cu124-cu126-win.yml
@@ -72,13 +72,13 @@ jobs:
           $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off -DLLAMA_HTTPLIB=on $env:CMAKE_ARGS"
 
           if ($env:AVXVER -eq 'AVX') {
-            $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
+            $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off'
           }
           if ($env:AVXVER -eq 'AVX2') {
-            $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off'
+            $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off'
           }
           if ($env:AVXVER -eq 'AVXVNNI') {
-           $env:CMAKE_ARGS = $env:CMAKE_ARGS + '  -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on -DGGML_FMA=on -DGGML_F16C=off'
+           $env:CMAKE_ARGS = $env:CMAKE_ARGS + '  -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on'
           }
           # if ($env:AVXVER -eq 'AVX512') {
           #  $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on'
diff --git a/.github/workflows/build-wheels-cu124-linux-basic.yml b/.github/workflows/build-wheels-cu124-linux-basic.yml
new file mode 100644
index 000000000..98f50fe47
--- /dev/null
+++ b/.github/workflows/build-wheels-cu124-linux-basic.yml
@@ -0,0 +1,116 @@
+name: Build Wheels(CU124) for Linux(Basic)
+
+on:
+  workflow_dispatch: # Manual trigger
+
+permissions:
+  contents: write
+
+jobs:
+  build_wheels:
+    name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }}
+    runs-on: ubuntu-22.04
+    container: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+    strategy:
+      matrix: # Define the build matrix directly here
+        os: ["ubuntu-22.04"]
+        pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions
+        cuda: ["12.4.1"]
+        releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build)
+        cudaarch: ["all"] # Controls target CUDA architectures for nvcc
+
+    defaults:
+      run:
+        shell: bash
+
+    env:
+      CUDAVER: ${{ matrix.cuda }}
+      AVXVER: ${{ matrix.releasetag }}
+      CUDAARCHVER: ${{ matrix.cudaarch }}
+
+    steps:
+      - name: Install dependencies
+        run: |
+            apt update
+            apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev
+
+      - uses: actions/checkout@v4 # Checkout code
+        with:
+          submodules: "recursive"
+
+      # from astral-sh/setup-uv
+      - name: Install the latest version of uv and set the python version
+        uses: astral-sh/setup-uv@v6
+        with:
+          python-version: ${{ matrix.pyver }}
+          activate-environment: true
+          enable-cache: true
+
+      - run: nvcc -V
+
+      - name: Build Wheel With Cmake # Main build step: configures and builds the wheel
+        env:
+          LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
+          VERBOSE: 1 # Enable verbose build output
+          CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME
+          CUDA_PATH: "${PATH}"
+          CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR
+        run: |
+          echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting
+          find /usr/ -name 'libcuda.so.*'
+          echo $LD_LIBRARY_PATH
+
+          # Add project-specific and feature flags
+          CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='70-real;75-real;80-real;86-real;87-real;89-real'"
+          CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}"
+          CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on"
+
+          # Basic options for compiling without AVX instructions
+          if [ "${AVXVER}" = "Basic" ]; then
+            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off"
+          fi
+
+          # Export CMAKE_ARGS environment variable so the python -m build command can use it
+          echo ${CMAKE_ARGS}
+          echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV
+
+          # Run the Python build command to generate the wheel
+          uv pip install build setuptools wheel packaging
+          # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
+          CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel
+
+          # --- Post-build steps to get info for release tag ---
+
+          # Find the generated wheel file in the 'dist' directory using bash
+          # Assumes only one wheel is generated per build configuration run
+          wheel_file=$(ls dist/*.whl | head -n 1)
+
+          # Extract the package version (e.g., 1.2.3) from the wheel filename
+          # Filename format is typically: package_name-version-tag-specificators.whl
+          # Using basename and cut to split by '-' and get the second field
+          tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2)
+          echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step
+
+          # Extract the short CUDA version (e.g., 124) from the full version (e.g., 12.4.1) from the matrix variable
+          cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g')
+          echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env
+
+
+      - name: Get Current Date # Step to get current date for the release tag
+        id: get-date
+        run: |
+          # Get date in YYYYMMDD format using bash date command
+          currentDate=$(date +%Y%m%d)
+          # Store the date in environment variable for the release step
+          echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV
+
+      - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release
+        with:
+          files: dist/* # Upload the generated wheel files from the dist directory
+          # Define the release tag name using the collected environment variables
+          # Format: v<package_version>-cu<short_cuda_version>-<avx_tag>-linux-<build_date>
+          tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux
+          # Note: This action will create a new release tag if it doesn't exist,
+          # or upload assets to an existing tag. Be mindful of potential tag name conflicts.
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication
\ No newline at end of file
diff --git a/.github/workflows/build-wheels-cu124-linux.yml b/.github/workflows/build-wheels-cu124-linux.yml
index 1dbe7fa5c..3feeeecfd 100644
--- a/.github/workflows/build-wheels-cu124-linux.yml
+++ b/.github/workflows/build-wheels-cu124-linux.yml
@@ -32,7 +32,7 @@ jobs:
       - name: Install dependencies
         run: |
             apt update
-            apt install -y build-essential cmake ccache curl git libgomp1 libcurl4-openssl-dev
+            apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev
 
       - uses: actions/checkout@v4 # Checkout code
         with:
@@ -63,23 +63,23 @@ jobs:
           # Add project-specific and feature flags
           CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='70-real;75-real;80-real;86-real;87-real;89-real'"
           CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}"
-          CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=on -DLLAMA_HTTPLIB=on"
+          CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on"
 
           if [ "${AVXVER}" = "AVX" ]; then
-            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off"
+            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off"
           fi
           if [ "${AVXVER}" = "AVX2" ]; then
-            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off"
+            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off"
           fi
           if [ "${AVXVER}" = "AVXVNNI" ]; then
-            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on -DGGML_FMA=on -DGGML_F16C=off"
+            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on"
           fi
           # if [ "${AVXVER}" = "AVX512" ]; then
           #   CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on"
           # fi
           # Basic options for compiling without AVX instructions
           if [ "${AVXVER}" = "Basic" ]; then
-            CMAKE_ARGS = "${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off"
+            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off"
           fi
 
           # Export CMAKE_ARGS environment variable so the python -m build command can use it
diff --git a/.github/workflows/build-wheels-cu126-linux-basic.yml b/.github/workflows/build-wheels-cu126-linux-basic.yml
new file mode 100644
index 000000000..78d1471c7
--- /dev/null
+++ b/.github/workflows/build-wheels-cu126-linux-basic.yml
@@ -0,0 +1,116 @@
+name: Build Wheels(CU126) for Linux(Basic)
+
+on:
+  workflow_dispatch: # Manual trigger
+
+permissions:
+  contents: write
+
+jobs:
+  build_wheels:
+    name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }}
+    runs-on: ubuntu-22.04
+    container: nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04
+    strategy:
+      matrix: # Define the build matrix directly here
+        os: ["ubuntu-22.04"]
+        pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions
+        cuda: ["12.6.3"]
+        releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build)
+        cudaarch: ["all"] # Controls target CUDA architectures for nvcc
+
+    defaults:
+      run:
+        shell: bash
+
+    env:
+      CUDAVER: ${{ matrix.cuda }}
+      AVXVER: ${{ matrix.releasetag }}
+      CUDAARCHVER: ${{ matrix.cudaarch }}
+
+    steps:
+      - name: Install dependencies
+        run: |
+            apt update
+            apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev
+
+      - uses: actions/checkout@v4 # Checkout code
+        with:
+          submodules: "recursive"
+
+      # from astral-sh/setup-uv
+      - name: Install the latest version of uv and set the python version
+        uses: astral-sh/setup-uv@v6
+        with:
+          python-version: ${{ matrix.pyver }}
+          activate-environment: true
+          enable-cache: true
+
+      - run: nvcc -V
+
+      - name: Build Wheel With Cmake # Main build step: configures and builds the wheel
+        env:
+          LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
+          VERBOSE: 1 # Enable verbose build output
+          CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME
+          CUDA_PATH: "${PATH}"
+          CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR
+        run: |
+          echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting
+          find /usr/ -name 'libcuda.so.*'
+          echo $LD_LIBRARY_PATH
+
+          # Add project-specific and feature flags
+          CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='70-real;75-real;80-real;86-real;87-real;89-real'"
+          CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}"
+          CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on"
+
+          # Basic options for compiling without AVX instructions
+          if [ "${AVXVER}" = "Basic" ]; then
+            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off"
+          fi
+
+          # Export CMAKE_ARGS environment variable so the python -m build command can use it
+          echo ${CMAKE_ARGS}
+          echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV
+
+          # Run the Python build command to generate the wheel
+          uv pip install build setuptools wheel packaging
+          # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
+          CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel
+
+          # --- Post-build steps to get info for release tag ---
+
+          # Find the generated wheel file in the 'dist' directory using bash
+          # Assumes only one wheel is generated per build configuration run
+          wheel_file=$(ls dist/*.whl | head -n 1)
+
+          # Extract the package version (e.g., 1.2.3) from the wheel filename
+          # Filename format is typically: package_name-version-tag-specificators.whl
+          # Using basename and cut to split by '-' and get the second field
+          tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2)
+          echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step
+
+          # Extract the short CUDA version (e.g., 126) from the full version (e.g., 12.6.3) from the matrix variable
+          cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g')
+          echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env
+
+
+      - name: Get Current Date # Step to get current date for the release tag
+        id: get-date
+        run: |
+          # Get date in YYYYMMDD format using bash date command
+          currentDate=$(date +%Y%m%d)
+          # Store the date in environment variable for the release step
+          echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV
+
+      - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release
+        with:
+          files: dist/* # Upload the generated wheel files from the dist directory
+          # Define the release tag name using the collected environment variables
+          # Format: v<package_version>-cu<short_cuda_version>-<avx_tag>-linux-<build_date>
+          tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux
+          # Note: This action will create a new release tag if it doesn't exist,
+          # or upload assets to an existing tag. Be mindful of potential tag name conflicts.
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication
\ No newline at end of file
diff --git a/.github/workflows/build-wheels-cu126-linux.yml b/.github/workflows/build-wheels-cu126-linux.yml
index bff7a778a..f9b566fab 100644
--- a/.github/workflows/build-wheels-cu126-linux.yml
+++ b/.github/workflows/build-wheels-cu126-linux.yml
@@ -32,7 +32,7 @@ jobs:
       - name: Install dependencies
         run: |
             apt update
-            apt install -y build-essential cmake ccache curl git libgomp1 libcurl4-openssl-dev
+            apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev
 
       - uses: actions/checkout@v4 # Checkout code
         with:
@@ -63,23 +63,23 @@ jobs:
           # Add project-specific and feature flags
           CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='70-real;75-real;80-real;86-real;87-real;89-real'"
           CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}"
-          CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=on -DLLAMA_HTTPLIB=on"
+          CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on"
 
           if [ "${AVXVER}" = "AVX" ]; then
-            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off"
+            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off"
           fi
           if [ "${AVXVER}" = "AVX2" ]; then
-            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off"
+            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off"
           fi
           if [ "${AVXVER}" = "AVXVNNI" ]; then
-            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on -DGGML_FMA=on -DGGML_F16C=off"
+            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on"
           fi
           # if [ "${AVXVER}" = "AVX512" ]; then
           #   CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on"
           # fi
           # Basic options for compiling without AVX instructions
           if [ "${AVXVER}" = "Basic" ]; then
-            CMAKE_ARGS = "${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off"
+            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off"
           fi
 
           # Export CMAKE_ARGS environment variable so the python -m build command can use it
diff --git a/.github/workflows/build-wheels-cu128-linux-basic.yml b/.github/workflows/build-wheels-cu128-linux-basic.yml
new file mode 100644
index 000000000..8c527c718
--- /dev/null
+++ b/.github/workflows/build-wheels-cu128-linux-basic.yml
@@ -0,0 +1,116 @@
+name: Build Wheels(CU128) for Linux(Basic)
+
+on:
+  workflow_dispatch: # Manual trigger
+
+permissions:
+  contents: write
+
+jobs:
+  build_wheels:
+    name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }}
+    runs-on: ubuntu-22.04
+    container: nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04
+    strategy:
+      matrix: # Define the build matrix directly here
+        os: ["ubuntu-22.04"]
+        pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions
+        cuda: ["12.8.1"]
+        releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build)
+        cudaarch: ["all"] # Controls target CUDA architectures for nvcc
+
+    defaults:
+      run:
+        shell: bash
+
+    env:
+      CUDAVER: ${{ matrix.cuda }}
+      AVXVER: ${{ matrix.releasetag }}
+      CUDAARCHVER: ${{ matrix.cudaarch }}
+
+    steps:
+      - name: Install dependencies
+        run: |
+            apt update
+            apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev
+
+      - uses: actions/checkout@v4 # Checkout code
+        with:
+          submodules: "recursive"
+
+      # from astral-sh/setup-uv
+      - name: Install the latest version of uv and set the python version
+        uses: astral-sh/setup-uv@v6
+        with:
+          python-version: ${{ matrix.pyver }}
+          activate-environment: true
+          enable-cache: true
+
+      - run: nvcc -V
+
+      - name: Build Wheel With Cmake # Main build step: configures and builds the wheel
+        env:
+          LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
+          VERBOSE: 1 # Enable verbose build output
+          CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME
+          CUDA_PATH: "${PATH}"
+          CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR
+        run: |
+          echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting
+          find /usr/ -name 'libcuda.so.*'
+          echo $LD_LIBRARY_PATH
+
+          # Add project-specific and feature flags
+          CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real'"
+          CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}"
+          CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on"
+
+          # Basic options for compiling without AVX instructions
+          if [ "${AVXVER}" = "Basic" ]; then
+            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off"
+          fi
+
+          # Export CMAKE_ARGS environment variable so the python -m build command can use it
+          echo ${CMAKE_ARGS}
+          echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV
+
+          # Run the Python build command to generate the wheel
+          uv pip install build setuptools wheel packaging
+          # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+          CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel
+
+          # --- Post-build steps to get info for release tag ---
+
+          # Find the generated wheel file in the 'dist' directory using bash
+          # Assumes only one wheel is generated per build configuration run
+          wheel_file=$(ls dist/*.whl | head -n 1)
+
+          # Extract the package version (e.g., 1.2.3) from the wheel filename
+          # Filename format is typically: package_name-version-tag-specificators.whl
+          # Using basename and cut to split by '-' and get the second field
+          tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2)
+          echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step
+
+          # Extract the short CUDA version (e.g., 128) from the full version (e.g., 12.8.1) from the matrix variable
+          cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g')
+          echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env
+
+
+      - name: Get Current Date # Step to get current date for the release tag
+        id: get-date
+        run: |
+          # Get date in YYYYMMDD format using bash date command
+          currentDate=$(date +%Y%m%d)
+          # Store the date in environment variable for the release step
+          echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV
+
+      - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release
+        with:
+          files: dist/* # Upload the generated wheel files from the dist directory
+          # Define the release tag name using the collected environment variables
+          # Format: v<package_version>-cu<short_cuda_version>-<avx_tag>-linux-<build_date>
+          tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux
+          # Note: This action will create a new release tag if it doesn't exist,
+          # or upload assets to an existing tag. Be mindful of potential tag name conflicts.
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication
\ No newline at end of file
diff --git a/.github/workflows/build-wheels-cu128-linux.yml b/.github/workflows/build-wheels-cu128-linux.yml
index e3fc186df..b25128d6e 100644
--- a/.github/workflows/build-wheels-cu128-linux.yml
+++ b/.github/workflows/build-wheels-cu128-linux.yml
@@ -32,7 +32,7 @@ jobs:
       - name: Install dependencies
         run: |
             apt update
-            apt install -y build-essential ccache cmake curl git libgomp1 libcurl4-openssl-dev
+            apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev
 
       - uses: actions/checkout@v4 # Checkout code
         with:
@@ -63,23 +63,23 @@ jobs:
           # Add project-specific and feature flags
           CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real'"
           CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}"
-          CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=on -DLLAMA_HTTPLIB=on"
+          CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on"
 
           if [ "${AVXVER}" = "AVX" ]; then
-            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off"
+            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off"
           fi
           if [ "${AVXVER}" = "AVX2" ]; then
-            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off"
+            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off"
           fi
           if [ "${AVXVER}" = "AVXVNNI" ]; then
-            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on -DGGML_FMA=on -DGGML_F16C=off"
+            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on"
           fi
           # if [ "${AVXVER}" = "AVX512" ]; then
           #   CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on"
           # fi
           # Basic options for compiling without AVX instructions
           if [ "${AVXVER}" = "Basic" ]; then
-            CMAKE_ARGS = "${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off"
+            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off"
           fi
 
           # Export CMAKE_ARGS environment variable so the python -m build command can use it
diff --git a/.github/workflows/build-wheels-cu128-win-basic.yml b/.github/workflows/build-wheels-cu128-win-basic.yml
new file mode 100644
index 000000000..2d78084db
--- /dev/null
+++ b/.github/workflows/build-wheels-cu128-win-basic.yml
@@ -0,0 +1,107 @@
+name: Build Wheels (CU128) for Windows(Basic)
+
+on:
+  workflow_dispatch:
+
+permissions:
+  contents: write
+
+jobs:
+  build_wheels:
+    name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: ['windows-2022']
+        pyver: ["3.10", "3.11", "3.12", "3.13"]
+        cuda: ["12.8.1"]
+        releasetag: ["Basic"]
+        cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real"]
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      CUDAVER: ${{ matrix.cuda }}
+      AVXVER: ${{ matrix.releasetag }}
+      CUDAARCHVER: ${{ matrix.cudaarch }}
+      # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html
+      # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list
+      # e.g. "all" "89" "90" "100" "120"
+      MAX_JOBS: 8
+
+    steps:
+      - name: Add MSBuild to PATH
+        if: runner.os == 'Windows'
+        uses: microsoft/setup-msbuild@v2
+        with:
+          msbuild-architecture: x64
+
+      - uses: actions/checkout@v5
+        with:
+          submodules: "recursive"
+
+      # from kingbri1/flash-attention build-wheels.yml
+      - name: Install CUDA ${{ matrix.cuda }}
+        uses: N-Storm/cuda-toolkit@v0.2.28
+        id: cuda-toolkit
+        with:
+          cuda: "${{ matrix.cuda }}"
+          use-github-cache: false
+
+      # from astral-sh/setup-uv
+      - name: Install the latest version of uv and set the python version
+        uses: astral-sh/setup-uv@v6
+        with:
+          python-version: ${{ matrix.pyver }}
+          activate-environment: true
+          enable-cache: true
+
+      - name: Install Dependencies
+        run: |
+          git config --system core.longpaths true
+          uv pip install --upgrade build setuptools wheel packaging
+
+      - name: Build Wheel
+        run: |
+          $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','')
+          $env:CUDA_HOME = $env:CUDA_PATH
+          $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH
+          $env:VERBOSE = '1'
+          $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS
+          $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS"
+          $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off -DLLAMA_HTTPLIB=on $env:CMAKE_ARGS"
+
+          # Basic options for compiling without AVX instructions
+          if ($env:AVXVER -eq 'Basic') {
+           $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off'
+          }
+          python -m build --wheel
+
+          # Check if wheel was built
+          if (!(Test-Path '.\dist\*.whl')) {
+            Write-Error "No wheel built in dist/ directory"
+            exit 1
+          }
+
+          # write the build tag to the output
+          Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV
+
+          $wheel = (gi '.\dist\*.whl')[0]
+          $tagVer = $wheel.name.split('-')[1]
+          Write-Output "TAG_VERSION=$tagVer" >> $env:GITHUB_ENV
+
+      - name: Get Current Date
+        id: get-date
+        run: |
+          $currentDate = Get-Date -UFormat "%Y%m%d"
+          Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV
+
+      - name: Create Release
+        if: always() && env.TAG_VERSION != ''
+        uses: softprops/action-gh-release@v2
+        with:
+          files: dist/*
+          # Set tag_name to <tag>-cu<cuda_version>-<date>-win
+          tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/build-wheels-cu128-win.yml b/.github/workflows/build-wheels-cu128-win.yml
index adf78e1e7..40578c8b4 100644
--- a/.github/workflows/build-wheels-cu128-win.yml
+++ b/.github/workflows/build-wheels-cu128-win.yml
@@ -75,10 +75,10 @@ jobs:
             $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
           }
           if ($env:AVXVER -eq 'AVX2') {
-            $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off'
+            $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off'
           }
           if ($env:AVXVER -eq 'AVXVNNI') {
-           $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on -DGGML_FMA=on -DGGML_F16C=off'
+           $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on'
           }
           # if ($env:AVXVER -eq 'AVX512') {
           #  $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on'
diff --git a/.github/workflows/build-wheels-cu130-linux-basic.yml b/.github/workflows/build-wheels-cu130-linux-basic.yml
new file mode 100644
index 000000000..0f03787a6
--- /dev/null
+++ b/.github/workflows/build-wheels-cu130-linux-basic.yml
@@ -0,0 +1,116 @@
+name: Build Wheels(CU130) for Linux(Basic)
+
+on:
+  workflow_dispatch: # Manual trigger
+
+permissions:
+  contents: write
+
+jobs:
+  build_wheels:
+    name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }}
+    runs-on: ubuntu-22.04
+    container: nvidia/cuda:13.0.2-cudnn-devel-ubuntu22.04
+    strategy:
+      matrix: # Define the build matrix directly here
+        os: ["ubuntu-22.04"]
+        pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions
+        cuda: ["13.0.2"]
+        releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build)
+        cudaarch: ["all"] # Controls target CUDA architectures for nvcc
+
+    defaults:
+      run:
+        shell: bash
+
+    env:
+      CUDAVER: ${{ matrix.cuda }}
+      AVXVER: ${{ matrix.releasetag }}
+      CUDAARCHVER: ${{ matrix.cudaarch }}
+
+    steps:
+      - name: Install dependencies
+        run: |
+            apt update
+            apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev
+
+      - uses: actions/checkout@v5 # Checkout code
+        with:
+          submodules: "recursive"
+
+      # from astral-sh/setup-uv
+      - name: Install the latest version of uv and set the python version
+        uses: astral-sh/setup-uv@v6
+        with:
+          python-version: ${{ matrix.pyver }}
+          activate-environment: true
+          enable-cache: true
+
+      - run: nvcc -V
+
+      - name: Build Wheel With Cmake # Main build step: configures and builds the wheel
+        env:
+          LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
+          VERBOSE: 1 # Enable verbose build output
+          CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME
+          CUDA_PATH: "${PATH}"
+          CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR
+        run: |
+          echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting
+          find /usr/ -name 'libcuda.so.*'
+          echo $LD_LIBRARY_PATH
+
+          # Add project-specific and feature flags
+          CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real'"
+          CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}"
+          CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on"
+
+          # Basic options for compiling without AVX instructions
+          if [ "${AVXVER}" = "Basic" ]; then
+            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off"
+          fi
+
+          # Export CMAKE_ARGS environment variable so the python -m build command can use it
+          echo ${CMAKE_ARGS}
+          echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV
+
+          # Run the Python build command to generate the wheel
+          uv pip install build setuptools wheel packaging
+          # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
+          CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel
+
+          # --- Post-build steps to get info for release tag ---
+
+          # Find the generated wheel file in the 'dist' directory using bash
+          # Assumes only one wheel is generated per build configuration run
+          wheel_file=$(ls dist/*.whl | head -n 1)
+
+          # Extract the package version (e.g., 1.2.3) from the wheel filename
+          # Filename format is typically: package_name-version-tag-specificators.whl
+          # Using basename and cut to split by '-' and get the second field
+          tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2)
+          echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step
+
+          # Extract the short CUDA version (e.g., 130) from the full version (e.g., 13.0.2) from the matrix variable
+          cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g')
+          echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env
+
+
+      - name: Get Current Date # Step to get current date for the release tag
+        id: get-date
+        run: |
+          # Get date in YYYYMMDD format using bash date command
+          currentDate=$(date +%Y%m%d)
+          # Store the date in environment variable for the release step
+          echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV
+
+      - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release
+        with:
+          files: dist/* # Upload the generated wheel files from the dist directory
+          # Define the release tag name using the collected environment variables
+          # Format: v<package_version>-cu<short_cuda_version>-<avx_tag>-linux-<build_date>
+          tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux
+          # Note: This action will create a new release tag if it doesn't exist,
+          # or upload assets to an existing tag. Be mindful of potential tag name conflicts.
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication
\ No newline at end of file
diff --git a/.github/workflows/build-wheels-cu130-linux.yml b/.github/workflows/build-wheels-cu130-linux.yml
new file mode 100644
index 000000000..6451c7ee4
--- /dev/null
+++ b/.github/workflows/build-wheels-cu130-linux.yml
@@ -0,0 +1,128 @@
+name: Build Wheels(CU130) for Linux
+
+on:
+  workflow_dispatch: # Manual trigger
+
+permissions:
+  contents: write
+
+jobs:
+  build_wheels:
+    name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }}
+    runs-on: ubuntu-22.04
+    container: nvidia/cuda:13.0.2-cudnn-devel-ubuntu22.04
+    strategy:
+      matrix: # Define the build matrix directly here
+        os: ["ubuntu-22.04"]
+        pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions
+        cuda: ["13.0.2"]
+        releasetag: ["AVX2"] # Controls CMAKE_ARGS for CPU features (even in CUDA build)
+        cudaarch: ["all"] # Controls target CUDA architectures for nvcc
+
+    defaults:
+      run:
+        shell: bash
+
+    env:
+      CUDAVER: ${{ matrix.cuda }}
+      AVXVER: ${{ matrix.releasetag }}
+      CUDAARCHVER: ${{ matrix.cudaarch }}
+
+    steps:
+      - name: Install dependencies
+        run: |
+            apt update
+            apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev
+
+      - uses: actions/checkout@v5 # Checkout code
+        with:
+          submodules: "recursive"
+
+      # from astral-sh/setup-uv
+      - name: Install the latest version of uv and set the python version
+        uses: astral-sh/setup-uv@v6
+        with:
+          python-version: ${{ matrix.pyver }}
+          activate-environment: true
+          enable-cache: true
+
+      - run: nvcc -V
+
+      - name: Build Wheel With Cmake # Main build step: configures and builds the wheel
+        env:
+          LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
+          VERBOSE: 1 # Enable verbose build output
+          CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME
+          CUDA_PATH: "${PATH}"
+          CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR
+        run: |
+          echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting
+          find /usr/ -name 'libcuda.so.*'
+          echo $LD_LIBRARY_PATH
+
+          # Add project-specific and feature flags
+          CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real'"
+          CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}"
+          CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on"
+
+          if [ "${AVXVER}" = "AVX" ]; then
+            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off  -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off"
+          fi
+          if [ "${AVXVER}" = "AVX2" ]; then
+            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off"
+          fi
+          if [ "${AVXVER}" = "AVXVNNI" ]; then
+            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on"
+          fi
+          # if [ "${AVXVER}" = "AVX512" ]; then
+          #   CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on"
+          # fi
+          # Basic options for compiling without AVX instructions
+          if [ "${AVXVER}" = "Basic" ]; then
+            CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off"
+          fi
+
+          # Export CMAKE_ARGS environment variable so the python -m build command can use it
+          echo ${CMAKE_ARGS}
+          echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV
+
+          # Run the Python build command to generate the wheel
+          uv pip install build setuptools wheel packaging
+          # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
+          CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel
+
+          # --- Post-build steps to get info for release tag ---
+
+          # Find the generated wheel file in the 'dist' directory using bash
+          # Assumes only one wheel is generated per build configuration run
+          wheel_file=$(ls dist/*.whl | head -n 1)
+
+          # Extract the package version (e.g., 1.2.3) from the wheel filename
+          # Filename format is typically: package_name-version-tag-specificators.whl
+          # Using basename and cut to split by '-' and get the second field
+          tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2)
+          echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step
+
+          # Extract the short CUDA version (e.g., 130) from the full version (e.g., 13.0.2) from the matrix variable
+          cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g')
+          echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env
+
+
+      - name: Get Current Date # Step to get current date for the release tag
+        id: get-date
+        run: |
+          # Get date in YYYYMMDD format using bash date command
+          currentDate=$(date +%Y%m%d)
+          # Store the date in environment variable for the release step
+          echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV
+
+      - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release
+        with:
+          files: dist/* # Upload the generated wheel files from the dist directory
+          # Define the release tag name using the collected environment variables
+          # Format: v<package_version>-cu<short_cuda_version>-<avx_tag>-linux-<build_date>
+          tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux
+          # Note: This action will create a new release tag if it doesn't exist,
+          # or upload assets to an existing tag. Be mindful of potential tag name conflicts.
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication
\ No newline at end of file
diff --git a/.github/workflows/build-wheels-cu130-win-basic.yml b/.github/workflows/build-wheels-cu130-win-basic.yml
new file mode 100644
index 000000000..17b0fb6c7
--- /dev/null
+++ b/.github/workflows/build-wheels-cu130-win-basic.yml
@@ -0,0 +1,107 @@
+name: Build Wheels (CU130) for Windows(Basic)
+
+on:
+  workflow_dispatch:
+
+permissions:
+  contents: write
+
+jobs:
+  build_wheels:
+    name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: ['windows-2022']
+        pyver: ["3.10", "3.11", "3.12", "3.13"]
+        cuda: ["13.0.2"]
+        releasetag: ["Basic"]
+        cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"]
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      CUDAVER: ${{ matrix.cuda }}
+      AVXVER: ${{ matrix.releasetag }}
+      CUDAARCHVER: ${{ matrix.cudaarch }}
+      # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html
+      # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list
+      # e.g. "all" "89" "90" "100" "120"
+      MAX_JOBS: 8
+
+    steps:
+      - name: Add MSBuild to PATH
+        if: runner.os == 'Windows'
+        uses: microsoft/setup-msbuild@v2
+        with:
+          msbuild-architecture: x64
+
+      - uses: actions/checkout@v5
+        with:
+          submodules: "recursive"
+
+      # from kingbri1/flash-attention build-wheels.yml
+      - name: Install CUDA ${{ matrix.cuda }}
+        uses: N-Storm/cuda-toolkit@v0.2.29
+        id: cuda-toolkit
+        with:
+          cuda: "${{ matrix.cuda }}"
+          use-github-cache: false
+
+      # from astral-sh/setup-uv
+      - name: Install the latest version of uv and set the python version
+        uses: astral-sh/setup-uv@v6
+        with:
+          python-version: ${{ matrix.pyver }}
+          activate-environment: true
+          enable-cache: true
+
+      - name: Install Dependencies
+        run: |
+          git config --system core.longpaths true
+          uv pip install --upgrade build setuptools wheel packaging
+
+      - name: Build Wheel
+        run: |
+          $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','')
+          $env:CUDA_HOME = $env:CUDA_PATH
+          $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH
+          $env:VERBOSE = '1'
+          $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS
+          $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS"
+          $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off -DLLAMA_HTTPLIB=on $env:CMAKE_ARGS"
+
+          # Basic options for compiling without AVX instructions
+          if ($env:AVXVER -eq 'Basic') {
+           $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off'
+          }
+          python -m build --wheel
+
+          # Check if wheel was built
+          if (!(Test-Path '.\dist\*.whl')) {
+            Write-Error "No wheel built in dist/ directory"
+            exit 1
+          }
+
+          # write the build tag to the output
+          Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV
+
+          $wheel = (gi '.\dist\*.whl')[0]
+          $tagVer = $wheel.name.split('-')[1]
+          Write-Output "TAG_VERSION=$tagVer" >> $env:GITHUB_ENV
+
+      - name: Get Current Date
+        id: get-date
+        run: |
+          $currentDate = Get-Date -UFormat "%Y%m%d"
+          Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV
+
+      - name: Create Release
+        if: always() && env.TAG_VERSION != ''
+        uses: softprops/action-gh-release@v2
+        with:
+          files: dist/*
+          # Set tag_name to <tag>-cu<cuda_version>-<date>-win
+          tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml
new file mode 100644
index 000000000..3c7d07caa
--- /dev/null
+++ b/.github/workflows/build-wheels-cu130-win.yml
@@ -0,0 +1,119 @@
+name: Build Wheels (CU130) for Windows
+
+on:
+  workflow_dispatch:
+
+permissions:
+  contents: write
+
+jobs:
+  build_wheels:
+    name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: ['windows-2022']
+        pyver: ["3.10", "3.11", "3.12", "3.13"]
+        cuda: ["13.0.2"]
+        releasetag: ["AVX2"]
+        cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"]
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      CUDAVER: ${{ matrix.cuda }}
+      AVXVER: ${{ matrix.releasetag }}
+      CUDAARCHVER: ${{ matrix.cudaarch }}
+      # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html
+      # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list
+      # e.g. "all" "89" "90" "100" "120"
+      MAX_JOBS: 8
+
+    steps:
+      - name: Add MSBuild to PATH
+        if: runner.os == 'Windows'
+        uses: microsoft/setup-msbuild@v2
+        with:
+          msbuild-architecture: x64
+
+      - uses: actions/checkout@v5
+        with:
+          submodules: "recursive"
+
+      # from kingbri1/flash-attention build-wheels.yml
+      - name: Install CUDA ${{ matrix.cuda }}
+        uses: N-Storm/cuda-toolkit@v0.2.29
+        id: cuda-toolkit
+        with:
+          cuda: "${{ matrix.cuda }}"
+          use-github-cache: false
+
+      # from astral-sh/setup-uv
+      - name: Install the latest version of uv and set the python version
+        uses: astral-sh/setup-uv@v6
+        with:
+          python-version: ${{ matrix.pyver }}
+          activate-environment: true
+          enable-cache: true
+
+      - name: Install Dependencies
+        run: |
+          git config --system core.longpaths true
+          uv pip install --upgrade build setuptools wheel packaging
+
+      - name: Build Wheel
+        run: |
+          $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','')
+          $env:CUDA_HOME = $env:CUDA_PATH
+          $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH
+          $env:VERBOSE = '1'
+          $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS
+          $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS"
+          $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off -DLLAMA_HTTPLIB=on $env:CMAKE_ARGS"
+
+          if ($env:AVXVER -eq 'AVX') {
+            $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off'
+          }
+          if ($env:AVXVER -eq 'AVX2') {
+            $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off'
+          }
+          if ($env:AVXVER -eq 'AVXVNNI') {
+           $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on'
+          }
+          # if ($env:AVXVER -eq 'AVX512') {
+          #  $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on'
+          # }
+          # Basic options for compiling without AVX instructions
+          if ($env:AVXVER -eq 'Basic') {
+           $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off'
+          }
+          python -m build --wheel
+
+          # Check if wheel was built
+          if (!(Test-Path '.\dist\*.whl')) {
+            Write-Error "No wheel built in dist/ directory"
+            exit 1
+          }
+
+          # write the build tag to the output
+          Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV
+
+          $wheel = (gi '.\dist\*.whl')[0]
+          $tagVer = $wheel.name.split('-')[1]
+          Write-Output "TAG_VERSION=$tagVer" >> $env:GITHUB_ENV
+
+      - name: Get Current Date
+        id: get-date
+        run: |
+          $currentDate = Get-Date -UFormat "%Y%m%d"
+          Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV
+
+      - name: Create Release
+        if: always() && env.TAG_VERSION != ''
+        uses: softprops/action-gh-release@v2
+        with:
+          files: dist/*
+          # Set tag_name to <tag>-cu<cuda_version>-<date>-win
+          tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml
deleted file mode 100644
index 4620cdd76..000000000
--- a/.github/workflows/build-wheels-cuda.yaml
+++ /dev/null
@@ -1,136 +0,0 @@
-name: Build Wheels (CUDA)
-
-on: workflow_dispatch
-
-permissions:
-  contents: write
-
-jobs:
-  define_matrix:
-    name: Define Build Matrix
-    runs-on: ubuntu-22.04
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-    defaults:
-      run:
-        shell: pwsh
-
-    steps:
-      - name: Define Job Output
-        id: set-matrix
-        run: |
-          $matrix = @{
-              'os' = @('ubuntu-22.04', 'windows-2022')
-              'pyver' = @("3.9", "3.10", "3.11", "3.12")
-              'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1") #, "12.5.1", "12.6.1")
-              'releasetag' = @("basic")
-          }
-
-          $matrixOut = ConvertTo-Json $matrix -Compress
-          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
-
-  build_wheels:
-    name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }}
-    needs: define_matrix
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
-    defaults:
-      run:
-        shell: pwsh
-    env:
-      CUDAVER: ${{ matrix.cuda }}
-      AVXVER: ${{ matrix.releasetag }}
-
-    steps:
-      - name: Add MSBuild to PATH
-        if: runner.os == 'Windows'
-        uses: microsoft/setup-msbuild@v2
-        with:
-          vs-version: '[16.11,16.12)'
-
-      - uses: actions/checkout@v4
-        with:
-          submodules: "recursive"
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.pyver }}
-          cache: 'pip'
-
-      - name: Setup Mamba
-        uses: conda-incubator/setup-miniconda@v3.1.0
-        with:
-          activate-environment: "llamacpp"
-          python-version: ${{ matrix.pyver }}
-          miniforge-version: latest
-          add-pip-as-python-dependency: true
-          auto-activate-base: false
-
-      - name: VS Integration Cache
-        id: vs-integration-cache
-        if: runner.os == 'Windows'
-        uses: actions/cache@v4
-        with:
-          path: ./MSBuildExtensions
-          key: cuda-${{ matrix.cuda }}-vs-integration
-
-      - name: Get Visual Studio Integration
-        if: runner.os == 'Windows' && steps.vs-integration-cache.outputs.cache-hit != 'true'
-        run: |
-          if ($env:CUDAVER -eq '12.1.1') {$x = '12.1.0'} else {$x = $env:CUDAVER}
-          $links = (Invoke-RestMethod 'https://raw.githubusercontent.com/Jimver/cuda-toolkit/master/src/links/windows-links.ts').Trim().split().where({$_ -ne ''})
-          for ($i=$q=0;$i -lt $links.count -and $q -lt 2;$i++) {if ($links[$i] -eq "'$x',") {$q++}}
-          Invoke-RestMethod $links[$i].Trim("'") -OutFile 'cudainstaller.zip'
-          & 'C:\Program Files\7-Zip\7z.exe' e cudainstaller.zip -oMSBuildExtensions -r *\MSBuildExtensions\* > $null
-          Remove-Item 'cudainstaller.zip'
-
-      - name: Install Visual Studio Integration
-        if: runner.os == 'Windows'
-        run: |
-          $y = (gi '.\MSBuildExtensions').fullname + '\*'
-          (gi 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Microsoft\VC\*\BuildCustomizations').fullname.foreach({cp $y $_})
-          $cupath = 'CUDA_PATH_V' + $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','_')
-          echo "$cupath=$env:CONDA_PREFIX" >> $env:GITHUB_ENV
-
-      - name: Install Dependencies
-        env:
-          MAMBA_DOWNLOAD_FAILFAST: "0"
-          MAMBA_NO_LOW_SPEED_LIMIT: "1"
-        run: |
-          $cudaVersion = $env:CUDAVER
-          mamba install -y 'cuda' -c nvidia/label/cuda-$cudaVersion
-          python -m pip install build wheel
-
-      - name: Build Wheel
-        run: |
-          $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','')
-          $env:CUDA_PATH = $env:CONDA_PREFIX
-          $env:CUDA_HOME = $env:CONDA_PREFIX
-          $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX
-          if ($IsLinux) {
-            $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH
-          }
-          $env:VERBOSE = '1'
-          $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all'
-          $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS"
-          # if ($env:AVXVER -eq 'AVX') {
-          $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
-          # }
-          # if ($env:AVXVER -eq 'AVX512') {
-          #  $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on'
-          # }
-          # if ($env:AVXVER -eq 'basic') {
-          #  $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
-          # }
-          python -m build --wheel
-          # write the build tag to the output
-          Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV
-
-      - uses: softprops/action-gh-release@v2
-        with:
-          files: dist/*
-          # Set tag_name to <tag>-cu<cuda_version>
-          tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }}
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 76d899335..2ad6f1679 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,39 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.18]
+- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/ce734a8a2f9fb6eb4f0383ab1370a1b0014ab787](https://github.com/ggml-org/llama.cpp/commit/ce734a8a2f9fb6eb4f0383ab1370a1b0014ab787)
+- feat: Sync llama.cpp llama/mtmd API Binding 20251215
+- feat: **implement `GLM46VChatHandler` for GLM-4.6V Series Model**
+- feat: **implement `LFM2VLChatHandler` for LFM2-VL series models**
+- feat: **implement `GLM41VChatHandler` for GLM-4.1V-9B-Thinking Model**
+- workflow: Added workflows for compiling with CUDA 13.0.2 on Windows and Linux.
+- feat: Added the scan path for CUDA 13.0+ dynamic link libraries under Windows system ($env:CUDA_PATH\bin\x64)
+- Optimization: Improved batch token processing logic in Llava15ChatHandler.
+- [perf: optimize LlamaModel.metadata reading performance](https://github.com/JamePeng/llama-cpp-python/commit/8213c19b0e164780ffffa3e64b5fc033cdbe4974)
+    - Increase initial buffer size to 16KB to eliminate re-allocations for large chat templates.
+    - Cache ctypes function references to reduce loop overhead.
+    - Repeated model loading can result in a cumulative speed improvement of 1-3%.
+- build: Improve CMakeLists target logic
+- refactor: optimize LlamaGrammar class code
+
+More information see: https://github.com/JamePeng/llama-cpp-python/compare/67421d546ddcaa07678ac7921a9f124e7e3de10e...d5131e2ff41e05f83fd847052b06938c7a551a6a
+
+## [0.3.17]
+- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/054a45c3d313387a4becd5eae982285932852b35](https://github.com/ggml-org/llama.cpp/commit/054a45c3d313387a4becd5eae982285932852b35)
+- feat: Sync llama.cpp llama/mtmd API Binding 20251121
+- feat: **Support clip flash-attn**
+- feat: **0day support Qwen3VLChatHandler into llama_chat_format.py**
+- Update README.md for Qwen3VL example(Thinking/No Thinking)
+- feat: **Better Qwen3VL chat template. (by @alcoftTAO)**
+- feat: [Implement LlamaTrieCache into llama_cache.py](https://github.com/JamePeng/llama-cpp-python/commit/2419dc2d9bb0a6be0cd381038ce00fcaea124c76): Optimize LlamaCache lookup from **O(N)** to **O(K)** using a Trie, **improves retrieval speed at least 40x compared to the original linear scan method of finding the longest prefix , thereby enhancing service responsiveness.**
+- feat: Update Llava15ChatHandler to accept use_gpu, image_min_tokens, and image_max_tokens.Now can pass the`image_min_tokens`parameter in Qwen3VLChatHandler to support bbox grounding tasks.
+- feat: [Add Pillow process code in _load_image for VLM](https://github.com/JamePeng/llama-cpp-python/commit/3b0133365e25840c023aef6b6c8578073cd081e8): that can reliably handle common formats, Supports 20+ image formats (PNG, JPEG, WebP, AVIF, HEIC, SVG, BMP, ICO, TIFF, etc.). Images with alpha channel (PNG, WebP, etc.) → automatically composites on white/black background(white for dark content, black for bright content)
+- feat: Optimize CUDA Wheel Build Workflow, now workflow action support python3.10-3.13  cu124-cu126-cu128  Basic(Non AVX)-AVX2  win-linux
+
+
+More information see: https://github.com/JamePeng/llama-cpp-python/compare/e5392b52036bd2770ece5269352f5600a8db5639...fbb0ed2f089c663a5eb75aadcad08f768041ed72
+
 ## [0.3.16]
 
 - feat: Update llama.cpp to [ggml-org/llama.cpp/commit/5e6229a8409ac786e62cb133d09f1679a9aec13e](https://github.com/ggml-org/llama.cpp/commit/5e6229a8409ac786e62cb133d09f1679a9aec13e)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d6429fac8..ca53b2ff6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,43 +5,49 @@ project(llama_cpp)
 option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python package" ON)
 option(MTMD_BUILD "Build mtmd shared library and install alongside python package" ON)
 
+# Helper function to install targets to Python package directories
 function(llama_cpp_python_install_target target)
     if(NOT TARGET ${target})
         return()
     endif()
 
-    install(
-        TARGETS ${target}
-        LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
-        RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
-        ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
-        FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
-        RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
-    )
-    install(
-        TARGETS ${target}
-        LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
-        RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
-        ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
-        FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
-        RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
-    )
-    set_target_properties(${target} PROPERTIES
-        INSTALL_RPATH "$ORIGIN"
-        BUILD_WITH_INSTALL_RPATH TRUE
+    # Define install destinations to avoid code duplication
+    set(INSTALL_DIRS
+        "${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib"
+        "${SKBUILD_PLATLIB_DIR}/llama_cpp/lib"
     )
+
+    foreach(DIR ${INSTALL_DIRS})
+        install(
+            TARGETS ${target}
+            LIBRARY DESTINATION ${DIR}
+            RUNTIME DESTINATION ${DIR}
+            ARCHIVE DESTINATION ${DIR}
+            FRAMEWORK DESTINATION ${DIR}
+            RESOURCE DESTINATION ${DIR}
+        )
+
+        # Automatically handle Windows DLL installation for each target
+        if (WIN32)
+            install(
+                FILES $<TARGET_RUNTIME_DLLS:${target}>
+                DESTINATION ${DIR}
+                OPTIONAL # Prevent errors if the target has no DLLs
+            )
+        endif()
+    endforeach()
+
+    # Configure RPATH
     if(UNIX)
+        set(INSTALL_RPATH_VAL "$ORIGIN")
         if(APPLE)
-            set_target_properties(${target} PROPERTIES
-                INSTALL_RPATH "@loader_path"
-                BUILD_WITH_INSTALL_RPATH TRUE
-            )
-        else()
-            set_target_properties(${target} PROPERTIES
-                INSTALL_RPATH "$ORIGIN"
-                BUILD_WITH_INSTALL_RPATH TRUE
-            )
+            set(INSTALL_RPATH_VAL "@loader_path")
         endif()
+
+        set_target_properties(${target} PROPERTIES
+            INSTALL_RPATH "${INSTALL_RPATH_VAL}"
+            BUILD_WITH_INSTALL_RPATH TRUE
+        )
     endif()
 endfunction()
 
@@ -65,21 +71,17 @@ if (LLAMA_BUILD)
     # Disable building curl support
     set(LLAMA_CURL OFF CACHE BOOL "llama.cpp: enable curl" FORCE)
 
+    # Enable build and link OpenSSL
+    set(LLAMA_OPENSSL ON CACHE BOOL "llama.cpp: build and link OpenSSL" FORCE)
+
     # Architecture detection and settings for Apple platforms
     if (APPLE)
-        # Get the target architecture
-        execute_process(
-            COMMAND uname -m
-            OUTPUT_VARIABLE HOST_ARCH
-            OUTPUT_STRIP_TRAILING_WHITESPACE
-        )
-
         # If CMAKE_OSX_ARCHITECTURES is not set, use the host architecture
         if(NOT CMAKE_OSX_ARCHITECTURES)
-            set(CMAKE_OSX_ARCHITECTURES ${HOST_ARCH} CACHE STRING "Build architecture for macOS" FORCE)
+            set(CMAKE_OSX_ARCHITECTURES ${CMAKE_HOST_SYSTEM_PROCESSOR} CACHE STRING "Build architecture for macOS" FORCE)
         endif()
 
-        message(STATUS "Host architecture: ${HOST_ARCH}")
+        message(STATUS "Host architecture: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
         message(STATUS "Target architecture: ${CMAKE_OSX_ARCHITECTURES}")
 
         # Configure based on target architecture
@@ -104,45 +106,32 @@ if (LLAMA_BUILD)
         endif()
     endif()
 
-    llama_cpp_python_install_target(llama)
-    llama_cpp_python_install_target(ggml)
-
-    llama_cpp_python_install_target(ggml-base)
-
-    llama_cpp_python_install_target(ggml-blas)
-    llama_cpp_python_install_target(ggml-cann)
-    llama_cpp_python_install_target(ggml-cpu)
-    llama_cpp_python_install_target(ggml-cuda)
-    llama_cpp_python_install_target(ggml-hexagon)
-    llama_cpp_python_install_target(ggml-hip)
-    llama_cpp_python_install_target(ggml-metal)
-    llama_cpp_python_install_target(ggml-musa)
-    llama_cpp_python_install_target(ggml-opencl)
-    llama_cpp_python_install_target(ggml-rpc)
-    llama_cpp_python_install_target(ggml-sycl)
-    llama_cpp_python_install_target(ggml-vulkan)
-    llama_cpp_python_install_target(ggml-webgpu)
-    llama_cpp_python_install_target(ggml-zdnn)
-
-    # Workaround for Windows + CUDA https://github.com/abetlen/llama-cpp-python/issues/563
-    if (WIN32)
-        install(
-            FILES $<TARGET_RUNTIME_DLLS:llama>
-            DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
-        )
-        install(
-            FILES $<TARGET_RUNTIME_DLLS:llama>
-            DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
-        )
-        install(
-            FILES $<TARGET_RUNTIME_DLLS:ggml>
-            DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
-        )
-        install(
-            FILES $<TARGET_RUNTIME_DLLS:ggml>
-            DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
-        )
-    endif()
+    # Define list of GGML targets to install
+    set(GGML_TARGETS
+        llama
+        ggml
+        ggml-base
+        ggml-blas
+        ggml-cann
+        ggml-cpu
+        ggml-cuda
+        ggml-hexagon
+        ggml-hip
+        ggml-metal
+        ggml-musa
+        ggml-opencl
+        ggml-rpc
+        ggml-sycl
+        ggml-vulkan
+        ggml-webgpu
+        ggml-zdnn
+        ggml-zendnn
+    )
+
+    # Loop through targets to avoid repetitive function calls
+    foreach(TARGET_NAME ${GGML_TARGETS})
+        llama_cpp_python_install_target(${TARGET_NAME})
+    endforeach()
 
     if (MTMD_BUILD)
         if (NOT DEFINED LLAMA_BUILD_NUMBER)
@@ -167,10 +156,5 @@ if (LLAMA_BUILD)
         endif()
 
         llama_cpp_python_install_target(mtmd)
-
-        if (WIN32)
-            install(FILES $<TARGET_RUNTIME_DLLS:mtmd> DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib)
-            install(FILES $<TARGET_RUNTIME_DLLS:mtmd> DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib)
-        endif()
     endif()
 endif()
diff --git a/README.md b/README.md
index 44354f25b..6c18a4a24 100644
--- a/README.md
+++ b/README.md
@@ -125,8 +125,10 @@ CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python
 
 It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements:
 
-- CUDA Version is 12.4, 12.6 or 12.8
+- CUDA Version is 12.4, 12.6, 12.8 or 13.0
 - Python Version is 3.10, 3.11, 3.12 or 3.13
+- Basic version: A version compiled without using AVX instructions (for compatibility with CPU platforms lacking AVX instructions or with AVX instruction compatibility issues).
+- AVX2 version: A version compiled using AVX2 instructions.
 
 Check the releases page:
 https://github.com/JamePeng/llama-cpp-python/releases
@@ -170,7 +172,11 @@ CMAKE_ARGS="-DGGML_HIPBLAS=on" pip install llama-cpp-python
 <details>
 <summary>Vulkan</summary>
 
-To install with Vulkan support, set the `GGML_VULKAN=on` environment variable before installing:
+- For Windows User: Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
+
+- For Linux User: Follow the official LunarG instructions for the installation and setup of the Vulkan SDK in the [Getting Started with the Linux Tarball Vulkan SDK](https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html) guide.
+
+Then install with Vulkan support by set the `GGML_VULKAN=on` environment variable before installing:
 
 ```bash
 CMAKE_ARGS="-DGGML_VULKAN=on" pip install llama-cpp-python
@@ -490,6 +496,9 @@ Below are the supported multi-modal models and their respective chat handlers (P
 | [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` |
 | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6`, `minicpm-v-4.0` |
 | [gemma3](https://huggingface.co/unsloth/gemma-3-27b-it-GGUF) | `Gemma3ChatHandler` | `gemma3` |
+| [glm4.1v](https://huggingface.co/unsloth/GLM-4.1V-9B-Thinking-GGUF) | `GLM41VChatHandler` | `glm4.1v` |
+| [glm4.6v](https://huggingface.co/unsloth/GLM-4.6V-Flash-GGUF) | `GLM46VChatHandler` | `glm4.6v` |
+| [lfm2-vl](https://huggingface.co/LiquidAI/LFM2-VL-3B-GGUF) | `LFM2VLChatHandler` | `lfm2-vl` |
 | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` |
 | [qwen3-vl](https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-GGUF) | `Qwen3VLChatHandler` | `qwen3-vl` |
 
@@ -585,8 +594,7 @@ messages = [
 
 </details>
 
-<details>
-<summary>Loading a Local Image With Qwen3VL(Thinking/Instruct)</summary>
+## Loading a Local Image With Qwen3VL(Thinking/Instruct)
 
 This script demonstrates how to load a local image, encode it as a base64 Data URI, and pass it to a local Qwen3-VL model (with the 'force_reasoning' parameter enabled for thinking model, disabled for instruct model) for processing using the llama-cpp-python library.
 
@@ -607,47 +615,92 @@ MMPROJ_PATH = r"./mmproj-Qwen3-VL-8b-Thinking-F16.gguf"
 llm = Llama(
     model_path=MODEL_PATH,
     # Set up the chat handler for Qwen3-VL, specifying the projector path
-    chat_handler=Qwen3VLChatHandler(clip_model_path=MMPROJ_PATH, force_reasoning=True),
+    chat_handler=Qwen3VLChatHandler(
+      clip_model_path=MMPROJ_PATH,
+      force_reasoning=True,
+      image_min_tokens=1024, # Note: Qwen-VL models require at minimum 1024 image tokens to function correctly on bbox grounding tasks
+    ),
     n_gpu_layers=-1,  # Offload all layers to the GPU
     n_ctx=10240,      # Set the context window size
     swa_full=True,
 )
 
-# --- Helper Function to Convert Image to Base64 Data URI ---
-def image_to_base64_data_uri(file_path):
+# Comprehensive MIME type mapping (updated as of 2025)
+# Reference: IANA official media types + common real-world usage
+_IMAGE_MIME_TYPES = {
+    # Most common formats
+    '.png':  'image/png',
+    '.jpg':  'image/jpeg',
+    '.jpeg': 'image/jpeg',
+    '.gif':  'image/gif',
+    '.webp': 'image/webp',
+    '.svg':  'image/svg+xml',
+    '.svgz': 'image/svg+xml',
+
+    # Next-generation formats
+    '.avif': 'image/avif',
+    '.heic': 'image/heic',
+    '.heif': 'image/heif',
+    '.heics': 'image/heic-sequence',
+    '.heifs': 'image/heif-sequence',
+
+    # Legacy / Windows formats
+    '.bmp':  'image/bmp',
+    '.dib':  'image/bmp',
+    '.ico':  'image/x-icon',
+    '.cur':  'image/x-icon',
+
+    # Professional imaging
+    '.tif':  'image/tiff',
+    '.tiff': 'image/tiff',
+}
+
+def image_to_base64_data_uri(
+    file_path: str,
+    *,
+    fallback_mime: str = "application/octet-stream"
+) -> str:
     """
-    Reads an image file, determines its MIME type, and converts it
-    to a base64 encoded Data URI.
+    Convert a local image file to a base64-encoded data URI with the correct MIME type.
+
+    Supports 20+ image formats (PNG, JPEG, WebP, AVIF, HEIC, SVG, BMP, ICO, TIFF, etc.).
+
+    Args:
+        file_path: Path to the image file on disk.
+        fallback_mime: MIME type used when the file extension is unknown.
+
+    Returns:
+        A valid data URI string (e.g., data:image/webp;base64,...).
+
+    Raises:
+        FileNotFoundError: If the file does not exist.
+        OSError: If reading the file fails.
     """
-    # Get the file extension to determine MIME type
+    if not os.path.isfile(file_path):
+        raise FileNotFoundError(f"Image file not found: {file_path}")
+
     extension = os.path.splitext(file_path)[1].lower()
+    mime_type = _IMAGE_MIME_TYPES.get(extension, fallback_mime)
 
-    # Determine the MIME type based on the file extension
-    if extension == '.png':
-        mime_type = 'image/png'
-    elif extension in ('.jpg', '.jpeg'):
-        mime_type = 'image/jpeg'
-    elif extension == '.gif':
-        mime_type = 'image/gif'
-    elif extension == '.svg':
-        mime_type = 'image/svg+xml'
-    else:
-        # Use a generic stream type for unsupported formats
-        mime_type = 'application/octet-stream'
-        print(f"Warning: Unsupported image type for file: {file_path}. Using a generic MIME type.")
-
-    # Read the image file in binary mode
-    with open(file_path, "rb") as img_file:
-        # Encode the binary data to base64 and decode to UTF-8
-        base64_data = base64.b64encode(img_file.read()).decode('utf-8')
-        # Format as a Data URI string
-        return f"data:{mime_type};base64,{base64_data}"
+    if mime_type == fallback_mime:
+        print(f"Warning: Unknown extension '{extension}' for '{file_path}'. "
+              f"Using fallback MIME type: {fallback_mime}")
+
+    try:
+        with open(file_path, "rb") as img_file:
+            encoded_data = base64.b64encode(img_file.read()).decode("utf-8")
+    except OSError as e:
+        raise OSError(f"Failed to read image file '{file_path}': {e}") from e
+
+    return f"data:{mime_type};base64,{encoded_data}"
 
 # --- Main Logic for Image Processing ---
 
 # 1. Create a list containing all image paths
 image_paths = [
     r'./scene.jpeg',
+    r'./cat.png',
+    r'./network.webp',
     # Add more image paths here if needed
 ]
 
@@ -666,7 +719,7 @@ images_messages.append({"type": "text", "text": "Describes the images."})
 # 5. Use this list to build the chat_completion request
 res = llm.create_chat_completion(
     messages=[
-        {"role": "system", "content": "You are a AI assistant who perfectly describes images."},
+        {"role": "system", "content": "You are a highly accurate vision-language assistant. Provide detailed, precise, and well-structured image descriptions."},
         # The user's content is the list containing both images and text
         {"role": "user", "content": images_messages}
     ]
@@ -677,7 +730,6 @@ print(res["choices"][0]["message"]["content"])
 
 ```
 
-</details>
 
 ### Speculative Decoding
 
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index c1dde7046..bdaefb9e0 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.16"
+__version__ = "0.3.18"
diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py
index e88ed387d..0ba7f416d 100644
--- a/llama_cpp/_ctypes_extensions.py
+++ b/llama_cpp/_ctypes_extensions.py
@@ -53,11 +53,26 @@ def load_shared_library(lib_base_name: str, base_path: pathlib.Path):
     if sys.platform == "win32" and sys.version_info >= (3, 8):
         os.add_dll_directory(str(base_path))
         if "CUDA_PATH" in os.environ:
-            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
-            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
+            cuda_path = os.environ["CUDA_PATH"]
+            sub_dirs_to_add = [
+                "bin",
+                os.path.join("bin", "x64"),  # CUDA 13.0+
+                "lib",
+                os.path.join("lib", "x64")
+            ]
+            for sub_dir in sub_dirs_to_add:
+                full_path = os.path.join(cuda_path, sub_dir)
+                if os.path.exists(full_path):
+                    os.add_dll_directory(full_path)
+
         if "HIP_PATH" in os.environ:
             os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "bin"))
             os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "lib"))
+
+        if "VULKAN_SDK" in os.environ:
+            os.add_dll_directory(os.path.join(os.environ["VULKAN_SDK"], "Bin"))
+            os.add_dll_directory(os.path.join(os.environ["VULKAN_SDK"], "Lib"))
+
         cdll_args["winmode"] = ctypes.RTLD_GLOBAL
 
     # Try to load the shared library, handling potential errors
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index 91eb801f6..0c95a6132 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -225,32 +225,38 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
     # Extra
     def metadata(self) -> Dict[str, str]:
         metadata: Dict[str, str] = {}
-        buffer_size = 1024
+        # Pre-allocate a 16KB buffer. This is large enough to handle almost all
+        # metadata values (including gpt-oss large chat templates ~15KB) in a single pass,
+        # eliminating the need for resize-and-retry in most cases.
+        buffer_size = 16384
         buffer = ctypes.create_string_buffer(buffer_size)
-        # zero the buffer
-        buffer.value = b"\0" * buffer_size
+
+        # Caching function references reduces the overhead of property lookups within loops.
+        get_key_by_index = llama_cpp.llama_model_meta_key_by_index
+        get_val_by_index = llama_cpp.llama_model_meta_val_str_by_index
+        metadata_count = llama_cpp.llama_model_meta_count(self.model)
         # iterate over model keys
-        for i in range(llama_cpp.llama_model_meta_count(self.model)):
-            nbytes = llama_cpp.llama_model_meta_key_by_index(
-                self.model, i, buffer, buffer_size
-            )
+        for i in range(metadata_count):
+            # 1. Get Key
+            nbytes = get_key_by_index(self.model, i, buffer, buffer_size)
+            # Handle buffer resize if the key exceeds current size
             if nbytes > buffer_size:
-                buffer_size = nbytes + 1
+                buffer_size = nbytes + 1024
                 buffer = ctypes.create_string_buffer(buffer_size)
-                nbytes = llama_cpp.llama_model_meta_key_by_index(
-                    self.model, i, buffer, buffer_size
-                )
+                # Retry with the larger buffer
+                nbytes = get_key_by_index(self.model, i, buffer, buffer_size)
             key = buffer.value.decode("utf-8")
-            nbytes = llama_cpp.llama_model_meta_val_str_by_index(
-                self.model, i, buffer, buffer_size
-            )
+
+            # 2. Get Value
+            nbytes = get_val_by_index(self.model, i, buffer, buffer_size)
+            # Handle buffer resize if the value exceeds current size
             if nbytes > buffer_size:
-                buffer_size = nbytes + 1
+                buffer_size = nbytes + 1024
                 buffer = ctypes.create_string_buffer(buffer_size)
-                nbytes = llama_cpp.llama_model_meta_val_str_by_index(
-                    self.model, i, buffer, buffer_size
-                )
+                # Retry with the larger buffer
+                nbytes = get_val_by_index(self.model, i, buffer, buffer_size)
             value = buffer.value.decode("utf-8")
+
             metadata[key] = value
         return metadata
 
@@ -301,8 +307,8 @@ def __del__(self):
     def n_ctx(self) -> int:
         return llama_cpp.llama_n_ctx(self.ctx)
 
-    def n_batch(self) -> int:
-        return llama_cpp.llama_n_batch(self.ctx)
+    def n_ctx_seq(self) -> int:
+        return llama_cpp.llama_n_ctx_seq(self.ctx)
 
     def n_batch(self) -> int:
         return llama_cpp.llama_n_batch(self.ctx)
@@ -436,12 +442,19 @@ def encode(self, batch: LlamaBatch):
             raise RuntimeError(f"llama_encode returned {return_code}")
 
     def decode(self, batch: LlamaBatch):
-        return_code = llama_cpp.llama_decode(
-            self.ctx,
-            batch.batch,
-        )
-        if return_code != 0:
-            raise RuntimeError(f"llama_decode returned {return_code}")
+        return_code = llama_cpp.llama_decode(self.ctx, batch.batch)
+
+        if return_code == 0:
+            return
+
+        error_map = {
+             1: "No KV slot available: try reducing batch size or increasing context window",
+             2: "Decoding aborted",
+            -1: "Invalid input batch",
+        }
+
+        msg = error_map.get(return_code, "Fatal internal error")
+        raise RuntimeError(f"llama_decode failed (code {return_code}): {msg}")
 
     def set_n_threads(self, n_threads: int, n_threads_batch: int):
         llama_cpp.llama_set_n_threads(self.ctx, n_threads, n_threads_batch)
@@ -649,7 +662,7 @@ def n_tokens(self) -> int:
     def reset(self):
         self.batch.n_tokens = 0
 
-    def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool):
+    def set_batch(self, batch: Sequence[int], n_past: llama_cpp.llama_pos, logits_all: bool):
         n_tokens = len(batch)
         self.batch.n_tokens = n_tokens
         for i in range(n_tokens):
@@ -1003,27 +1016,6 @@ def convert_list_str_to_char_array_ptr(self, str_list: List[str]):
         # Return the char** pointer and the number of strings
         return char_array_ptr, num_byte_list
 
-    def add_grammar_lazy(
-            self,
-            model: LlamaModel,
-            grammar: LlamaGrammar,
-            trigger_tokens:list[llama_cpp.llama_token],
-            num_trigger_tokens: int,
-            trigger_words: list[str]=[]
-        ):
-        trigger_words_char_array_ptr, num_trigger_words = self.convert_list_str_to_char_array_ptr(trigger_words)
-
-        sampler = llama_cpp.llama_sampler_init_grammar_lazy(
-            model.vocab,
-            grammar._grammar.encode("utf-8"),
-            grammar._root.encode("utf-8"),
-            trigger_words_char_array_ptr,
-            num_trigger_words,
-            trigger_tokens,
-            num_trigger_tokens
-        )
-        self._add_sampler(sampler)
-
     def add_grammar_lazy_patterns(
             self,
             model: LlamaModel,
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 2bb1c8769..c2628555b 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -273,17 +273,17 @@ def __init__(
                 if isinstance(v, bool):
                     self._kv_overrides_array[
                         i
-                    ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL
+                    ].tag = llama_cpp.LlamaModelKVOverrideType.LLAMA_KV_OVERRIDE_TYPE_BOOL.value
                     self._kv_overrides_array[i].value.val_bool = v
                 elif isinstance(v, int):
                     self._kv_overrides_array[
                         i
-                    ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT
+                    ].tag = llama_cpp.LlamaModelKVOverrideType.LLAMA_KV_OVERRIDE_TYPE_INT.value
                     self._kv_overrides_array[i].value.val_i64 = v
                 elif isinstance(v, float):
                     self._kv_overrides_array[
                         i
-                    ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT
+                    ].tag = llama_cpp.LlamaModelKVOverrideType.LLAMA_KV_OVERRIDE_TYPE_FLOAT.value
                     self._kv_overrides_array[i].value.val_f64 = v
                 elif isinstance(v, str):  # type: ignore
                     v_bytes = v.encode("utf-8")
@@ -292,7 +292,7 @@ def __init__(
                     v_bytes = v_bytes.ljust(128, b"\0")
                     self._kv_overrides_array[
                         i
-                    ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
+                    ].tag = llama_cpp.LlamaModelKVOverrideType.LLAMA_KV_OVERRIDE_TYPE_STR.value
                     # copy min(v_bytes, 128) to str_value
                     address = typing.cast(
                         int,
@@ -1031,6 +1031,11 @@ def create_embedding(
         Returns:
             An embedding object.
         """
+        warnings.warn(
+            "The `create_embedding` method in `Llama` class is deprecated. "
+            "Please migrate to `LlamaEmbedding.create_embedding` for better efficiency.",
+            DeprecationWarning,
+        )
         model_name: str = model if model is not None else self.model_path
 
         input = input if isinstance(input, list) else [input]
@@ -1075,6 +1080,12 @@ def embed(
         Returns:
             A list of embeddings
         """
+        warnings.warn(
+            "The `embed` method in `Llama` class is deprecated and will be removed in future versions. "
+            "Please use the `LlamaEmbedding` class from `llama_embedding` module for optimized performance and reranking support.",
+            DeprecationWarning,
+        )
+
         n_embd = self.n_embd()
         n_batch = self.n_batch
 
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index efb17bdd8..166459edb 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -2792,10 +2792,13 @@ class Llava15ChatHandler:
         "{% endif %}"
     )
 
-    def __init__(self, clip_model_path: str, verbose: bool = True):
+    def __init__(self, clip_model_path: str, verbose: bool = True, use_gpu: bool = True, image_min_tokens: int = -1, image_max_tokens: int = -1):
         import llama_cpp.mtmd_cpp as mtmd_cpp
 
         self.clip_model_path = clip_model_path
+        self.image_min_tokens = image_min_tokens
+        self.image_max_tokens = image_max_tokens
+        self.use_gpu = use_gpu
         self.verbose = verbose
         self._mtmd_cpp = mtmd_cpp
         self._exit_stack = ExitStack()
@@ -2805,6 +2808,12 @@ def __init__(self, clip_model_path: str, verbose: bool = True):
         if not os.path.exists(clip_model_path):
             raise ValueError(f"Clip model path does not exist: {clip_model_path}")
 
+        # Pre-compile Jinja template
+        self.chat_template = ImmutableSandboxedEnvironment(
+            trim_blocks=True,
+            lstrip_blocks=True,
+        ).from_string(self.CHAT_FORMAT)
+
     def _init_mtmd_context(self, llama_model: llama.Llama):
         """Initialize mtmd context with the llama model."""
         if self.mtmd_ctx is not None:
@@ -2815,10 +2824,17 @@ def _init_mtmd_context(self, llama_model: llama.Llama):
 
             # Get default parameters
             mctx_params = self._mtmd_cpp.mtmd_context_params_default()
-            mctx_params.use_gpu = True # TODO: Make this configurable
+            mctx_params.use_gpu = self.use_gpu
             mctx_params.print_timings = self.verbose
             mctx_params.n_threads = llama_model.n_threads
             mctx_params.flash_attn_type  = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO
+            mctx_params.warmup = True
+            if self.image_min_tokens > 0:
+                mctx_params.image_min_tokens = self.image_min_tokens
+            if self.image_max_tokens > 0:
+                mctx_params.image_max_tokens = self.image_max_tokens
+            if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0:
+                raise ValueError(f"image_max_pixels {self.image_max_tokens} is less than image_min_pixels {self.image_min_tokens}")
 
             # Initialize mtmd context
             self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file(
@@ -2922,16 +2938,12 @@ def __call__(
             ] + messages
 
         image_urls = self.get_image_urls(messages)
-        template = ImmutableSandboxedEnvironment(
-            trim_blocks=True,
-            lstrip_blocks=True,
-        ).from_string(self.CHAT_FORMAT)
 
         # Get the default media marker
         media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8')
 
         # Replace image URLs with media markers in the template
-        text = template.render(
+        text = self.chat_template.render(
             messages=messages,
             tools=tools,
             add_generation_prompt=True,
@@ -2987,22 +2999,19 @@ def __call__(
                 llama._ctx.memory_clear(True)
 
                 # Process each chunk
-                n_past = llama_cpp.llama_pos(0)
+                n_past = 0
                 n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks)
 
                 for i in range(n_chunks):
                     chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i)
-                    if chunk is None:
-                        continue
+                    if chunk is None: continue
 
                     chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk)
 
                     if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT:
                         # Handle text chunk
                         n_tokens_out = ctypes.c_size_t()
-                        tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(
-                            chunk, ctypes.byref(n_tokens_out)
-                        )
+                        tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk, ctypes.byref(n_tokens_out))
 
                         if tokens_ptr and n_tokens_out.value > 0:
                             # Convert ctypes array to Python list
@@ -3012,15 +3021,17 @@ def __call__(
                                 raise ValueError(
                                     f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}"
                                 )
+                            llama.n_tokens = n_past
                             llama.eval(tokens)
+                            n_past = llama.n_tokens
 
                     elif chunk_type in [self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE, self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO]:
                         # Handle image/audio chunk using helper
                         chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk)
 
-                        if llama.n_tokens + chunk_n_tokens > llama.n_ctx():
+                        if n_past + chunk_n_tokens > llama.n_ctx():
                             raise ValueError(
-                                f"Prompt exceeds n_ctx: {llama.n_tokens + chunk_n_tokens} > {llama.n_ctx()}"
+                                f"Prompt exceeds n_ctx: {n_past + chunk_n_tokens} > {llama.n_ctx()}"
                             )
 
                         new_n_past = llama_cpp.llama_pos(0)
@@ -3028,7 +3039,7 @@ def __call__(
                             self.mtmd_ctx,
                             llama._ctx.ctx,
                             chunk,
-                            llama_cpp.llama_pos(llama.n_tokens),
+                            llama_cpp.llama_pos(n_past),
                             llama_cpp.llama_seq_id(0),
                             llama.n_batch,
                             False,  # logits_last
@@ -3039,8 +3050,15 @@ def __call__(
                             raise ValueError(f"Failed to evaluate chunk: error code {result}")
 
                         # Update llama's token count
-                        llama.n_tokens = new_n_past.value
-
+                        n_past = new_n_past.value
+                        llama.n_tokens = n_past
+
+                n_past = llama.n_tokens
+                if n_past > 0:
+                    llama._ctx.memory_seq_rm(0, n_past - 1, -1)
+                    if llama._ctx.memory_seq_pos_min(0) == llama._ctx.memory_seq_pos_max(0):
+                        n_past += 1
+                    llama.n_tokens = n_past
                 # Get prompt tokens to avoid a cache miss
                 prompt = llama.input_ids[: llama.n_tokens].tolist()
 
@@ -3144,16 +3162,83 @@ def __call__(
 
     @staticmethod
     def _load_image(image_url: str) -> bytes:
-        # TODO: Add Pillow support for other image formats beyond (jpg, png)
-        if image_url.startswith("data:"):
+        """
+        Load an image from either a URL or a data URI and return it as JPEG bytes.
+
+        Supports:
+        - Remote images via HTTP/HTTPS (with proper User-Agent)
+        - Data URIs (base64-encoded, e.g., data:image/png;base64,...)
+        - Images with alpha channel (PNG, WebP, etc.) → automatically composites on white/black background
+        - Any format that Pillow can open
+
+        Returns:
+            JPEG-encoded bytes (quality=95) in RGB mode, suitable for most vision models.
+        """
+        image_bytes = b""
+
+        # 1. Handle data URI (base64)
+        if image_url.strip().startswith("data:"):
             import base64
-            image_bytes = base64.b64decode(image_url.split(",")[1])
-            return image_bytes
+            # Split only once from the right to correctly handle mime types containing commas
+            comma_pos = image_url.find(",")
+            if comma_pos == -1:
+                raise ValueError("Invalid data URI: missing comma separator")
+            base64_data = image_url[comma_pos + 1 :]
+            image_bytes = base64.b64decode(base64_data)
+
+        # 2. Handle local/remote URL
         else:
             import urllib.request
-            with urllib.request.urlopen(image_url) as f:
-                image_bytes = f.read()
-                return image_bytes
+            from urllib.error import URLError, HTTPError
+
+            headers = {"User-Agent": "Mozilla/5.0"}
+            req = urllib.request.Request(image_url, headers=headers)
+
+            try:
+                with urllib.request.urlopen(req, timeout=15) as f:
+                    image_bytes = f.read()
+            except (URLError, HTTPError) as e:
+                raise ConnectionError(f"Failed to download image from {image_url}: {e}")
+
+        if not image_bytes:
+            raise ValueError("Empty image data received")
+
+        # 3. Open image with Pillow
+        try:
+            from PIL import Image, ImageStat
+        except ImportError:
+            raise ImportError("Pillow is required for image processing. Install with: pip install pillow")
+
+        import io
+        image = Image.open(io.BytesIO(image_bytes))
+
+        # 4. Handle transparency (RGBA, LA, P with transparency, etc.)
+        if image.mode in ("RGBA", "LA", "PA") or (image.mode == "P" and "transparency" in image.info):
+            # Use alpha channel as mask
+            if image.mode == "P":
+                image = image.convert("RGBA")
+
+            alpha = image.split()[-1]  # Last channel is alpha
+            # Compute average brightness of visible (non-transparent) pixels
+            stat = ImageStat.Stat(image.convert("L"), mask=alpha)
+
+            # Choose background: white for dark content, black for bright content
+            bg_color = (255, 255, 255)  # white
+            if stat.count[0] > 0 and stat.mean[0] > 127:
+                bg_color = (0, 0, 0)  # black
+
+            background = Image.new("RGB", image.size, bg_color)
+            background.paste(image, mask=alpha)
+            image = background
+
+        # 5. Ensure RGB mode for formats like CMYK, palette, etc.
+        elif image.mode != "RGB":
+            image = image.convert("RGB")
+
+        # 6. Save as high-quality JPEG, suitable for most vision models.
+        output = io.BytesIO()
+        image.save(output, format="JPEG", quality=95, optimize=True, progressive=True)
+        return output.getvalue()
 
     @staticmethod
     def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]):
@@ -3638,6 +3723,249 @@ class Gemma3ChatHandler(Llava15ChatHandler):
     )
 
 
+class GLM41VChatHandler(Llava15ChatHandler):
+    # Note: Make sure the GGUF files of your converted model and mmproj are F16 or F32.
+
+    GLM41V_EOS_TOKEN = "<|endoftext|>"
+    GLM41V_PAD_TOKEN = "<|endoftext|>"
+    GLM41V_IMAGE_START_TOKEN = "<|begin_of_image|>"
+    GLM41V_IMAGE_END_TOKEN = "<|end_of_image|>"
+
+    CHAT_FORMAT = (
+        "[gMASK]<sop>\n"
+        "{%- for msg in messages -%}"
+            "{%- if msg.role == 'system' -%}"
+                "<|system|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}"
+            "{%- elif msg.role == 'user' -%}"
+                "<|user|>\n"
+                "{%- if msg.content is string -%}"
+                    "{{ msg.content }}"
+                "{%- else -%}"
+                    "{%- for item in msg.content -%}"
+                        "{%- if item.type == 'image_url' or 'image_url' in item -%}"
+                            "<|begin_of_image|>"
+                            "{%- if item.image_url is string -%}"
+                                "{{- item.image_url -}}"
+                            "{%- else -%}"
+                                "{{- item.image_url.url -}}"
+                            "{%- endif -%}"
+                            "<|end_of_image|>"
+                        "{%- elif item.type == 'text' -%}"
+                            "{{ item.text }}"
+                        "{%- endif -%}"
+                    "{%- endfor -%}"
+                "{%- endif -%}{{ GLM41V_EOS_TOKEN }}"
+            "{%- elif msg.role == 'assistant' -%}"
+                "{%- if msg.metadata -%}"
+                    "<|assistant|>{{ msg.metadata }}\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}"
+                "{%- else -%}"
+                    "<|assistant|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}"
+                "{%- endif -%}"
+            "{%- endif -%}"
+        "{%- endfor -%}"
+        "{%- if add_generation_prompt -%}"
+            "<|assistant|>\n"
+        "{%- endif -%}"
+    )
+
+    def __call__(self, **kwargs):
+        self.extra_template_arguments["GLM41V_EOS_TOKEN"] = self.GLM41V_EOS_TOKEN
+        # https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking/blob/main/generation_config.json
+        stop_tokens = [self.GLM41V_EOS_TOKEN, "<|user|>", "<|observation|>", "</answer>"] # Stop token patch
+        kwargs['stop'] = stop_tokens
+
+        llama = kwargs['llama']
+
+        # Clear state for multiple runs
+        llama.reset()
+        llama._ctx.memory_clear(True)
+        llama.n_tokens = 0
+
+        if hasattr(llama, 'input_ids'):
+            llama.input_ids.fill(0)
+
+        # Clear any handler state
+        if hasattr(self, '_last_image_embed'):
+            self._last_image_embed = None
+            self._last_image_hash = None
+
+        if self.verbose:
+            messages = kwargs.get('messages', [])
+            try:
+                image_count = len(self.get_image_urls(messages))
+                print(f"GLM4VChatHandler - Cleared state, processing {image_count} images", file=sys.stderr)
+            except Exception:
+                print(f"GLM4VChatHandler - Cleared state", file=sys.stderr)
+
+        # Use parent implementation
+        return super().__call__(**kwargs)
+
+
+class GLM46VChatHandler(Llava15ChatHandler):
+    GLM46V_EOS_TOKEN = "<|endoftext|>"
+    GLM46V_PAD_TOKEN = "<|endoftext|>"
+    GLM46V_IMAGE_START_TOKEN = "<|begin_of_image|>"
+    GLM46V_IMAGE_END_TOKEN = "<|end_of_image|>"
+
+    CHAT_FORMAT = (
+        "[gMASK]<sop>"
+        "{%- if tools -%}"
+            "<|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n"
+            "You are provided with function signatures within <tools></tools> XML tags:\n<tools>\n"
+            "{%- for tool in tools -%}"
+                "{{ tool | tojson(ensure_ascii=False) }}\n"
+            "{%- endfor -%}"
+            "</tools>\n\nFor each function call, output the function name and arguments within the following XML format:\n"
+            "<tool_call>{function-name}\n<arg_key>{arg-key-1}</arg_key>\n<arg_value>{arg-value-1}</arg_value>\n...\n</tool_call>"
+        "{%- endif -%}"
+
+        "{%- for m in messages -%}"
+            "{%- if m.role == 'system' -%}"
+                "<|system|>\n{{ m.content }}"
+            "{%- elif m.role == 'user' -%}"
+                "<|user|>\n"
+                "{%- if m.content is string -%}"
+                    "{{ m.content }}"
+                "{%- else -%}"
+                    "{%- for item in m.content -%}"
+                        "{%- if item.type == 'image_url' or 'image_url' in item -%}"
+                            "<|begin_of_image|>"
+                            "{%- if item.image_url is string -%}"
+                                "{{- item.image_url -}}"
+                            "{%- else -%}"
+                                "{{- item.image_url.url -}}"
+                            "{%- endif -%}"
+                            "<|end_of_image|>"
+                        "{%- elif item.type == 'text' -%}"
+                            "{{ item.text }}"
+                        "{%- endif -%}"
+                    "{%- endfor -%}"
+                "{%- endif -%}"
+                # If enable_thinking is disabled, insert `/nothink` according to the source code logic.
+                "{{ '/nothink' if not enable_thinking else '' }}"
+            "{%- elif m.role == 'assistant' -%}"
+                "<|assistant|>"
+                "{%- if enable_thinking -%}"
+                    "{%- set reasoning = m.reasoning_content if m.reasoning_content is string else '' -%}"
+                    "\n<think>{{ reasoning.strip() }}</think>"
+                "{%- else -%}"
+                    "\n<think></think>"
+                "{%- endif -%}"
+                "{{ '\n' + m.content.strip() if m.content.strip() else '' }}"
+            "{%- endif -%}"
+            "{{ GLM46V_EOS_TOKEN }}"
+        "{%- endfor -%}"
+
+        "{%- if add_generation_prompt -%}"
+            "<|assistant|>\n"
+            "{{ '<think>' if enable_thinking else '<think></think>\n' }}"
+        "{%- endif -%}"
+    )
+
+    def __init__(self, enable_thinking: bool = True, **kwargs):
+        """
+        GLM-4.6V Handler
+        Parameters:
+        - enable_thinking (bool): Whether to enable the model's think process. The default is True.
+        """
+        self.enable_thinking = enable_thinking
+        super().__init__(**kwargs)
+
+    def __call__(self, **kwargs):
+        self.extra_template_arguments["enable_thinking"] = self.enable_thinking
+        self.extra_template_arguments["GLM46V_EOS_TOKEN"] = self.GLM46V_EOS_TOKEN
+
+        # https://huggingface.co/zai-org/GLM-4.6V-Flash/blob/main/generation_config.json
+        kwargs['stop'] = [self.GLM46V_EOS_TOKEN, "<|user|>", "<|observation|>", "<|code_middle|>"] # Stop token patch
+
+        llama = kwargs['llama']
+        llama.reset()
+        llama._ctx.memory_clear(True)
+        llama.n_tokens = 0
+
+        if hasattr(llama, 'input_ids'):
+            llama.input_ids.fill(0)
+
+        if hasattr(self, '_last_image_embed'):
+            self._last_image_embed = None
+            self._last_image_hash = None
+
+        if self.verbose:
+            messages = kwargs.get('messages', [])
+            try:
+                image_count = len(self.get_image_urls(messages))
+                print(f"GLM46VChatHandler(enable_thinking={self.enable_thinking}) - Processing {image_count} images", file=sys.stderr)
+            except Exception:
+                print(f"GLM46VChatHandler(enable_thinking={self.enable_thinking}) - Cleared state", file=sys.stderr)
+
+        return super().__call__(**kwargs)
+
+
+class LFM2VLChatHandler(Llava15ChatHandler):
+    LFM2VL_BOS_TOKEN = "<|startoftext|>"
+    LFM2VL_EOS_TOKEN = "<|im_end|>"
+    LFM2VL_IMAGE_START_TOKEN = "<|image_start|>"
+    LFM2VL_IMAGE_END_TOKEN = "<|image_end|>"
+
+    CHAT_FORMAT = (
+        "{%- for message in messages -%}"
+            "{{ '<|im_start|>' + message['role'] + '\n' }}"
+            "{%- if message['content'] is string -%}"
+                "{{ message['content'] }}"
+            "{%- else -%}"
+                "{%- for content in message['content'] -%}"
+                    "{%- if 'image_url' in content -%}"
+                        "{%- if content.image_url is string -%}"
+                            "<|image_start|>{{ content.image_url }}<|image_end|>"
+                        "{%- else -%}"
+                            "<|image_start|>{{ content.image_url.url }}<|image_end|>"
+                        "{%- endif -%}"
+                    "{%- elif content['type'] == 'text' -%}"
+                        "{{ content['text'] }}"
+                    "{%- endif -%}"
+                "{%- endfor -%}"
+            "{%- endif -%}"
+            "{{ '<|im_end|>\n' }}"
+        "{%- endfor -%}"
+        "{%- if add_generation_prompt -%}"
+            "{{ '<|im_start|>assistant\n' }}"
+        "{%- endif -%}"
+    )
+
+    def __init__(self, image_min_tokens: int = -1, image_max_tokens: int = -1, **kwargs):
+        """
+        LFM2-VL Handler
+        LiquidAI officially recommends configuring LFM2-VL with the following Vision parameters: min_image_tokens=64, max_image_tokens=256
+        """
+        self.image_min_tokens = image_min_tokens
+        self.image_max_tokens = image_max_tokens
+        super().__init__(image_min_tokens=self.image_min_tokens, image_max_tokens=self.image_max_tokens, **kwargs)
+
+    def __call__(self, **kwargs):
+
+        llama = kwargs['llama']
+        llama.reset()
+        llama._ctx.memory_clear(True)
+        llama.n_tokens = 0
+
+        if hasattr(llama, 'input_ids'):
+            llama.input_ids.fill(0)
+
+        if hasattr(self, '_last_image_embed'):
+            self._last_image_embed = None
+            self._last_image_hash = None
+
+        if self.verbose:
+            messages = kwargs.get('messages', [])
+            try:
+                image_count = len(self.get_image_urls(messages))
+                print(f"LFM2VLChatHandler - Cleared state, Processing {image_count} images", file=sys.stderr)
+            except Exception:
+                print(f"LFM2VLChatHandler - Cleared state", file=sys.stderr)
+
+        return super().__call__(**kwargs)
+
+
 class Qwen25VLChatHandler(Llava15ChatHandler):
     DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant."
 
@@ -3718,7 +4046,7 @@ class Qwen3VLChatHandler(Llava15ChatHandler):
                 "{{- '\n' -}}"
                 "{{- tool | tojson -}}"
             "{%- endfor -%}"
-            "{{- '\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <arguments-json-object>}\n</tool_call>\n\nYou can also return a response for the user alongside a function call:\n<response-for-user>\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <arguments-json-object>}\n</tool_call>' -}}"
+            "{{- '\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <arguments-json-object>}\n</tool_call>\n\nYou can also return a response for the user alongside a function call:\nRESPONSE FOR THE USER HERE\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <arguments-json-object>}\n</tool_call>' -}}"
         "{%- endif -%}"
         "{{- '<|im_end|>\n' -}}"
         "{%- set image_count = namespace(value=0) -%}"
@@ -3780,7 +4108,7 @@ class Qwen3VLChatHandler(Llava15ChatHandler):
             "{%- endif -%}"
         "{%- endfor -%}"
         "{%- if add_generation_prompt -%}"
-            "{{- '<im_start>assistant\n' -}}"
+            "{{- '<|im_start|>assistant\n' -}}"
             "{%- if force_reasoning -%}"
                 "{{- '<think>\n' -}}"
             "{%- endif -%}"
@@ -3791,6 +4119,7 @@ def __init__(
         self,
         force_reasoning: bool = False,
         add_vision_id: bool = True,
+        image_min_tokens: int = -1,
         **kwargs,
     ):
         """
@@ -3801,11 +4130,15 @@ def __init__(
         - add_vision_id (bool):
             - True (default): Count all the images. Recommended for multi-image.
             - False: Doesn't count the images. Can save tokens with single-image.
+        - image_min_tokens (int):
+            It only takes effect when the value is greater than zero. the default value is -1 (i.e., using the default parameters in the model's preprocessor_config.json).
+            Note: Qwen-VL models require at minimum 1024 image tokens to function correctly on bbox grounding tasks
         """
         self.force_reasoning = force_reasoning
         self.add_vision_id = add_vision_id
+        self.image_min_tokens = image_min_tokens
 
-        super().__init__(**kwargs)
+        super().__init__(image_min_tokens=self.image_min_tokens, **kwargs)
 
     def __call__(self, **kwargs):
         self.extra_template_arguments["force_reasoning"] = self.force_reasoning
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index d3890c0c1..aa4e40e05 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -550,7 +550,7 @@ class llama_batch(ctypes.Structure):
     The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
 
     Attributes:
-        n_tokens (int): number of tokens
+        n_tokens (ctypes.c_int32): number of tokens
         token (ctypes.Array[llama_token]): the token ids of the input (used when embd is NULL)
         embd (ctypes.Array[ctypes.ctypes.c_float]): token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
         pos (ctypes.Array[ctypes.Array[llama_pos]]): the positions of the respective token in the sequence
@@ -559,7 +559,7 @@ class llama_batch(ctypes.Structure):
     """
 
     if TYPE_CHECKING:
-        n_tokens: int
+        n_tokens: ctypes.c_int32
         token: CtypesArray[llama_token]
         embd: CtypesArray[ctypes.c_float]
         pos: CtypesArray[CtypesArray[llama_pos]]
@@ -584,10 +584,40 @@ class llama_batch(ctypes.Structure):
 #     LLAMA_KV_OVERRIDE_TYPE_BOOL,
 #     LLAMA_KV_OVERRIDE_TYPE_STR,
 # };
-LLAMA_KV_OVERRIDE_TYPE_INT = 0
-LLAMA_KV_OVERRIDE_TYPE_FLOAT = 1
-LLAMA_KV_OVERRIDE_TYPE_BOOL = 2
-LLAMA_KV_OVERRIDE_TYPE_STR = 3
+class LlamaModelKVOverrideType(enum.IntEnum):
+    LLAMA_KV_OVERRIDE_TYPE_INT   = 0
+    LLAMA_KV_OVERRIDE_TYPE_FLOAT = 1
+    LLAMA_KV_OVERRIDE_TYPE_BOOL  = 2
+    LLAMA_KV_OVERRIDE_TYPE_STR   = 3
+
+
+# enum llama_model_meta_key {
+#     LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE,
+#     LLAMA_MODEL_META_KEY_SAMPLING_TOP_K,
+#     LLAMA_MODEL_META_KEY_SAMPLING_TOP_P,
+#     LLAMA_MODEL_META_KEY_SAMPLING_MIN_P,
+#     LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY,
+#     LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD,
+#     LLAMA_MODEL_META_KEY_SAMPLING_TEMP,
+#     LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N,
+#     LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT,
+#     LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT,
+#     LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU,
+#     LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA,
+# };
+class LlamaModelMetaKey(enum.IntEnum):
+    LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE        = 0
+    LLAMA_MODEL_META_KEY_SAMPLING_TOP_K           = 1
+    LLAMA_MODEL_META_KEY_SAMPLING_TOP_P           = 2
+    LLAMA_MODEL_META_KEY_SAMPLING_MIN_P           = 3
+    LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY = 4
+    LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD   = 5
+    LLAMA_MODEL_META_KEY_SAMPLING_TEMP            = 6
+    LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N  = 7
+    LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT  = 8
+    LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT        = 9
+    LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU    = 10
+    LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA    = 11
 
 
 # struct llama_model_kv_override {
@@ -682,6 +712,7 @@ class llama_model_tensor_buft_override(ctypes.Structure):
 #     bool check_tensors;   // validate model tensor data
 #     bool use_extra_bufts; // use extra buffer types (used for weight repacking)
 #     bool no_host;         // bypass host buffer allowing extra buffers to be used
+#     bool no_alloc;        // only load metadata and simulate memory allocations
 # };
 class llama_model_params(ctypes.Structure):
     """Parameters for llama_model
@@ -701,7 +732,8 @@ class llama_model_params(ctypes.Structure):
         use_mlock (bool): force system to keep model in RAM
         check_tensors (bool): validate model tensor data
         use_extra_bufts (bool): use extra buffer types (used for weight repacking)
-        no_host (bool): bypass host buffer allowing extra buffers to be used"""
+        no_host (bool): bypass host buffer allowing extra buffers to be used
+        no_alloc (bool): only load metadata and simulate memory allocations"""
 
     if TYPE_CHECKING:
         devices: CtypesArray[ctypes.c_void_p]  # NOTE: unused
@@ -719,6 +751,7 @@ class llama_model_params(ctypes.Structure):
         check_tensors: bool
         use_extra_bufts: bool
         no_host: bool
+        no_alloc: bool
 
     _fields_ = [
         ("devices", ctypes.c_void_p), # NOTE: unnused
@@ -736,8 +769,10 @@ class llama_model_params(ctypes.Structure):
         ("check_tensors", ctypes.c_bool),
         ("use_extra_bufts", ctypes.c_bool),
         ("no_host", ctypes.c_bool),
+        ("no_alloc", ctypes.c_bool),
     ]
 
+llama_model_params_p = ctypes.POINTER(llama_model_params)
 
 # // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
 # //       https://github.com/ggml-org/llama.cpp/pull/7544
@@ -888,6 +923,7 @@ class llama_context_params(ctypes.Structure):
         ("kv_unified", ctypes.c_bool),
     ]
 
+llama_context_params_p = ctypes.POINTER(llama_context_params)
 
 # // Signature for logging events
 # // Note that text includes the new line character at the end for most events.
@@ -1276,6 +1312,51 @@ def llama_free(ctx: llama_context_p, /):
     ...
 
 
+# // fits mparams and cparams to free device memory (assumes system memory is unlimited)
+# // returns true if the parameters could be successfully modified to fit device memory
+# // this function is NOT thread safe because it modifies the global llama logger state
+# LLAMA_API bool llama_params_fit(
+#                                 const char   * path_model,
+#                 struct llama_model_params   * mparams,
+#                 struct llama_context_params * cparams,
+#                                         float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
+#     struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
+#                                         size_t   margin,                // margin of memory to leave per device in bytes
+#                                     uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
+#                         enum ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log
+@ctypes_function(
+    "llama_params_fit",
+    [
+        ctypes.c_char_p,
+        llama_model_params_p,
+        llama_context_params_p,
+        ctypes.POINTER(ctypes.c_float),
+        ctypes.POINTER(llama_model_tensor_buft_override),
+        ctypes.c_size_t,
+        ctypes.c_uint32,
+        ctypes.c_int,
+    ],
+    ctypes.c_bool,
+)
+def llama_params_fit(
+    path_model: ctypes.c_char_p,
+    mparams: llama_model_params_p,
+    cparams: llama_context_params_p,
+    tensor_split: ctypes.pointer(ctypes.c_float),
+    tensor_buft_overrides: ctypes.pointer(llama_model_tensor_buft_override),
+    margin: ctypes.c_size_t,
+    n_ctx_min: ctypes.c_uint32,
+    log_level: int,
+    /,
+) -> bool:
+    """
+    fits mparams and cparams to free device memory (assumes system memory is unlimited)
+    returns true if the parameters could be successfully modified to fit device memory
+    this function is NOT thread safe because it modifies the global llama logger state
+    """
+    ...
+
+
 # LLAMA_API int64_t llama_time_us(void);
 @ctypes_function(
     "llama_time_us",
@@ -1298,6 +1379,12 @@ def llama_max_parallel_sequences() -> int:
     ...
 
 
+# LLAMA_API size_t llama_max_tensor_buft_overrides(void);
+@ctypes_function("llama_max_tensor_buft_overrides", [], ctypes.c_size_t)
+def llama_max_tensor_buft_overrides() -> int:
+    ...
+
+
 # LLAMA_API bool llama_supports_mmap       (void);
 @ctypes_function("llama_supports_mmap", [], ctypes.c_bool)
 def llama_supports_mmap() -> bool:
@@ -1442,22 +1529,47 @@ def llama_model_n_head(model: llama_model_p, /) -> int:
     ...
 
 
- # LLAMA_API int32_t llama_model_n_swa      (const struct llama_model * model);
-@ctypes_function("llama_model_n_swa", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_model_n_swa(model: llama_model_p, /) -> int:
-    ...
-
-
 # LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
 @ctypes_function("llama_model_n_head_kv", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_model_n_head_kv(model: llama_model_p, /) -> int:
     ...
 
 
+ # LLAMA_API int32_t llama_model_n_swa      (const struct llama_model * model);
+@ctypes_function("llama_model_n_swa", [llama_model_p_ctypes], ctypes.c_int32)
+def llama_model_n_swa(model: llama_model_p, /) -> int:
+    ...
+
+
 # // Get the model's RoPE frequency scaling factor
 # LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
 @ctypes_function("llama_model_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
 def llama_model_rope_freq_scale_train(model: llama_model_p, /) -> float:
+    """
+    Get the model's RoPE frequency scaling factor
+    """
+    ...
+
+
+# // Returns the number of classifier outputs (only valid for classifier models)
+# // Undefined behavior for non-classifier models
+# LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
+@ctypes_function("llama_model_n_cls_out", [llama_model_p_ctypes], ctypes.c_uint32)
+def llama_model_n_cls_out(model: llama_model_p, /) -> int:
+    """
+    Returns the number of classifier outputs (only valid for classifier models)
+    Undefined behavior for non-classifier models
+    """
+    ...
+
+
+# // Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
+# LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
+@ctypes_function("llama_model_cls_label", [llama_model_p_ctypes, ctypes.c_uint32], ctypes.c_char_p)
+def llama_model_cls_label(model: llama_model_p, i: ctypes.c_uint32, /) -> ctypes.c_char_p:
+    """
+    Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
+    """
     ...
 
 
@@ -1511,6 +1623,14 @@ def llama_model_meta_count(model: llama_model_p, /) -> int:
     ...
 
 
+# // Get sampling metadata key name. Returns nullptr if the key is invalid
+# LLAMA_API const char * llama_model_meta_key_str(enum llama_model_meta_key key);
+@ctypes_function("llama_model_meta_key_str", [ctypes.c_int], ctypes.c_char_p)
+def llama_model_meta_key_str(key: int, /) -> ctypes.c_char_p:
+    """Get sampling metadata key name. Returns nullptr if the key is invalid"""
+    ...
+
+
 # // Get metadata key name by index
 # LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
 @ctypes_function(
@@ -4179,6 +4299,23 @@ def llama_print_system_info() -> bytes:
 
 # // Set callback for all future logging events.
 # // If this is not called, or NULL is supplied, everything is output on stderr.
+# // The logger state is global so these functions are NOT thread safe.
+# LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data);
+@ctypes_function(
+    "llama_log_get",
+    [ctypes.POINTER(ggml_log_callback), ctypes.POINTER(ctypes.c_void_p)],
+    None,
+)
+def llama_log_get(
+    log_callback: Optional[ctypes.pointer(ggml_log_callback)],
+    user_data: ctypes.pointer(ctypes.c_void_p),
+    /,
+):
+    """Get callback for all future logging events.
+    If this is not called, or NULL is supplied, everything is output on stderr."""
+    ...
+
+
 # LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
 @ctypes_function(
     "llama_log_set",
@@ -4191,7 +4328,6 @@ def llama_log_set(
     /,
 ):
     """Set callback for all future logging events.
-
     If this is not called, or NULL is supplied, everything is output on stderr."""
     ...
 
diff --git a/llama_cpp/llama_embedding.py b/llama_cpp/llama_embedding.py
new file mode 100644
index 000000000..44d918e16
--- /dev/null
+++ b/llama_cpp/llama_embedding.py
@@ -0,0 +1,345 @@
+import numpy as np
+from typing import Union, List, Optional, Dict, Any, Tuple
+import llama_cpp.llama_cpp as llama_cpp
+from .llama_types import Embedding
+from .llama import Llama
+# Pooling types from .llama_cpp
+from .llama_cpp import (
+    LLAMA_POOLING_TYPE_UNSPECIFIED,
+    LLAMA_POOLING_TYPE_NONE,
+    LLAMA_POOLING_TYPE_MEAN,
+    LLAMA_POOLING_TYPE_CLS,
+    LLAMA_POOLING_TYPE_LAST,
+    LLAMA_POOLING_TYPE_RANK, # Specifically for Reranking models
+)
+
+# Normalization modes for embedding vectors
+# See: https://github.com/ggml-org/llama.cpp/tree/master/examples/embedding#--embd-normalize-integer
+NORM_MODE_NONE = -1
+NORM_MODE_MAX_INT16 = 0
+NORM_MODE_TAXICAB = 1
+NORM_MODE_EUCLIDEAN = 2
+
+# TODO(JamePeng): Needs more extensive testing with various embedding and reranking models.
+class LlamaEmbedding(Llama):
+    """
+    A specialized class for high-performance Text Embedding and Reranking.
+    Inherits from the base Llama class but is optimized for vector operations.
+
+    Key Features:
+    1. Auto-configuration: Automatically sets embedding=True.
+    2. Streaming Batch: Handles massive datasets without OOM (Out Of Memory).
+    3. Native Reranking Support: Specifically handles `LLAMA_POOLING_TYPE_RANK` models (like BGE-Reranker). /
+       It correctly identifies classification heads to output scalar relevance scores instead of high-dimensional vectors.
+    4. Advanced Normalization: Implements MaxInt16, Taxicab (L1), and Euclidean (L2) normalization strategies /
+       using NumPy for optimal performance and compatibility with various vector databases.
+    """
+
+    def __init__(self, model_path: str, pooling_type: int = LLAMA_POOLING_TYPE_UNSPECIFIED, **kwargs):
+        """
+        Initialize the embedding model with enforced configuration.
+
+        Args:
+            model_path: Path to the GGUF model file.
+            pooling_type: The pooling strategy used by the model.
+                          - Use `LLAMA_POOLING_TYPE_RANK` (4) for Reranker models.
+                          - Use `LLAMA_POOLING_TYPE_UNSPECIFIED` (-1) to let the model metadata decide (for standard embeddings).
+            **kwargs: Additional arguments passed to the Llama base class (e.g., n_gpu_layers, n_batch, n_ctx).
+        """
+        kwargs["embedding"] = True
+
+        # Enable Unified KV Cache (Crucial for Batching)
+        # This allows us to assign arbitrary seq_ids in a batch, enabling the parallel /
+        #     encoding of multiple unrelated documents without "invalid seq_id" errors.
+        kwargs["kv_unified"] = True
+
+        # Set pooling type
+        kwargs["pooling_type"] = pooling_type
+
+        super().__init__(model_path=model_path, **kwargs)
+
+        if self.verbose:
+            print(f"LlamaEmbedding initialized with pooling_type: {self.pooling_type()}")
+
+    def _normalize_vector(self, vector: List[float], mode: int) -> List[float]:
+        """
+        Apply mathematical normalization to a vector.
+        Uses numpy for performance.
+        """
+        if mode == NORM_MODE_NONE: return vector
+        arr = np.array(vector, dtype=np.float32)
+
+        # Mode 0: Max Absolute Int16 -> 32760 * x_i / max|x_i|
+        if mode == NORM_MODE_MAX_INT16:
+            max_abs = np.max(np.abs(arr))
+            if max_abs == 0: return vector
+            return ((arr / max_abs) * 32760.0).tolist()
+
+        # Mode 1: Taxicab (L1 Norm) -> x_i / sum|x_i|
+        elif mode == NORM_MODE_TAXICAB:
+            norm = np.sum(np.abs(arr))
+            if norm == 0: return vector
+            return (arr / norm).tolist()
+
+        # Mode 2: Euclidean (L2 Norm) -> x_i / sqrt(sum x_i^2)
+        elif mode == NORM_MODE_EUCLIDEAN:
+            norm = np.linalg.norm(arr)
+            if norm == 0: return vector
+            return (arr / norm).tolist()
+
+        # Mode > 2: p-norm
+        elif mode > 2:
+            norm = np.sum(np.abs(arr) ** mode) ** (1.0 / mode)
+            if norm == 0: return vector
+            return (arr / norm).tolist()
+
+        return vector
+
+    def embed(
+        self,
+        input: Union[str, List[str], List[List[int]]],
+        normalize: int = NORM_MODE_EUCLIDEAN,
+        truncate: bool = True,
+        separator: Optional[str] = None,
+        return_count: bool = False,
+    ) -> Union[List[float], List[List[float]], Tuple[Any, int]]:
+
+        ctx = self._ctx.ctx
+        n_batch = self.n_batch
+        n_ctx = self._n_ctx
+        n_ubatch = self.context_params.n_ubatch
+
+        # Determine if it is in Rerank mode
+        try:
+            current_pooling = self.pooling_type()
+        except AttributeError:
+            current_pooling = LLAMA_POOLING_TYPE_UNSPECIFIED
+        is_rank = (current_pooling == LLAMA_POOLING_TYPE_RANK)
+        logits_all = current_pooling == llama_cpp.LLAMA_POOLING_TYPE_NONE
+
+        # Determine the output dimension
+        if is_rank:
+            out_dim = llama_cpp.llama_model_n_cls_out(self._model.model)
+        else:
+            out_dim = self.n_embd()
+
+        if self.verbose:
+            mode_str = "RANK (Score)" if is_rank else "EMBED (Vector)"
+            print(f"LlamaEmbedding Debug: Mode={mode_str} | Output Dimension={out_dim}")
+
+        # Preprocess Input
+        inputs: List[Union[str, List[int]]] = []
+        is_single = False
+
+        if isinstance(input, str):
+            if separator:
+                inputs = input.split(separator)
+                is_single = False
+            else:
+                inputs = [input]
+                is_single = True
+        else:
+            inputs = input
+            is_single = False
+
+        # Reset Context and Batch
+        if self.verbose:
+            llama_cpp.llama_perf_context_reset(ctx)
+        self._batch.reset()
+        llama_cpp.llama_memory_clear(llama_cpp.llama_get_memory(ctx), True)
+
+        # Initialize State Variables
+        results: List[Any] = []
+        batch_seq_lens: List[int] = []
+        total_tokens_processed = 0
+
+        # --- Decode Current Batch ---
+        def _decode_batch():
+            nonlocal batch_seq_lens
+            if not batch_seq_lens: return
+
+            self._ctx.decode(self._batch)
+
+            for i in range(len(batch_seq_lens)):
+                ptr = llama_cpp.llama_get_embeddings_seq(ctx, i)
+                data = ptr[:out_dim]
+
+                if not is_rank:
+                    data = self._normalize_vector(data, normalize)
+
+                if is_rank and len(data) == 1:
+                    results.append(data[0])
+                else:
+                    results.append(data)
+
+            self._batch.reset()
+            llama_cpp.llama_memory_clear(llama_cpp.llama_get_memory(ctx), True)
+            batch_seq_lens = []
+
+        # Main Streaming Loop
+        idx_in_batch = 0
+
+        for item in inputs:
+            # Tokenize
+            tokens: List[int] = []
+            if isinstance(item, list) and (not item or isinstance(item[0], int)):
+                tokens = item
+            elif isinstance(item, str):
+                tokens = self.tokenize(item.encode("utf-8"))
+            else:
+                raise ValueError("Input item must be str or List[int]")
+
+            # Truncate
+            if truncate and len(tokens) > n_ctx:
+                tokens = tokens[:n_ctx]
+
+            n_tokens = len(tokens)
+            total_tokens_processed += n_tokens
+
+            if n_tokens == 0:
+                results.append(0.0 if is_rank else [])
+                continue
+
+            # Check Batch Capacity
+            if (self._batch.n_tokens() + n_tokens > n_batch) or (idx_in_batch >= n_ubatch):
+                _decode_batch()
+                idx_in_batch = 0
+
+            # Add to Batch
+            self._batch.add_sequence(tokens, idx_in_batch, logits_all=logits_all)
+            batch_seq_lens.append(n_tokens)
+            idx_in_batch += 1
+
+        # Process Remaining Items
+        _decode_batch()
+
+        if self.verbose:
+            llama_cpp.llama_perf_context_print(ctx)
+
+        final_result = results[0] if is_single else results
+
+        if return_count:
+            return final_result, total_tokens_processed
+
+        return final_result
+
+    def rank(self, query: str, documents: List[str]) -> List[float]:
+        """
+        Calculate relevance scores for a list of documents against a query using a Reranking model.
+
+        This method constructs a specific prompt structure ([BOS] Query [SEP] Doc [EOS])
+        typically used by Cross-Encoders to estimate similarity.
+
+        Args:
+            query: The search query string.
+            documents: A list of candidate document strings to be scored.
+
+        Returns:
+            A list of float scores, where higher values indicate greater relevance.
+        """
+        if self.pooling_type() != LLAMA_POOLING_TYPE_RANK:
+            raise ValueError(f"Model pooling_type is {self.pooling_type()}, but LLAMA_POOLING_TYPE_RANK is required.")
+
+        # Prepare Special Tokens
+        sep_id = self.token_sep()
+        if sep_id == -1: sep_id = self.token_eos()
+        eos_id = self.token_eos()
+
+        # Pre-process Query
+        q_tokens = self.tokenize(query.encode("utf-8"), add_bos=True, special=True)
+        # Remove the automatically added EOS token from the query
+        # because we need to append the separator and document tokens after it.
+        if q_tokens and q_tokens[-1] == eos_id:
+            q_tokens.pop()
+
+        # Construct Batch Inputs
+        batch_inputs: List[List[int]] = []
+        for doc in documents:
+            d_tokens = self.tokenize(doc.encode("utf-8"), add_bos=False, special=True)
+            full_seq = q_tokens + [sep_id] + d_tokens
+            # Ensure the sequence ends with an EOS token to mark the end of inference.
+            if not full_seq or full_seq[-1] != eos_id:
+                full_seq.append(eos_id)
+            batch_inputs.append(full_seq)
+
+        # We use NORM_MODE_NONE because rerankers output raw logits/scores, not vectors that need normalization.
+        return self.embed(batch_inputs, normalize=NORM_MODE_NONE)
+
+    def create_embedding(
+        self,
+        input: Union[str, List[str]],
+        model: Optional[str] = None,
+        normalize: int = NORM_MODE_EUCLIDEAN,
+        output_format: str = "json"
+    ) -> Union[Dict[str, Any], List[float], List[List[float]]]:
+        """
+        High-level API compatible with OpenAI format.
+
+        Args:
+            output_format:
+                - 'json': OpenAI style dict (Default)
+                - 'json+': OpenAI style dict + cosineSimilarity matrix
+                - 'array': Raw python list (List[float] or List[List[float]])
+        """
+        model_name = model if model is not None else self.model_path
+
+        # Normalize input to list
+        inputs_list = [input] if isinstance(input, str) else input
+
+        # Generate Embeddings(and get token count)
+        embeddings, token_count = self.embed(
+            inputs_list,
+            normalize=normalize,
+            return_count=True
+        )
+
+        if output_format == "array":
+            return embeddings
+
+        # Structure the OpenAI-style response ('json' or 'json+')
+        # Ensure embeddings is a list for iteration
+        # (If input was single string, embeddings is List[float], wrap it for the loop)
+        iter_embeddings = [embeddings] if isinstance(embeddings[0], float) else embeddings
+
+        data: List[Embedding] = [
+            {
+                "object": "embedding",
+                "embedding": emb,
+                "index": idx,
+            }
+            for idx, emb in enumerate(iter_embeddings)
+        ]
+
+        response = {
+            "object": "list",
+            "data": data,
+            "model": model_name,
+            "usage": {
+                "prompt_tokens": token_count,  # Input consumption
+                "completion_tokens": 0,        # The Embedding task does not generate text, so the value is 0.
+                "total_tokens": token_count,   # Total consumption = Input consumption + Output
+            }
+        }
+
+        # Calculate Cosine Similarity Matrix (Optimized via Numpy)
+        # Only if output_format is 'json+' and we have vectors
+        if output_format == "json+" and len(embeddings) > 1 and isinstance(embeddings[0], list):
+            try:
+                # Assuming embeddings are already L2 normalized if normalize=2
+                mat = np.array(embeddings)
+
+                # Safety check: Force normalize if not already done, to ensure Cosine (not Dot Product)
+                if normalize != NORM_MODE_EUCLIDEAN:
+                    norm = np.linalg.norm(mat, axis=1, keepdims=True)
+                    # Avoid division by zero
+                    norm[norm == 0] = 1e-10
+                    mat = mat / norm
+
+                # Matrix multiplication: A @ A.T
+                sim_matrix = np.dot(mat, mat.T)
+                response["cosineSimilarity"] = sim_matrix.tolist()
+            except Exception as e:
+                if self.verbose:
+                    print(f"Warning: Failed to calculate similarity matrix: {e}")
+
+        return response
diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py
index 1079c1d2e..46ae4ba1c 100644
--- a/llama_cpp/llama_grammar.py
+++ b/llama_cpp/llama_grammar.py
@@ -2,13 +2,15 @@
 
 # flake8: noqa
 from pathlib import Path
-
-from itertools import groupby
+import itertools
+import json
+import re
+import sys
 from typing import (
     Any,
-    Set,
     List,
     Optional,
+    Set,
     Tuple,
     Union,
 )
@@ -21,30 +23,50 @@ def __init__(self, *args, _grammar: str, **kwargs):
         self._grammar = _grammar
         self._root = LLAMA_GRAMMAR_DEFAULT_ROOT
 
+    @property
+    def grammar(self) -> str:
+        return self._grammar
+
     @classmethod
     def from_string(cls, grammar: str, verbose: bool = True) -> "LlamaGrammar":
         return cls(_grammar=grammar)
 
     @classmethod
     def from_file(cls, file: Union[str, Path], verbose: bool = True) -> "LlamaGrammar":
+        file_path = Path(file)
+
+        if not file_path.exists():
+            raise FileNotFoundError(f"{cls.__name__}.from_file: file not found: {file_path}")
+
         try:
-            with open(file) as f:
-                grammar = f.read()
+            grammar_content = file_path.read_text(encoding='utf-8')
         except Exception as err:
-            raise Exception(
-                f"{cls.from_file.__name__}: error reading grammar file: {err}"
-            )
+            raise IOError(f"{cls.__name__}.from_file: error reading grammar file: {err}")
 
-        if grammar:
-            return cls.from_string(grammar, verbose=verbose)
+        if not grammar_content.strip():
+            raise ValueError(f"{cls.__name__}.from_file: grammar file is empty")
 
-        raise ValueError(
-            f"{cls.from_file.__name__}: error parsing grammar file: params_grammer is empty"
-        )
+        return cls.from_string(grammar_content, verbose=verbose)
 
     @classmethod
-    def from_json_schema(cls, json_schema: str, verbose: bool = True) -> "LlamaGrammar":
-        return cls.from_string(json_schema_to_gbnf(json_schema), verbose=verbose)
+    def from_json_schema(
+        cls,
+        json_schema: str,
+        prop_order: Optional[List[str]] = None,
+        verbose: bool = True
+    ) -> "LlamaGrammar":
+        """
+        Create a syntax object from a JSON Schema.
+
+        json_schema: A JSON Schema string or dictionary.
+        prop_order: Specifies the order in which fields are generated (helps improve the stability of small models).
+        verbose: Whether to log.
+        """
+        try:
+            gbnf_grammar_str = json_schema_to_gbnf(json_schema, prop_order=prop_order)
+            return cls.from_string(gbnf_grammar_str, verbose=verbose)
+        except Exception as e:
+            raise ValueError(f"{cls.__name__}.from_json_schema: conversion failed: {e}")
 
 
 """llama.cpp gbnf rules from vendor/llama.cpp/grammars"""
@@ -242,78 +264,190 @@ def from_json_schema(cls, json_schema: str, verbose: bool = True) -> "LlamaGramm
 """
 
 """llama.cpp json-schema to grammar converter from vendor/llama.cpp/examples/json-schema-to-grammar.py"""
-import json
-import re
-from typing import List, Optional
 
-# whitespace is constrained to a single space char to prevent model "running away" in
-# whitespace. Also maybe improves generation quality?
-SPACE_RULE = '" "?'
+def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
 
+    if max_items == 0:
+        return ""
 
-INVALID_RULE_CHARS_RE = re.compile(r"[^a-zA-Z0-9-]+")
-GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
-GRAMMAR_LITERAL_ESCAPES = {"\r": "\\r", "\n": "\\n", '"': '\\"'}
-
-# whitespace is constrained to a single space char to prevent model "running away" in
-# whitespace. Also maybe improves generation quality?
-SPACE_RULE = '" "?'
-
+    if min_items == 0 and max_items == 1:
+        return f'{item_rule}?'
 
-def _build_repetition(
-    item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False
-):
     if not separator_rule:
-        if min_items == 0 and max_items == 1:
-            return f"{item_rule}?"
-        elif min_items == 1 and max_items is None:
-            return f"{item_rule}+"
-
-    result = ""
-
-    if min_items > 0:
-        if item_rule_is_literal and separator_rule is None:
-            result = '"' + (item_rule[1:-1] * min_items) + '"'
+        if min_items == 1 and max_items is None:
+            return f'{item_rule}+'
+        elif min_items == 0 and max_items is None:
+            return f'{item_rule}*'
         else:
-            result = (f" {separator_rule} " if separator_rule else " ").join(
-                [item_rule] * min_items
-            )
-
-    def opt_repetitions(up_to_n, prefix_with_sep=False):
-        """
-        - n=4, no sep:             '(a (a (a (a)?)?)?)?'
-        - n=4, sep=',', prefix:    '("," a ("," a ("," a ("," a)?)?)?)?'
-        - n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
-        """
+            return f'{item_rule}{{{min_items},{max_items if max_items is not None else ""}}}'
 
-        content = (
-            f"{separator_rule} {item_rule}"
-            if prefix_with_sep and separator_rule
-            else item_rule
-        )
-        if up_to_n == 0:
-            return ""
-        elif up_to_n == 1:
-            return f"({content})?"
-        elif separator_rule and not prefix_with_sep:
-            return f"({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?"
-        else:
-            return (f"({content} " * up_to_n).rstrip() + (")?" * up_to_n)
+    result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
+    return f'({result})?' if min_items == 0 else result
 
-    if min_items > 0 and max_items != min_items:
-        result += " "
+def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True):
+    has_min = min_value != None
+    has_max = max_value != None
 
-    if max_items is not None:
-        result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
-    else:
-        item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
-
-        if min_items == 0 and separator_rule:
-            result = f"({item_rule} {item_operator}*)?"
+    def digit_range(from_char: str, to_char: str):
+        out.append("[")
+        if from_char == to_char:
+            out.append(from_char)
         else:
-            result += f"{item_operator}*"
+            out.append(from_char)
+            out.append("-")
+            out.append(to_char)
+        out.append("]")
+
+    def more_digits(min_digits: int, max_digits: int):
+        out.append("[0-9]")
+        if min_digits == max_digits and min_digits == 1:
+            return
+        out.append("{")
+        out.append(str(min_digits))
+        if max_digits != min_digits:
+            out.append(",")
+            if max_digits != sys.maxsize:
+                out.append(str(max_digits))
+        out.append("}")
+
+    def uniform_range(from_str: str, to_str: str):
+        i = 0
+        while i < len(from_str) and from_str[i] == to_str[i]:
+            i += 1
+        if i > 0:
+            out.append("\"")
+            out.append(from_str[:i])
+            out.append("\"")
+        if i < len(from_str):
+            if i > 0:
+                out.append(" ")
+            sub_len = len(from_str) - i - 1
+            if sub_len > 0:
+                from_sub = from_str[i+1:]
+                to_sub = to_str[i+1:]
+                sub_zeros = "0" * sub_len
+                sub_nines = "9" * sub_len
+
+                to_reached = False
+                out.append("(")
+                if from_sub == sub_zeros:
+                    digit_range(from_str[i], chr(ord(to_str[i]) - 1))
+                    out.append(" ")
+                    more_digits(sub_len, sub_len)
+                else:
+                    out.append("[")
+                    out.append(from_str[i])
+                    out.append("] ")
+                    out.append("(")
+                    uniform_range(from_sub, sub_nines)
+                    out.append(")")
+                    if ord(from_str[i]) < ord(to_str[i]) - 1:
+                        out.append(" | ")
+                        if to_sub == sub_nines:
+                            digit_range(chr(ord(from_str[i]) + 1), to_str[i])
+                            to_reached = True
+                        else:
+                            digit_range(chr(ord(from_str[i]) + 1), chr(ord(to_str[i]) - 1))
+                        out.append(" ")
+                        more_digits(sub_len, sub_len)
+                if not to_reached:
+                    out.append(" | ")
+                    digit_range(to_str[i], to_str[i])
+                    out.append(" ")
+                    uniform_range(sub_zeros, to_sub)
+                out.append(")")
+            else:
+                out.append("[")
+                out.append(from_str[i])
+                out.append("-")
+                out.append(to_str[i])
+                out.append("]")
+
+    if has_min and has_max:
+        if min_value < 0 and max_value < 0:
+            out.append("\"-\" (")
+            _generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True)
+            out.append(")")
+            return
+
+        if min_value < 0:
+            out.append("\"-\" (")
+            _generate_min_max_int(0, -min_value, out, decimals_left, top_level=True)
+            out.append(") | ")
+            min_value = 0
+
+        min_s = str(min_value)
+        max_s = str(max_value)
+        min_digits = len(min_s)
+        max_digits = len(max_s)
+
+        for digits in range(min_digits, max_digits):
+            uniform_range(min_s, "9" * digits)
+            min_s = "1" + "0" * digits
+            out.append(" | ")
+        uniform_range(min_s, max_s)
+        return
+
+    less_decimals = max(decimals_left - 1, 1)
+
+    if has_min:
+        if min_value < 0:
+            out.append("\"-\" (")
+            _generate_min_max_int(None, -min_value, out, decimals_left, top_level=False)
+            out.append(") | [0] | [1-9] ")
+            more_digits(0, decimals_left - 1)
+        elif min_value == 0:
+            if top_level:
+                out.append("[0] | [1-9] ")
+                more_digits(0, less_decimals)
+            else:
+                more_digits(1, decimals_left)
+        elif min_value <= 9:
+            c = str(min_value)
+            range_start = '1' if top_level else '0'
+            if c > range_start:
+                digit_range(range_start, chr(ord(c) - 1))
+                out.append(" ")
+                more_digits(1, less_decimals)
+                out.append(" | ")
+            digit_range(c, "9")
+            out.append(" ")
+            more_digits(0, less_decimals)
+        else:
+            min_s = str(min_value)
+            length = len(min_s)
+            c = min_s[0]
+
+            if c > "1":
+                digit_range("1" if top_level else "0", chr(ord(c) - 1))
+                out.append(" ")
+                more_digits(length, less_decimals)
+                out.append(" | ")
+            digit_range(c, c)
+            out.append(" (")
+            _generate_min_max_int(int(min_s[1:]), None, out, less_decimals, top_level=False)
+            out.append(")")
+            if c < "9":
+                out.append(" | ")
+                digit_range(chr(ord(c) + 1), "9")
+                out.append(" ")
+                more_digits(length - 1, less_decimals)
+        return
+
+    if has_max:
+        if max_value >= 0:
+            if top_level:
+                out.append("\"-\" [1-9] ")
+                more_digits(0, less_decimals)
+                out.append(" | ")
+            _generate_min_max_int(0, max_value, out, decimals_left, top_level=True)
+        else:
+            out.append("\"-\" (")
+            _generate_min_max_int(-max_value, None, out, decimals_left, top_level=False)
+            out.append(")")
+        return
 
-    return result
+    raise RuntimeError("At least one of min_value or max_value must be set")
 
 
 class BuiltinRule:
@@ -321,70 +455,46 @@ def __init__(self, content: str, deps: list = None):
         self.content = content
         self.deps = deps or []
 
-
-_up_to_15_digits = _build_repetition("[0-9]", 0, 15)
+# Constraining spaces to prevent model "running away".
+SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}'
 
 PRIMITIVE_RULES = {
-    "boolean": BuiltinRule('("true" | "false") space', []),
-    "decimal-part": BuiltinRule("[0-9] " + _up_to_15_digits, []),
-    "integral-part": BuiltinRule("[0-9] | [1-9] " + _up_to_15_digits, []),
-    "number": BuiltinRule(
-        '("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space',
-        ["integral-part", "decimal-part"],
-    ),
-    "integer": BuiltinRule('("-"? integral-part) space', ["integral-part"]),
-    "value": BuiltinRule(
-        "object | array | string | number | boolean | null",
-        ["object", "array", "string", "number", "boolean", "null"],
-    ),
-    "object": BuiltinRule(
-        '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
-        ["string", "value"],
-    ),
-    "array": BuiltinRule(
-        '"[" space ( value ("," space value)* )? "]" space', ["value"]
-    ),
-    "uuid": BuiltinRule(
-        r'"\"" '
-        + ' "-" '.join("[0-9a-fA-F]" * n for n in [8, 4, 4, 4, 12])
-        + r' "\"" space',
-        [],
-    ),
-    "char": BuiltinRule(
-        r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])',
-        [],
-    ),
-    "string": BuiltinRule(r'"\"" char* "\"" space', ["char"]),
-    "null": BuiltinRule('"null" space', []),
+    'boolean'      : BuiltinRule('("true" | "false") space', []),
+    'decimal-part' : BuiltinRule('[0-9]{1,16}', []),
+    'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
+    'number'       : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
+    'integer'      : BuiltinRule('("-"? integral-part) space', ['integral-part']),
+    'value'        : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
+    'object'       : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
+    'array'        : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
+    'uuid'         : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
+    'char'         : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []),
+    'string'       : BuiltinRule(r'"\"" char* "\"" space', ['char']),
+    'null'         : BuiltinRule('"null" space', []),
 }
 
 # TODO: support "uri", "email" string formats
 STRING_FORMAT_RULES = {
-    "date": BuiltinRule(
-        '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( "0" [1-9] | [1-2] [0-9] | "3" [0-1] )',
-        [],
-    ),
-    "time": BuiltinRule(
-        '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
-        [],
-    ),
-    "date-time": BuiltinRule('date "T" time', ["date", "time"]),
-    "date-string": BuiltinRule('"\\"" date "\\"" space', ["date"]),
-    "time-string": BuiltinRule('"\\"" time "\\"" space', ["time"]),
-    "date-time-string": BuiltinRule('"\\"" date-time "\\"" space', ["date-time"]),
+    'date'            : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
+    'time'            : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
+    'date-time'       : BuiltinRule('date "T" time', ['date', 'time']),
+    'date-string'     : BuiltinRule('"\\"" date "\\"" space', ['date']),
+    'time-string'     : BuiltinRule('"\\"" time "\\"" space', ['time']),
+    'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
 }
 
-DOTALL = "[\\U00000000-\\U0010FFFF]"
-DOT = "[^\\x0A\\x0D]"
+DOTALL = '[\\U00000000-\\U0010FFFF]'
+DOT = '[^\\x0A\\x0D]'
 
-RESERVED_NAMES = set(
-    ["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()]
-)
+RESERVED_NAMES = set(["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()])
 
+INVALID_RULE_CHARS_RE = re.compile(r"[^a-zA-Z0-9-]+")
+GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n\"\\\\]')
+GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n\"\\]\\-\\\\]')
+GRAMMAR_LITERAL_ESCAPES = {"\r": "\\r", "\n": "\\n", '"': '\\"', "-": "\\-", "]": "\\]", "\\": "\\\\"}
 
 NON_LITERAL_SET = set("|.()[]{}*+?")
-ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set("[]()|{}*+?")
-
+ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set("^$.[]()|{}*+?")
 
 class SchemaConverter:
     def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
@@ -393,101 +503,142 @@ def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
         self._dotall = dotall
         self._raw_pattern = raw_pattern
         self._rules = {
-            "space": SPACE_RULE,
+            'space': SPACE_RULE,
         }
         self._refs = {}
         self._refs_being_resolved = set()
 
     def _format_literal(self, literal):
         escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
-            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal
+            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)) or m.group(0), literal
         )
         return f'"{escaped}"'
 
-    def not_literal(
-        self, literal: str, dotall: bool = True, maybe_escaped_underscores=False
-    ) -> str:
-        """
-        not_literal('a') -> '[^a]'
-        not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?'
-        """
-        assert len(literal) > 0, "Empty literal not supported"
-
+    def not_literal(self, literal: str, dotall: bool = True, maybe_escaped_underscores = False) -> str:
+        '''
+            not_literal('a') -> '[^a]'
+            not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?'
+        '''
+        assert len(literal) > 0, 'Empty literal not supported'
         def recurse(i: int):
             c = literal[i]
-            if maybe_escaped_underscores and c == "_":
-                yield f"[^{c}\\\\]"
-                yield " | "
+            if maybe_escaped_underscores and c == '_':
+                yield f'[^{c}\\\\]'
+                yield ' | '
                 yield f'"\\\\"? "{c}"'
             else:
-                yield f"[^{c}]"
+                yield f'[^{c}]'
             if i < len(literal) - 1:
-                yield " | "
+                yield ' | '
                 yield self._format_literal(c)
-                yield " ("
+                yield ' ('
                 yield from recurse(i + 1)
-                yield ")?"
-
-        return "".join(("(", *recurse(0), ")"))
+                yield ')?'
+
+        return ''.join(('(', *recurse(0), ')'))
+
+    def _not_strings(self, strings):
+        class TrieNode:
+            def __init__(self):
+                self.children = {}
+                self.is_end_of_string = False
+
+            def insert(self, string):
+                node = self
+                for c in string:
+                    node = node.children.setdefault(c, TrieNode())
+                node.is_end_of_string = True
+
+        trie = TrieNode()
+        for s in strings:
+            trie.insert(s)
+
+        char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
+        out = ['["] ( ']
+
+        def visit(node):
+            rejects = []
+            first = True
+            for c in sorted(node.children.keys()):
+                child = node.children[c]
+                rejects.append(c)
+                if first:
+                    first = False
+                else:
+                    out.append(' | ')
+                out.append(f'[{c}]')
+                if child.children:
+                    out.append(f' (')
+                    visit(child)
+                    out.append(')')
+                elif child.is_end_of_string:
+                    out.append(f' {char_rule}+')
+            if node.children:
+                if not first:
+                    out.append(' | ')
+                out.append(f'[^"{"".join(rejects)}] {char_rule}*')
+        visit(trie)
+
+        out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space')
+        return ''.join(out)
 
     def _add_rule(self, name, rule):
-        esc_name = INVALID_RULE_CHARS_RE.sub("-", name)
+        esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
         if esc_name not in self._rules or self._rules[esc_name] == rule:
             key = esc_name
         else:
             i = 0
-            while (
-                f"{esc_name}{i}" in self._rules
-                and self._rules[f"{esc_name}{i}"] != rule
-            ):
+            while f'{esc_name}{i}' in self._rules and self._rules[f'{esc_name}{i}'] != rule:
                 i += 1
-            key = f"{esc_name}{i}"
+            key = f'{esc_name}{i}'
         self._rules[key] = rule
         return key
 
     def resolve_refs(self, schema: dict, url: str):
-        """
-        Resolves all $ref fields in the given schema, fetching any remote schemas,
-        replacing $ref with absolute reference URL and populating self._refs with the
-        respective referenced (sub)schema dictionaries.
-        """
-
+        '''
+            Resolves all $ref fields in the given schema, fetching any remote schemas,
+            replacing $ref with absolute reference URL and populating self._refs with the
+            respective referenced (sub)schema dictionaries.
+        '''
         def visit(n: dict):
             if isinstance(n, list):
                 return [visit(x) for x in n]
             elif isinstance(n, dict):
-                ref = n.get("$ref")
+                ref = n.get('$ref')
                 if ref is not None and ref not in self._refs:
-                    if ref.startswith("https://"):
-                        assert (
-                            self._allow_fetch
-                        ), "Fetching remote schemas is not allowed (use --allow-fetch for force)"
+                    if ref.startswith('https://'):
+                        assert self._allow_fetch, 'Fetching remote schemas is not allowed (use --allow-fetch for force)'
                         import requests
 
-                        frag_split = ref.split("#")
+                        frag_split = ref.split('#')
                         base_url = frag_split[0]
 
                         target = self._refs.get(base_url)
                         if target is None:
-                            target = self.resolve_refs(
-                                requests.get(ref).json(), base_url
-                            )
+                            target = self.resolve_refs(requests.get(ref).json(), base_url)
                             self._refs[base_url] = target
 
-                        if len(frag_split) == 1 or frag_split[-1] == "":
+                        if len(frag_split) == 1 or frag_split[-1] == '':
                             return target
-                    elif ref.startswith("#/"):
+                    elif ref.startswith('#/'):
                         target = schema
-                        ref = f"{url}{ref}"
-                        n["$ref"] = ref
+                        ref = f'{url}{ref}'
+                        n['$ref'] = ref
                     else:
-                        raise ValueError(f"Unsupported ref {ref}")
-
-                    for sel in ref.split("#")[-1].split("/")[1:]:
-                        assert (
-                            target is not None and sel in target
-                        ), f"Error resolving ref {ref}: {sel} not in {target}"
-                        target = target[sel]
+                        raise ValueError(f'Unsupported ref {ref}')
+
+                    for sel in ref.split('#')[-1].split('/')[1:]:
+                        assert target is not None, f'Error resolving ref {ref}: {sel} not in {target}'
+                        if isinstance(target, list):
+                            try:
+                                sel_index = int(sel)
+                            except ValueError:
+                                raise ValueError(f'Error resolving ref {ref}: {sel} not in {target}')
+                            assert 0 <= sel_index < len(target), f'Error resolving ref {ref}: {sel} not in {target}'
+                            target = target[sel_index]
+                        else:
+                            assert sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
+                            target = target[sel]
 
                     self._refs[ref] = target
                 else:
@@ -495,47 +646,42 @@ def visit(n: dict):
                         visit(v)
 
             return n
-
         return visit(schema)
 
     def _generate_union_rule(self, name, alt_schemas):
-        return " | ".join(
-            (
-                self.visit(alt_schema, f'{name}{"-" if name else "alternative-"}{i}')
-                for i, alt_schema in enumerate(alt_schemas)
-            )
-        )
+        return ' | '.join((
+            self.visit(alt_schema, f'{name}{"-" if name else "alternative-"}{i}')
+            for i, alt_schema in enumerate(alt_schemas)
+        ))
 
     def _visit_pattern(self, pattern, name):
-        """
-        Transforms a regular expression pattern into a GBNF rule.
+        '''
+            Transforms a regular expression pattern into a GBNF rule.
 
-        Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions
-        Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
+            Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions
+            Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
 
-        Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers.
+            Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers.
 
-        Mostly a 1:1 translation, except for {x} / {x,} / {x,y} quantifiers for which
-        we define sub-rules to keep the output lean.
-        """
+            Mostly a 1:1 translation, except for {x} / {x,} / {x,y} quantifiers for which
+            we define sub-rules to keep the output lean.
+        '''
 
-        assert pattern.startswith("^") and pattern.endswith(
-            "$"
-        ), 'Pattern must start with "^" and end with "$"'
+        assert pattern.startswith('^') and pattern.endswith('$'), 'Pattern must start with "^" and end with "$"'
         pattern = pattern[1:-1]
         sub_rule_ids = {}
 
         i = 0
         length = len(pattern)
 
-        def to_rule(s: Tuple[str, bool]) -> str:
+        def to_rule(s: tuple[str, bool]) -> str:
             (txt, is_literal) = s
-            return '"' + txt + '"' if is_literal else txt
+            return "\"" + txt + "\"" if is_literal else txt
 
-        def transform() -> Tuple[str, bool]:
-            """
-            Parse a unit at index i (advancing it), and return its string representation + whether it's a literal.
-            """
+        def transform() -> tuple[str, bool]:
+            '''
+                Parse a unit at index i (advancing it), and return its string representation + whether it's a literal.
+            '''
             nonlocal i
             nonlocal pattern
             nonlocal sub_rule_ids
@@ -545,7 +691,7 @@ def transform() -> Tuple[str, bool]:
             # We only need a flat structure here to apply repetition operators to the last item, and
             # to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially
             # (GBNF's syntax is luckily very close to regular expressions!)
-            seq: list[Tuple[str, bool]] = []
+            seq: list[tuple[str, bool]] = []
 
             def get_dot():
                 if self._dotall:
@@ -553,72 +699,64 @@ def get_dot():
                 else:
                     # Accept any character... except \n and \r line break chars (\x0A and \xOD)
                     rule = DOT
-                return self._add_rule(f"dot", rule)
+                return self._add_rule(f'dot', rule)
 
             def join_seq():
                 nonlocal seq
                 ret = []
-                for is_literal, g in groupby(seq, lambda x: x[1]):
+                for is_literal, g in itertools.groupby(seq, lambda x: x[1]):
                     if is_literal:
-                        ret.append(("".join(x[0] for x in g), True))
+                        ret.append((''.join(x[0] for x in g), True))
                     else:
                         ret.extend(g)
                 if len(ret) == 1:
                     return ret[0]
-                return (" ".join(to_rule(x) for x in seq), False)
+                return (' '.join(to_rule(x) for x in seq), False)
 
             while i < length:
                 c = pattern[i]
-                if c == ".":
+                if c == '.':
                     seq.append((get_dot(), False))
                     i += 1
-                elif c == "(":
+                elif c == '(':
                     i += 1
                     if i < length:
-                        assert (
-                            pattern[i] != "?"
-                        ), f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/'
-                    seq.append((f"({to_rule(transform())})", False))
-                elif c == ")":
+                        assert pattern[i] != '?', f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/'
+                    seq.append((f'({to_rule(transform())})', False))
+                elif c == ')':
                     i += 1
-                    assert (
-                        start > 0 and pattern[start - 1] == "("
-                    ), f"Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}"
+                    assert start > 0 and pattern[start-1] == '(', f'Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}'
                     return join_seq()
-                elif c == "[":
+                elif c == '[':
                     square_brackets = c
                     i += 1
-                    while i < length and pattern[i] != "]":
-                        if pattern[i] == "\\":
-                            square_brackets += pattern[i : i + 2]
+                    while i < length and pattern[i] != ']':
+                        if pattern[i] == '\\':
+                            square_brackets += pattern[i:i+2]
                             i += 2
                         else:
                             square_brackets += pattern[i]
                             i += 1
-                    assert (
-                        i < length
-                    ), f"Unbalanced square brackets; start = {start}, i = {i}, pattern = {pattern}"
-                    square_brackets += "]"
+                    assert i < length, f'Unbalanced square brackets; start = {start}, i = {i}, pattern = {pattern}'
+                    square_brackets += ']'
                     i += 1
                     seq.append((square_brackets, False))
-                elif c == "|":
-                    seq.append(("|", False))
+                elif c == '|':
+                    seq.append(('|', False))
                     i += 1
-                elif c in ("*", "+", "?"):
+                elif c in ('*', '+', '?'):
                     seq[-1] = (to_rule(seq[-1]) + c, False)
                     i += 1
-                elif c == "{":
+                elif c == '{':
                     curly_brackets = c
                     i += 1
-                    while i < length and pattern[i] != "}":
+                    while i < length and pattern[i] != '}':
                         curly_brackets += pattern[i]
                         i += 1
-                    assert (
-                        i < length
-                    ), f"Unbalanced curly brackets; start = {start}, i = {i}, pattern = {pattern}"
-                    curly_brackets += "}"
+                    assert i < length, f'Unbalanced curly brackets; start = {start}, i = {i}, pattern = {pattern}'
+                    curly_brackets += '}'
                     i += 1
-                    nums = [s.strip() for s in curly_brackets[1:-1].split(",")]
+                    nums = [s.strip() for s in curly_brackets[1:-1].split(',')]
                     min_times = 0
                     max_times = None
                     try:
@@ -630,49 +768,35 @@ def join_seq():
                             min_times = int(nums[0]) if nums[0] else 0
                             max_times = int(nums[1]) if nums[1] else None
                     except ValueError:
-                        raise ValueError(
-                            f"Invalid quantifier {curly_brackets} in /{pattern}/"
-                        )
+                        raise ValueError(f'Invalid quantifier {curly_brackets} in /{pattern}/')
 
                     (sub, sub_is_literal) = seq[-1]
 
                     if not sub_is_literal:
                         id = sub_rule_ids.get(sub)
                         if id is None:
-                            id = self._add_rule(f"{name}-{len(sub_rule_ids) + 1}", sub)
+                            id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub)
                             sub_rule_ids[sub] = id
                         sub = id
 
-                    seq[-1] = (
-                        _build_repetition(
-                            f'"{sub}"' if sub_is_literal else sub,
-                            min_times,
-                            max_times,
-                            item_rule_is_literal=sub_is_literal,
-                        ),
-                        False,
-                    )
+                    seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times), False)
                 else:
-                    literal = ""
+                    literal = ''
                     while i < length:
-                        if pattern[i] == "\\" and i < length - 1:
+                        if pattern[i] == '\\' and i < length - 1:
                             next = pattern[i + 1]
                             if next in ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS:
                                 i += 1
                                 literal += pattern[i]
                                 i += 1
                             else:
-                                literal += pattern[i : i + 2]
+                                literal += pattern[i:i+2]
                                 i += 2
                         elif pattern[i] == '"' and not self._raw_pattern:
                             literal += '\\"'
                             i += 1
-                        elif pattern[i] not in NON_LITERAL_SET and (
-                            i == length - 1
-                            or literal == ""
-                            or pattern[i + 1] == "."
-                            or pattern[i + 1] not in NON_LITERAL_SET
-                        ):
+                        elif pattern[i] not in NON_LITERAL_SET and \
+                                (i == length - 1 or literal == '' or pattern[i+1] == '.' or pattern[i+1] not in NON_LITERAL_SET):
                             literal += pattern[i]
                             i += 1
                         else:
@@ -684,15 +808,13 @@ def join_seq():
 
         return self._add_rule(
             name,
-            (
-                to_rule(transform())
-                if self._raw_pattern
-                else '"\\"" ' + to_rule(transform()) + ' "\\"" space'
-            ),
-        )
+            to_rule(transform()) if self._raw_pattern \
+                else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space")
+
 
     def _resolve_ref(self, ref):
-        ref_name = ref.split("/")[-1]
+        ref_fragment = ref.split('#')[-1]
+        ref_name = 'ref' + re.sub(r'[^a-zA-Z0-9-]+', '-', ref_fragment)
         if ref_name not in self._rules and ref not in self._refs_being_resolved:
             self._refs_being_resolved.add(ref)
             resolved = self._refs[ref]
@@ -704,203 +826,166 @@ def _generate_constant_rule(self, value):
         return self._format_literal(json.dumps(value))
 
     def visit(self, schema, name):
-        schema_type = schema.get("type")
-        schema_format = schema.get("format")
-        rule_name = name + "-" if name in RESERVED_NAMES else name or "root"
+        schema_type = schema.get('type')
+        schema_format = schema.get('format')
+        rule_name = name + '-' if name in RESERVED_NAMES else name or 'root'
 
-        if (ref := schema.get("$ref")) is not None:
+        if (ref := schema.get('$ref')) is not None:
             return self._add_rule(rule_name, self._resolve_ref(ref))
 
-        elif "oneOf" in schema or "anyOf" in schema:
-            return self._add_rule(
-                rule_name,
-                self._generate_union_rule(name, schema.get("oneOf") or schema["anyOf"]),
-            )
+        elif 'oneOf' in schema or 'anyOf' in schema:
+            return self._add_rule(rule_name, self._generate_union_rule(name, schema.get('oneOf') or schema['anyOf']))
 
         elif isinstance(schema_type, list):
-            return self._add_rule(
-                rule_name,
-                self._generate_union_rule(name, [{"type": t} for t in schema_type]),
-            )
+            return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type]))
 
-        elif "const" in schema:
-            return self._add_rule(
-                rule_name, self._generate_constant_rule(schema["const"])
-            )
+        elif 'const' in schema:
+            return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space')
 
-        elif "enum" in schema:
-            rule = " | ".join((self._generate_constant_rule(v) for v in schema["enum"]))
+        elif 'enum' in schema:
+            rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space'
             return self._add_rule(rule_name, rule)
 
-        elif schema_type in (None, "object") and (
-            "properties" in schema
-            or (
-                "additionalProperties" in schema
-                and schema["additionalProperties"] is not True
-            )
-        ):
-            required = set(schema.get("required", []))
-            properties = list(schema.get("properties", {}).items())
-            return self._add_rule(
-                rule_name,
-                self._build_object_rule(
-                    properties, required, name, schema.get("additionalProperties")
-                ),
-            )
+        elif schema_type in (None, 'object') and \
+             ('properties' in schema or \
+              ('additionalProperties' in schema and schema['additionalProperties'] is not True)):
+            required = set(schema.get('required', []))
+            properties = list(schema.get('properties', {}).items())
+            return self._add_rule(rule_name, self._build_object_rule(properties, required, name, schema.get('additionalProperties')))
 
-        elif schema_type in (None, "object") and "allOf" in schema:
+        elif schema_type in (None, 'object', 'string') and 'allOf' in schema:
             required = set()
             properties = []
+            enum_sets = []
             hybrid_name = name
-
             def add_component(comp_schema, is_required):
-                if (ref := comp_schema.get("$ref")) is not None:
+                if (ref := comp_schema.get('$ref')) is not None:
                     comp_schema = self._refs[ref]
 
-                if "properties" in comp_schema:
-                    for prop_name, prop_schema in comp_schema["properties"].items():
+                if 'properties' in comp_schema:
+                    for prop_name, prop_schema in comp_schema['properties'].items():
                         properties.append((prop_name, prop_schema))
                         if is_required:
                             required.add(prop_name)
 
-            for t in schema["allOf"]:
-                if "anyOf" in t:
-                    for tt in t["anyOf"]:
+                if 'enum' in comp_schema:
+                    enum_sets.append(set(comp_schema['enum']))
+
+            for t in schema['allOf']:
+                if 'anyOf' in t:
+                    for tt in t['anyOf']:
                         add_component(tt, is_required=False)
                 else:
                     add_component(t, is_required=True)
 
-            return self._add_rule(
-                rule_name,
-                self._build_object_rule(
-                    properties, required, hybrid_name, additional_properties=[]
-                ),
-            )
+            if enum_sets:
+                enum_intersection = enum_sets[0]
+                for s in enum_sets[1:]:
+                    enum_intersection &= s
+
+                if enum_intersection:
+                    rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ') space'
+                    return self._add_rule(rule_name, rule)
 
-        elif schema_type in (None, "array") and (
-            "items" in schema or "prefixItems" in schema
-        ):
-            items = schema.get("items") or schema["prefixItems"]
+            return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
+
+        elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
+            items = schema.get('items') or schema['prefixItems']
             if isinstance(items, list):
                 return self._add_rule(
                     rule_name,
-                    '"[" space '
-                    + ' "," space '.join(
+                    '"[" space ' +
+                    ' "," space '.join(
                         self.visit(item, f'{name}{"-" if name else ""}tuple-{i}')
-                        for i, item in enumerate(items)
-                    )
-                    + ' "]" space',
-                )
+                        for i, item in enumerate(items)) +
+                    ' "]" space')
             else:
                 item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
                 min_items = schema.get("minItems", 0)
                 max_items = schema.get("maxItems")
-                return self._add_rule(
-                    rule_name,
-                    '"[" space '
-                    + _build_repetition(
-                        item_rule_name, min_items, max_items, separator_rule='"," space'
-                    )
-                    + ' "]" space',
-                )
+                return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space')
 
-        elif schema_type in (None, "string") and "pattern" in schema:
-            return self._visit_pattern(schema["pattern"], rule_name)
+        elif schema_type in (None, 'string') and 'pattern' in schema:
+            return self._visit_pattern(schema['pattern'], rule_name)
 
-        elif schema_type in (None, "string") and re.match(
-            r"^uuid[1-5]?$", schema_format or ""
-        ):
+        elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''):
             return self._add_primitive(
-                "root" if rule_name == "root" else schema_format,
-                PRIMITIVE_RULES["uuid"],
-            )
-
-        elif (
-            schema_type in (None, "string")
-            and f"{schema_format}-string" in STRING_FORMAT_RULES
-        ):
-            prim_name = f"{schema_format}-string"
-            return self._add_rule(
-                rule_name,
-                self._add_primitive(prim_name, STRING_FORMAT_RULES[prim_name]),
+                'root' if rule_name == 'root' else schema_format,
+                PRIMITIVE_RULES['uuid']
             )
 
-        elif schema_type == "string" and (
-            "minLength" in schema or "maxLength" in schema
-        ):
-            char_rule = self._add_primitive("char", PRIMITIVE_RULES["char"])
-            min_len = schema.get("minLength", 0)
-            max_len = schema.get("maxLength")
-
-            return self._add_rule(
-                rule_name,
-                r'"\"" '
-                + _build_repetition(char_rule, min_len, max_len)
-                + r' "\"" space',
-            )
-
-        elif (schema_type == "object") or (len(schema) == 0):
-            return self._add_rule(
-                rule_name, self._add_primitive("object", PRIMITIVE_RULES["object"])
-            )
+        elif schema_type in (None, 'string') and f'{schema_format}-string' in STRING_FORMAT_RULES:
+            prim_name = f'{schema_format}-string'
+            return self._add_rule(rule_name, self._add_primitive(prim_name, STRING_FORMAT_RULES[prim_name]))
+
+        elif schema_type == 'string' and ('minLength' in schema or 'maxLength' in schema):
+            char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
+            min_len = schema.get('minLength', 0)
+            max_len = schema.get('maxLength')
+
+            return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
+
+        elif schema_type in (None, 'integer') and \
+                ('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema):
+            min_value = None
+            max_value = None
+            if 'minimum' in schema:
+                min_value = schema['minimum']
+            elif 'exclusiveMinimum' in schema:
+                min_value = schema['exclusiveMinimum'] + 1
+            if 'maximum' in schema:
+                max_value = schema['maximum']
+            elif 'exclusiveMaximum' in schema:
+                max_value = schema['exclusiveMaximum'] - 1
+
+            out = ["("]
+            _generate_min_max_int(min_value, max_value, out)
+            out.append(") space")
+            return self._add_rule(rule_name, ''.join(out))
+
+        elif (schema_type == 'object') or (len(schema) == 0):
+            return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
 
         else:
-            assert schema_type in PRIMITIVE_RULES, f"Unrecognized schema: {schema}"
+            assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
             # TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
-            return self._add_primitive(
-                "root" if rule_name == "root" else schema_type,
-                PRIMITIVE_RULES[schema_type],
-            )
+            return self._add_primitive('root' if rule_name == 'root' else schema_type, PRIMITIVE_RULES[schema_type])
 
     def _add_primitive(self, name: str, rule: BuiltinRule):
         n = self._add_rule(name, rule.content)
 
         for dep in rule.deps:
             dep_rule = PRIMITIVE_RULES.get(dep) or STRING_FORMAT_RULES.get(dep)
-            assert dep_rule, f"Rule {dep} not known"
+            assert dep_rule, f'Rule {dep} not known'
             if dep not in self._rules:
                 self._add_primitive(dep, dep_rule)
         return n
 
-    def _build_object_rule(
-        self,
-        properties: List[Tuple[str, Any]],
-        required: Set[str],
-        name: str,
-        additional_properties: Union[bool, Any],
-    ):
+    def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Optional[Union[bool, Any]]):
         prop_order = self._prop_order
         # sort by position in prop_order (if specified) then by original order
-        sorted_props = [
-            kv[0]
-            for _, kv in sorted(
-                enumerate(properties),
-                key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]),
-            )
-        ]
+        sorted_props = [kv[0] for _, kv in sorted(enumerate(properties), key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]))]
 
         prop_kv_rule_names = {}
         for prop_name, prop_schema in properties:
-            prop_rule_name = self.visit(
-                prop_schema, f'{name}{"-" if name else ""}{prop_name}'
-            )
+            prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
             prop_kv_rule_names[prop_name] = self._add_rule(
                 f'{name}{"-" if name else ""}{prop_name}-kv',
-                rf'{self._format_literal(json.dumps(prop_name))} space ":" space {prop_rule_name}',
+                fr'{self._format_literal(json.dumps(prop_name))} space ":" space {prop_rule_name}'
             )
         required_props = [k for k in sorted_props if k in required]
         optional_props = [k for k in sorted_props if k not in required]
 
-        if additional_properties == True or isinstance(additional_properties, dict):
+        if additional_properties is not None and additional_properties != False:
             sub_name = f'{name}{"-" if name else ""}additional'
-            value_rule = self.visit(
-                {} if additional_properties == True else additional_properties,
-                f"{sub_name}-value",
-            )
+            value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \
+                self._add_primitive('value', PRIMITIVE_RULES['value'])
+            key_rule = self._add_primitive('string', PRIMITIVE_RULES['string']) if not sorted_props \
+                else self._add_rule(f'{sub_name}-k', self._not_strings(sorted_props))
+
             prop_kv_rule_names["*"] = self._add_rule(
-                f"{sub_name}-kv",
-                self._add_primitive("string", PRIMITIVE_RULES["string"])
-                + f' ":" space {value_rule}',
+                f'{sub_name}-kv',
+                f'{key_rule} ":" space {value_rule}'
             )
             optional_props.append("*")
 
@@ -908,44 +993,40 @@ def _build_object_rule(
         rule += ' "," space '.join(prop_kv_rule_names[k] for k in required_props)
 
         if optional_props:
-            rule += " ("
+            rule += ' ('
             if required_props:
                 rule += ' "," space ( '
 
             def get_recursive_refs(ks, first_is_optional):
                 [k, *rest] = ks
                 kv_rule_name = prop_kv_rule_names[k]
-                if k == "*":
-                    res = self._add_rule(
-                        f'{name}{"-" if name else ""}additional-kvs',
-                        f'{kv_rule_name} ( "," space ' + kv_rule_name + " )*",
-                    )
-                elif first_is_optional:
-                    res = f'( "," space {kv_rule_name} )?'
+                comma_ref = f'( "," space {kv_rule_name} )'
+                if first_is_optional:
+                    res = comma_ref + ('*' if k == '*' else '?')
                 else:
-                    res = kv_rule_name
+                    res = kv_rule_name + (' ' + comma_ref + "*" if k == '*' else '')
                 if len(rest) > 0:
-                    res += " " + self._add_rule(
+                    res += ' ' + self._add_rule(
                         f'{name}{"-" if name else ""}{k}-rest',
-                        get_recursive_refs(rest, first_is_optional=True),
+                        get_recursive_refs(rest, first_is_optional=True)
                     )
                 return res
 
-            rule += " | ".join(
+            rule += ' | '.join(
                 get_recursive_refs(optional_props[i:], first_is_optional=False)
                 for i in range(len(optional_props))
             )
             if required_props:
-                rule += " )"
-            rule += " )?"
+                rule += ' )'
+            rule += ' )?'
 
         rule += ' "}" space'
 
         return rule
 
     def format_grammar(self):
-        return "\n".join(
-            f"{name} ::= {rule}"
+        return '\n'.join(
+            f'{name} ::= {rule}'
             for name, rule in sorted(self._rules.items(), key=lambda kv: kv[0])
         )
 
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index 89008043c..e2ee004de 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -127,6 +127,7 @@ class clip_flash_attn_type (enum.IntEnum):
 #     enum clip_flash_attn_type flash_attn_type;
 #     int image_min_tokens;
 #     int image_max_tokens;
+#     bool warmup;
 # };
 class clip_context_params(Structure):
     _fields_ = [
@@ -134,6 +135,7 @@ class clip_context_params(Structure):
         ("flash_attn_type", c_int),
         ("image_min_tokens", c_int),
         ("image_max_tokens", c_int),
+        ("warmup", c_bool),
     ]
 
 # struct mtmd_context_params {
@@ -143,6 +145,7 @@ class clip_context_params(Structure):
 #     const char * image_marker; // deprecated, use media_marker instead
 #     const char * media_marker;
 #     enum llama_flash_attn_type flash_attn_type;
+#     bool warmup; // whether to run a warmup encode pass after initialization
 
 #     // limit number of image tokens, only for vision models with dynamic resolution
 #     int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
@@ -156,6 +159,7 @@ class mtmd_context_params(Structure):
         ("image_marker", c_char_p),
         ("media_marker", c_char_p),
         ("flash_attn_type", c_int),
+        ("warmup", c_bool),
         ("image_min_tokens", c_int),
         ("image_max_tokens", c_int),
     ]
diff --git a/pyproject.toml b/pyproject.toml
index f5ae7b59c..00dbab1ed 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,15 +12,15 @@ authors = [
     { name = "Andrei Betlen", email = "abetlen@gmail.com" },
 ]
 dependencies = [
-    "typing-extensions>=4.5.0",
-    "numpy>=1.20.0",
-    "diskcache>=5.6.1",
+    "typing-extensions>=4.8.0",
+    "numpy>=1.21.6",
+    "diskcache>=5.6.2",
     "jinja2>=2.11.3",
+    "Pillow>=9.5.0",
 ]
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 classifiers = [
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index c7b7db044..7ac890213 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit c7b7db04456da9839ebd81ecde6f488c5fe41cbc
+Subproject commit 7ac8902133da6eb390c4d8368a7d252279123942