diff --git a/.github/scripts/eval_chat_config.py b/.github/scripts/eval_chat_config.py
index 81872c0dd1..89ad20a533 100644
--- a/.github/scripts/eval_chat_config.py
+++ b/.github/scripts/eval_chat_config.py
@@ -174,7 +174,6 @@
                                 max_out_len=MAX_NEW_TOKENS,
                                 max_seq_len=MAX_SESSION_LEN,
                                 batch_size=128,
-                                concurrency=128,
                                 meta_template=llama2_meta_template,
                                 run_cfg=dict(num_gpus=1),
                                 end_str='[INST]')
diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index 84fcaf5034..f03bbf4a50 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -199,9 +199,9 @@ jobs:
           chmod -R 777 $workdir
 
   test_tools:
-    needs: test_quantization
     if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
     runs-on: [self-hosted, linux-a100]
+    needs: test_quantization
     timeout-minutes: 150
     strategy:
       fail-fast: false
diff --git a/.github/workflows/daily_ete_test_v100.yml b/.github/workflows/daily_ete_test_v100.yml
new file mode 100644
index 0000000000..8b32bab1f7
--- /dev/null
+++ b/.github/workflows/daily_ete_test_v100.yml
@@ -0,0 +1,667 @@
+name: daily_ete_test_v100
+
+on:
+  workflow_dispatch:
+    inputs:
+      repo_org:
+        required: false
+        description: 'Tested repository organization name. Default is InternLM'
+        type: string
+        default: 'InternLM/lmdeploy'
+      repo_ref:
+        required: false
+        description: 'Set branch or tag or commit id. Default is "main"'
+        type: string
+        default: 'main'
+      backend:
+        required: true
+        description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
+        type: string
+        default: "['turbomind', 'pytorch', 'turbomind_vl']"
+      model:
+        required: true
+        description: 'Set testcase module filter: chat, restful, pipeline, quantization. Default contains all models'
+        type: string
+        default: "['pipeline','restful','chat']"
+      offline_mode:
+        required: true
+        description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
+        type: boolean
+        default: false
+      regression_func:
+        required: true
+        description: 'regression functions'
+        type: string
+        default: "['quant', 'tools','restful','pipeline','benchmark','evaluation']"
+  schedule:
+    - cron:  '00 16 * * 0-4'
+
+env:
+  HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
+  HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
+  OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }}
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+  REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }}
+  COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
+  FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
+  TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy
+  OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
+  OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
+
+jobs:
+  linux-build:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}}
+    strategy:
+      matrix:
+        pyver: [py310]
+    runs-on: ubuntu-latest
+    env:
+      PYTHON_VERSION: ${{ matrix.pyver }}
+      PLAT_NAME: manylinux2014_x86_64
+      DOCKER_TAG: cuda12.1
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Build
+        run: |
+          echo ${PYTHON_VERSION}
+          echo ${PLAT_NAME}
+          echo ${DOCKER_TAG}
+          echo ${OUTPUT_FOLDER}
+          echo ${GITHUB_RUN_ID}
+          # remove -it
+          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
+          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          if-no-files-found: error
+          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
+          retention-days: 1
+          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
+
+
+  download_pkgs:
+    needs: linux-build
+    if: ${{!cancelled()}}
+    runs-on: linux-v100
+    timeout-minutes: 50
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/187:/mnt/187
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v2
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Copy repository
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}}
+      - name: Copy repository - offline
+        if: ${{inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}}
+      - name: Change testconfig on v100
+        run: |
+          mv ${{env.TEST_CODE_PATH}}/autotest/config-v100.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml
+      - name: Download Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        uses: actions/download-artifact@v4
+        with:
+          name: my-artifact-${{ github.run_id }}-py310
+      - name: Copy Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+      - name: Copy Artifacts - offline
+        if: ${{inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+
+  test_quantization:
+    needs: download_pkgs
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}}
+    runs-on: linux-v100
+    timeout-minutes: 180
+    env:
+      PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
+      MODELSCOPE_CACHE: /root/modelscope_hub
+      MODELSCOPE_MODULES_CACHE: /root/modelscope_modules
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/github-actions/modelscope_hub:/root/modelscope_hub
+        - /nvme/github-actions/modelscope_modules:/root/modelscope_modules
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /mnt/187:/mnt/187
+        - /nvme/qa_test_models/lmdeploy/autotest:/local_case
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          pip install triton==3.0.0
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test lmdeploy - quantization w4a16
+        continue-on-error: true
+        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind')
+        run: |
+          pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - convert
+        continue-on-error: true
+        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind')
+        run: |
+          pytest autotest/tools/convert -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_tools:
+    needs: test_quantization
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
+    runs-on: linux-v100
+    timeout-minutes: 240
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: ${{ fromJSON(inputs.backend) || fromJSON('["turbomind", "pytorch", "turbomind_vl"]')}}
+        model: ${{ fromJSON(inputs.model) || fromJSON('["pipeline","restful","chat"]')}}
+        exclude:
+          - backend: turbomind_vl
+            model: chat
+        include:
+          - backend: turbomind
+            model: local_case
+    env:
+      PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
+      MODELSCOPE_CACHE: /root/modelscope_hub
+      MODELSCOPE_MODULES_CACHE: /root/modelscope_modules
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/github-actions/modelscope_hub:/root/modelscope_hub
+        - /nvme/github-actions/modelscope_modules:/root/modelscope_modules
+        - /nvme/github-actions/resources/lora:/root/lora
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /mnt/187:/mnt/187
+        - /nvme/qa_test_models/lmdeploy/autotest:/local_case
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          pip install triton==3.0.0
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          cp -r /root/lora .
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test lmdeploy - chat workspace
+        continue-on-error: true
+        if: matrix.backend == 'turbomind' && matrix.model == 'chat'
+        run: |
+          pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - chat
+        continue-on-error: true
+        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'chat'
+        run: |
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - pipeline
+        continue-on-error: true
+        if: matrix.model == 'pipeline'
+        run: |
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - restful
+        continue-on-error: true
+        if: matrix.model == 'restful'
+        run: |
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - restful workspace
+        continue-on-error: true
+        if: matrix.backend == 'turbomind' && matrix.model == 'restful'
+        run: |
+          pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - local testcase
+        if: matrix.backend == 'turbomind' && matrix.model == 'local_case'
+        run: |
+          pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}}|| true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_restful:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
+    runs-on: linux-v100
+    needs: test_quantization
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: ['turbomind', 'pytorch']
+    timeout-minutes: 120
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /mnt/187:/mnt/187
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          pip install triton==3.0.0
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Start restful api turbomind
+        if: matrix.backend == 'turbomind'
+        run: |
+          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm2_5-20b-chat --tp 2 > restful.log  2>&1  &
+          echo "restful_pid=$!" >> "$GITHUB_ENV"
+          sleep 600s
+      - name: Start restful api pytorch
+        if: matrix.backend == 'pytorch'
+        run: |
+          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm2_5-20b-chat --tp 2 --backend pytorch --dtype float16 > restful.log  2>&1  &
+          echo "restful_pid=$!" >> "$GITHUB_ENV"
+          sleep 600s
+      - name: Test lmdeploy - restful api
+        timeout-minutes: 75
+        run: |
+          pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}}' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Kill api server
+        if: always()
+        run: |
+          kill -15 "$restful_pid"
+      - name: Start restful api turbomind - base
+        if: matrix.backend == 'turbomind'
+        run: |
+          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm2_5-20b --tp 2 > restful.log  2>&1  &
+          echo "restful_pid=$!" >> "$GITHUB_ENV"
+          sleep 600s
+      - name: Start restful api pytorch - base
+        if: matrix.backend == 'pytorch'
+        run: |
+          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm2_5-20b --tp 2 --backend pytorch --dtype float16 > restful.log  2>&1  &
+          echo "restful_pid=$!" >> "$GITHUB_ENV"
+          sleep 600s
+      - name: Test lmdeploy - restful api - base
+        timeout-minutes: 40
+        run: |
+          pytest autotest/interface/restful/test_restful_completions_v1.py -n 20 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Kill api server
+        if: always()
+        run: |
+          kill -15 "$restful_pid"
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_pipeline:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'pipeline'))}}
+    runs-on: linux-v100
+    needs: test_quantization
+    timeout-minutes: 240
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /mnt/187:/mnt/187
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          pip install triton==3.0.0
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test lmdeploy - interface pipeline case
+        run: |
+          pytest autotest/interface/pipeline/test_pipeline_func.py -m 'not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+
+  test_benchmark:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}}
+    runs-on: linux-v100
+    needs: test_quantization
+    timeout-minutes: 120
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /mnt/187:/mnt/187
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          pip install triton==3.0.0
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test benchmark script
+        run: |
+          pytest autotest/benchmark -n 4 --run_id ${{ github.run_id }} -m function ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          chmod -R 777 /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_evaluation:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}}
+    runs-on: linux-v100
+    needs: test_quantization
+    timeout-minutes: 120
+    strategy:
+        fail-fast: false
+        matrix:
+          evaluate_type: ['chat', 'base']
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/github-actions/resources:/root/resources
+        - /nvme/github-actions/opencompass-data:/root/opencompass-data
+        - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /mnt/187:/mnt/187
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Install opencompass
+        run: |
+          git clone --depth=1 https://github.com/open-compass/opencompass.git
+          cd opencompass
+          python3 -m pip install -e .
+          echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV
+      - name: Check env
+        run: |
+          pip install triton==3.0.0
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Setup paths for evaluation
+        run: |
+          ln -s /root/opencompass-data ./data
+          python3 .github/scripts/action_tools.py create_model_links /nvme/qa_test_models .
+      - name: Evaluate models
+        if: matrix.evaluate_type == 'chat'
+        run: |
+          export LMDEPLOY_DIR=$(pwd)
+
+          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_4bits, turbomind_internlm2_5_7b_chat_kvint4, pt_internlm2_5_7b_chat, turbomind_internlm2_5_20b_chat, turbomind_internlm2_5_20b_chat_4bits, turbomind_internlm2_5_20b_chat_kvint4, pt_internlm2_5_20b_chat, turbomind_llama_3d1_8b_instruct, pt_llama_3d1_8b_instruct, turbomind_llama_3d1_8b_instruct_4bits, turbomind_llama_3d1_8b_instruct_kvint4, turbomind_qwen2_7b_instruct, turbomind_qwen2_7b_instruct_4bits, pt_qwen1_5_moe_2_7b_chat, pt_gemma_2_9b_it]" "[*race_datasets, *gsm8k_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
+      - name: Evaluate base models
+        if: matrix.evaluate_type == 'base'
+        run: |
+          export LMDEPLOY_DIR=$(pwd)
+
+          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b, turbomind_qwen2_7b, turbomind_internlm2_5_7b_batch1]" "[*mmlu_datasets, *gsm8k_datasets]" /root/evaluation-reports/${{ github.run_id }} base true
+      - name: Clear workspace
+        if: always()
+        run: |
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+
+  get_benchmark_result:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}}
+    needs: [test_benchmark]
+    timeout-minutes: 5
+    runs-on: linux-v100
+    env:
+      BENCHMARK_REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v3
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Get overview
+        run: |
+          pip install pandas fire mmengine
+          python3 .github/scripts/action_tools.py generate_benchmark_report $BENCHMARK_REPORT_DIR
+
+
+  get_coverage_report:
+    if: ${{!cancelled()}}
+    runs-on: linux-v100
+    needs: [test_tools, test_restful, test_pipeline, test_benchmark]
+    timeout-minutes: 5
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Get coverage report
+        run: |
+          pip install coverage
+          coverage combine ${{env.REPORT_DIR}}
+          coverage xml -o ${{env.REPORT_DIR}}/coverage.xml
+          coverage report -m
+          mv .coverage ${{env.REPORT_DIR}}/.coverage
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  notify_to_feishu:
+    if: always() && !cancelled() && (github.ref_name == 'develop' || github.ref_name == 'main')
+    needs: [get_benchmark_result, get_coverage_report, test_evaluation]
+    timeout-minutes: 5
+    runs-on: linux-v100
+    steps:
+      - name: notify
+        if: contains(needs.*.result, 'failure')
+        run: |
+          curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Lmdeploy- Daily test finished！！！","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.FEISHU_USER_ID }}'"}]]}}}}'  ${{ secrets.FEISHU_WEBHOOK_URL }}
diff --git a/autotest/config-v100.yaml b/autotest/config-v100.yaml
new file mode 100644
index 0000000000..172667ec0c
--- /dev/null
+++ b/autotest/config-v100.yaml
@@ -0,0 +1,131 @@
+model_path: /nvme/qa_test_models
+dst_path: /nvme/qa_test_models/autotest_model
+log_path: /nvme/qa_test_models/autotest_model/log
+benchmark_path: /nvme/qa_test_models/benchmark-reports
+dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
+
+tp_config:
+    internlm-chat-20b: 2
+    internlm2-chat-20b: 2
+    Baichuan2-13B-Chat: 2
+    Mixtral-8x7B-Instruct-v0.1: 2
+    Qwen-VL-Chat: 2
+    llava-v1.5-13b: 2
+    internlm2_5-20b-chat: 2
+    internlm2_5-20b: 2
+    Meta-Llama-3-1-70B-Instruct: 4
+    internlm2_5-7b-chat-1m: 4
+    Qwen2-7B-Instruct-GPTQ-Int4: 2
+    InternVL2-26B: 2
+    InternVL2-40B: 2
+    MiniCPM-V-2_6: 2
+
+turbomind_chat_model:
+    - meta-llama/Meta-Llama-3-1-8B-Instruct
+    - meta-llama/Meta-Llama-3-1-8B-Instruct-AWQ
+    - meta-llama/Meta-Llama-3-8B-Instruct
+    - meta-llama/Meta-Llama-3-8B-Instruct-inner-4bits
+    - internlm/internlm2_5-7b-chat
+    - internlm/internlm2_5-20b-chat
+    - internlm/internlm-xcomposer2d5-7b
+    - OpenGVLab/InternVL2-2B
+    - OpenGVLab/InternVL2-8B
+    - OpenGVLab/InternVL2-26B
+    - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
+    - Qwen/Qwen2-7B-Instruct-AWQ
+    - Qwen/Qwen2-1.5B-Instruct
+    - Qwen/Qwen2.5-7B-Instruct
+    - Qwen/Qwen2-7B-Instruct-GPTQ-Int4
+    - mistralai/Mistral-7B-Instruct-v0.3
+    - THUDM/glm-4-9b-chat
+
+
+pytorch_chat_model:
+    - meta-llama/Meta-Llama-3-8B-Instruct
+    - meta-llama/Meta-Llama-3-1-8B-Instruct
+    - internlm/internlm2_5-7b-chat
+    - internlm/internlm2_5-20b-chat
+    - OpenGVLab/InternVL2-2B
+    - OpenGVLab/InternVL2-4B
+    - OpenGVLab/InternVL2-8B
+    - OpenGVLab/InternVL2-26B
+    - Qwen/Qwen2-1.5B-Instruct
+    - Qwen/Qwen1.5-MoE-A2.7B-Chat
+    - Qwen/Qwen2-VL-2B-Instruct
+    - Qwen/Qwen2-VL-7B-Instruct
+    - google/gemma-2-9b-it
+    - mistralai/Mistral-7B-Instruct-v0.2
+    - THUDM/glm-4v-9b
+    - THUDM/glm-4-9b-chat
+    - microsoft/Phi-3-mini-4k-instruct
+    - deepseek-ai/DeepSeek-V2-Lite-Chat
+
+turbomind_base_model:
+    - internlm/internlm2_5-7b
+    - internlm/internlm2_5-20b
+
+pytorch_base_model:
+    - internlm/internlm2_5-7b
+    - internlm/internlm2_5-20b
+
+vl_model:
+    - OpenGVLab/InternVL2-2B
+    - OpenGVLab/InternVL2-4B
+    - OpenGVLab/InternVL2-8B
+    - OpenGVLab/InternVL2-26B
+    - Qwen/Qwen2-VL-2B-Instruct
+    - Qwen/Qwen2-VL-7B-Instruct
+    - internlm/internlm-xcomposer2d5-7b
+    - THUDM/glm-4v-9b
+    - microsoft/Phi-3-mini-4k-instruct
+
+turbomind_quatization:
+    no_awq:
+        - meta-llama/Meta-Llama-3-1-8B-Instruct
+        - meta-llama/Meta-Llama-3-8B-Instruct
+        - internlm/internlm-xcomposer2d5-7b
+        - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
+        - mistralai/Mistral-7B-Instruct-v0.3
+        - THUDM/glm-4-9b-chat
+    gptq:
+        - internlm/internlm2_5-7b-chat
+    no_kvint4:
+        - openbmb/MiniCPM-V-2_6
+    no_kvint8:
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
+
+pytorch_quatization:
+    awq:
+        - internlm/internlm2_5-7b-chat
+        - internlm/internlm2_5-20b-chat
+        - Qwen/Qwen2-1.5B-Instruct
+    w8a8:
+        - internlm/internlm2_5-7b-chat
+        - internlm/internlm2_5-7b
+    no_kvint4:
+        - OpenGVLab/InternVL2-4B
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
+        - microsoft/Phi-3-mini-4k-instruct
+        - microsoft/Phi-3-vision-128k-instruct
+    no_kvint8:
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
+
+
+longtext_model:
+    - meta-llama/Meta-Llama-3-1-8B-Instruct
+    - meta-llama/Meta-Llama-3-8B-Instruct
+    - meta-llama/Meta-Llama-3-1-70B-Instruct
+    - internlm/internlm2_5-7b-chat-1m
+    - internlm/internlm2-chat-20b
+
+benchmark_model:
+    - meta-llama/Llama-2-7b-chat-hf
+    - meta-llama/Meta-Llama-3-1-8B-Instruct
+    - meta-llama/Meta-Llama-3-8B-Instruct
+    - meta-llama/Meta-Llama-3-1-70B-Instruct
+    - internlm/internlm2_5-7b-chat
+    - internlm/internlm2_5-20b-chat
+    - THUDM/glm-4-9b-chat
+    - mistralai/Mistral-7B-Instruct-v0.3
+    - mistralai/Mixtral-8x7B-Instruct-v0.1
+    - deepseek-ai/DeepSeek-V2-Lite-Chat
diff --git a/autotest/config.yaml b/autotest/config.yaml
index 4e4b20f206..46b9bd9ce1 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -48,6 +48,7 @@ turbomind_chat_model:
     - mistralai/Mistral-7B-Instruct-v0.1
     - mistralai/Mistral-7B-Instruct-v0.2
     - mistralai/Mistral-7B-Instruct-v0.3
+    - mistralai/Mixtral-8x7B-Instruct-v0.1
     - lmdeploy/llama2-chat-7b-w4
     - baichuan-inc/Baichuan2-7B-Chat
     - 01-ai/Yi-6B-Chat
@@ -90,7 +91,6 @@ pytorch_chat_model:
     - mistralai/Mistral-7B-Instruct-v0.1
     - mistralai/Mistral-7B-Instruct-v0.2
     - mistralai/Mixtral-8x7B-Instruct-v0.1
-    - mistralai/Mixtral-8x7B-Instruct-v0.1
     - google/gemma-7b-it
     - google/gemma-2-9b-it
     - deepseek-ai/deepseek-moe-16b-chat
diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py
index 096918b6b1..bd33ed33a0 100644
--- a/autotest/interface/pipeline/test_pipeline_func.py
+++ b/autotest/interface/pipeline/test_pipeline_func.py
@@ -15,6 +15,14 @@
 
 from lmdeploy import (GenerationConfig, PytorchEngineConfig,
                       TurbomindEngineConfig, pipeline)
+from lmdeploy.utils import is_bf16_supported
+
+
+def init_pipeline(model_path, backend_config):
+    if not is_bf16_supported() and isinstance(backend_config,
+                                              PytorchEngineConfig):
+        backend_config.dtype = 'float16'
+    return pipeline(model_path, backend_config=backend_config)
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
@@ -26,7 +34,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         response = pipe('Hi, pls intro yourself')
         result, msg = assert_pipeline_single_return(response)
         save_pipeline_common_log(config, file_name, result, response, msg)
@@ -56,7 +64,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         response = []
         for item in pipe.stream_infer('Hi, pls intro yourself'):
             response.append(item)
@@ -88,7 +96,7 @@ def run_pipeline_testcase_with_prompt(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'])
         result, msg = assert_pipeline_batch_return(response, 2)
         save_pipeline_common_log(config, file_name, result, response, msg)
@@ -118,7 +126,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         response = []
         for item in pipe.stream_infer(['Pls intro yourself', 'Shanghai is']):
             response.append(item)
@@ -149,7 +157,7 @@ def test_return_with_message(config, model, backend, worker_id):
     def run_pipeline_testcase(config, model, backend, file_name):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         prompts = [[{'role': 'user', 'content': 'Hi, pls intro yourself'}]]
         response = pipe(prompts)
         print(response)
@@ -180,7 +188,7 @@ def test_return_with_message_stream(config, model, backend, worker_id):
     def run_pipeline_testcase(config, model, backend, file_name):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         prompts = [[{'role': 'user', 'content': 'Hi, pls intro yourself'}]]
         response = []
         for item in pipe.stream_infer(prompts):
@@ -212,7 +220,7 @@ def test_return_with_message_batch(config, model, backend, worker_id):
     def run_pipeline_testcase(config, model, backend, file_name):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         prompts = [[{
             'role': 'user',
             'content': 'Hi, pls intro yourself'
@@ -249,7 +257,7 @@ def test_return_with_message_batch_stream(config, model, backend, worker_id):
     def run_pipeline_testcase(config, model, backend, file_name):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         prompts = [[{
             'role': 'user',
             'content': 'Hi, pls intro yourself'
@@ -287,7 +295,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         gen_config = GenerationConfig(logprobs=10,
                                       max_new_tokens=5,
                                       top_k=40,
@@ -320,7 +328,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         gen_config = GenerationConfig(logprobs=10,
                                       max_new_tokens=5,
                                       top_k=40,
@@ -358,7 +366,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(session_len=10, tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'])
 
         result = True
@@ -392,7 +400,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test min_new_tokens
         gen_config = GenerationConfig(min_new_tokens=200, ignore_eos=True)
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
@@ -428,7 +436,7 @@ def run_pipeline_testcase_stop_words(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test stop_words
         gen_config = GenerationConfig(stop_words=[' and', '浦', ' to'])
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
@@ -467,7 +475,7 @@ def run_pipeline_testcase_bad_words(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test bad_words
         gen_config = GenerationConfig(bad_words=[' and', '浦', ' to'])
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
@@ -502,7 +510,7 @@ def test_gen_config_special_words_false(config, model, backend, worker_id):
     def run_pipeline_testcase_special_words(config, model, backend, file_name):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test special_words
         prompt = '<|im_start|>system\n当开启工具以及代码时，根据需求选择合适的工具进行调用\n' + \
             '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' + \
@@ -543,7 +551,7 @@ def test_gen_config_special_words_true(config, model, backend, worker_id):
     def run_pipeline_testcase_special_words(config, model, backend, file_name):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test special_words
         prompt = '<|im_start|>system\n当开启工具以及代码时，根据需求选择合适的工具进行调用\n' + \
             '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' + \
@@ -587,7 +595,7 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend,
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test repetition_penalty
         gen_config = GenerationConfig(repetition_penalty=0.01,
                                       random_seed=1,
@@ -626,7 +634,7 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend,
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test repetition_penalty
         gen_config = GenerationConfig(repetition_penalty=1.2, random_seed=1)
         response = pipe('Shanghai is', gen_config=gen_config)
@@ -658,7 +666,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test repetition_penalty
         gen_config = GenerationConfig(top_p=0.1, random_seed=1)
         response = pipe('Shanghai is', gen_config=gen_config)
@@ -690,7 +698,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test repetition_penalty
         gen_config = GenerationConfig(top_k=1,
                                       max_new_tokens=20,
@@ -727,7 +735,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         response_list = []
         for i in range(3):
             gen_config = GenerationConfig(random_seed=i,
@@ -764,7 +772,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         gen_config = GenerationConfig(random_seed=1, top_k=40, do_sample=True)
         response_list = []
         for i in range(3):
@@ -798,7 +806,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         gen_config = GenerationConfig(temperature=1.0,
                                       top_k=40,
                                       do_sample=True)
@@ -833,7 +841,7 @@ def run_pipeline_testcase_max_new_tokens(config, model, backend,
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test max_new_tokens
         gen_config = GenerationConfig(max_new_tokens=5)
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
@@ -870,7 +878,7 @@ def run_pipeline_testcase_ignore_eos(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test max_new_tokens with ignore_eos
         gen_config = GenerationConfig(ignore_eos=True, max_new_tokens=256)
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
@@ -907,7 +915,7 @@ def test_backend_config_input_validation(config, model, backend, worker_id):
                                                                      tp_num=2)
     model_path = '/'.join([config.get('model_path'), model])
     backend_config = backend(tp=2)
-    pipe = pipeline(model_path, backend_config=backend_config)
+    pipe = init_pipeline(model_path, backend_config=backend_config)
     with pytest.raises(AssertionError):
         gen_config = GenerationConfig(top_p=0)
         pipe('Shanghai is', gen_config=gen_config)
@@ -1018,7 +1026,7 @@ def test_backend_config_tp(config, model, backend, worker_id):
                 worker_id, tp_num=2)
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=100)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         del pipe
         torch.cuda.empty_cache()
         if 'gw' in worker_id:
diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py
index 0ea643524f..c80dbe0dfc 100644
--- a/autotest/utils/benchmark_utils.py
+++ b/autotest/utils/benchmark_utils.py
@@ -7,6 +7,8 @@
 from utils.config_utils import get_workerid
 from utils.run_restful_chat import health_check
 
+from lmdeploy.utils import is_bf16_supported
+
 DEFAULT_PORT = 23333
 GENERATION_CONFIG = ' -c 8 256 -ct 128 128 2048 128 -pt 1 128 128 2048'
 GENERATION_LONGTEXT_CONFIG = ' -c 1 --session-len 200000 -ct 1024 -pt 198000'
@@ -40,6 +42,8 @@ def generation_test(config,
     run_config = ''
     if backend == 'pytorch':
         command += ' --backend pytorch'
+        if not is_bf16_supported():
+            command += ' --dtype float16'
     else:
         if '4bit' in model:
             command += ' --model-format awq'
@@ -105,6 +109,8 @@ def throughput_test(config,
         run_config = '--num-prompts 3000'
     if backend == 'pytorch':
         command += ' --backend pytorch'
+        if not is_bf16_supported():
+            command += ' --dtype float16'
     else:
         if '4bit' in model:
             command += ' --model-format awq'
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index ca041dc9a1..8aa5f933fb 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -4,6 +4,8 @@
 import yaml
 from utils.get_run_config import get_tp_num
 
+from lmdeploy.utils import is_bf16_supported
+
 
 def get_turbomind_model_list(tp_num: int = None,
                              model_type: str = 'chat_model',
@@ -85,14 +87,16 @@ def get_torch_model_list(tp_num: int = None,
 def get_all_model_list(tp_num: int = None,
                        quant_policy: int = None,
                        model_type: str = 'chat_model'):
+
     case_list = get_turbomind_model_list(tp_num=tp_num,
                                          model_type=model_type,
                                          quant_policy=quant_policy)
-    for case in get_torch_model_list(tp_num=tp_num,
-                                     quant_policy=quant_policy,
-                                     model_type=model_type):
-        if case not in case_list:
-            case_list.append(case)
+    if is_bf16_supported():
+        for case in get_torch_model_list(tp_num=tp_num,
+                                         quant_policy=quant_policy,
+                                         model_type=model_type):
+            if case not in case_list:
+                case_list.append(case)
     return [x for x in case_list if 'w8a8' not in x]
 
 
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index 33d65448ab..1ab34b23d5 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -10,6 +10,7 @@
 
 from lmdeploy import pipeline
 from lmdeploy.messages import PytorchEngineConfig, TurbomindEngineConfig
+from lmdeploy.utils import is_bf16_supported
 from lmdeploy.vl import load_image
 from lmdeploy.vl.constants import IMAGE_TOKEN
 
@@ -32,6 +33,8 @@ def run_pipeline_chat_test(config,
 
     if 'pytorch' in type:
         backend_config = PytorchEngineConfig(tp=tp)
+        if not is_bf16_supported():
+            backend_config.dtype = 'float16'
     else:
         backend_config = TurbomindEngineConfig(tp=tp)
 
@@ -292,6 +295,10 @@ def run_pipeline_vl_chat_test(config, model_case, quant_policy: int = None):
         backend_config.model_format = 'awq'
     if quant_policy is not None:
         backend_config.quant_policy = quant_policy
+
+    if not is_bf16_supported():
+        backend_config.cache_max_entry_count = 0.5
+        backend_config.dtype = 'float16'
     pipe = pipeline(hf_path, backend_config=backend_config)
 
     pipeline_chat_log = os.path.join(
diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py
index 75b7319aeb..752168958a 100644
--- a/autotest/utils/quantization_utils.py
+++ b/autotest/utils/quantization_utils.py
@@ -2,6 +2,8 @@
 import subprocess
 from subprocess import PIPE
 
+from lmdeploy.utils import is_bf16_supported
+
 
 def quantization(config,
                  quantization_model_name,
@@ -21,17 +23,17 @@ def quantization(config,
     if quantization_type == 'awq':
         quantization_cmd = ' '.join([
             cuda_prefix, 'lmdeploy lite auto_awq', origin_model_path,
-            '--work-dir', quantization_model_path, '--batch-size 32'
+            '--work-dir', quantization_model_path
         ])
     elif quantization_type == 'gptq':
         quantization_cmd = ' '.join([
             cuda_prefix, 'lmdeploy lite auto_gptq', origin_model_path,
-            '--work-dir', quantization_model_path, '--batch-size 32'
+            '--work-dir', quantization_model_path
         ])
     elif quantization_type == 'w8a8':
         quantization_cmd = ' '.join([
             cuda_prefix, 'lmdeploy lite smooth_quant', origin_model_path,
-            '--work-dir', quantization_model_path, '--batch-size 32'
+            '--work-dir', quantization_model_path
         ])
     else:
         return False, 'quantization type should in [awq, gptq, w8a8], \
@@ -40,6 +42,11 @@ def quantization(config,
     if 'llama-3' in origin_model_name.lower():
         quantization_cmd += ' --search-scale True'
 
+    if not is_bf16_supported():
+        quantization_cmd += ' --batch-size 8'
+    else:
+        quantization_cmd += ' --batch-size 32'
+
     with open(quantization_log, 'w') as f:
         # remove existing folder
         subprocess.run([' '.join(['rm -rf', quantization_model_path])],
diff --git a/autotest/utils/run_client_chat.py b/autotest/utils/run_client_chat.py
index edc2268e30..529bf4a6a0 100644
--- a/autotest/utils/run_client_chat.py
+++ b/autotest/utils/run_client_chat.py
@@ -4,6 +4,8 @@
 from utils.get_run_config import get_command_with_extra, get_model_name
 from utils.rule_condition_assert import assert_result
 
+from lmdeploy.utils import is_bf16_supported
+
 TEMPLATE = 'autotest/template.json'
 
 
@@ -63,6 +65,9 @@ def hf_command_line_test(config,
                                  need_tp=True,
                                  cuda_prefix=cuda_prefix)
 
+    if type == 'pytorch':
+        if not is_bf16_supported():
+            cmd += ' --dtype float16'
     if type == 'turbomind':
         if ('w4' in model_case
                 or ('4bits' in model_case or 'awq' in model_case.lower())):
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index 1eb84f1d93..c567db4d00 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -13,6 +13,7 @@
 from utils.run_client_chat import command_line_test
 
 from lmdeploy.serve.openai.api_client import APIClient
+from lmdeploy.utils import is_bf16_supported
 
 BASE_HTTP_URL = 'http://localhost'
 DEFAULT_PORT = 23333
@@ -60,12 +61,17 @@ def start_restful_api(config, param, model, model_path, backend_type,
             cmd += ' --model-format gptq'
     if backend_type == 'pytorch':
         cmd += ' --backend pytorch'
+        if not is_bf16_supported():
+            cmd += ' --dtype float16'
     if 'llava' in model:
         cmd += ' --model-name vicuna'
     if 'quant_policy' in param.keys() and param['quant_policy'] is not None:
         quant_policy = param['quant_policy']
         cmd += f' --quant-policy {quant_policy}'
 
+    if not is_bf16_supported():
+        cmd += ' --cache-max-entry-count 0.5'
+
     start_log = os.path.join(
         log_path, 'start_restful_' + model.split('/')[1] + worker_id + '.log')
 
@@ -87,13 +93,18 @@ def start_restful_api(config, param, model, model_path, backend_type,
         content = file.read()
         print(content)
     start_time = int(time())
+
+    start_timeout = 300
+    if not is_bf16_supported():
+        start_timeout = 600
+
     sleep(5)
-    for i in range(300):
+    for i in range(start_timeout):
         sleep(1)
         end_time = int(time())
         total_time = end_time - start_time
         result = health_check(http_url)
-        if result or total_time >= 300:
+        if result or total_time >= start_timeout:
             break
     allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT)
     return pid, startRes
diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py
index b28937dd4c..952de5d9f7 100644
--- a/benchmark/profile_generation.py
+++ b/benchmark/profile_generation.py
@@ -349,6 +349,7 @@ def parse_args():
     session_len_act = ArgumentHelper.session_len(pt_group, default=2048)
     prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
     rope_scaling_factor_act = ArgumentHelper.rope_scaling_factor(pt_group)
+    dtype_act = ArgumentHelper.dtype(pt_group)
 
     # turbomind engine args
     tb_group = parser.add_argument_group('TurboMind engine argument')
@@ -358,6 +359,7 @@ def parse_args():
     tb_group._group_actions.append(cache_block_seq_len_act)
     tb_group._group_actions.append(prefix_caching_act)
     tb_group._group_actions.append(rope_scaling_factor_act)
+    tb_group._group_actions.append(dtype_act)
     ArgumentHelper.model_format(tb_group, default='hf')
     args = parser.parse_args()
     return args
@@ -416,6 +418,7 @@ def main():
                     rope_scaling_factor=args.rope_scaling_factor,
                     tp=args.tp,
                     enable_prefix_caching=args.enable_prefix_caching,
+                    dtype=args.dtype,
                 )
             elif args.backend == 'pytorch':
                 engine_config = PytorchEngineConfig(
@@ -426,6 +429,7 @@ def main():
                     thread_safe=True,
                     eager_mode=args.eager_mode,
                     enable_prefix_caching=args.enable_prefix_caching,
+                    dtype=args.dtype,
                 )
             gen_config = GenerationConfig(top_k=args.top_k,
                                           top_p=args.top_p,
diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
index 9d573d51b1..58786d9c80 100644
--- a/benchmark/profile_throughput.py
+++ b/benchmark/profile_throughput.py
@@ -289,6 +289,7 @@ def parse_args():
     cache_block_seq_len_act = ArgumentHelper.cache_block_seq_len(pt_group)
     prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
     quant_policy_act = ArgumentHelper.quant_policy(pt_group, default=0)
+    dtype_act = ArgumentHelper.dtype(pt_group)
 
     # turbomind engine args
     tb_group = parser.add_argument_group('TurboMind engine argument')
@@ -298,6 +299,8 @@ def parse_args():
     tb_group._group_actions.append(cache_block_seq_len_act)
     tb_group._group_actions.append(prefix_caching_act)
     tb_group._group_actions.append(quant_policy_act)
+    tb_group._group_actions.append(dtype_act)
+
     ArgumentHelper.model_format(tb_group, default='hf')
     ArgumentHelper.num_tokens_per_iter(tb_group)
     ArgumentHelper.max_prefill_iters(tb_group)
@@ -321,6 +324,7 @@ def main():
             num_tokens_per_iter=args.num_tokens_per_iter,
             max_prefill_iters=args.max_prefill_iters,
             enable_prefix_caching=args.enable_prefix_caching,
+            dtype=args.dtype,
         )
     elif args.backend == 'pytorch':
         engine_config = PytorchEngineConfig(
@@ -333,6 +337,7 @@ def main():
             eager_mode=args.eager_mode,
             enable_prefix_caching=args.enable_prefix_caching,
             quant_policy=args.quant_policy,
+            dtype=args.dtype,
         )
 
     engine = Engine(args.model_path, engine_config, csv=args.csv)
diff --git a/docs/en/get_started/ascend/get_started.md b/docs/en/get_started/ascend/get_started.md
index 402ac50fbf..a5400ed64d 100644
--- a/docs/en/get_started/ascend/get_started.md
+++ b/docs/en/get_started/ascend/get_started.md
@@ -23,8 +23,8 @@ The Docker version is supposed to be no less than `18.03`. And `Ascend Docker Ru
 
 #### Ascend Drivers, Firmware and CANN
 
-The target machine needs to install the Huawei driver and firmware version 23.0.3, refer to
-[CANN Driver and Firmware Installation](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha003/softwareinst/instg/instg_0019.html)
+The target machine needs to install the Huawei driver and firmware version not lower than 23.0.3, refer to
+[CANN Driver and Firmware Installation](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha001/softwareinst/instg/instg_0005.html)
 and [download resources](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=26&cann=8.0.RC2.beta1&driver=1.0.25.alpha).
 
 And the CANN (version 8.0.RC2.beta1) software packages should also be downloaded from [Ascend Resource Download Center](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC2.beta1&product=4&model=26) themselves. Make sure to place the `Ascend-cann-kernels-910b*.run`, `Ascend-cann-nnal_*.run` and `Ascend-cann-toolkit*-aarch64.run` under the root directory of lmdeploy source code
diff --git a/docs/en/llm/pipeline.md b/docs/en/llm/pipeline.md
index ab4035a8cc..887e2e0a3e 100644
--- a/docs/en/llm/pipeline.md
+++ b/docs/en/llm/pipeline.md
@@ -136,6 +136,10 @@ logits = pipe.get_logits(input_ids)
 ppl = pipe.get_ppl(input_ids)
 ```
 
+```{note}
+get_ppl returns the cross entropy loss without applying the exponential operation afterwards
+```
+
 - **Below is an example for pytorch backend. Please install triton first.**
 
 ```shell
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index cd38a60025..1f344e78bb 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -51,7 +51,7 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 |     Llama3     |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |    Llama3.2    |   1B, 3B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  Llama3.2-VL   |   8B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|  Llama3.2-VL   |  11B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |   -   |
 |   InternLM2    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  InternLM2.5   |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
diff --git a/docs/zh_cn/get_started/ascend/get_started.md b/docs/zh_cn/get_started/ascend/get_started.md
index 78bd8383d4..e00c1e173a 100644
--- a/docs/zh_cn/get_started/ascend/get_started.md
+++ b/docs/zh_cn/get_started/ascend/get_started.md
@@ -22,8 +22,8 @@ Docker 版本应不低于 18.03。并且需按照[官方指南](https://www.hias
 
 #### Drivers，Firmware 和 CANN
 
-目标机器需安装华为驱动程序和固件版本 23.0.3，请参考
-[CANN 驱动程序和固件安装](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha003/softwareinst/instg/instg_0019.html)
+目标机器需安装华为驱动程序和固件版本至少为 23.0.3，请参考
+[CANN 驱动程序和固件安装](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha001/softwareinst/instg/instg_0005.html)
 和[下载资源](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=26&cann=8.0.RC2.beta1&driver=1.0.25.alpha)。
 
 另外，`docker/Dockerfile_aarch64_ascend`没有提供CANN 安装包，用户需要自己从[昇腾资源下载中心](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC2.beta1&product=4&model=26)下载CANN(version 8.0.RC2.beta1)软件包。
diff --git a/docs/zh_cn/llm/pipeline.md b/docs/zh_cn/llm/pipeline.md
index a9c74a5f14..40406c85a4 100644
--- a/docs/zh_cn/llm/pipeline.md
+++ b/docs/zh_cn/llm/pipeline.md
@@ -136,6 +136,10 @@ logits = pipe.get_logits(input_ids)
 ppl = pipe.get_ppl(input_ids)
 ```
 
+```{note}
+get_ppl 返回的是 cross entropy loss，没有在之后加 exp 操作
+```
+
 - **使用 pytorch 后端**
 
 需要先安装 triton
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index 9bdbf0d45d..ac061cf1ae 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -51,7 +51,7 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 |     Llama3     |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |    Llama3.2    |   1B, 3B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  Llama3.2-VL   |   8B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|  Llama3.2-VL   |  11B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |   -   |
 |   InternLM2    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  InternLM2.5   |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
diff --git a/lmdeploy/lite/quantization/calibration.py b/lmdeploy/lite/quantization/calibration.py
index 4ae21e3f45..77ff74e234 100644
--- a/lmdeploy/lite/quantization/calibration.py
+++ b/lmdeploy/lite/quantization/calibration.py
@@ -253,9 +253,11 @@ def export(self, out_dir):
 
         inp_stats = self.collect_inputs_stats()
         torch.save(inp_stats, out_dir / 'inputs_stats.pth')
+        torch.cuda.empty_cache()
 
         out_stats = self.collect_outputs_stats()
         torch.save(out_stats, out_dir / 'outputs_stats.pth')
+        torch.cuda.empty_cache()
 
     def calibrate(self, data):
         """Forward pass through the model in inference mode with given data."""
@@ -267,6 +269,7 @@ def calibrate(self, data):
             model = self.model.model
         with torch.inference_mode():
             _ = model(data.to(self.device))
+        torch.cuda.empty_cache()
 
     def __enter__(self):
         """Prepares the Calibration object for a 'with' statement by
@@ -440,6 +443,7 @@ def export(self, out_dir):
             inputs_stats['absmean'][name] = obs.absmean_val
             inputs_stats['ratios'][name] = obs.ratio
         torch.save(inputs_stats, out_dir / 'inputs_stats.pth')
+        torch.cuda.empty_cache()
 
     def _wrap_decoder_layers_for_search(self):
         """Method to wrap the decoder layers' forward functions for observing
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
index 84e3fba8fb..c713e3ec85 100644
--- a/lmdeploy/pytorch/engine/model_agent.py
+++ b/lmdeploy/pytorch/engine/model_agent.py
@@ -500,15 +500,22 @@ def _start_tp_process(proc_id: int,
 def _check_context_alive(mp_context: mp.ProcessContext):
     """check context alive."""
     procs: List[mp.Process] = mp_context.processes
-    failed_ranks = list(idx for idx, p in enumerate(procs) if not p.is_alive())
-    if len(failed_ranks) == 0:
+    failed_procs = list(idx for idx, p in enumerate(procs) if not p.is_alive())
+    if len(failed_procs) == 0:
         return
-    for p in procs:
+
+    log_procs = []
+    for idx, p in enumerate(procs):
         if p.is_alive():
             p.terminate()
         else:
+            exitcode = p.exitcode
+            if exitcode > 0:
+                # terminated exitcode < 0
+                log_procs.append((idx, exitcode))
             p.close()
-    logger.error(f'TP process {failed_ranks} failed.')
+    for idx, exitcode in log_procs:
+        logger.error(f'TP process {idx} failed with exitcode {exitcode}.')
     # TODO: not safe exit.
     os._exit(1)
 
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 019a617acd..a12cadaa7d 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -323,6 +323,12 @@ async def chat_completions_v1(request: ChatCompletionRequest,
     - ignore_eos (bool): indicator for ignoring eos
     - skip_special_tokens (bool): Whether or not to remove special tokens
         in the decoding. Default to be True.
+    - min_new_tokens (int): To generate at least numbers of tokens.
+    - min_p (float): Minimum token probability, which will be scaled by the
+        probability of the most likely token. It must be a value between
+        0 and 1. Typical values are in the 0.01-0.2 range, comparably
+        selective as setting `top_p` in the 0.99-0.8 range (use the
+        opposite of normal `top_p` values)
 
     Currently we do not support the following features:
     - presence_penalty (replaced with repetition_penalty)
@@ -386,6 +392,8 @@ async def chat_completions_v1(request: ChatCompletionRequest,
         skip_special_tokens=request.skip_special_tokens,
         response_format=response_format,
         logits_processors=logits_processors,
+        min_new_tokens=request.min_new_tokens,
+        min_p=request.min_p,
         random_seed=random_seed)
 
     tools = None
@@ -826,6 +834,12 @@ async def chat_interactive_v1(request: GenerateRequest,
         in the decoding. Default to be True.
     - adapter_name (str): For slora inference. Choose which lora to do the
         inference.
+    - min_new_tokens (int): To generate at least numbers of tokens.
+    - min_p (float): Minimum token probability, which will be scaled by the
+        probability of the most likely token. It must be a value between
+        0 and 1. Typical values are in the 0.01-0.2 range, comparably
+        selective as setting `top_p` in the 0.99-0.8 range (use the
+        opposite of normal `top_p` values)
     """
     if request.cancel:
         if request.session_id != -1:
@@ -867,6 +881,8 @@ async def chat_interactive_v1(request: GenerateRequest,
         ignore_eos=request.ignore_eos,
         stop_words=request.stop,
         skip_special_tokens=request.skip_special_tokens,
+        min_new_tokens=request.min_new_tokens,
+        min_p=request.min_p,
         random_seed=random_seed)
     if request.image_url:
         from lmdeploy.vl import load_image
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
index bd54028c39..d4bf8ed315 100644
--- a/lmdeploy/serve/openai/protocol.py
+++ b/lmdeploy/serve/openai/protocol.py
@@ -137,6 +137,8 @@ class ChatCompletionRequest(BaseModel):
     skip_special_tokens: Optional[bool] = True
     top_k: Optional[int] = 40
     seed: Optional[int] = None
+    min_new_tokens: Optional[int] = Field(default=None, examples=[None])
+    min_p: float = 0.0
 
 
 class FunctionResponse(BaseModel):
@@ -339,6 +341,8 @@ class GenerateRequest(BaseModel):
     cancel: Optional[bool] = False  # cancel a responding request
     adapter_name: Optional[str] = Field(default=None, examples=[None])
     seed: Optional[int] = None
+    min_new_tokens: Optional[int] = Field(default=None, examples=[None])
+    min_p: float = 0.0
 
 
 class GenerateResponse(BaseModel):
diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py
index 4791d3c724..3a16f0a65b 100644
--- a/lmdeploy/serve/utils.py
+++ b/lmdeploy/serve/utils.py
@@ -212,8 +212,8 @@ def get_ppl(self, input_ids: Union[List[int],
         logger.info(f'sorted indices: {indices}')
         for (start, end) in self._batch_iterator(sizes, max_input_len):
             logger.info(f'start: {start}, end: {end}')
-            _input_ids = [input_ids[indices[i]] for i in range(start, end)]
             if start == end:
+                _input_ids = input_ids[indices[start]]
                 loss, target_count = self._get_long_text_ppl(
                     generator=generator,
                     input_ids=_input_ids,
@@ -221,6 +221,7 @@ def get_ppl(self, input_ids: Union[List[int],
                 losses.append(loss)
                 target_counts.append(target_count)
             else:
+                _input_ids = [input_ids[indices[i]] for i in range(start, end)]
                 loss, target_count = self._get_ppl(
                     generator=generator,
                     input_ids=_input_ids,
@@ -261,24 +262,24 @@ def _batch_iterator(self, sizes, max_value):
                 i += 1
 
     def _get_long_text_ppl(self, generator, input_ids, max_input_len):
-        assert isinstance(input_ids, List) and len(input_ids) == 1
-        seq_len = len(input_ids[0])
+        assert all(isinstance(_, int) for _ in input_ids)
+        seq_len = len(input_ids)
         assert seq_len > max_input_len
         logger.info(f'get long text ppl: seq_len {seq_len}')
 
         losses = []
         target_counts = []
         for i in range(0, seq_len, max_input_len):
-            token_ids = input_ids[:, i:i + max_input_len]
+            token_ids = input_ids[i:i + max_input_len]
             step = [i]
             # shift token_ids by 1 to the left
-            target_ids = input_ids[:, i + 1:i + 1 + max_input_len]
+            target_ids = input_ids[i + 1:i + 1 + max_input_len]
 
             loss, target_count = self._get_ppl(
                 generator=generator,
-                input_ids=token_ids,
+                input_ids=[token_ids],
                 max_input_len=max_input_len,
-                target_ids=target_ids,
+                target_ids=[target_ids],
                 steps=step,
                 sequence_start=(i == 0),
                 sequence_end=(i + max_input_len >= seq_len))
diff --git a/lmdeploy/version.py b/lmdeploy/version.py
index b9f76b5761..8b9babb986 100644
--- a/lmdeploy/version.py
+++ b/lmdeploy/version.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Tuple
 
-__version__ = '0.6.2'
+__version__ = '0.6.2.post1'
 short_version = __version__
 
 
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index 7ed657a9b8..2d68ef3535 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -303,6 +303,43 @@ void loadWeights(
     }
 }
 
+template<typename T>
+void loadWeights(LlamaDenseWeight<T>& w, std::string prefix, FtCudaDataType model_file_type)
+{
+    auto weight_file  = prefix + ".weight";
+    auto qweight_file = prefix + ".qweight";
+
+    if (!std::filesystem::exists(weight_file) && !std::filesystem::exists(qweight_file)) {
+        TM_LOG_ERROR("%s and %s does not exist", weight_file.c_str(), qweight_file.c_str());
+        FT_CHECK(false);
+    }
+
+    size_t     dim0 = w.input_dims;
+    size_t     dim1 = w.output_dims;
+    const auto type = model_file_type;
+
+    if (w.bias) {
+        loadWeightFromBin((T*)w.bias, {1, dim1}, prefix + ".bias", type);
+    }
+    const size_t bit_size = getBitSize(w.type);
+    if (bit_size >= 16) {  // fp16, fp32
+        loadWeightFromBin((T*)w.kernel, {dim0, dim1}, prefix + ".weight", type);
+    }
+    else {  // int8, int4
+        const int factor = sizeof(float) * 8 / bit_size;
+
+        FT_CHECK(dim1 % factor == 0);
+
+        std::vector<size_t> w_shape{dim0, dim1 / factor * sizeof(uint32_t)};
+        loadWeightFromBin((int8_t*)w.kernel, w_shape, prefix + ".qweight", FtCudaDataType::INT8);
+
+        const size_t group_count = w.group_size > 0 ? dim0 / w.group_size : 1;
+
+        loadWeightFromBin((half*)w.scales, {group_count, dim1}, prefix + ".scales", type);
+        loadWeightFromBin((half*)w.zeros, {group_count, dim1}, prefix + ".zeros", type);
+    }
+}
+
 template<typename T>
 void LlamaDecoderLayerWeight<T>::mallocWeights()
 {
@@ -357,10 +394,22 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
     loadWeights(self_attn_weights.qkv, dir_path + ".attention.w_qkv", tensor_para_rank_, type, tensor_para_size_);
 
     loadWeights(self_attn_weights.output, dir_path + ".attention.wo", tensor_para_rank_, type, tensor_para_size_);
-
-    loadWeights(ffn_weights.gating, dir_path + ".feed_forward.w1", tensor_para_rank_, type, tensor_para_size_);
-    loadWeights(ffn_weights.intermediate, dir_path + ".feed_forward.w3", tensor_para_rank_, type, tensor_para_size_);
-    loadWeights(ffn_weights.output, dir_path + ".feed_forward.w2", tensor_para_rank_, type, tensor_para_size_);
+    if (moe_weights.experts.empty()) {
+        loadWeights(ffn_weights.gating, dir_path + ".feed_forward.w1", tensor_para_rank_, type, tensor_para_size_);
+        loadWeights(
+            ffn_weights.intermediate, dir_path + ".feed_forward.w3", tensor_para_rank_, type, tensor_para_size_);
+        loadWeights(ffn_weights.output, dir_path + ".feed_forward.w2", tensor_para_rank_, type, tensor_para_size_);
+    }
+    else {
+        loadWeights(moe_weights.gate, dir_path + ".moe_ffn.gate", type);
+        for (size_t i = 0; i < moe_weights.experts.size(); ++i) {
+            std::string weight_name = dir_path + ".moe_ffn.experts." + std::to_string(i);
+            loadWeights(moe_weights.experts[i].gating, weight_name + ".w1", tensor_para_rank_, type, tensor_para_size_);
+            loadWeights(
+                moe_weights.experts[i].intermediate, weight_name + ".w3", tensor_para_rank_, type, tensor_para_size_);
+            loadWeights(moe_weights.experts[i].output, weight_name + ".w2", tensor_para_rank_, type, tensor_para_size_);
+        }
+    }
 }
 
 template<typename T>
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 44f73370da..0e9309d9be 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -333,12 +333,6 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     }
     else {
         moe_param_.method = ft::MoeParam::kFused;
-        // Note: This will fail when GPUs of different SMs are mixed
-        if (weight_type_ != ft::WeightType::kINT4 && ft::getSMVersion() >= 90) {
-            // On sm90 the cuBLAS method may be faster as our grouped GEMM is not
-            // optimized for GMMA yet
-            moe_param_.method = ft::MoeParam::kNaive;
-        }
     }
 
     TM_LOG_INFO("%s", toString().c_str());
@@ -377,6 +371,10 @@ std::unique_ptr<ft::Engine<T>> LlamaTritonModel<T>::createSharedModelInstance(
                                                   shared_state_,
                                                   device_id);
 
+    // Wait for pinned buffers to be allocated for all ranks, otherwise tuning will hang
+    // due to concurrent kernel launch & cudaMallocHost
+    shared_state_->barrier->wait();
+
     engine->Start();
 
     return engine;