diff --git a/.github/scripts/eval_chat_config.py b/.github/scripts/eval_chat_config.py index 81872c0dd1..89ad20a533 100644 --- a/.github/scripts/eval_chat_config.py +++ b/.github/scripts/eval_chat_config.py @@ -174,7 +174,6 @@ max_out_len=MAX_NEW_TOKENS, max_seq_len=MAX_SESSION_LEN, batch_size=128, - concurrency=128, meta_template=llama2_meta_template, run_cfg=dict(num_gpus=1), end_str='[INST]') diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml index 84fcaf5034..f03bbf4a50 100644 --- a/.github/workflows/daily_ete_test.yml +++ b/.github/workflows/daily_ete_test.yml @@ -199,9 +199,9 @@ jobs: chmod -R 777 $workdir test_tools: - needs: test_quantization if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} runs-on: [self-hosted, linux-a100] + needs: test_quantization timeout-minutes: 150 strategy: fail-fast: false diff --git a/.github/workflows/daily_ete_test_v100.yml b/.github/workflows/daily_ete_test_v100.yml new file mode 100644 index 0000000000..8b32bab1f7 --- /dev/null +++ b/.github/workflows/daily_ete_test_v100.yml @@ -0,0 +1,667 @@ +name: daily_ete_test_v100 + +on: + workflow_dispatch: + inputs: + repo_org: + required: false + description: 'Tested repository organization name. Default is InternLM' + type: string + default: 'InternLM/lmdeploy' + repo_ref: + required: false + description: 'Set branch or tag or commit id. Default is "main"' + type: string + default: 'main' + backend: + required: true + description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"' + type: string + default: "['turbomind', 'pytorch', 'turbomind_vl']" + model: + required: true + description: 'Set testcase module filter: chat, restful, pipeline, quantization. Default contains all models' + type: string + default: "['pipeline','restful','chat']" + offline_mode: + required: true + description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' + type: boolean + default: false + regression_func: + required: true + description: 'regression functions' + type: string + default: "['quant', 'tools','restful','pipeline','benchmark','evaluation']" + schedule: + - cron: '00 16 * * 0-4' + +env: + HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache + HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai + OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }} + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true + REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }} + COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy + FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}} + TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy + OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy + OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt + +jobs: + linux-build: + if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}} + strategy: + matrix: + pyver: [py310] + runs-on: ubuntu-latest + env: + PYTHON_VERSION: ${{ matrix.pyver }} + PLAT_NAME: manylinux2014_x86_64 + DOCKER_TAG: cuda12.1 + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Build + run: | + echo ${PYTHON_VERSION} + echo ${PLAT_NAME} + echo ${DOCKER_TAG} + echo ${OUTPUT_FOLDER} + echo ${GITHUB_RUN_ID} + # remove -it + sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh + bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} + - name: Upload Artifacts + uses: actions/upload-artifact@v4 + with: + if-no-files-found: error + path: builder/manywheel/${{ env.OUTPUT_FOLDER }} + retention-days: 1 + name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }} + + + download_pkgs: + needs: linux-build + if: ${{!cancelled()}} + runs-on: linux-v100 + timeout-minutes: 50 + container: + image: openmmlab/lmdeploy:latest-cu12 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/187:/mnt/187 + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Clone repository + uses: actions/checkout@v2 + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Copy repository + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} + - name: Copy repository - offline + if: ${{inputs.offline_mode}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} + - name: Change testconfig on v100 + run: | + mv ${{env.TEST_CODE_PATH}}/autotest/config-v100.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml + - name: Download Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + uses: actions/download-artifact@v4 + with: + name: my-artifact-${{ github.run_id }}-py310 + - name: Copy Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + - name: Copy Artifacts - offline + if: ${{inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + + test_quantization: + needs: download_pkgs + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}} + runs-on: linux-v100 + timeout-minutes: 180 + env: + PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA + MODELSCOPE_CACHE: /root/modelscope_hub + MODELSCOPE_MODULES_CACHE: /root/modelscope_modules + container: + image: openmmlab/lmdeploy:latest-cu12 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/github-actions/modelscope_hub:/root/modelscope_hub + - /nvme/github-actions/modelscope_modules:/root/modelscope_modules + - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/shared:/mnt/shared + - /mnt/187:/mnt/187 + - /nvme/qa_test_models/lmdeploy/autotest:/local_case + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + # manually install flash attn + # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases + python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps + python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Check env + run: | + pip install triton==3.0.0 + pip uninstall -y nvidia-nccl-cu11 + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Test lmdeploy - quantization w4a16 + continue-on-error: true + if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind') + run: | + pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - convert + continue-on-error: true + if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind') + run: | + pytest autotest/tools/convert -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + test_tools: + needs: test_quantization + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} + runs-on: linux-v100 + timeout-minutes: 240 + strategy: + fail-fast: false + matrix: + backend: ${{ fromJSON(inputs.backend) || fromJSON('["turbomind", "pytorch", "turbomind_vl"]')}} + model: ${{ fromJSON(inputs.model) || fromJSON('["pipeline","restful","chat"]')}} + exclude: + - backend: turbomind_vl + model: chat + include: + - backend: turbomind + model: local_case + env: + PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA + MODELSCOPE_CACHE: /root/modelscope_hub + MODELSCOPE_MODULES_CACHE: /root/modelscope_modules + container: + image: openmmlab/lmdeploy:latest-cu12 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/github-actions/modelscope_hub:/root/modelscope_hub + - /nvme/github-actions/modelscope_modules:/root/modelscope_modules + - /nvme/github-actions/resources/lora:/root/lora + - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/shared:/mnt/shared + - /mnt/187:/mnt/187 + - /nvme/qa_test_models/lmdeploy/autotest:/local_case + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + # manually install flash attn + # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases + python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps + python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Check env + run: | + pip install triton==3.0.0 + pip uninstall -y nvidia-nccl-cu11 + python3 -m pip list + lmdeploy check_env + cp -r /root/lora . + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Test lmdeploy - chat workspace + continue-on-error: true + if: matrix.backend == 'turbomind' && matrix.model == 'chat' + run: | + pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - chat + continue-on-error: true + if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'chat' + run: | + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - pipeline + continue-on-error: true + if: matrix.model == 'pipeline' + run: | + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - restful + continue-on-error: true + if: matrix.model == 'restful' + run: | + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - restful workspace + continue-on-error: true + if: matrix.backend == 'turbomind' && matrix.model == 'restful' + run: | + pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - local testcase + if: matrix.backend == 'turbomind' && matrix.model == 'local_case' + run: | + pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}}|| true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + test_restful: + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}} + runs-on: linux-v100 + needs: test_quantization + strategy: + fail-fast: false + matrix: + backend: ['turbomind', 'pytorch'] + timeout-minutes: 120 + container: + image: openmmlab/lmdeploy:latest-cu12 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/shared:/mnt/shared + - /mnt/187:/mnt/187 + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + # manually install flash attn + # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases + python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps + python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Check env + run: | + pip install triton==3.0.0 + pip uninstall -y nvidia-nccl-cu11 + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Start restful api turbomind + if: matrix.backend == 'turbomind' + run: | + CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm2_5-20b-chat --tp 2 > restful.log 2>&1 & + echo "restful_pid=$!" >> "$GITHUB_ENV" + sleep 600s + - name: Start restful api pytorch + if: matrix.backend == 'pytorch' + run: | + CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm2_5-20b-chat --tp 2 --backend pytorch --dtype float16 > restful.log 2>&1 & + echo "restful_pid=$!" >> "$GITHUB_ENV" + sleep 600s + - name: Test lmdeploy - restful api + timeout-minutes: 75 + run: | + pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}}' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Kill api server + if: always() + run: | + kill -15 "$restful_pid" + - name: Start restful api turbomind - base + if: matrix.backend == 'turbomind' + run: | + CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm2_5-20b --tp 2 > restful.log 2>&1 & + echo "restful_pid=$!" >> "$GITHUB_ENV" + sleep 600s + - name: Start restful api pytorch - base + if: matrix.backend == 'pytorch' + run: | + CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm2_5-20b --tp 2 --backend pytorch --dtype float16 > restful.log 2>&1 & + echo "restful_pid=$!" >> "$GITHUB_ENV" + sleep 600s + - name: Test lmdeploy - restful api - base + timeout-minutes: 40 + run: | + pytest autotest/interface/restful/test_restful_completions_v1.py -n 20 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Kill api server + if: always() + run: | + kill -15 "$restful_pid" + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + test_pipeline: + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'pipeline'))}} + runs-on: linux-v100 + needs: test_quantization + timeout-minutes: 240 + container: + image: openmmlab/lmdeploy:latest-cu12 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/shared:/mnt/shared + - /mnt/187:/mnt/187 + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + # manually install flash attn + # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases + python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps + python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Check env + run: | + pip install triton==3.0.0 + pip uninstall -y nvidia-nccl-cu11 + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Test lmdeploy - interface pipeline case + run: | + pytest autotest/interface/pipeline/test_pipeline_func.py -m 'not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + + test_benchmark: + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}} + runs-on: linux-v100 + needs: test_quantization + timeout-minutes: 120 + container: + image: openmmlab/lmdeploy:latest-cu12 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/shared:/mnt/shared + - /mnt/187:/mnt/187 + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + # manually install flash attn + # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases + python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps + python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Check env + run: | + pip install triton==3.0.0 + pip uninstall -y nvidia-nccl-cu11 + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Test benchmark script + run: | + pytest autotest/benchmark -n 4 --run_id ${{ github.run_id }} -m function ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + chmod -R 777 /nvme/qa_test_models/benchmark-reports/${{ github.run_id }} + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + test_evaluation: + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}} + runs-on: linux-v100 + needs: test_quantization + timeout-minutes: 120 + strategy: + fail-fast: false + matrix: + evaluate_type: ['chat', 'base'] + container: + image: openmmlab/lmdeploy:latest-cu12 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/github-actions/resources:/root/resources + - /nvme/github-actions/opencompass-data:/root/opencompass-data + - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports + - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/shared:/mnt/shared + - /mnt/187:/mnt/187 + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + # manually install flash attn + # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases + python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps + python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Install opencompass + run: | + git clone --depth=1 https://github.com/open-compass/opencompass.git + cd opencompass + python3 -m pip install -e . + echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV + - name: Check env + run: | + pip install triton==3.0.0 + pip uninstall -y nvidia-nccl-cu11 + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Setup paths for evaluation + run: | + ln -s /root/opencompass-data ./data + python3 .github/scripts/action_tools.py create_model_links /nvme/qa_test_models . + - name: Evaluate models + if: matrix.evaluate_type == 'chat' + run: | + export LMDEPLOY_DIR=$(pwd) + + python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_4bits, turbomind_internlm2_5_7b_chat_kvint4, pt_internlm2_5_7b_chat, turbomind_internlm2_5_20b_chat, turbomind_internlm2_5_20b_chat_4bits, turbomind_internlm2_5_20b_chat_kvint4, pt_internlm2_5_20b_chat, turbomind_llama_3d1_8b_instruct, pt_llama_3d1_8b_instruct, turbomind_llama_3d1_8b_instruct_4bits, turbomind_llama_3d1_8b_instruct_kvint4, turbomind_qwen2_7b_instruct, turbomind_qwen2_7b_instruct_4bits, pt_qwen1_5_moe_2_7b_chat, pt_gemma_2_9b_it]" "[*race_datasets, *gsm8k_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true + - name: Evaluate base models + if: matrix.evaluate_type == 'base' + run: | + export LMDEPLOY_DIR=$(pwd) + + python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b, turbomind_qwen2_7b, turbomind_internlm2_5_7b_batch1]" "[*mmlu_datasets, *gsm8k_datasets]" /root/evaluation-reports/${{ github.run_id }} base true + - name: Clear workspace + if: always() + run: | + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + + get_benchmark_result: + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}} + needs: [test_benchmark] + timeout-minutes: 5 + runs-on: linux-v100 + env: + BENCHMARK_REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }} + steps: + - name: Clone repository + uses: actions/checkout@v3 + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Get overview + run: | + pip install pandas fire mmengine + python3 .github/scripts/action_tools.py generate_benchmark_report $BENCHMARK_REPORT_DIR + + + get_coverage_report: + if: ${{!cancelled()}} + runs-on: linux-v100 + needs: [test_tools, test_restful, test_pipeline, test_benchmark] + timeout-minutes: 5 + container: + image: openmmlab/lmdeploy:latest-cu12 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/qa_test_models:/nvme/qa_test_models + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Get coverage report + run: | + pip install coverage + coverage combine ${{env.REPORT_DIR}} + coverage xml -o ${{env.REPORT_DIR}}/coverage.xml + coverage report -m + mv .coverage ${{env.REPORT_DIR}}/.coverage + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + notify_to_feishu: + if: always() && !cancelled() && (github.ref_name == 'develop' || github.ref_name == 'main') + needs: [get_benchmark_result, get_coverage_report, test_evaluation] + timeout-minutes: 5 + runs-on: linux-v100 + steps: + - name: notify + if: contains(needs.*.result, 'failure') + run: | + curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Lmdeploy- Daily test finished!!!","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.FEISHU_USER_ID }}'"}]]}}}}' ${{ secrets.FEISHU_WEBHOOK_URL }} diff --git a/autotest/config-v100.yaml b/autotest/config-v100.yaml new file mode 100644 index 0000000000..172667ec0c --- /dev/null +++ b/autotest/config-v100.yaml @@ -0,0 +1,131 @@ +model_path: /nvme/qa_test_models +dst_path: /nvme/qa_test_models/autotest_model +log_path: /nvme/qa_test_models/autotest_model/log +benchmark_path: /nvme/qa_test_models/benchmark-reports +dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json + +tp_config: + internlm-chat-20b: 2 + internlm2-chat-20b: 2 + Baichuan2-13B-Chat: 2 + Mixtral-8x7B-Instruct-v0.1: 2 + Qwen-VL-Chat: 2 + llava-v1.5-13b: 2 + internlm2_5-20b-chat: 2 + internlm2_5-20b: 2 + Meta-Llama-3-1-70B-Instruct: 4 + internlm2_5-7b-chat-1m: 4 + Qwen2-7B-Instruct-GPTQ-Int4: 2 + InternVL2-26B: 2 + InternVL2-40B: 2 + MiniCPM-V-2_6: 2 + +turbomind_chat_model: + - meta-llama/Meta-Llama-3-1-8B-Instruct + - meta-llama/Meta-Llama-3-1-8B-Instruct-AWQ + - meta-llama/Meta-Llama-3-8B-Instruct + - meta-llama/Meta-Llama-3-8B-Instruct-inner-4bits + - internlm/internlm2_5-7b-chat + - internlm/internlm2_5-20b-chat + - internlm/internlm-xcomposer2d5-7b + - OpenGVLab/InternVL2-2B + - OpenGVLab/InternVL2-8B + - OpenGVLab/InternVL2-26B + - OpenGVLab/Mini-InternVL-Chat-2B-V1-5 + - Qwen/Qwen2-7B-Instruct-AWQ + - Qwen/Qwen2-1.5B-Instruct + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2-7B-Instruct-GPTQ-Int4 + - mistralai/Mistral-7B-Instruct-v0.3 + - THUDM/glm-4-9b-chat + + +pytorch_chat_model: + - meta-llama/Meta-Llama-3-8B-Instruct + - meta-llama/Meta-Llama-3-1-8B-Instruct + - internlm/internlm2_5-7b-chat + - internlm/internlm2_5-20b-chat + - OpenGVLab/InternVL2-2B + - OpenGVLab/InternVL2-4B + - OpenGVLab/InternVL2-8B + - OpenGVLab/InternVL2-26B + - Qwen/Qwen2-1.5B-Instruct + - Qwen/Qwen1.5-MoE-A2.7B-Chat + - Qwen/Qwen2-VL-2B-Instruct + - Qwen/Qwen2-VL-7B-Instruct + - google/gemma-2-9b-it + - mistralai/Mistral-7B-Instruct-v0.2 + - THUDM/glm-4v-9b + - THUDM/glm-4-9b-chat + - microsoft/Phi-3-mini-4k-instruct + - deepseek-ai/DeepSeek-V2-Lite-Chat + +turbomind_base_model: + - internlm/internlm2_5-7b + - internlm/internlm2_5-20b + +pytorch_base_model: + - internlm/internlm2_5-7b + - internlm/internlm2_5-20b + +vl_model: + - OpenGVLab/InternVL2-2B + - OpenGVLab/InternVL2-4B + - OpenGVLab/InternVL2-8B + - OpenGVLab/InternVL2-26B + - Qwen/Qwen2-VL-2B-Instruct + - Qwen/Qwen2-VL-7B-Instruct + - internlm/internlm-xcomposer2d5-7b + - THUDM/glm-4v-9b + - microsoft/Phi-3-mini-4k-instruct + +turbomind_quatization: + no_awq: + - meta-llama/Meta-Llama-3-1-8B-Instruct + - meta-llama/Meta-Llama-3-8B-Instruct + - internlm/internlm-xcomposer2d5-7b + - OpenGVLab/Mini-InternVL-Chat-2B-V1-5 + - mistralai/Mistral-7B-Instruct-v0.3 + - THUDM/glm-4-9b-chat + gptq: + - internlm/internlm2_5-7b-chat + no_kvint4: + - openbmb/MiniCPM-V-2_6 + no_kvint8: + - deepseek-ai/DeepSeek-V2-Lite-Chat + +pytorch_quatization: + awq: + - internlm/internlm2_5-7b-chat + - internlm/internlm2_5-20b-chat + - Qwen/Qwen2-1.5B-Instruct + w8a8: + - internlm/internlm2_5-7b-chat + - internlm/internlm2_5-7b + no_kvint4: + - OpenGVLab/InternVL2-4B + - deepseek-ai/DeepSeek-V2-Lite-Chat + - microsoft/Phi-3-mini-4k-instruct + - microsoft/Phi-3-vision-128k-instruct + no_kvint8: + - deepseek-ai/DeepSeek-V2-Lite-Chat + + +longtext_model: + - meta-llama/Meta-Llama-3-1-8B-Instruct + - meta-llama/Meta-Llama-3-8B-Instruct + - meta-llama/Meta-Llama-3-1-70B-Instruct + - internlm/internlm2_5-7b-chat-1m + - internlm/internlm2-chat-20b + +benchmark_model: + - meta-llama/Llama-2-7b-chat-hf + - meta-llama/Meta-Llama-3-1-8B-Instruct + - meta-llama/Meta-Llama-3-8B-Instruct + - meta-llama/Meta-Llama-3-1-70B-Instruct + - internlm/internlm2_5-7b-chat + - internlm/internlm2_5-20b-chat + - THUDM/glm-4-9b-chat + - mistralai/Mistral-7B-Instruct-v0.3 + - mistralai/Mixtral-8x7B-Instruct-v0.1 + - deepseek-ai/DeepSeek-V2-Lite-Chat diff --git a/autotest/config.yaml b/autotest/config.yaml index 4e4b20f206..46b9bd9ce1 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -48,6 +48,7 @@ turbomind_chat_model: - mistralai/Mistral-7B-Instruct-v0.1 - mistralai/Mistral-7B-Instruct-v0.2 - mistralai/Mistral-7B-Instruct-v0.3 + - mistralai/Mixtral-8x7B-Instruct-v0.1 - lmdeploy/llama2-chat-7b-w4 - baichuan-inc/Baichuan2-7B-Chat - 01-ai/Yi-6B-Chat @@ -90,7 +91,6 @@ pytorch_chat_model: - mistralai/Mistral-7B-Instruct-v0.1 - mistralai/Mistral-7B-Instruct-v0.2 - mistralai/Mixtral-8x7B-Instruct-v0.1 - - mistralai/Mixtral-8x7B-Instruct-v0.1 - google/gemma-7b-it - google/gemma-2-9b-it - deepseek-ai/deepseek-moe-16b-chat diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py index 096918b6b1..bd33ed33a0 100644 --- a/autotest/interface/pipeline/test_pipeline_func.py +++ b/autotest/interface/pipeline/test_pipeline_func.py @@ -15,6 +15,14 @@ from lmdeploy import (GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline) +from lmdeploy.utils import is_bf16_supported + + +def init_pipeline(model_path, backend_config): + if not is_bf16_supported() and isinstance(backend_config, + PytorchEngineConfig): + backend_config.dtype = 'float16' + return pipeline(model_path, backend_config=backend_config) @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @@ -26,7 +34,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) response = pipe('Hi, pls intro yourself') result, msg = assert_pipeline_single_return(response) save_pipeline_common_log(config, file_name, result, response, msg) @@ -56,7 +64,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) response = [] for item in pipe.stream_infer('Hi, pls intro yourself'): response.append(item) @@ -88,7 +96,7 @@ def run_pipeline_testcase_with_prompt(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) response = pipe(['Hi, pls intro yourself', 'Shanghai is']) result, msg = assert_pipeline_batch_return(response, 2) save_pipeline_common_log(config, file_name, result, response, msg) @@ -118,7 +126,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) response = [] for item in pipe.stream_infer(['Pls intro yourself', 'Shanghai is']): response.append(item) @@ -149,7 +157,7 @@ def test_return_with_message(config, model, backend, worker_id): def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) prompts = [[{'role': 'user', 'content': 'Hi, pls intro yourself'}]] response = pipe(prompts) print(response) @@ -180,7 +188,7 @@ def test_return_with_message_stream(config, model, backend, worker_id): def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) prompts = [[{'role': 'user', 'content': 'Hi, pls intro yourself'}]] response = [] for item in pipe.stream_infer(prompts): @@ -212,7 +220,7 @@ def test_return_with_message_batch(config, model, backend, worker_id): def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) prompts = [[{ 'role': 'user', 'content': 'Hi, pls intro yourself' @@ -249,7 +257,7 @@ def test_return_with_message_batch_stream(config, model, backend, worker_id): def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) prompts = [[{ 'role': 'user', 'content': 'Hi, pls intro yourself' @@ -287,7 +295,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) gen_config = GenerationConfig(logprobs=10, max_new_tokens=5, top_k=40, @@ -320,7 +328,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) gen_config = GenerationConfig(logprobs=10, max_new_tokens=5, top_k=40, @@ -358,7 +366,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(session_len=10, tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) response = pipe(['Hi, pls intro yourself', 'Shanghai is']) result = True @@ -392,7 +400,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test min_new_tokens gen_config = GenerationConfig(min_new_tokens=200, ignore_eos=True) response = pipe(['Hi, pls intro yourself', 'Shanghai is'], @@ -428,7 +436,7 @@ def run_pipeline_testcase_stop_words(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test stop_words gen_config = GenerationConfig(stop_words=[' and', '浦', ' to']) response = pipe(['Hi, pls intro yourself', 'Shanghai is'], @@ -467,7 +475,7 @@ def run_pipeline_testcase_bad_words(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test bad_words gen_config = GenerationConfig(bad_words=[' and', '浦', ' to']) response = pipe(['Hi, pls intro yourself', 'Shanghai is'], @@ -502,7 +510,7 @@ def test_gen_config_special_words_false(config, model, backend, worker_id): def run_pipeline_testcase_special_words(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test special_words prompt = '<|im_start|>system\n当开启工具以及代码时,根据需求选择合适的工具进行调用\n' + \ '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' + \ @@ -543,7 +551,7 @@ def test_gen_config_special_words_true(config, model, backend, worker_id): def run_pipeline_testcase_special_words(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test special_words prompt = '<|im_start|>system\n当开启工具以及代码时,根据需求选择合适的工具进行调用\n' + \ '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' + \ @@ -587,7 +595,7 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend, model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test repetition_penalty gen_config = GenerationConfig(repetition_penalty=0.01, random_seed=1, @@ -626,7 +634,7 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend, model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test repetition_penalty gen_config = GenerationConfig(repetition_penalty=1.2, random_seed=1) response = pipe('Shanghai is', gen_config=gen_config) @@ -658,7 +666,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test repetition_penalty gen_config = GenerationConfig(top_p=0.1, random_seed=1) response = pipe('Shanghai is', gen_config=gen_config) @@ -690,7 +698,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test repetition_penalty gen_config = GenerationConfig(top_k=1, max_new_tokens=20, @@ -727,7 +735,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) response_list = [] for i in range(3): gen_config = GenerationConfig(random_seed=i, @@ -764,7 +772,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) gen_config = GenerationConfig(random_seed=1, top_k=40, do_sample=True) response_list = [] for i in range(3): @@ -798,7 +806,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) gen_config = GenerationConfig(temperature=1.0, top_k=40, do_sample=True) @@ -833,7 +841,7 @@ def run_pipeline_testcase_max_new_tokens(config, model, backend, model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test max_new_tokens gen_config = GenerationConfig(max_new_tokens=5) response = pipe(['Hi, pls intro yourself', 'Shanghai is'], @@ -870,7 +878,7 @@ def run_pipeline_testcase_ignore_eos(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test max_new_tokens with ignore_eos gen_config = GenerationConfig(ignore_eos=True, max_new_tokens=256) response = pipe(['Hi, pls intro yourself', 'Shanghai is'], @@ -907,7 +915,7 @@ def test_backend_config_input_validation(config, model, backend, worker_id): tp_num=2) model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) with pytest.raises(AssertionError): gen_config = GenerationConfig(top_p=0) pipe('Shanghai is', gen_config=gen_config) @@ -1018,7 +1026,7 @@ def test_backend_config_tp(config, model, backend, worker_id): worker_id, tp_num=2) model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=100) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) del pipe torch.cuda.empty_cache() if 'gw' in worker_id: diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py index 0ea643524f..c80dbe0dfc 100644 --- a/autotest/utils/benchmark_utils.py +++ b/autotest/utils/benchmark_utils.py @@ -7,6 +7,8 @@ from utils.config_utils import get_workerid from utils.run_restful_chat import health_check +from lmdeploy.utils import is_bf16_supported + DEFAULT_PORT = 23333 GENERATION_CONFIG = ' -c 8 256 -ct 128 128 2048 128 -pt 1 128 128 2048' GENERATION_LONGTEXT_CONFIG = ' -c 1 --session-len 200000 -ct 1024 -pt 198000' @@ -40,6 +42,8 @@ def generation_test(config, run_config = '' if backend == 'pytorch': command += ' --backend pytorch' + if not is_bf16_supported(): + command += ' --dtype float16' else: if '4bit' in model: command += ' --model-format awq' @@ -105,6 +109,8 @@ def throughput_test(config, run_config = '--num-prompts 3000' if backend == 'pytorch': command += ' --backend pytorch' + if not is_bf16_supported(): + command += ' --dtype float16' else: if '4bit' in model: command += ' --model-format awq' diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index ca041dc9a1..8aa5f933fb 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -4,6 +4,8 @@ import yaml from utils.get_run_config import get_tp_num +from lmdeploy.utils import is_bf16_supported + def get_turbomind_model_list(tp_num: int = None, model_type: str = 'chat_model', @@ -85,14 +87,16 @@ def get_torch_model_list(tp_num: int = None, def get_all_model_list(tp_num: int = None, quant_policy: int = None, model_type: str = 'chat_model'): + case_list = get_turbomind_model_list(tp_num=tp_num, model_type=model_type, quant_policy=quant_policy) - for case in get_torch_model_list(tp_num=tp_num, - quant_policy=quant_policy, - model_type=model_type): - if case not in case_list: - case_list.append(case) + if is_bf16_supported(): + for case in get_torch_model_list(tp_num=tp_num, + quant_policy=quant_policy, + model_type=model_type): + if case not in case_list: + case_list.append(case) return [x for x in case_list if 'w8a8' not in x] diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index 33d65448ab..1ab34b23d5 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -10,6 +10,7 @@ from lmdeploy import pipeline from lmdeploy.messages import PytorchEngineConfig, TurbomindEngineConfig +from lmdeploy.utils import is_bf16_supported from lmdeploy.vl import load_image from lmdeploy.vl.constants import IMAGE_TOKEN @@ -32,6 +33,8 @@ def run_pipeline_chat_test(config, if 'pytorch' in type: backend_config = PytorchEngineConfig(tp=tp) + if not is_bf16_supported(): + backend_config.dtype = 'float16' else: backend_config = TurbomindEngineConfig(tp=tp) @@ -292,6 +295,10 @@ def run_pipeline_vl_chat_test(config, model_case, quant_policy: int = None): backend_config.model_format = 'awq' if quant_policy is not None: backend_config.quant_policy = quant_policy + + if not is_bf16_supported(): + backend_config.cache_max_entry_count = 0.5 + backend_config.dtype = 'float16' pipe = pipeline(hf_path, backend_config=backend_config) pipeline_chat_log = os.path.join( diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py index 75b7319aeb..752168958a 100644 --- a/autotest/utils/quantization_utils.py +++ b/autotest/utils/quantization_utils.py @@ -2,6 +2,8 @@ import subprocess from subprocess import PIPE +from lmdeploy.utils import is_bf16_supported + def quantization(config, quantization_model_name, @@ -21,17 +23,17 @@ def quantization(config, if quantization_type == 'awq': quantization_cmd = ' '.join([ cuda_prefix, 'lmdeploy lite auto_awq', origin_model_path, - '--work-dir', quantization_model_path, '--batch-size 32' + '--work-dir', quantization_model_path ]) elif quantization_type == 'gptq': quantization_cmd = ' '.join([ cuda_prefix, 'lmdeploy lite auto_gptq', origin_model_path, - '--work-dir', quantization_model_path, '--batch-size 32' + '--work-dir', quantization_model_path ]) elif quantization_type == 'w8a8': quantization_cmd = ' '.join([ cuda_prefix, 'lmdeploy lite smooth_quant', origin_model_path, - '--work-dir', quantization_model_path, '--batch-size 32' + '--work-dir', quantization_model_path ]) else: return False, 'quantization type should in [awq, gptq, w8a8], \ @@ -40,6 +42,11 @@ def quantization(config, if 'llama-3' in origin_model_name.lower(): quantization_cmd += ' --search-scale True' + if not is_bf16_supported(): + quantization_cmd += ' --batch-size 8' + else: + quantization_cmd += ' --batch-size 32' + with open(quantization_log, 'w') as f: # remove existing folder subprocess.run([' '.join(['rm -rf', quantization_model_path])], diff --git a/autotest/utils/run_client_chat.py b/autotest/utils/run_client_chat.py index edc2268e30..529bf4a6a0 100644 --- a/autotest/utils/run_client_chat.py +++ b/autotest/utils/run_client_chat.py @@ -4,6 +4,8 @@ from utils.get_run_config import get_command_with_extra, get_model_name from utils.rule_condition_assert import assert_result +from lmdeploy.utils import is_bf16_supported + TEMPLATE = 'autotest/template.json' @@ -63,6 +65,9 @@ def hf_command_line_test(config, need_tp=True, cuda_prefix=cuda_prefix) + if type == 'pytorch': + if not is_bf16_supported(): + cmd += ' --dtype float16' if type == 'turbomind': if ('w4' in model_case or ('4bits' in model_case or 'awq' in model_case.lower())): diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index 1eb84f1d93..c567db4d00 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -13,6 +13,7 @@ from utils.run_client_chat import command_line_test from lmdeploy.serve.openai.api_client import APIClient +from lmdeploy.utils import is_bf16_supported BASE_HTTP_URL = 'http://localhost' DEFAULT_PORT = 23333 @@ -60,12 +61,17 @@ def start_restful_api(config, param, model, model_path, backend_type, cmd += ' --model-format gptq' if backend_type == 'pytorch': cmd += ' --backend pytorch' + if not is_bf16_supported(): + cmd += ' --dtype float16' if 'llava' in model: cmd += ' --model-name vicuna' if 'quant_policy' in param.keys() and param['quant_policy'] is not None: quant_policy = param['quant_policy'] cmd += f' --quant-policy {quant_policy}' + if not is_bf16_supported(): + cmd += ' --cache-max-entry-count 0.5' + start_log = os.path.join( log_path, 'start_restful_' + model.split('/')[1] + worker_id + '.log') @@ -87,13 +93,18 @@ def start_restful_api(config, param, model, model_path, backend_type, content = file.read() print(content) start_time = int(time()) + + start_timeout = 300 + if not is_bf16_supported(): + start_timeout = 600 + sleep(5) - for i in range(300): + for i in range(start_timeout): sleep(1) end_time = int(time()) total_time = end_time - start_time result = health_check(http_url) - if result or total_time >= 300: + if result or total_time >= start_timeout: break allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT) return pid, startRes diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py index b28937dd4c..952de5d9f7 100644 --- a/benchmark/profile_generation.py +++ b/benchmark/profile_generation.py @@ -349,6 +349,7 @@ def parse_args(): session_len_act = ArgumentHelper.session_len(pt_group, default=2048) prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group) rope_scaling_factor_act = ArgumentHelper.rope_scaling_factor(pt_group) + dtype_act = ArgumentHelper.dtype(pt_group) # turbomind engine args tb_group = parser.add_argument_group('TurboMind engine argument') @@ -358,6 +359,7 @@ def parse_args(): tb_group._group_actions.append(cache_block_seq_len_act) tb_group._group_actions.append(prefix_caching_act) tb_group._group_actions.append(rope_scaling_factor_act) + tb_group._group_actions.append(dtype_act) ArgumentHelper.model_format(tb_group, default='hf') args = parser.parse_args() return args @@ -416,6 +418,7 @@ def main(): rope_scaling_factor=args.rope_scaling_factor, tp=args.tp, enable_prefix_caching=args.enable_prefix_caching, + dtype=args.dtype, ) elif args.backend == 'pytorch': engine_config = PytorchEngineConfig( @@ -426,6 +429,7 @@ def main(): thread_safe=True, eager_mode=args.eager_mode, enable_prefix_caching=args.enable_prefix_caching, + dtype=args.dtype, ) gen_config = GenerationConfig(top_k=args.top_k, top_p=args.top_p, diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py index 9d573d51b1..58786d9c80 100644 --- a/benchmark/profile_throughput.py +++ b/benchmark/profile_throughput.py @@ -289,6 +289,7 @@ def parse_args(): cache_block_seq_len_act = ArgumentHelper.cache_block_seq_len(pt_group) prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group) quant_policy_act = ArgumentHelper.quant_policy(pt_group, default=0) + dtype_act = ArgumentHelper.dtype(pt_group) # turbomind engine args tb_group = parser.add_argument_group('TurboMind engine argument') @@ -298,6 +299,8 @@ def parse_args(): tb_group._group_actions.append(cache_block_seq_len_act) tb_group._group_actions.append(prefix_caching_act) tb_group._group_actions.append(quant_policy_act) + tb_group._group_actions.append(dtype_act) + ArgumentHelper.model_format(tb_group, default='hf') ArgumentHelper.num_tokens_per_iter(tb_group) ArgumentHelper.max_prefill_iters(tb_group) @@ -321,6 +324,7 @@ def main(): num_tokens_per_iter=args.num_tokens_per_iter, max_prefill_iters=args.max_prefill_iters, enable_prefix_caching=args.enable_prefix_caching, + dtype=args.dtype, ) elif args.backend == 'pytorch': engine_config = PytorchEngineConfig( @@ -333,6 +337,7 @@ def main(): eager_mode=args.eager_mode, enable_prefix_caching=args.enable_prefix_caching, quant_policy=args.quant_policy, + dtype=args.dtype, ) engine = Engine(args.model_path, engine_config, csv=args.csv) diff --git a/docs/en/get_started/ascend/get_started.md b/docs/en/get_started/ascend/get_started.md index 402ac50fbf..a5400ed64d 100644 --- a/docs/en/get_started/ascend/get_started.md +++ b/docs/en/get_started/ascend/get_started.md @@ -23,8 +23,8 @@ The Docker version is supposed to be no less than `18.03`. And `Ascend Docker Ru #### Ascend Drivers, Firmware and CANN -The target machine needs to install the Huawei driver and firmware version 23.0.3, refer to -[CANN Driver and Firmware Installation](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha003/softwareinst/instg/instg_0019.html) +The target machine needs to install the Huawei driver and firmware version not lower than 23.0.3, refer to +[CANN Driver and Firmware Installation](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha001/softwareinst/instg/instg_0005.html) and [download resources](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=26&cann=8.0.RC2.beta1&driver=1.0.25.alpha). And the CANN (version 8.0.RC2.beta1) software packages should also be downloaded from [Ascend Resource Download Center](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC2.beta1&product=4&model=26) themselves. Make sure to place the `Ascend-cann-kernels-910b*.run`, `Ascend-cann-nnal_*.run` and `Ascend-cann-toolkit*-aarch64.run` under the root directory of lmdeploy source code diff --git a/docs/en/llm/pipeline.md b/docs/en/llm/pipeline.md index ab4035a8cc..887e2e0a3e 100644 --- a/docs/en/llm/pipeline.md +++ b/docs/en/llm/pipeline.md @@ -136,6 +136,10 @@ logits = pipe.get_logits(input_ids) ppl = pipe.get_ppl(input_ids) ``` +```{note} +get_ppl returns the cross entropy loss without applying the exponential operation afterwards +``` + - **Below is an example for pytorch backend. Please install triton first.** ```shell diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md index cd38a60025..1f344e78bb 100644 --- a/docs/en/supported_models/supported_models.md +++ b/docs/en/supported_models/supported_models.md @@ -51,7 +51,7 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha | Llama3 | 8B, 70B | LLM | Yes | Yes | Yes | Yes | Yes | | Llama3.1 | 8B, 70B | LLM | Yes | Yes | Yes | No | - | | Llama3.2 | 1B, 3B | LLM | Yes | Yes | Yes | No | - | -| Llama3.2-VL | 8B, 90B | MLLM | Yes | Yes | Yes | No | - | +| Llama3.2-VL | 11B, 90B | MLLM | Yes | Yes | Yes | No | - | | InternLM | 7B - 20B | LLM | Yes | Yes | Yes | Yes | - | | InternLM2 | 7B - 20B | LLM | Yes | Yes | Yes | Yes | Yes | | InternLM2.5 | 7B | LLM | Yes | Yes | Yes | Yes | Yes | diff --git a/docs/zh_cn/get_started/ascend/get_started.md b/docs/zh_cn/get_started/ascend/get_started.md index 78bd8383d4..e00c1e173a 100644 --- a/docs/zh_cn/get_started/ascend/get_started.md +++ b/docs/zh_cn/get_started/ascend/get_started.md @@ -22,8 +22,8 @@ Docker 版本应不低于 18.03。并且需按照[官方指南](https://www.hias #### Drivers,Firmware 和 CANN -目标机器需安装华为驱动程序和固件版本 23.0.3,请参考 -[CANN 驱动程序和固件安装](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha003/softwareinst/instg/instg_0019.html) +目标机器需安装华为驱动程序和固件版本至少为 23.0.3,请参考 +[CANN 驱动程序和固件安装](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha001/softwareinst/instg/instg_0005.html) 和[下载资源](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=26&cann=8.0.RC2.beta1&driver=1.0.25.alpha)。 另外,`docker/Dockerfile_aarch64_ascend`没有提供CANN 安装包,用户需要自己从[昇腾资源下载中心](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC2.beta1&product=4&model=26)下载CANN(version 8.0.RC2.beta1)软件包。 diff --git a/docs/zh_cn/llm/pipeline.md b/docs/zh_cn/llm/pipeline.md index a9c74a5f14..40406c85a4 100644 --- a/docs/zh_cn/llm/pipeline.md +++ b/docs/zh_cn/llm/pipeline.md @@ -136,6 +136,10 @@ logits = pipe.get_logits(input_ids) ppl = pipe.get_ppl(input_ids) ``` +```{note} +get_ppl 返回的是 cross entropy loss,没有在之后加 exp 操作 +``` + - **使用 pytorch 后端** 需要先安装 triton diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md index 9bdbf0d45d..ac061cf1ae 100644 --- a/docs/zh_cn/supported_models/supported_models.md +++ b/docs/zh_cn/supported_models/supported_models.md @@ -51,7 +51,7 @@ turbomind 引擎不支持 window attention。所以,对于应用了 window att | Llama3 | 8B, 70B | LLM | Yes | Yes | Yes | Yes | Yes | | Llama3.1 | 8B, 70B | LLM | Yes | Yes | Yes | No | - | | Llama3.2 | 1B, 3B | LLM | Yes | Yes | Yes | No | - | -| Llama3.2-VL | 8B, 90B | MLLM | Yes | Yes | Yes | No | - | +| Llama3.2-VL | 11B, 90B | MLLM | Yes | Yes | Yes | No | - | | InternLM | 7B - 20B | LLM | Yes | Yes | Yes | Yes | - | | InternLM2 | 7B - 20B | LLM | Yes | Yes | Yes | Yes | Yes | | InternLM2.5 | 7B | LLM | Yes | Yes | Yes | Yes | Yes | diff --git a/lmdeploy/lite/quantization/calibration.py b/lmdeploy/lite/quantization/calibration.py index 4ae21e3f45..77ff74e234 100644 --- a/lmdeploy/lite/quantization/calibration.py +++ b/lmdeploy/lite/quantization/calibration.py @@ -253,9 +253,11 @@ def export(self, out_dir): inp_stats = self.collect_inputs_stats() torch.save(inp_stats, out_dir / 'inputs_stats.pth') + torch.cuda.empty_cache() out_stats = self.collect_outputs_stats() torch.save(out_stats, out_dir / 'outputs_stats.pth') + torch.cuda.empty_cache() def calibrate(self, data): """Forward pass through the model in inference mode with given data.""" @@ -267,6 +269,7 @@ def calibrate(self, data): model = self.model.model with torch.inference_mode(): _ = model(data.to(self.device)) + torch.cuda.empty_cache() def __enter__(self): """Prepares the Calibration object for a 'with' statement by @@ -440,6 +443,7 @@ def export(self, out_dir): inputs_stats['absmean'][name] = obs.absmean_val inputs_stats['ratios'][name] = obs.ratio torch.save(inputs_stats, out_dir / 'inputs_stats.pth') + torch.cuda.empty_cache() def _wrap_decoder_layers_for_search(self): """Method to wrap the decoder layers' forward functions for observing diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py index 84e3fba8fb..c713e3ec85 100644 --- a/lmdeploy/pytorch/engine/model_agent.py +++ b/lmdeploy/pytorch/engine/model_agent.py @@ -500,15 +500,22 @@ def _start_tp_process(proc_id: int, def _check_context_alive(mp_context: mp.ProcessContext): """check context alive.""" procs: List[mp.Process] = mp_context.processes - failed_ranks = list(idx for idx, p in enumerate(procs) if not p.is_alive()) - if len(failed_ranks) == 0: + failed_procs = list(idx for idx, p in enumerate(procs) if not p.is_alive()) + if len(failed_procs) == 0: return - for p in procs: + + log_procs = [] + for idx, p in enumerate(procs): if p.is_alive(): p.terminate() else: + exitcode = p.exitcode + if exitcode > 0: + # terminated exitcode < 0 + log_procs.append((idx, exitcode)) p.close() - logger.error(f'TP process {failed_ranks} failed.') + for idx, exitcode in log_procs: + logger.error(f'TP process {idx} failed with exitcode {exitcode}.') # TODO: not safe exit. os._exit(1) diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index 019a617acd..a12cadaa7d 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -323,6 +323,12 @@ async def chat_completions_v1(request: ChatCompletionRequest, - ignore_eos (bool): indicator for ignoring eos - skip_special_tokens (bool): Whether or not to remove special tokens in the decoding. Default to be True. + - min_new_tokens (int): To generate at least numbers of tokens. + - min_p (float): Minimum token probability, which will be scaled by the + probability of the most likely token. It must be a value between + 0 and 1. Typical values are in the 0.01-0.2 range, comparably + selective as setting `top_p` in the 0.99-0.8 range (use the + opposite of normal `top_p` values) Currently we do not support the following features: - presence_penalty (replaced with repetition_penalty) @@ -386,6 +392,8 @@ async def chat_completions_v1(request: ChatCompletionRequest, skip_special_tokens=request.skip_special_tokens, response_format=response_format, logits_processors=logits_processors, + min_new_tokens=request.min_new_tokens, + min_p=request.min_p, random_seed=random_seed) tools = None @@ -826,6 +834,12 @@ async def chat_interactive_v1(request: GenerateRequest, in the decoding. Default to be True. - adapter_name (str): For slora inference. Choose which lora to do the inference. + - min_new_tokens (int): To generate at least numbers of tokens. + - min_p (float): Minimum token probability, which will be scaled by the + probability of the most likely token. It must be a value between + 0 and 1. Typical values are in the 0.01-0.2 range, comparably + selective as setting `top_p` in the 0.99-0.8 range (use the + opposite of normal `top_p` values) """ if request.cancel: if request.session_id != -1: @@ -867,6 +881,8 @@ async def chat_interactive_v1(request: GenerateRequest, ignore_eos=request.ignore_eos, stop_words=request.stop, skip_special_tokens=request.skip_special_tokens, + min_new_tokens=request.min_new_tokens, + min_p=request.min_p, random_seed=random_seed) if request.image_url: from lmdeploy.vl import load_image diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py index bd54028c39..d4bf8ed315 100644 --- a/lmdeploy/serve/openai/protocol.py +++ b/lmdeploy/serve/openai/protocol.py @@ -137,6 +137,8 @@ class ChatCompletionRequest(BaseModel): skip_special_tokens: Optional[bool] = True top_k: Optional[int] = 40 seed: Optional[int] = None + min_new_tokens: Optional[int] = Field(default=None, examples=[None]) + min_p: float = 0.0 class FunctionResponse(BaseModel): @@ -339,6 +341,8 @@ class GenerateRequest(BaseModel): cancel: Optional[bool] = False # cancel a responding request adapter_name: Optional[str] = Field(default=None, examples=[None]) seed: Optional[int] = None + min_new_tokens: Optional[int] = Field(default=None, examples=[None]) + min_p: float = 0.0 class GenerateResponse(BaseModel): diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py index 4791d3c724..3a16f0a65b 100644 --- a/lmdeploy/serve/utils.py +++ b/lmdeploy/serve/utils.py @@ -212,8 +212,8 @@ def get_ppl(self, input_ids: Union[List[int], logger.info(f'sorted indices: {indices}') for (start, end) in self._batch_iterator(sizes, max_input_len): logger.info(f'start: {start}, end: {end}') - _input_ids = [input_ids[indices[i]] for i in range(start, end)] if start == end: + _input_ids = input_ids[indices[start]] loss, target_count = self._get_long_text_ppl( generator=generator, input_ids=_input_ids, @@ -221,6 +221,7 @@ def get_ppl(self, input_ids: Union[List[int], losses.append(loss) target_counts.append(target_count) else: + _input_ids = [input_ids[indices[i]] for i in range(start, end)] loss, target_count = self._get_ppl( generator=generator, input_ids=_input_ids, @@ -261,24 +262,24 @@ def _batch_iterator(self, sizes, max_value): i += 1 def _get_long_text_ppl(self, generator, input_ids, max_input_len): - assert isinstance(input_ids, List) and len(input_ids) == 1 - seq_len = len(input_ids[0]) + assert all(isinstance(_, int) for _ in input_ids) + seq_len = len(input_ids) assert seq_len > max_input_len logger.info(f'get long text ppl: seq_len {seq_len}') losses = [] target_counts = [] for i in range(0, seq_len, max_input_len): - token_ids = input_ids[:, i:i + max_input_len] + token_ids = input_ids[i:i + max_input_len] step = [i] # shift token_ids by 1 to the left - target_ids = input_ids[:, i + 1:i + 1 + max_input_len] + target_ids = input_ids[i + 1:i + 1 + max_input_len] loss, target_count = self._get_ppl( generator=generator, - input_ids=token_ids, + input_ids=[token_ids], max_input_len=max_input_len, - target_ids=target_ids, + target_ids=[target_ids], steps=step, sequence_start=(i == 0), sequence_end=(i + max_input_len >= seq_len)) diff --git a/lmdeploy/version.py b/lmdeploy/version.py index b9f76b5761..8b9babb986 100644 --- a/lmdeploy/version.py +++ b/lmdeploy/version.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from typing import Tuple -__version__ = '0.6.2' +__version__ = '0.6.2.post1' short_version = __version__ diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc index 7ed657a9b8..2d68ef3535 100644 --- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc +++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc @@ -303,6 +303,43 @@ void loadWeights( } } +template +void loadWeights(LlamaDenseWeight& w, std::string prefix, FtCudaDataType model_file_type) +{ + auto weight_file = prefix + ".weight"; + auto qweight_file = prefix + ".qweight"; + + if (!std::filesystem::exists(weight_file) && !std::filesystem::exists(qweight_file)) { + TM_LOG_ERROR("%s and %s does not exist", weight_file.c_str(), qweight_file.c_str()); + FT_CHECK(false); + } + + size_t dim0 = w.input_dims; + size_t dim1 = w.output_dims; + const auto type = model_file_type; + + if (w.bias) { + loadWeightFromBin((T*)w.bias, {1, dim1}, prefix + ".bias", type); + } + const size_t bit_size = getBitSize(w.type); + if (bit_size >= 16) { // fp16, fp32 + loadWeightFromBin((T*)w.kernel, {dim0, dim1}, prefix + ".weight", type); + } + else { // int8, int4 + const int factor = sizeof(float) * 8 / bit_size; + + FT_CHECK(dim1 % factor == 0); + + std::vector w_shape{dim0, dim1 / factor * sizeof(uint32_t)}; + loadWeightFromBin((int8_t*)w.kernel, w_shape, prefix + ".qweight", FtCudaDataType::INT8); + + const size_t group_count = w.group_size > 0 ? dim0 / w.group_size : 1; + + loadWeightFromBin((half*)w.scales, {group_count, dim1}, prefix + ".scales", type); + loadWeightFromBin((half*)w.zeros, {group_count, dim1}, prefix + ".zeros", type); + } +} + template void LlamaDecoderLayerWeight::mallocWeights() { @@ -357,10 +394,22 @@ void LlamaDecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataType loadWeights(self_attn_weights.qkv, dir_path + ".attention.w_qkv", tensor_para_rank_, type, tensor_para_size_); loadWeights(self_attn_weights.output, dir_path + ".attention.wo", tensor_para_rank_, type, tensor_para_size_); - - loadWeights(ffn_weights.gating, dir_path + ".feed_forward.w1", tensor_para_rank_, type, tensor_para_size_); - loadWeights(ffn_weights.intermediate, dir_path + ".feed_forward.w3", tensor_para_rank_, type, tensor_para_size_); - loadWeights(ffn_weights.output, dir_path + ".feed_forward.w2", tensor_para_rank_, type, tensor_para_size_); + if (moe_weights.experts.empty()) { + loadWeights(ffn_weights.gating, dir_path + ".feed_forward.w1", tensor_para_rank_, type, tensor_para_size_); + loadWeights( + ffn_weights.intermediate, dir_path + ".feed_forward.w3", tensor_para_rank_, type, tensor_para_size_); + loadWeights(ffn_weights.output, dir_path + ".feed_forward.w2", tensor_para_rank_, type, tensor_para_size_); + } + else { + loadWeights(moe_weights.gate, dir_path + ".moe_ffn.gate", type); + for (size_t i = 0; i < moe_weights.experts.size(); ++i) { + std::string weight_name = dir_path + ".moe_ffn.experts." + std::to_string(i); + loadWeights(moe_weights.experts[i].gating, weight_name + ".w1", tensor_para_rank_, type, tensor_para_size_); + loadWeights( + moe_weights.experts[i].intermediate, weight_name + ".w3", tensor_para_rank_, type, tensor_para_size_); + loadWeights(moe_weights.experts[i].output, weight_name + ".w2", tensor_para_rank_, type, tensor_para_size_); + } + } } template diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 44f73370da..0e9309d9be 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -333,12 +333,6 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, } else { moe_param_.method = ft::MoeParam::kFused; - // Note: This will fail when GPUs of different SMs are mixed - if (weight_type_ != ft::WeightType::kINT4 && ft::getSMVersion() >= 90) { - // On sm90 the cuBLAS method may be faster as our grouped GEMM is not - // optimized for GMMA yet - moe_param_.method = ft::MoeParam::kNaive; - } } TM_LOG_INFO("%s", toString().c_str()); @@ -377,6 +371,10 @@ std::unique_ptr> LlamaTritonModel::createSharedModelInstance( shared_state_, device_id); + // Wait for pinned buffers to be allocated for all ranks, otherwise tuning will hang + // due to concurrent kernel launch & cudaMallocHost + shared_state_->barrier->wait(); + engine->Start(); return engine;