8000 Update · pytorch/pytorch@86a8231 · GitHub
[go: up one dir, main page]

Skip to content

Commit 86a8231

Browse files
committed
Update
[ghstack-poisoned]
2 parents c822959 + c5e5de7 commit 86a8231

File tree

145 files changed

+3183
-1606
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

145 files changed

+3183
-1606
lines changed

.ci/aarch64_linux/aarch64_wheel_ci_build.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ def parse_arguments():
204204
else:
205205
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
206206
elif branch.startswith(("v1.", "v2.")):
207-
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
207+
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
208208

209209
if enable_mkldnn:
210210
build_ArmComputeLibrary()

.ci/aarch64_linux/build_aarch64_wheel.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -761,7 +761,7 @@ def start_build(
761761
version = host.check_output("cat pytorch/version.txt").strip()[:-2]
762762
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1"
763763
if branch.startswith(("v1.", "v2.")):
764-
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
764+
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
765765
if host.using_docker():
766766
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
767767
if enable_mkldnn:

.ci/docker/common/install_rocm_drm.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ index a5007ffc..13fa07fc 100644
115115
if (!fp) {
116116
- fprintf(stderr, "%s: %s\n", AMDGPU_ASIC_ID_TABLE,
117117
- strerror(errno));
118-
+ fprintf(stderr, "amdgpu.ids: No such file or directory\n");
118+
+ //fprintf(stderr, "amdgpu.ids: No such file or directory\n");
119119
return;
120120
}
121121

.ci/docker/libtorch/build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ case ${GPU_ARCH_TYPE} in
3939
BASE_TARGET=rocm
4040
DOCKER_TAG=rocm${GPU_ARCH_VERSION}
4141
GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-complete
42-
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx942"
42+
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx1102;gfx942"
4343
DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
4444
10000 ;;
4545
*)

.ci/docker/manywheel/build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ case ${GPU_ARCH_TYPE} in
9797
DEVTOOLSET_VERSION="11"
9898
GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
9999
fi
100-
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101"
100+
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102"
101101
DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
102102
;;
103103
xpu)

.ci/pytorch/smoke_test/max_autotune.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,9 @@ def train(args, model, device, train_loader, optimizer, epoch):
4646
optimizer.step()
4747
if batch_idx % args.log_interval == 0:
4848
print(
49-
f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}" # noqa: B950
49+
f"Train Epoch: {epoch} "
50+
f"[{batch_idx * len(data)}/{len(train_loader.dataset)} "
51+
f"({100.0 * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}"
5052
)
5153
if args.dry_run:
5254
break
@@ -71,7 +73,9 @@ def test(model, device, test_loader):
7173
test_loss /= len(test_loader.dataset)
7274

7375
print(
74-
f"\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n" # noqa: B950
76+
f"\nTest set: Average loss: {test_loss:.4f}, "
77+
f"Accuracy: {correct}/{len(test_loader.dataset)} "
78+
f"({100.0 * correct / len(test_loader.dataset):.0f}%)\n"
7579
)
7680

7781

.ci/pytorch/test.sh

Lines changed: 54 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -494,8 +494,53 @@ test_cachebench() {
494494
TEST_REPORTS_DIR=$(pwd)/test/test-reports
495495
mkdir -p "$TEST_REPORTS_DIR"
496496

497-
$TASKSET python "benchmarks/dynamo/cachebench.py" --mode training --benchmark torchbench --output "$TEST_REPORTS_DIR/cachebench_training.json"
498-
$TASKSET python "benchmarks/dynamo/cachebench.py" --mode inference --benchmark torchbench --output "$TEST_REPORTS_DIR/cachebench_inference.json"
497+
local BENCHMARK
498+
if [[ "${SHARD_NUMBER}" == 1 ]]; then
499+
local BENCHMARK=torchbench
500+
elif [[ "${SHARD_NUMBER}" == 2 ]]; then
501+
local BENCHMARK=huggingface
502+
else
503+
echo "invalid SHARD_NUMBER: ${SHARD_NUMBER}"
504+
exit 1
505+
fi
506+
507+
local mode_options=("training" "inference")
508+
509+
for mode in "${mode_options[@]}"; do
510+
$TASKSET python "benchmarks/dynamo/cachebench.py" \
511+
--mode "$mode" \
512+
--device cuda \
513+
--benchmark "$BENCHMARK" \
514+
--repeat 3 \
515+
--output "$TEST_REPORTS_DIR/cachebench_${BENCHMARK}_${mode}.json"
516+
517+
$TASKSET python "benchmarks/dynamo/cachebench.py" \
518+
--mode "$mode" \
519+
--dynamic \
520+
--device cuda \
521+
--benchmark "$BENCHMARK" \
522+
--repeat 3 \
523+
--output "$TEST_REPORTS_DIR/cachebench_${BENCHMARK}_${mode}_dynamic.json"
524+
done
525+
}
526+
527+
test_verify_cachebench() {
528+
TMP_TEST_REPORTS_DIR=$(mktemp -d)
529+
TEST_OUTPUT="$TMP_TEST_REPORTS_DIR/test.json"
530+
531+
$TASKSET python "benchmarks/dynamo/cachebench.py" \
532+
--mode training \
533< F438 code class="diff-text syntax-highlighted-line addition">+
--device cpu \
534+
--model nanogpt \
535+
--benchmark torchbench \
536+
--output "$TEST_OUTPUT"
537+
538+
# -s checks file exists and is non empty
539+
if [[ ! -s "$TEST_OUTPUT" ]]; then
540+
echo "Cachebench failed to produce an output."
541+
echo "Run 'python benchmarks/dynamo/cachebench.py' to make sure it works"
542+
exit 1
543+
fi
499544
}
500545

501546
test_perf_for_dashboard() {
@@ -1449,7 +1494,7 @@ test_executorch() {
14491494
test_linux_aarch64() {
14501495
python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
14511496
test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
1452-
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops \
1497+
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
14531498
--shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
14541499

14551500
# Dynamo tests
@@ -1520,8 +1565,13 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
15201565
elif [[ "${TEST_CONFIG}" == cachebench ]]; then
15211566
install_torchaudio cuda
15221567
install_torchvision
1523-
checkout_install_torchbench nanogpt BERT_pytorch resnet50
1568+
checkout_install_torchbench nanogpt BERT_pytorch resnet50 hf_T5 llama moco
15241569
PYTHONPATH=$(pwd)/torchbench test_cachebench
1570+
elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
1571+
install_torchaudio cpu
1572+
install_torchvision
1573+
checkout_install_torchbench nanogpt
1574+
PYTHONPATH=$(pwd)/torchbench test_verify_cachebench
15251575
elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
15261576
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
15271577
install_torchaudio cpu

.github/merge_rules.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,7 @@
334334
- XiaobingSuper
335335
- jgong5
336336
- mingfeima
337+
- EikanWang
337338
mandatory_checks_name:
338339
- EasyCLA
339340
- Lint
@@ -366,6 +367,7 @@
366367
- jgong5
367368
- vfdev-5
368369
- leslie-fang-intel
370+
- EikanWang
369371
mandatory_checks_name:
370372
- EasyCLA
371373
- Lint
@@ -379,6 +381,7 @@
379381
approved_by:
380382
- leslie-fang-intel
381383
- jgong5
384+
- EikanWang
382385
mandatory_checks_name:
383386
- EasyCLA
384387
- Lint

.github/scripts/github_utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,10 @@ def gh_fetch_url_and_headers(
5757
print(
5858
f"""{url}
5959
Rate limit exceeded:
60-
Used: {err.headers['X-RateLimit-Used']}
61-
Limit: {err.headers['X-RateLimit-Limit']}
62-
Remaining: {err.headers['X-RateLimit-Remaining']}
63-
Resets at: {err.headers['x-RateLimit-Reset']}"""
60+
Used: {err.headers["X-RateLimit-Used"]}
61+
Limit: {err.headers["X-RateLimit-Limit"]}
62+
Remaining: {err.headers["X-RateLimit-Remaining"]}
63+
Resets at: {err.headers["x-RateLimit-Reset"]}"""
6464
)
6565
else:
6666
print(f"Error fetching {url} {err}")

.github/scripts/trymerge.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -485,7 +485,7 @@ def get_check_run_name_prefix(workflow_run: Any) -> str:
485485
if workflow_run is None:
486486
return ""
487487
else:
488-
return f'{workflow_run["workflow"]["name"]} / '
488+
return f"{workflow_run['workflow']['name']} / "
489489

490490

491491
def is_passing_status(status: Optional[str]) -> bool:
@@ -545,7 +545,7 @@ def add_conclusions(edges: Any) -> None:
545545
if not isinstance(checkrun_node, dict):
546546
warn(f"Expected dictionary, but got {type(checkrun_node)}")
547547
continue
548-
checkrun_name = f'{get_check_run_name_prefix(workflow_run)}{checkrun_node["name"]}'
548+
checkrun_name = f"{get_check_run_name_prefix(workflow_run)}{checkrun_node['name']}"
549549
existing_checkrun = workflow_obj.jobs.get(checkrun_name)
550550
if existing_checkrun is None or not is_passing_status(
551551
existing_checkrun.status

.github/scripts/trymerge_explainer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def get_merge_message(
7979
(
8080
"<details><summary>Advanced Debugging</summary>",
8181
"Check the merge workflow status ",
82-
f"<a href=\"{os.getenv('GH_RUN_URL')}\">here</a>",
82+
f'<a href="{os.getenv("GH_RUN_URL")}">here</a>',
8383
"</details>",
8484
)
8585
)

.github/workflows/_binary-upload.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ on:
6666
jobs:
6767
upload:
6868
runs-on: ubuntu-22.04
69-
environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
7069
container:
7170
image: continuumio/miniconda3:4.12.0
7271
env:

.github/workflows/build-triton-wheel.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,6 @@ jobs:
168168
contents: read
169169
container:
170170
image: continuumio/miniconda3:4.12.0
171-
environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
172171
steps:
173172
- uses: actions/checkout@v3
174173

.github/workflows/inductor-perf-test-nightly.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,8 @@ jobs:
105105
{ config: "inductor_torchbench_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
106106
{ config: "inductor_torchbench_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
107107
{ config: "inductor_torchbench_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
108-
{ config: "cachebench", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
108+
{ config: "cachebench", shard: 1, num_shards: 2, runner: "linux.aws.a100" },
109+
{ config: "cachebench", shard: 2, num_shards: 2, runner: "linux.aws.a100" },
109110
]}
110111
selected-test-configs: ${{ inputs.benchmark_configs }}
111112
secrets: inherit

.github/workflows/periodic.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,9 @@ jobs:
149149
docker-image-name: pytorch-linux-focal-rocm-n-py3
150150
test-matrix: |
151151
{ include: [
152-
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.2", owners: ["module:rocm", "oncall:distributed"] },
153-
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.2", owners: ["module:rocm", "oncall:distributed"] },
154-
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.2", owners: ["module:rocm", "oncall:distributed"] },
152+
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
153+
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
154+
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
155155
]}
156156
secrets: inherit
157157

.github/workflows/trunk.yml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,3 +246,29 @@ jobs:
246246
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
247247
cuda-arch-list: '8.0'
248248
secrets: inherit
249+
250+
verify-cachebench-cpu-build:
251+
name: verify-cachebench-cpu-build
252+
uses: ./.github/workflows/_linux-build.yml
253+
needs: get-label-type
254+
with:
255+
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
256+
build-environment: linux-jammy-py3.9-gcc11
257+
docker-image-name: pytorch-linux-jammy-py3.9-gcc11
258+
test-matrix: |
259+
{ include: [
260+
{ config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
261+
]}
262+
secrets: inherit
263+
264+
verify-cachebench-cpu-test:
265+
name: verify-cachebench-cpu-test
266+
uses: ./.github/workflows/_linux-test.yml
267+
needs:
268+
- verify-cachebench-cpu-build
269+
- target-determination
270+
with:
271+
build-environment: linux-jammy-py3.9-gcc11
272+
docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
273+
test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
274+
secrets: inherit

aten/src/ATen/Context.cpp

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,10 @@ at::BlasBackend Context::blasPreferredBackend() {
332332
static const std::vector<std::string> archs = {
333333
"gfx90a", "gfx942",
334334
#if ROCM_VERSION >= 60300
335-
"gfx1100", "gfx1101"
335+
"gfx1100", "gfx1101", "gfx1200", "gfx1201"
336+
#endif
337+
#if ROCM_VERSION >= 60500
338+
"gfx950"
336339
#endif
337340
};
338341
for (auto index: c10::irange(getNumGPUs())) {
@@ -430,6 +433,18 @@ void Context::setAllowFP16AccumulationCuBLAS(bool b) {
430433
allow_fp16_accumulation_cublas = b;
431434
}
432435

436+
std::optional<int32_t> Context::_SMCarveout_EXPERIMENTAL() const {
437+
return sm_carveout;
438+
}
439+
440+
void Context::_setSMCarveout_EXPERIMENTAL(std::optional<int32_t> c) {
441+
if (c.has_value()) {
442+
TORCH_WARN_ONCE(
443+
"Setting the SM carveout for matmuls is a temporary experimental mitigation for performance issues, "
444+
"while more robust solutions are developed. It may be removed at any moment without notice.");
445+
}
446+
sm_carveout = c;
447+
}
433448

434449
bool Context::hasMKL() {
435450
#if AT_MKL_ENABLED()

aten/src/ATen/Context.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,19 @@ class TORCH_API Context {
345345
void setAllowBF16ReductionCuBLAS(bool);
346346
bool allowFP16AccumulationCuBLAS() const;
347347
void setAllowFP16AccumulationCuBLAS(bool);
348+
349+
// Matmuls can use a so-called "persistent" kernel which launches one CUDA
350+
// block for each SM on the GPU, and each block then iterates over multiple
351+
// output tiles. This allows to use software pipelining to hide the begin/end
352+
// latencies (e.g., epilogue), especially when only one tile fits per SM.
353+
// However, if some SMs are busy (e.g., with a background NCCL kernel), the
354+
// matmul's blocks will be scheduled in two waves and, in the absence of some
355+
// smart load balancing, the kernel will take twice as long. This flag allows
356+
// to make matmuls target only a subset of the SMs, so they can fully schedule
357+
// even next to a comms kernel, and only be a few percent slower.
358+
std::optional<int32_t> _SMCarveout_EXPERIMENTAL() const;
359+
void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t>);
360+
348361
at::QEngine qEngine() const;
349362
void setQEngine(at::QEngine e);
350363
static const std::vector<at::QEngine>& supportedQEngines();
@@ -423,6 +436,7 @@ class TORCH_API Context {
423436
bool allow_fp16_reduction_cublas = true;
424437
bool allow_bf16_reduction_cublas = true;
425438
bool allow_fp16_accumulation_cublas = false;
439+
std::optional<int32_t> sm_carveout = std::nullopt;
426440
bool enabled_mkldnn = true;
427441
bool allow_tf32_onednn = false;
428442
bool enabled_nnpack = true;

aten/src/ATen/cpu/vec/vec_base.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
/*
5151
https://learn.microsoft.com/en-us/cpp/overview/compiler-versions?view=msvc-170
5252
Use _MSC_FULL_VER to identify current compiler is msvc,
53-
Windows llvm will not have this defination.
53+
Windows llvm will not have this definition.
5454
*/
5555
#define __msvc_cl__
5656
#endif

0 commit comments

Comments
 (0)
0