pytorch
diff --git a/‎.ci/docker/ci_commit_pins/executorch.txt
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/executorch.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/scripts/generate_binary_build_matrix.py
Lines changed: 42 additions & 24 deletions b/‎.github/scripts/generate_binary_build_matrix.py
Lines changed: 42 additions & 24 deletions
diff --git a/‎.github/workflows/generated-linux-binary-manywheel-main.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/generated-linux-binary-manywheel-main.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/generated-linux-binary-manywheel-nightly.yml
Lines changed: 5 additions & 0 deletions b/‎.github/workflows/generated-linux-binary-manywheel-nightly.yml
Lines changed: 5 additions & 0 deletions
diff --git a/‎aten/src/ATen/mps/MPSStream.mm
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/mps/MPSStream.mm
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/mps/operations/ConstantOps.mm
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/mps/operations/ConstantOps.mm
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/transformers/attention.cpp
Lines changed: 0 additions & 7 deletions b/‎aten/src/ATen/native/transformers/attention.cpp
Lines changed: 0 additions & 7 deletions
diff --git a/‎benchmarks/dynamo/Makefile
Lines changed: 1 addition & 1 deletion b/‎benchmarks/dynamo/Makefile
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
Lines changed: 1 addition & 1 deletion b/‎benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
Lines changed: 1 addition & 1 deletion b/‎benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
Lines changed: 1 addition & 1 deletion b/‎benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
Lines changed: 1 addition & 1 deletion b/‎benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
Lines changed: 1 addition & 1 deletion b/‎benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
Lines changed: 1 addition & 1 deletion b/‎benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
Lines changed: 1 addition & 1 deletion b/‎benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/dynamo/common.py
Lines changed: 34 additions & 15 deletions b/‎benchmarks/dynamo/common.py
Lines changed: 34 additions & 15 deletions
diff --git a/‎benchmarks/dynamo/timm_models.py
Lines changed: 9 additions & 3 deletions b/‎benchmarks/dynamo/timm_models.py
Lines changed: 9 additions & 3 deletions
@@ -1 +1 @@
-bed91223f660685325147a5027348356f11cdd17
+3a4bb06b3a3a36863ff9d7fca3cfee9d8f7b6613
@@ -24,19 +24,34 @@
 
 CPU_AARCH64_ARCH = ["cpu-aarch64"]
 
-PYTORCH_EXTRA_INSTALL_REQUIREMENTS = (
-    "nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "  # noqa: B950
-    "nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-    "nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-    "nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-    "nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-    "nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-    "nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-    "nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-    "nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-    "nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-    "nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'"
-)
+PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
+    "11.8": (
+        "nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "  # noqa: B950
+        "nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu11==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'"
+    ),
+    "12.1": (
+        "nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "  # noqa: B950
+        "nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'"
+    ),
+}
 
 
 def get_nccl_submodule_version() -> str:
@@ -65,15 +80,17 @@ def get_nccl_submodule_version() -> str:
     return f"{d['NCCL_MAJOR']}.{d['NCCL_MINOR']}.{d['NCCL_PATCH']}"
 
 
-def get_nccl_wheel_version() -> str:
+def get_nccl_wheel_version(arch_version: str) -> str:
     import re
 
-    requrements = map(str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS))
-    return [x for x in requrements if x.startswith("nvidia-nccl-cu")][0].split("==")[1]
+    requirements = map(
+        str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version])
+    )
+    return [x for x in requirements if x.startswith("nvidia-nccl-cu")][0].split("==")[1]
 
 
-def validate_nccl_dep_consistency() -> None:
-    wheel_ver = get_nccl_wheel_version()
+def validate_nccl_dep_consistency(arch_version: str) -> None:
+    wheel_ver = get_nccl_wheel_version(arch_version)
     submodule_ver = get_nccl_submodule_version()
     if wheel_ver != submodule_ver:
         raise RuntimeError(
@@ -298,7 +315,7 @@ def generate_wheels_matrix(
             )
 
             # 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
-            if arch_version == "12.1" and os == "linux":
+            if arch_version in ["12.1", "11.8"] and os == "linux":
                 ret.append(
                     {
                         "python_version": python_version,
@@ -310,7 +327,7 @@ def generate_wheels_matrix(
                         "devtoolset": "",
                         "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                         "package_type": package_type,
-                        "pytorch_extra_install_requirements": PYTORCH_EXTRA_INSTALL_REQUIREMENTS,
+                        "pytorch_extra_install_requirements": PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version],  # fmt: skip
                         "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(  # noqa: B950
                             ".", "_"
                         ),
@@ -333,12 +350,13 @@ def generate_wheels_matrix(
                         "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(
                             ".", "_"
                         ),
-                        "pytorch_extra_install_requirements": PYTORCH_EXTRA_INSTALL_REQUIREMENTS
-                        if os != "linux"
-                        else "",
+                        "pytorch_extra_install_requirements":
+                        PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.1"]  # fmt: skip
+                        if os != "linux" else "",
                     }
                 )
     return ret
 
 
-validate_nccl_dep_consistency()
+validate_nccl_dep_consistency("12.1")
+validate_nccl_dep_consistency("11.8")
@@ -146,9 +146,9 @@ @interface MPSGraphExecutionDescriptor ()
 }
 
 void MPSStream::fill(id<MTLBuffer> buffer, uint8_t value, size_t length, size_t offset, SyncType syncType) {
-  TORCH_INTERNAL_ASSERT(length >= offset);
-  if (length == 0)
+  if (length == 0) {
     return;
+  }
   dispatch_sync(_serialQueue, ^() {
     @autoreleasepool {
       endKernelCoalescing();
 
@@ -79,7 +79,7 @@ static bool fill_mps_tensor_(Tensor& self, uint8_t value) {
   if (self.is_contiguous()) {
     MPSStream* stream = getCurrentMPSStream();
     auto storage_byte_offset = self.storage_offset() * self.itemsize();
-    stream->fill(mps::getMTLBufferStorage(self), 0, self.storage().nbytes(), storage_byte_offset);
+    stream->fill(mps::getMTLBufferStorage(self), value, self.nbytes(), storage_byte_offset);
     return true;
   }
   return false;
 
@@ -548,13 +548,6 @@ at::Tensor preprocess_mask(
   constexpr int mem_eff_alignment = 8;
   at::Tensor result_mask = mask;
   if (!aligned_tensor<mem_eff_alignment>(mask)) {
-    TORCH_WARN_ONCE(
-        "Memory Efficient Attention requires the attn_mask to be aligned to, ",
-        mem_eff_alignment,
-        " elements. "
-        "Prior to calling SDPA, pad the last dimension of the attn_mask "
-        "to be at least a multiple of ", mem_eff_alignment,
-        " and then slice the attn_mask to the original size.");
     result_mask = pad_bias<mem_eff_alignment>(mask);
   }
   return result_mask.expand_symint(
 
@@ -35,7 +35,7 @@ build-deps: clone-deps
 	(cd ../../../torchdata && python setup.py install)
 	(cd ../../../torchtext && python setup.py clean && python setup.py develop)
 	(cd ../../../torchaudio && python setup.py clean && python setup.py develop)
-	(cd ../../../FBGEMM/fbgemm_gpu && python setup.py clean && pip install -r requirements.txt && python setup.py develop)
+	(cd ../../../FBGEMM/fbgemm_gpu && pip install -r requirements.txt && python setup.py clean && python setup.py develop)
 	(cd ../../../torchrec && python setup.py clean && python setup.py develop)
 	(cd ../../../detectron2  && python setup.py clean && python setup.py develop)
 	(cd ../../../torchbenchmark && python install.py --continue_on_fail)
 
@@ -10,7 +10,7 @@ Background_Matting,pass_due_to_skip,0
 
 
 
-DALLE2_pytorch,fail_to_run,21
+DALLE2_pytorch,timeout,0
 
 
 
 
@@ -186,7 +186,7 @@ opacus_cifar10,pass,0
 
 
 
-phi_1_5,pass,74
+phi_1_5,pass,0
 
 
 
 
@@ -10,7 +10,7 @@ Background_Matting,pass_due_to_skip,0
 
 
 
-DALLE2_pytorch,fail_to_run,21
+DALLE2_pytorch,timeout,0
 
 
 
 
@@ -186,7 +186,7 @@ opacus_cifar10,pass,0
 
 
 
-phi_1_5,pass,74
+phi_1_5,pass,0
 
 
 
 
@@ -10,7 +10,7 @@ Background_Matting,pass_due_to_skip,0
 
 
 
-DALLE2_pytorch,fail_to_run,21
+DALLE2_pytorch,timeout,0
 
 
 
 
@@ -10,7 +10,7 @@ Background_Matting,pass_due_to_skip,0
 
 
 
-DALLE2_pytorch,fail_to_run,21
+DALLE2_pytorch,timeout,0
 
 
 
 
@@ -10,7 +10,7 @@ Background_Matting,pass_due_to_skip,0
 
 
 
-DALLE2_pytorch,fail_to_run,21
+DALLE2_pytorch,timeout,0
 
 
 
 
@@ -760,6 +760,39 @@ def onnxrt_model_iter_fn(model, inputs, collect_outputs=True):
 
         return onnxrt_model_iter_fn
 
+    def timed_onnx(model, onnx_model: OnnxModelFromTorchScript, inputs):
+        if current_device == "cpu" or onnx_model.is_cpu():
+            onnxrt_model_iter_fn = create_onnx_fn(onnx_model, inputs)
+        else:
+            onnxrt_model_iter_fn = create_onnx_input_binded_fn(
+                onnx_model, inputs, expected_output
+            )
+        return timed(
+            model,
+            onnxrt_model_iter_fn,
+            inputs,
+            return_result=True,
+            times=times,
+            collect_outputs=args.collect_outputs,
+        )
+
+    # Insert ONNX warm-up
+    inputs = (
+        randomize_input(copy.deepcopy(example_inputs))
+        if should_randomize_input
+        else example_inputs
+    )
+    _, expected_output = timed(
+        model,
+        model_iter_fn,
+        inputs,
+        return_result=True,
+        times=times,
+        collect_outputs=args.collect_outputs,
+    )
+    for _ in range(2):
+        timed_onnx(model, onnx_model, inputs)
+
     for rep in range(args.repeat):
         inputs = (
             randomize_input(copy.deepcopy(example_inputs))
@@ -775,21 +808,7 @@ def onnxrt_model_iter_fn(model, inputs, collect_outputs=True):
             collect_outputs=args.collect_outputs,
         )
 
-        if current_device == "cpu" or onnx_model.is_cpu():
-            onnxrt_model_iter_fn = create_onnx_fn(onnx_model, inputs)
-        else:
-            onnxrt_model_iter_fn = create_onnx_input_binded_fn(
-                onnx_model, inputs, expected_output
-            )
-
-        timings[rep, 1], actual_output = timed(
-            model,
-            onnxrt_model_iter_fn,
-            inputs,
-            return_result=True,
-            times=times,
-            collect_outputs=args.collect_outputs,
-        )
+        timings[rep, 1], actual_output = timed_onnx(model, onnx_model, inputs)
 
     pvalue = ttest_ind(timings[:, 0], timings[:, 1]).pvalue
     median = np.median(timings, axis=0)
 
@@ -67,7 +67,13 @@ def pip_install(package):
     "xcit_large_24_p8_224": 4,
 }
 
-REQUIRE_HIGHER_TOLERANCE = set("sebotnet33ts_256")
+REQUIRE_HIGHER_TOLERANCE = {
+    "fbnetv3_b",
+    "hrnet_w18",
+    "inception_v3",
+    "sebotnet33ts_256",
+    "selecsls42b",
+}
 
 SCALED_COMPUTE_LOSS = {
     "ese_vovnet19b_dw",
@@ -304,8 +310,8 @@ def get_tolerance_and_cosine_flag(self, is_training, current_device, name):
         cosine = self.args.cosine
         tolerance = 1e-3
         if is_training:
-            if REQUIRE_HIGHER_TOLERANCE:
-                tolerance = 2 * 1e-2
+            if name in REQUIRE_HIGHER_TOLERANCE:
+                tolerance = 4 * 1e-2
             else:
                 tolerance = 1e-2
         return tolerance, cosine
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-bed91223f660685325147a5027348356f11cdd17`
	`1`	`+3a4bb06b3a3a36863ff9d7fca3cfee9d8f7b6613`
Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ static bool fill_mps_tensor_(Tensor& self, uint8_t value) {`
`79`	`79`	`if (self.is_contiguous()) {`
`80`	`80`	`MPSStream* stream = getCurrentMPSStream();`
`81`	`81`	`auto storage_byte_offset = self.storage_offset() * self.itemsize();`
`82`		`- stream->fill(mps::getMTLBufferStorage(self), 0, self.storage().nbytes(), storage_byte_offset);`
	`82`	`+ stream->fill(mps::getMTLBufferStorage(self), value, self.nbytes(), storage_byte_offset);`
`83`	`83`	`return true;`
`84`	`84`	`}`
`85`	`85`	`return false;`
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@ Background_Matting,pass_due_to_skip,0`
`10`	`10`
`11`	`11`
`12`	`12`
`13`		`-DALLE2_pytorch,fail_to_run,21`
	`13`	`+DALLE2_pytorch,timeout,0`
`14`	`14`
`15`	`15`
`16`	`16`
Original file line number	Diff line number	Diff line change
`@@ -186,7 +186,7 @@ opacus_cifar10,pass,0`
`186`	`186`
`187`	`187`
`188`	`188`
`189`		`-phi_1_5,pass,74`
	`189`	`+phi_1_5,pass,0`
`190`	`190`
`191`	`191`
`192`	`192`