pytorch
diff --git a/‎.ci/docker/ci_commit_pins/executorch.txt
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/executorch.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/LinearAlgebra.cpp
Lines changed: 2 additions & 6 deletions b/‎aten/src/ATen/native/LinearAlgebra.cpp
Lines changed: 2 additions & 6 deletions
diff --git a/‎aten/src/ATen/native/mkldnn/Matmul.cpp
Lines changed: 7 additions & 19 deletions b/‎aten/src/ATen/native/mkldnn/Matmul.cpp
Lines changed: 7 additions & 19 deletions
diff --git a/‎aten/src/ATen/native/mkldnn/Utils.h
Lines changed: 1 addition & 9 deletions b/‎aten/src/ATen/native/mkldnn/Utils.h
Lines changed: 1 addition & 9 deletions
diff --git a/‎cmake/Dependencies.cmake
Lines changed: 2 additions & 2 deletions b/‎cmake/Dependencies.cmake
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/dynamo/test_dicts.py
Lines changed: 63 additions & 0 deletions b/‎test/dynamo/test_dicts.py
Lines changed: 63 additions & 0 deletions
diff --git a/‎test/export/test_export.py
Lines changed: 28 additions & 33 deletions b/‎test/export/test_export.py
Lines changed: 28 additions & 33 deletions
diff --git a/‎test/inductor/test_mkldnn_pattern_matcher.py
Lines changed: 1 addition & 1 deletion b/‎test/inductor/test_mkldnn_pattern_matcher.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/inductor/test_torchinductor_strided_blocks.py
Lines changed: 29 additions & 0 deletions b/‎test/inductor/test_torchinductor_strided_blocks.py
Lines changed: 29 additions & 0 deletions
diff --git a/‎torch/_C/_dynamo/guards.pyi
Lines changed: 1 addition & 0 deletions b/‎torch/_C/_dynamo/guards.pyi
Lines changed: 1 addition & 0 deletions
diff --git a/‎torch/_dynamo/guards.py
Lines changed: 10 additions & 0 deletions b/‎torch/_dynamo/guards.py
Lines changed: 10 additions & 0 deletions
diff --git a/‎torch/_dynamo/side_effects.py
Lines changed: 12 additions & 0 deletions b/‎torch/_dynamo/side_effects.py
Lines changed: 12 additions & 0 deletions
diff --git a/‎torch/_dynamo/variables/builder.py
Lines changed: 28 additions & 0 deletions b/‎torch/_dynamo/variables/builder.py
Lines changed: 28 additions & 0 deletions
@@ -1 +1 @@
-319c8d7fd3551bac63429334509de2663aa43f57
+8148603e3f3a618acef447a73bdeec9b749a95fb
@@ -1513,12 +1513,8 @@ static void addmm_impl_cpu_(
   // that will call then into Arm® Compute Library (ACL) GEMM kernel and also
   // additionally have support for running kernel with BF16 instructions
   if (transpose_c) {
-    bool apply_heur =
-        apply_mkldnn_matmul_heur(b.sizes()[0], b.sizes()[1], a.sizes()[1]);
-    if (apply_heur && transpose_a && !transpose_b &&
-        (result.scalar_type() == at::ScalarType::Float ||
-         result.scalar_type() == at::ScalarType::BFloat16 ||
-         result.scalar_type() == at::ScalarType::Half)) {
+    bool apply_heur = apply_mkldnn_matmul_heur(b.sizes()[0], b.sizes()[1], a.sizes()[1]);
+    if (apply_heur && transpose_a && !transpose_b && result.scalar_type() == at::ScalarType::Float) {
       try {
         mkldnn_matmul(b, a, c, beta.to<float>(), alpha.to<float>());
         // We have dispatched to ACL GEMM for single precision float
 
@@ -236,27 +236,15 @@ void mkldnn_matmul(
               "mkldnn_matmul:  unsupported dims for mat and mat2");
 
 #if defined(__aarch64__)
-  // oneDNN fast-maths mode (enabled by setting the environment variable
-  // ONEDNN_DEFAULT_FPMATH_MODE=BF16) will dispatch fp32 inputs to bf16 kernels
-  // where HW permits. So, both fp32 and bf16 inputs are permitted.
-  TORCH_CHECK(
-      (mat1.scalar_type() == mat2.scalar_type()) &&
-          (mat1.scalar_type() == result.scalar_type()) &&
-          ((mat1.scalar_type() == at::kFloat) ||
-           (mat1.scalar_type() == at::kBFloat16) ||
-           (mat1.scalar_type() == at::kHalf)),
-      "mkldnn_matmul:  only enabled for fp32, bf16 and fp16 path");
+  // oneDNN fast-maths mode (enabled by setting the environment variable ONEDNN_DEFAULT_FPMATH_MODE=BF16) will dispatch
+  // fp32 inputs to bf16 kernels where HW permits. So, both fp32 and bf16 inputs are permitted.
+  TORCH_CHECK((mat1.scalar_type() == mat2.scalar_type()) && (mat1.scalar_type() == result.scalar_type()) &&
+              ((mat1.scalar_type() == at::kFloat) || (mat1.scalar_type() == at::kBFloat16)),
+              "mkldnn_matmul:  only enabled for fp32 and bf16 path");
   // device needs to support bf16 if the inputs are of bf16 type
   if (mat1.scalar_type() == at::kBFloat16) {
-    TORCH_CHECK(
-        mkldnn_bf16_device_check_arm(),
-        "mkldnn_matmul: mkldnn_matmul bf16 path needs a cpu with bf16 support");
-  }
-  // device needs to support fp16 if the inputs are of fp16 type
-  if (mat1.scalar_type() == at::kHalf) {
-    TORCH_CHECK(
-        mkldnn_fp16_device_check_arm(),
-        "mkldnn_matmul: mkldnn_matmul fp16 path needs a cpu with fp16 support");
+    TORCH_CHECK(mkldnn_bf16_device_check_arm(),
+                "mkldnn_matmul: mkldnn_matmul bf16 path needs a cpu with bf16 support");
   }
 #else
   TORCH_CHECK(
 
@@ -90,10 +90,6 @@ inline bool mkldnn_bf16_device_check_arm() {
   return cpuinfo_initialize() && cpuinfo_has_arm_bf16();
 }
 
-inline bool mkldnn_fp16_device_check_arm() {
-  return cpuinfo_initialize() && cpuinfo_has_arm_neon_fp16();
-}
-
 inline bool is_arm_neoverse() {
   return (cpuinfo_initialize() && cpuinfo_get_uarchs_count() == 1 &&
           (cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_v1 ||
@@ -106,10 +102,6 @@ constexpr bool mkldnn_bf16_device_check_arm() {
   return false;
 }
 
-inline bool mkldnn_fp16_device_check_arm() {
-  return false;
-}
-
 constexpr bool is_arm_neoverse() {
   return false;
 }
@@ -129,7 +121,7 @@ inline bool mkldnn_fp16_device_check() {
 #if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC))
   return ideep::has_fp16_type_support();
 #else
-  return mkldnn_fp16_device_check_arm();
+  return false;
 #endif
 }
 
 
@@ -864,9 +864,9 @@ if(NOT Python_Interpreter_FOUND)
   message(FATAL_ERROR "Python3 could not be found.")
 endif()
 
-if(${Python_VERSION} VERSION_LESS 3.8)
+if(${Python_VERSION} VERSION_LESS 3.9)
   message(FATAL_ERROR
-    "Found Python libraries version ${Python_VERSION}. Python < 3.8 is no longer supported by PyTorch.")
+    "Found Python libraries version ${Python_VERSION}. Python < 3.9 is no longer supported by PyTorch.")
 endif()
 
 # ---[ Python + Numpy
 
@@ -838,6 +838,69 @@ def fn(x):
         d["e"] = 5
         self.assertEqual(d["e"], res["e"])
 
+    def test_mapping_proxy_existing(self):
+        d = {"a": 2, "b": 3, "c": 5}
+
+        def fn(x, mp):
+            y = torch.sin(x * mp["a"])
+            for k, v in mp.items():
+                y += torch.cos(x * v)
+            return y
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        x = torch.randn(4)
+        mp = types.MappingProxyType(d)
+        ref = fn(x, mp)
+        res = opt_fn(x, mp)
+        self.assertEqual(ref, res)
+
+        d["a"] = 3
+        ref = fn(x, mp)
+        res = opt_fn(x, mp)
+        self.assertEqual(ref, res)
+
+        d.pop("b")
+        ref = fn(x, mp)
+        res = opt_fn(x, mp)
+        self.assertEqual(ref, res)
+
+    def test_mapping_proxy_existing_mutation(self):
+        d = {"a": 2, "b": 3, "c": 5}
+
+        mp = types.MappingProxyType(d)
+
+        def fn(x):
+            d["d"] = 4
+            y = torch.sin(x * mp["d"])
+            return y
+
+        opt_fn = torch.compile(fn, backend="eager")
+        x = torch.randn(4)
+        ref = torch.sin(x * 4)
+        res = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(d.keys(), mp.keys())
+
+    def test_mapping_proxy_existing_local_mutation(self):
+        d = {"a": 2, "b": 3, "c": 5}
+
+        mp = types.MappingProxyType(d)
+
+        def fn(x):
+            # Dynamo should not cause a graph break here because it knows that
+            # the existing proxy cant point to this new dict
+            other_dict = {}
+            other_dict["d"] = 4
+            y = torch.sin(x * mp["c"])
+            return y
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        x = torch.randn(4)
+        ref = torch.sin(x * mp["c"])
+        res = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(d.keys(), mp.keys())
+
     def test_move_to_end(self):
         def fn(x):
             d = OrderedDict({"a": torch.cos(x), "b": 3, "c": 5})
 
@@ -11953,6 +11953,34 @@ def forward(self, x):
             ]
             self.assertEqual(len(shift_op), 1)
 
+    @unittest.skipIf(IS_MACOS, "Distributed not packaged in macos")
+    def test_distributed_all_reduce(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(4, 3)
+
+            def forward(self, x):
+                y = self.linear(x).abs().clamp(max=1.0) * 2
+                torch.distributed.all_reduce(y)
+                return y
+
+        try:
+            torch.distributed.init_process_group(
+                backend="fake",
+                world_size=2,
+                rank=0,
+                store=FakeStore(),
+            )
+
+            m = Foo()
+            ep = export(m, (torch.randn(4, 4),))
+            inp = (torch.randn(4, 4),)
+            self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
+
+        finally:
+            torch.distributed.destroy_process_group()
+
 
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo isn't support")
 class TestOneOffModelExportResult(TestCase):
@@ -12524,39 +12552,6 @@ def forward(self, x):
             ep.graph_module.code
         )
 
-    @unittest.skipIf(IS_MACOS, "Distributed not packaged in macos")
-    @testing.expectedFailureSerDerNonStrict  # nonstrict doesn't support allreduce
-    @testing.expectedFailureNonStrict
-    @testing.expectedFailureTrainingIRToRunDecompNonStrict  # source_fn_stack failure
-    @testing.expectedFailureRetraceabilityNonStrict
-    @testing.expectedFailureLegacyExportNonStrict
-    def test_distributed_all_reduce(self):
-        class Foo(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear = torch.nn.Linear(4, 3)
-
-            def forward(self, x):
-                y = self.linear(x).abs().clamp(max=1.0) * 2
-                torch.distributed.all_reduce(y)
-                return y
-
-        try:
-            torch.distributed.init_process_group(
-                backend="fake",
-                world_size=2,
-                rank=0,
-                store=FakeStore(),
-            )
-
-            m = Foo()
-            ep = export(m, (torch.randn(4, 4),))
-            inp = (torch.randn(4, 4),)
-            self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
-
-        finally:
-            torch.distributed.destroy_process_group()
-
     def test_preserve_cia_op(self):
         class StaticResizeBilinear2dModule(torch.nn.Module):
             def forward(self, x):
 
@@ -117,7 +117,7 @@ def cal_conv_generated_kernel_number(mod, input, dtype, dim=4):
     ):
         input_kernel = 1
     if output.is_contiguous(memory_format=torch.contiguous_format) or (
-        TEST_ACL and (dtype == torch.bfloat16 or dtype == torch.half)
+        TEST_ACL and dtype == torch.bfloat16
     ):
         output_kernel = 1
     return input_kernel + output_kernel
 
@@ -20,6 +20,7 @@
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
     HAS_GPU,
+    requires_gpu,
     skip_windows_ci,
     TRITON_HAS_CPU,
 )
@@ -895,6 +896,34 @@ def func(x, y):
         )
         self.assertTrue("Min" not in code[0])
 
+    @requires_gpu()  # FIXME this test failed on Triton-CPU
+    def test_3d_permute_tiling(self):
+        """
+        Test 3D tiling with permute.
+        """
+
+        def foo(x, y, z):
+            dims = [0, 2, 1]
+            a = x.permute(dims=dims) + y
+            b = (z + y).permute(dims=dims)
+            return a + b
+
+        inps = (torch.rand((51, 51, 51), device=self.device, dtype=torch.
10000
float32),) * 3
+        result, (code,) = run_and_compare(
+            self,
+            foo,
+            *inps,
+            expected_num_triton_kernels=1,
+            expected_num_block_pointers=3,
+            config_patches={
+                "triton.max_tiles": 3,
+                "triton.prefer_nd_tiling": True,
+            },
+        )
+
+        # Check for 3D tiling
+        self.assertIn("ZBLOCK", code)
+
 
 @unittest.skipIf(not TRITON_HAS_CPU, "requires triton CPU backend")
 @config.patch(cpu_backend="triton")
 
@@ -104,6 +104,7 @@ class GuardManager:
     def add_torch_function_mode_stack_guard(
         self, initial_stack, verbose_code_parts: list[str]
     ) -> None: ...
+    def add_mapping_keys_guard(sef, value, verbose_code_parts: list[str]) -> None: ...
 
 class RootGuardManager(GuardManager):
     def get_epilogue_lambda_guards(self) -> list[LeafGuard]: ...
 
@@ -1808,6 +1808,16 @@ def WEAKREF_ALIVE(self, guard):
             get_verbose_code_parts(code, guard)
         )
 
+    def MAPPING_KEYS_CHECK(self, guard):
+        """Guard on the key order of types.MappingProxyType object"""
+        ref = self.arg_ref(guard)
+        value = self.get(guard.name)
+
+        code = []
+        code.append(f"list({ref}.keys()) == {list(value.keys())}")
+        self._set_guard_export_info(guard, code)
+        self.get_guard_manager(guard).add_mapping_keys_guard(value, code)
+
     def DICT_KEYS_MATCH(self, guard):
         """Insert guard to check that the keys of a dict are same"""
         ref = self.arg_ref(guard)
 
@@ -94,6 +94,9 @@ def __init__(
         self.keepalive = keepalive or []
         self.save_for_backward = save_for_backward or []
         self.tensor_hooks = tensor_hooks or {}
+        # Used by MappingProxyVariable to graph break in case of any mutated
+        # dict
+        self._has_existing_dict_mutation = False
         # Track Compiled Autograd final callbacks that must be called at the end of Compiled Autograd backward graph.
         # Only applicable if this graph is created from Dynamo tracing in Compiled Autograd.
         self.ca_final_callbacks_var = None
@@ -536,6 +539,15 @@ def mutation(self, var):
         self.check_allowed_side_effect(var)
         if isinstance(var.mutation_type, ValueMutationExisting):
             var.mutation_type.is_modified = True
+        if (
+            var.source
+            and isinstance(var, variables.ConstDictVariable)
+            and not isinstance(var, variables.SetVariable)
+        ):
+            self._has_existing_dict_mutation = True
+
+    def has_existing_dict_mutation(self):
+        return self._has_existing_dict_mutation
 
     def _get_modified_vars(self):
         return [var for var in self.id_to_variable.values() if self.is_modified(var)]
 
@@ -158,6 +158,7 @@
     DefaultDictVariable,
     DictKeySetVariable,
     FrozensetVariable,
+    MappingProxyVariable,
     SetVariable,
 )
 from .distributed import (
@@ -472,6 +473,7 @@ def _type_dispatch_impl(cls, trace_numpy):
             (weakref.ReferenceType, cls.wrap_weakref),
             (torch.utils.hooks.RemovableHandle, cls.wrap_removable_handle),
             (torch.jit.ScriptFunction, cls.wrap_jit_function),
+            (types.MappingProxyType, cls.wrap_mapping_proxy),
         ]
 
         if trace_numpy and np:
@@ -507,6 +509,32 @@ def wrap_jit_function(self, value):
             value, "_torchdynamo_inline", source=self.source
         )
 
+    def wrap_mapping_proxy(self, value):
+        self.install_guards(GuardBuilder.TYPE_MATCH)
+        # This might be suboptimal compared to dict guards. But mappingproxy is
+        # not very common, so its ok to guard on all keys.
+        self.install_guards(GuardBuilder.MAPPING_KEYS_CHECK)
+        all_const = all(ConstantVariable.is_literal(k) for k in value.keys())
+
+        if not all_const:
+            unimplemented("mapping proxy type supports only const keys")
+
+        def build_key_value(k, v):
+            key = ConstantVariable.create(k)
+            source_key = k
+
+            source_value = GetItemSource(self.get_source(), source_key)
+            value = LazyVariableTracker.create(v, source_value)
+
+            return key, value
+
+        items = dict(build_key_value(k, v) for k, v in value.items())
+
+        # Create a dict_vt to be used in the mapping proxy variable
+        dict_vt = ConstDictVariable(items, source=None)
+        result = MappingProxyVariable(dict_vt, source=self.source)
+        return self.tx.output.side_effects.track_mutable(value, result)
+
     @classmethod
     @functools.lru_cache(None)
     def _id_dispatch(
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-319c8d7fd3551bac63429334509de2663aa43f57`
	`1`	`+8148603e3f3a618acef447a73bdeec9b749a95fb`