pytorch
diff --git a/‎aten/src/ATen/native/TensorShape.cpp
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/TensorShape.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/cuda/CUDALoops.cuh
Lines changed: 29 additions & 15 deletions b/‎aten/src/ATen/native/cuda/CUDALoops.cuh
Lines changed: 29 additions & 15 deletions
diff --git a/‎aten/src/ATen/native/cuda/MemoryAccess.cuh
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/native/cuda/MemoryAccess.cuh
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/cuda/TensorShape.cu
Lines changed: 4 additions & 3 deletions b/‎aten/src/ATen/native/cuda/TensorShape.cu
Lines changed: 4 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/transformers/attention.cpp
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/transformers/attention.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/Dependencies.cmake
Lines changed: 3 additions & 0 deletions b/‎cmake/Dependencies.cmake
Lines changed: 3 additions & 0 deletions
diff --git a/‎test/cpp_extensions/open_registration_extension/setup.py
Lines changed: 4 additions & 0 deletions b/‎test/cpp_extensions/open_registration_extension/setup.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎test/inductor/test_aot_inductor_package.py
Lines changed: 57 additions & 0 deletions b/‎test/inductor/test_aot_inductor_package.py
Lines changed: 57 additions & 0 deletions
diff --git a/‎test/inductor/test_codecache.py
Lines changed: 130 additions & 0 deletions b/‎test/inductor/test_codecache.py
Lines changed: 130 additions & 0 deletions
@@ -3366,7 +3366,7 @@ static std::vector<Tensor> _pad_chunk(
     std::vector<int64_t> view_sizes(
         tensor_size.begin(), tensor_size.begin() + dim);
     view_sizes.insert(view_sizes.end(), {num_chunks, -1});
-    padded_tensors.push_back(padded_tensor.view(view_sizes));
+    padded_tensors.push_back(padded_tensor.reshape(view_sizes));
   }
   return padded_tensors;
 }
 
@@ -612,28 +612,41 @@ struct check_binary_functor_types_for_specialization<
 };
 
 // The following is a list of type specializations for vectorized_templated
-// elementwise kernel. It refers to the first and second runtime types of the
-// arguments of a binary functor.
-
+// elementwise kernel. The three types refer to runtime types of the output
+// tensor, first tensor argument, and the second tensor argument used for a
+// binary functor.
 constexpr std::array rt_binary_specializations = {
-    std::array<c10::ScalarType, 2>(
+    std::array<c10::ScalarType, 3>(
         {c10::CppTypeToScalarType<float>::value,
+         c10::CppTypeToScalarType<float>::value,
          c10::CppTypeToScalarType<BFloat16>::value}),
-    std::array<c10::ScalarType, 2>(
+    std::array<c10::ScalarType, 3>(
+        {c10::CppTypeToScalarType<float>::value,
+         c10::CppTypeToScalarType<BFloat16>::value,
+         c10::CppTypeToScalarType<float>::value}),
+    std::array<c10::ScalarType, 3>(
         {c10::CppTypeToScalarType<BFloat16>::value,
+         c10::CppTypeToScalarType<BFloat16>::value,
          c10::CppTypeToScalarType<float>::value}),
-    std::array<c10::ScalarType, 2>(
+    std::array<c10::ScalarType, 3>(
         {c10::CppTypeToScalarType<float>::value,
+         c10::CppTypeToScalarType<float>::value,
          c10::CppTypeToScalarType<Half>::value}),
-    std::array<c10::ScalarType, 2>(
+    std::array<c10::ScalarType, 3>(
+        {c10::CppTypeToScalarType<float>::value,
+         c10::CppTypeToScalarType<Half>::value,
+         c10::CppTypeToScalarType<float>::value}),
+    std::array<c10::ScalarType, 3>(
         {c10::CppTypeToScalarType<Half>::value,
+         c10::CppTypeToScalarType<Half>::value,
          c10::CppTypeToScalarType<float>::value})};
 
 bool check_binary_rt_types_for_specialization(TensorIteratorBase& iter) {
   if (iter.ninputs() != 2)
     return false;
   for (auto spec : rt_binary_specializations)
-    if (iter.input_dtype(0) == spec[0] && iter.input_dtype(1) == spec[1])
+    if (iter.dtype(0) == spec[0] && iter.input_dtype(0) == spec[1] &&
+        iter.input_dtype(1) == spec[2])
       return true;
   return false;
 }
@@ -648,6 +661,7 @@ struct type_specialized_kernel_launcher {
       typename loader_t,
       typename storer_t>
   static void apply(
+      ScalarType ret_t,
       ScalarType arg0_t,
       ScalarType arg1_t,
       int64_t numel,
@@ -657,22 +671,22 @@ struct type_specialized_kernel_launcher {
       out_calc_t output_offset_calculator,
       loader_t loader,
       storer_t storer) {
-    using traits = function_traits<func_t>;
-    using return_t = typename traits::result_type;
-    if (arg0_t == rt_binary_specializations[arg_index][0] &&
-        arg1_t == rt_binary_specializations[arg_index][1])
+    if (ret_t == rt_binary_specializations[arg_index][0] &&
+        arg0_t == rt_binary_specializations[arg_index][1] &&
+        arg1_t == rt_binary_specializations[arg_index][2])
       launch_vectorized_templated_kernel<
           func_t,
           array_t,
           inp_calc_t,
           out_calc_t,
           loader_t,
           storer_t,
-          return_t,
           decltype(c10::impl::ScalarTypeToCPPType<
                    rt_binary_specializations[arg_index][0]>::t),
           decltype(c10::impl::ScalarTypeToCPPType<
-                   rt_binary_specializations[arg_index][1]>::t)>(
+                   rt_binary_specializations[arg_index][1]>::t),
+          decltype(c10::impl::ScalarTypeToCPPType<
+                   rt_binary_specializations[arg_index][2]>::t)>(
           numel,
           f,
           data,
@@ -712,7 +726,6 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
 #ifdef USE_ROCM
     // Attempt to call specialized vectorized elementwise kernel
     // that enables interleaving.
-
     if (check_binary_rt_types_for_specialization(iter) &&
         memory::can_vectorize_up_to<func_t>(data) > 1) {
       // constexpr to reduce the amount of kernels generated for
@@ -740,6 +753,7 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
             type_specialized_kernel_launcher,
             rt_binary_specializations.size()>::
             with_args(
+                iter.dtype(0),
                 iter.input_dtype(0),
                 iter.input_dtype(1),
                 numel,
 
@@ -407,8 +407,8 @@ struct vectorized_templated {
   // float(float,bfloat16) and functor add on float(float,float).
   template <typename scalar_t>
   __device__ inline void store(scalar_t* from, int idx) {
-    using vec_t = aligned_vector<scalar_t, vec_size>;
-    scalar_t* to = reinterpret_cast<scalar_t*>(data[0]) + block_work_size * idx;
+    using vec_t = aligned_vector<CastToT, vec_size>;
+    CastToT* to = reinterpret_cast<CastToT*>(data[0]) + block_work_size * idx;
     vec_t* to_ = reinterpret_cast<vec_t*>(to);
     int thread_idx = threadIdx.x;
 #pragma unroll
 
@@ -422,11 +422,12 @@ static __global__ void chunk_cat_cuda_kernel(
 }
 
 bool all_contiguous(TensorList tensors) {
-  bool contiguous = true;
   for (const auto& t : tensors) {
-    contiguous &= t.is_non_overlapping_and_dense();
+    if (!t.is_contiguous()) {
+      return false;
+    }
   }
-  return contiguous;
+  return true;
 }
 
 // Get leading dimensions before `dim`-th dimension.
 
@@ -449,7 +449,7 @@ REGISTER_AVX512_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
 REGISTER_VSX_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
 REGISTER_ZVECTOR_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
 REGISTER_SVE256_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
-REGISTER_HPU_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_meta);
+REGISTER_HPU_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_meta)
 
 int64_t _fused_sdp_choice_meta(
     const Tensor& query_,
 
@@ -1177,6 +1177,9 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE)
       set(CMAKE_POLICY_VERSION_MINIMUM 3.5)
     endif()
     add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/tensorpipe)
+    # Suppress warning to unblock libnop comiplation by clang-17
+    # See https://github.com/pytorch/pytorch/issues/151316
+    target_compile_options_if_supported(tensorpipe -Wno-missing-template-arg-list-after-template-kw)
     if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0")
       unset(CMAKE_POLICY_VERSION_MINIMUM)
     endif()
 
@@ -1,5 +1,6 @@
 import distutils.command.clean
 import os
+import platform
 import shutil
 import sys
 from pathlib import Path
@@ -40,6 +41,9 @@ def run(self):
             CXX_FLAGS = ["/sdl"]
         else:
             CXX_FLAGS = ["/sdl", "/permissive-"]
+    elif platform.machine() == "s390x":
+        # no -Werror on s390x due to newer compiler
+        CXX_FLAGS = {"cxx": ["-g", "-Wall"]}
     else:
         CXX_FLAGS = {"cxx": ["-g", "-Wall", "-Werror"]}
 
 
@@ -467,6 +467,63 @@ def forward(self, a):
         output = compiled(test_inputs)
         self.assertEqual(expected, output)
 
+    @skipif(
+        lambda device, package_cpp_only: device == "cpu" or package_cpp_only,
+        "No support for cpp only and cpu",
+    )
+    def test_package_user_managed_weight(self):
+        class Model(torch.nn.Module):
+            def __init__(self, n, k, device):
+                super().__init__()
+                self.linear = torch.nn.Linear(k, n, device=device)
+
+            def forward(self, a):
+                return self.linear(a)
+
+        M, N, K = 128, 4096, 4096
+        model = Model(N, K, self.device)
+        example_inputs = (torch.randn(M, K, device=self.device),)
+
+        inductor_configs = {
+            "always_keep_tensor_constants": True,
+            "aot_inductor.package_constants_in_so": False,
+        }
+        compiled = compile(model, example_inputs, inductor_configs=inductor_configs)
+
+        self.assertEqual(
+            set(compiled.get_constant_fqns()), set(model.state_dict().keys())
+        )
+
+        compiled.load_constants(
+            model.state_dict(), check_full_update=True, user_managed=False
+        )
+
+        test_inputs = torch.randn(M, K, device=self.device)
+        expected = model(test_inputs)
+        output = compiled(test_inputs)
+        self.assertEqual(expected, output)
+
+        # Let's try to modify the weight in-place, result shouldn't change.
+        model.linear.weight.data *= 3.7
+        new_output = compiled(test_inputs)
+        self.assertEqual(new_output, output)
+
+        # Recreate a new model that we will test against user_managed=True
+        new_compiled = compile(model, example_inputs, inductor_configs=inductor_configs)
+        new_compiled.load_constants(
+            model.state_dict(), check_full_update=True, user_managed=True
+        )
+
+        expected = model(test_inputs)
+        new_output = new_compiled(test_inputs)
+        self.assertEqual(expected, new_output)
+
+        # Try to modify the weight in-place, result should change.
+        model.linear.weight.data *= 3.7
+        expected = model(test_inputs)
+        new_output = new_compiled(test_inputs)
+        self.assertEqual(new_output, expected)
+
     def test_deepcopy_compiled_model(self):
         class Model(torch.nn.Module):
             def forward(self, x, y):
 
@@ -3,6 +3,8 @@
 import os
 import pickle
 import shutil
+import subprocess
+import sys
 import tempfile
 import unittest
 from typing import Optional, Union
@@ -11,6 +13,7 @@
 import torch
 from torch._dynamo import reset
 from torch._dynamo.utils import counters
+from torch._functorch import config as functorch_config
 from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache
 from torch._inductor import config, metrics
 from torch._inductor.codecache import (
@@ -35,6 +38,7 @@
 from torch.testing._internal.common_device_type import largeTensorTest
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
+    IS_FBCODE,
     parametrize,
     TEST_WITH_ROCM,
 )
@@ -1376,6 +1380,132 @@ def forward(self, x):
         )
 
 
+@instantiate_parametrized_tests
+class TestStandaloneCompile(TestCase):
+    def setUp(self):
+        super().setUp()
+        counters.clear()
+        PatchCaches.setUp()
+        CacheArtifactManager.clear()
+
+    def tearDown(self):
+        super().tearDown()
+        PatchCaches.tearDown()
+
+    def reset(self):
+        AOTAutogradCache.clear()
+        PyCodeCache.cache_clear(purge=True)
+        torch._dynamo.reset()
+        clear_inductor_caches()
+
+    def capture(self, fn):
+        def inner(*args):
+            gm = None
+            actual_args = None
+            kwargs = None
+
+            def backend(gm_, args_, **kwargs_):
+                nonlocal gm
+                nonlocal actual_args
+                nonlocal kwargs
+                gm = gm_
+                actual_args = args_
+                kwargs = kwargs_
+                return gm
+
+            _ = torch.compile(fn, fullgraph=True, backend=backend)(*args)
+            return gm, actual_args, kwargs
+
+        return inner
+
+    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
+    @functorch_config.patch({"enable_autograd_cache": True})
+    @parametrize("format", ("binary", "unpacked"))
+    @parametrize("dynamic", (False, True))
+    def test_basic(self, format: str, dynamic: bool) -> None:
+        mod = torch.nn.Linear(1, 3)
+        x = torch.randn(4, 1)
+        if dynamic:
+            torch._dynamo.mark_dynamic(x, 0)
+
+        def f(x):
+            with torch.no_grad():
+                return mod(x)
+
+        eager_out = f(x)
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            path = (
+                temp_dir
+                if format == "unpacked"
+                else os.path.join(temp_dir, "compiled_artifact.bin")
+            )
+            with fresh_inductor_cache():
+                gm, args, kwargs = self.capture(f)(x)
+                assert not kwargs
+
+                compiled_artifact = torch._inductor.standalone_compile(gm, args)
+                compiled_artifact.save(path=path, format=format)
+
+            self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
+
+            with fresh_inductor_cache():
+                loaded = torch._inductor.CompiledArtifact.load(path=path, format=format)
+                compiled_out = loaded(*args)
+                self.assertEqual(eager_out, compiled_out)
+
+            self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
+
+    @unittest.skipIf(IS_FBCODE, "torch import error")
+    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
+    @functorch_config.patch({"enable_autograd_cache": True})
+    def test_different_process(self):
+        x = torch.ones(4, 1)
+
+        def f(x):
+            return x.sin() * 2
+
+        gm, args, kwargs = self.capture(f)(x)
+        assert not kwargs
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            path = os.path.join(temp_dir, "compiled_artifact.bin")
+
+            with fresh_inductor_cache():
+                compiled_artifact = torch._inductor.standalone_compile(gm, args)
+                compiled_artifact.save(path=path)
+
+            script = f"""
+import torch
+from torch._inductor.utils import fresh_inductor_cache
+
+arg = torch.ones(4, 1)
+with fresh_inductor_cache():
+    loaded = torch._inductor.CompiledArtifact.load(path="{path}")
+    compiled_result = loaded(arg)
+
+eager_result = arg.sin() * 2
+
+if not torch.allclose(eager_result, compiled_result, atol=0.1, rtol=0.01):
+    raise RuntimeError("tensors do not match")
+"""
+            try:
+                subprocess.check_output(
+                    [sys.executable, "-c", script],
+                    stderr=subprocess.STDOUT,
+                    cwd=os.path.dirname(os.path.realpath(__file__)),
+                )
+            except subprocess.CalledProcessError as e:
+                self.fail(
+                    msg=(
+                        "Subprocess exception while attempting to run test: "
+                        + e.output.decode("utf-8")
+                    )
+                )
+
+
 class TestFxGraphCacheHashing(TestCase):
     def test_parameter_constants(self):
         """
Original file line number	Diff line number	Diff line change
`@@ -3366,7 +3366,7 @@ static std::vector<Tensor> _pad_chunk(`
`3366`	`3366`	`std::vector<int64_t> view_sizes(`
`3367`	`3367`	`tensor_size.begin(), tensor_size.begin() + dim);`
`3368`	`3368`	`view_sizes.insert(view_sizes.end(), {num_chunks, -1});`
`3369`		`- padded_tensors.push_back(padded_tensor.view(view_sizes));`
	`3369`	`+ padded_tensors.push_back(padded_tensor.reshape(view_sizes));`
`3370`	`3370`	`}`
`3371`	`3371`	`return padded_tensors;`
`3372`	`3372`	`}`
Original file line number	Diff line number	Diff line change
`@@ -422,11 +422,12 @@ static __global__ void chunk_cat_cuda_kernel(`
`422`	`422`	`}`
`423`	`423`
`424`	`424`	`bool all_contiguous(TensorList tensors) {`
`425`		`- bool contiguous = true;`
`426`	`425`	`for (const auto& t : tensors) {`
`427`		`- contiguous &= t.is_non_overlapping_and_dense();`
	`426`	`+ if (!t.is_contiguous()) {`
	`427`	`+ return false;`
	`428`	`+ }`
`428`	`429`	`}`
`429`		`- return contiguous;`
	`430`	`+ return true;`
`430`	`431`	`}`
`431`	`432`
`432`	`433`	// Get leading dimensions before `dim`-th dimension.