pytorch
diff --git a/‎torch/_inductor/autotune_process.py
Lines changed: 6 additions & 8 deletions b/‎torch/_inductor/autotune_process.py
Lines changed: 6 additions & 8 deletions
diff --git a/‎torch/_inductor/codegen/triton_utils.py
Lines changed: 3 additions & 27 deletions b/‎torch/_inductor/codegen/triton_utils.py
Lines changed: 3 additions & 27 deletions
diff --git a/‎torch/_inductor/scheduler.py
Lines changed: 11 additions & 2 deletions b/‎torch/_inductor/scheduler.py
Lines changed: 11 additions & 2 deletions
diff --git a/‎torch/_inductor/select_algorithm.py
Lines changed: 4 additions & 9 deletions b/‎torch/_inductor/select_algorithm.py
Lines changed: 4 additions & 9 deletions
@@ -502,6 +502,7 @@ def benchmark(
 class TritonBenchmarkRequest(BenchmarkRequest):
     # Important: Instances of this class have to be serializable
     # across process boundaries. Do not put CUDA Tensors in here!
+
     def __init__(
         self,
         kernel_name: str,
@@ -544,8 +545,6 @@ def make_run_fn(
         if "warmup" in inspect.signature(run_method).parameters:
             warmup_arg["warmup"] = False
 
-        from torch._C import _cuda_getCurrentRawStream as get_raw_stream
-
         if torch.version.hip and self.matrix_instr_nonkdim != 0:
             return functools.partial(
                 run_method,
@@ -554,7 +553,9 @@ def make_run_fn(
                 *self.extra_args,
                 grid=self.grid,
                 **warmup_arg,
-                stream=get_raw_stream(self.output_tensor_meta.device.index),
+                num_stages=self.num_stages,
+                num_warps=self.num_warps,
+                matrix_instr_nonkdim=self.matrix_instr_nonkdim,
             )
         else:
             return functools.partial(
@@ -564,13 +565,10 @@ def make_run_fn(
                 *self.extra_args,
                 grid=self.grid,
                 **warmup_arg,
-                stream=get_raw_stream(self.output_tensor_meta.device.index),
+                num_stages=self.num_stages,
+                num_warps=self.num_warps,
             )
 
-    def precompile(self):
-        mod = PyCodeCache.load_by_key_path(self.module_cache_key, self.module_path)
-        getattr(mod, self.kernel_name).precompile()
-
     def __str__(self) -> str:
         return f"{self.kernel_name=}, {self.module_path=}, {self.module_cache_key=}"
 
 
@@ -63,32 +63,6 @@ def signature_to_meta(
     }
 
 
-def is_unaligned_buffer(arg: TensorArg):
-    buf_name = arg.buffer
-    if buf_name in V.graph.graph_inputs:
-        return not config.assume_aligned_inputs
-
-    if buf_name in V.graph.constants:
-        # all constants are assumed to be aligned
-        return False
-
-    if V.graph.scheduler:
-        layout = V.graph.scheduler.get_buffer_layout(buf_name)
-    else:
-        buffer = V.graph.get_buffer(buf_name)
-        # output arg
-        if not buffer:
-            assert buf_name == V.kernel.output_node.name
-            layout = V.kernel.output_node.layout
-        else:
-            layout = buffer.get_layout()
-
-    if isinstance(layout, torch._inductor.ir.NonOwningLayout):
-        return not layout.maybe_guard_aligned()
-    else:
-        return False
-
-
 def config_of(
     args: List[KernelArgType],
     *,
@@ -107,7 +81,9 @@ def is_aligned(x: KernelArgType, alignment: int, include_tensor: bool) -> bool:
                 offset_aligned = V.graph.sizevars.statically_known_multiple_of(
                     x.offset * x.dtype.itemsize, alignment  # type: ignore[arg-type]
                 )
-                return offset_aligned and not is_unaligned_buffer(x)
+                return offset_aligned and not V.graph.scheduler.is_unaligned_buffer(
+                    x.buffer
+                )
             else:
                 return False
         if isinstance(x, SizeArg):
 
@@ -2454,9 +2454,18 @@ def codegen(self):
 
         self.flush()
 
-    def get_buffer_layout(self, buf_name: str) -> ir.Layout:
+    def is_unaligned_buffer(self, buf_name):
+        if buf_name in V.graph.graph_inputs:
+            return not config.assume_aligned_inputs
+        if buf_name in V.graph.constants:
+            # all constants are assumed to be aligned
+            return False
         node = self.name_to_node[buf_name]
-        return node.node.get_layout()
+        layout = node.node.get_layout()
+        if isinstance(layout, ir.NonOwningLayout):
+    
8000
        return not layout.maybe_guard_aligned()
+        else:
+            return False
 
 
 class BaseScheduling:
 
@@ -94,7 +94,7 @@ def __init__(
         grid_fn,
         meta,
         call_sizes,
-        use_jit=False,
+        use_jit=True,
         prefix_args=0,
         suffix_args=0,
         epilogue_fn=identity,
@@ -150,8 +150,8 @@ def jit_lines(self):
         argdefs, _, signature = self.args.python_argdefs()
         triton_meta = {
             "signature": signature_to_meta(signature, size_dtype=self.index_dtype),
-            "device": self.output_node.get_device().index,
-            "device_type": self.output_node.get_device().type,
+            "device": V.graph.scheduler.current_device.index,
+            "device_type": V.graph.scheduler.current_device.type,
             "constants": {},
         }
         triton_meta["configs"] = [config_of(signature)]
@@ -502,7 +502,7 @@ def generate(
         ), TritonTemplateKernel(
             kernel_name=kernel_name,
             output_node=fake_out,
-            use_jit=False,
+            use_jit=True,
             **kernel_options,
         ) as kernel:
             try:
@@ -688,10 +688,6 @@ def benchmark(self, *args, out):
         assert self.bmreq is not None
         return self.bmreq.benchmark(*args, output_tensor=out)
 
-    def precompile(self):
-        assert self.bmreq is not None
-        self.bmreq.precompile()
-
     def __str__(self):
         return f"TritonTemplateCaller({self.bmreq.module_path}, {self.debug_extra})"
 
@@ -832,7 +828,6 @@ def __call__(
 
         # TODO(nmacchioni): remove once CI tests are fixed
         choices = [choice for choice in choices if choice is not None]
-
         if len(choices) == 0:
             raise RuntimeError(
                 "No choices to select, please consider adding ATEN into max_autotune_gemm_backends "