[associative_scan] Autograd separated #139939

bohnstingl · 2025-05-09T15:36:37Z

I separated off these two specific cases which lead to CPP compilation failures, which I don't think are necessarily related to the associative_scan, as they appear only here. I marked those tests as expected fail and we should do it in a follow-up PR. For reference, the issue we observe with those tests is:

ERROR: test_associative_scan_compile_fail_reverse_True_compile_mode_compile_dynamic_shape_combine_mode_generic_cpu_autograd_True (__main__.AssociativeScanTests.test_associative_scan_compile_fail_reverse_True_compile_mode_compile_dynamic_shape_combine_mode_generic_cpu_autograd_True) ---------------------------------------------------------------------- Traceback (most recent call last): File "/data_malta3_ssd/pytorch/torch/testing/_internal/common_utils.py", line 3154, in wrapper method(*args, **kwargs) File "/data_malta3_ssd/pytorch/torch/testing/_internal/common_utils.py", line 552, in instantiated_test test(self, **param_kwargs) File "/data_malta3_ssd/pytorch/test/functorch/test_control_flow.py", line 3594, in test_associative_scan_compile_fail result = self._run_test( ^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/test/functorch/test_control_flow.py", line 3456, in _run_test self._check_autograd(result, result_exp, autograd_param) File "/data_malta3_ssd/pytorch/test/functorch/test_control_flow.py", line 3444, in _check_autograd grads = torch.autograd.grad(result_flatten, grad_param, grad_init) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/autograd/__init__.py", line 503, in grad result = _engine_run_backward( ^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/autograd/graph.py", line 829, in _engine_run_backward return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/autograd/function.py", line 307, in apply return user_fn(self, *args) ^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 2179, in backward return impl_fn() ^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 2165, in impl_fn out = CompiledFunction._backward_impl(ctx, all_args) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 2257, in _backward_impl CompiledFunction.compiled_bw = aot_config.bw_compiler( ^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_functorch/aot_autograd.py", line 483, in __call__ return self.compiler_fn(gm, example_inputs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_dynamo/backends/common.py", line 73, in _wrapped_bw_compiler disable( File "/data_malta3_ssd/pytorch/torch/_dynamo/eval_frame.py", line 872, in _fn return fn(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_utils_internal.py", line 97, in wrapper_function return function(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/compile_fx.py", line 2234, in bw_compiler return inner_compile( ^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/compile_fx.py", line 710, in compile_fx_inner return wrap_compiler_debug(_compile_fx_inner, compiler_name="inductor")( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_dynamo/repro/after_aot.py", line 124, in debug_wrapper inner_compiled_fn = compiler_fn(gm, example_inputs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/compile_fx.py", line 880, in _compile_fx_inner raise InductorError(e, currentframe()).with_traceback( File "/data_malta3_ssd/pytorch/torch/_inductor/compile_fx.py", line 864, in _compile_fx_inner mb_compiled_graph = fx_codegen_and_compile( ^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/compile_fx.py", line 1487, in fx_codegen_and_compile return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/compile_fx.py", line 1374, in codegen_and_compile compiled_module = graph.compile_to_module() ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/graph.py", line 2238, in compile_to_module return self._compile_to_module() ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/graph.py", line 2248, in _compile_to_module mod = self._compile_to_module_lines(wrapper_code) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/graph.py", line 2312, in _compile_to_module_lines mod = PyCodeCache.load_by_key_path( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/codecache.py", line 3022, in load_by_key_path mod = _reload_python_module(key, path, set_sys_modules=in_toplevel) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/runtime/compile_tasks.py", line 31, in _reload_python_module exec(code, mod.__dict__, mod.__dict__) File "/tmp/torchinductor_boh/cd/ccdmedyasojek2hupoamg66aetwrqjyasivhvip62n63grppshs5.py", line 416, in <module> async_compile.wait(globals()) File "/data_malta3_ssd/pytorch/torch/_inductor/async_compile.py", line 481, in wait self._wait_futures(scope) File "/data_malta3_ssd/pytorch/torch/_inductor/async_compile.py", line 501, in _wait_futures kernel = result.result() ^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/codecache.py", line 3524, in result return self.result_fn() ^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/codecache.py", line 2505, in future result = get_result() ^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/codecache.py", line 2313, in load_fn future.result() File "/data_malta3_ssd/miniforge3/envs/pt23/lib/python3.11/concurrent/futures/_base.py", line 449, in result return self.__get_result() ^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/miniforge3/envs/pt23/lib/python3.11/concurrent/futures/_base.py", line 401, in __get_result raise self._exception File "/data_malta3_ssd/pytorch/torch/_inductor/compile_fx.py", line 864, in _compile_fx_inner mb_compiled_graph = fx_codegen_and_compile( ^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/compile_fx.py", line 1487, in fx_codegen_and_compile return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/compile_fx.py", line 1374, in codegen_and_compile compiled_module = graph.compile_to_module() ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/graph.py", line 2238, in compile_to_module return self._compile_to_module() ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/graph.py", line 2248, in _compile_to_module mod = self._compile_to_module_lines(wrapper_code) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/graph.py", line 2312, in _compile_to_module_lines mod = PyCodeCache.load_by_key_path( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/codecache.py", line 3022, in load_by_key_path mod = _reload_python_module(key, path, set_sys_modules=in_toplevel) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/runtime/compile_tasks.py", line 31, in _reload_python_module exec(code, mod.__dict__, mod.__dict__) File "/tmp/torchinductor_boh/cd/ccdmedyasojek2hupoamg66aetwrqjyasivhvip62n63grppshs5.py", line 416, in <module> async_compile.wait(globals()) File "/data_malta3_ssd/pytorch/torch/_inductor/async_compile.py", line 481, in wait self._wait_futures(scope) File "/data_malta3_ssd/pytorch/torch/_inductor/async_compile.py", line 501, in _wait_futures kernel = result.result() ^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/codecache.py", line 3524, in result return self.result_fn() ^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/codecache.py", line 2505, in future result = get_result() ^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/codecache.py", line 2313, in load_fn future.result() File "/data_malta3_ssd/miniforge3/envs/pt23/lib/python3.11/concurrent/futures/_base.py", line 456, in result return self.__get_result() ^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/miniforge3/envs/pt23/lib/python3.11/concurrent/futures/_base.py", line 401, in __get_result raise self._exception File "/data_malta3_ssd/miniforge3/envs/pt23/lib/python3.11/concurrent/futures/thread.py", line 58, in run result = self.fn(*self.args, **self.kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data_malta3_ssd/pytorch/torch/_inductor/codecache.py", line 2342, in _worker_compile_cpp cpp_builder.build() File "/data_malta3_ssd/pytorch/torch/_inductor/cpp_builder.py", line 1687, in build run_compile_cmd(build_cmd, cwd=_build_tmp_dir) File "/data_malta3_ssd/pytorch/torch/_inductor/cpp_builder.py", line 358, in run_compile_cmd _run_compile_cmd(cmd_line, cwd) File "/data_malta3_ssd/pytorch/torch/_inductor/cpp_builder.py", line 353, in _run_compile_cmd raise exc.CppCompileError(cmd, output) from e torch._inductor.exc.InductorError: CppCompileError: C++ compile error Command: g++ /tmp/torchinductor_boh/i7/ci7asxgns7q3tcdqd7ptdfdnsn65pdggp5yb5mofl4qhemvc7lph.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX2 -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -fopenmp -I/data_malta3_ssd/miniforge3/envs/pt23/include/python3.11 -I/data_malta3_ssd/pytorch/torch/include -I/data_malta3_ssd/pytorch/torch/include/torch/csrc/api/include -mavx2 -mfma -mf16c -D_GLIBCXX_USE_CXX11_ABI=1 -o /tmp/torchinductor_boh/i7/ci7asxgns7q3tcdqd7ptdfdnsn65pdggp5yb5mofl4qhemvc7lph.so -ltorch -ltorch_cpu -ltorch_python -lgomp -L/opt/ssd/miniforge3/envs/pt23/lib -L/data_malta3_ssd/pytorch/torch/lib Output: /tmp/torchinductor_boh/i7/ci7asxgns7q3tcdqd7ptdfdnsn65pdggp5yb5mofl4qhemvc7lph.cpp: In lambda function: /tmp/torchinductor_boh/i7/ci7asxgns7q3tcdqd7ptdfdnsn65pdggp5yb5mofl4qhemvc7lph.cpp:52:38: error: invalid cast from type ‘at::vec::CPU_CAPABILITY::Vectorized<float>’ to type ‘float’ 52 | auto tmp19 = float(tmp7 + tmp18); | ^~~~~~~~~~~~~~~~~~~ /tmp/torchinductor_boh/i7/ci7asxgns7q3tcdqd7ptdfdnsn65pdggp5yb5mofl4qhemvc7lph.cpp: In lambda function: /tmp/torchinductor_boh/i7/ci7asxgns7q3tcdqd7ptdfdnsn65pdggp5yb5mofl4qhemvc7lph.cpp:96:43: warning: self-comparison always evaluates to false [-Wtautological-compare] 96 | auto tmp34 = tmp3 < tmp3; | ~~~~ ^ ~~~~ /tmp/torchinductor_boh/i7/ci7asxgns7q3tcdqd7ptdfdnsn65pdggp5yb5mofl4qhemvc7lph.cpp:122:38: error: invalid cast from type ‘at::vec::CPU_CAPABILITY::Vectorized<float>’ to type ‘float’ 122 | auto tmp43 = float(tmp33 + tmp42); | ^~~~~~~~~~~~~~~~~~~~ /tmp/torchinductor_boh/i7/ci7asxgns7q3tcdqd7ptdfdnsn65pdggp5yb5mofl4qhemvc7lph.cpp: In lambda function: /tmp/torchinductor_boh/i7/ci7asxgns7q3tcdqd7ptdfdnsn65pdggp5yb5mofl4qhemvc7lph.cpp:166:43: warning: self-comparison always evaluates to false [-Wtautological-compare] 166 | auto tmp58 = tmp3 < tmp3; | ~~~~ ^ ~~~~ /tmp/torchinductor_boh/i7/ci7asxgns7q3tcdqd7ptdfdnsn65pdggp5yb5mofl4qhemvc7lph.cpp:192:38: error: invalid cast from type ‘at::vec::CPU_CAPABILITY::Vectorized<float>’ to type ‘float’ 192 | auto tmp67 = float(tmp57 + tmp66); | ^~~~~~~~~~~~~~~~~~~~ /tmp/torchinductor_boh/i7/ci7asxgns7q3tcdqd7ptdfdnsn65pdggp5yb5mofl4qhemvc7lph.cpp: In lambda function: /tmp/torchinductor_boh/i7/ci7asxgns7q3tcdqd7ptdfdnsn65pdggp5yb5mofl4qhemvc7lph.cpp:254:47: warning: self-comparison always evaluates to false [-Wtautological-compare] 254 | auto tmp25 = tmp2 < tmp2; | ~~~~ ^ ~~~~ /tmp/torchinductor_boh/i7/ci7asxgns7q3tcdqd7ptdfdnsn65pdggp5yb5mofl4qhemvc7lph.cpp: In lambda function: /tmp/torchinductor_boh/i7/ci7asxgns7q3tcdqd7ptdfdnsn65pdggp5yb5mofl4qhemvc7lph.cpp:280:47: warning: self-comparison always evaluates to false [-Wtautological-compare] 280 | auto tmp43 = tmp2 < tmp2; | ~~~~ ^ ~~~~ To execute this test, run the following from the base repo dir: python test/functorch/test_control_flow.py AssociativeScanTests.test_associative_scan_compile_fail_reverse_True_compile_mode_compile_dynamic_shape_combine_mode_generic_cpu_autograd_True This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 ----------------------------------------------------------------------```

-Original file line number
+Diff line change
@@ Expand Up / @@ -3485,6 +3485,18 @@ def _prepare_fake_kwargs(self, original_kwargs): @@
                 )
             ),
         )
+        # Skipping this combination as there is a CPP compilation failure that
+        # may be unrelated to associative_scan itself. There is a dedicated tests for
+        # this case below.
+        @decorateIf(
+            unittest.skip,
+            lambda params: (
+                params["compile_mode"] == "compile_dynamic_shape"
+                and params["combine_mode"] == "generic"
+                and params["device"] == torch.device("cpu")
+                and params["autograd"]
+            ),
+        )
         def test_associative_scan_compile(
             self, combine_mode, reverse, compile_mode, device, autograd
         ):
@@ Expand Down Expand Up / @@ -3535,6 +3547,64 @@ def test_associative_scan_compile( @@
             self.assertEqual(result, results_torch)
+        @unittest.skipIf(not SM70OrLater, "triton")
+        @requires_cuda
+        @unittest.expectedFailure
+        @parametrize("reverse", [False, True])
+        @parametrize("compile_mode", ["compile_dynamic_shape"])
+        @parametrize("combine_mode", ["generic"])
+        @parametrize("device", [torch.device("cpu")])
+        @parametrize("autograd", [True])
+        def test_associative_scan_compile_fail(
+            self, combine_mode, reverse, compile_mode, device, autograd
+        ):
+            x = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
+            kwargs = {
+                "dim": 0,
+                "reverse": reverse,
+                "compile_mode": compile_mode,
+                "combine_mode": combine_mode,
+            }
+            kwargs_fake = self._prepare_fake_kwargs(kwargs)
+            results = self._run_test(
+                model=AssociativeScanModels.Simple(**kwargs),
+                model_fake=AssociativeScanModels.Simple(**kwargs_fake),
+                inputs=x,
+                autograd_param=None if not autograd else (x,),
+            )
+            if not reverse:
+                results_torch = []
+                for op_pt in [torch.cumsum, torch.cumprod]:
+                    results_torch.append(op_pt(x, 0))
+                self.assertEqual(results, results_torch)
+            # Jax Examples
+            x = torch.arange(
+, 4, device=device, dtype=torch.float32, requires_grad=autograd
+            )
+            kwargs = {
+                "dim": 0,
+                "reverse": reverse,
+                "compile_mode": compile_mode,
+                "combine_fn": get_scan_combine_fn("add", True),
+                "combine_mode": combine_mode,
+            }
+            kwargs_fake = self._prepare_fake_kwargs(kwargs)
+            result = self._run_test(
+                model=AssociativeScanModels.CombineFn(**kwargs),
+                model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
+                inputs=x,
+                autograd_param=None if not autograd else (x,),
+            )
+            if not reverse:
+                results_torch = torch.tensor([0.0, 1.0, 3.0, 6.0], dtype=torch.float32)
+            else:
+                results_torch = torch.tensor([6.0, 6.0, 5.0, 3.0], dtype=torch.float32)
+            self.assertEqual(result, results_torch)
         @unittest.skipIf(not SM70OrLater, "triton")
         @requires_cuda
         @parametrize("reverse", [False, True])
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[associative_scan] Autograd separated #139939

Diff view

Diff view

Uh oh!

There are no files selected for viewing

Uh oh!

Uh oh!

[associative_scan] Autograd separated #139939

Are you sure you want to change the base?

[associative_scan] Autograd separated #139939

Uh oh!

Uh oh!

Diff view

Diff view

Uh oh!

There are no files selected for viewing

Choose a reason for hiding this comment

Uh oh!

Uh oh!