pytorch
diff --git a/‎torch/_decomp/decompositions.py
Lines changed: 0 additions & 2 deletions b/‎torch/_decomp/decompositions.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎torch/_inductor/decomposition.py
Lines changed: 19 additions & 0 deletions b/‎torch/_inductor/decomposition.py
Lines changed: 19 additions & 0 deletions
diff --git a/‎torch/_inductor/lowering.py
Lines changed: 3 additions & 1 deletion b/‎torch/_inductor/lowering.py
Lines changed: 3 additions & 1 deletion
@@ -1236,8 +1236,6 @@ def embedding_dense_backward(
     padding_idx: int,
     scale_grad_by_freq: bool,
 ):
-    if grad_output.is_xpu:
-        return NotImplemented
     computation_dtype, result_dtype = utils.elementwise_dtypes(
         grad_output, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
     )
 
@@ -20,6 +20,7 @@
 from torch._decomp.decompositions import (
     _grid_sampler_2d as decomp_grid_sampler_2d,
     _index_add,
+    embedding_dense_backward as decomp_embedding_dense_backward,
     pw_cast_for_opmath,
 )
 from torch._decomp.decompositions_for_rng import extra_random_decomps
@@ -110,6 +111,7 @@
     aten._softmax_backward_data,
     aten.clamp_max,
     aten.clamp_min,
+    aten.embedding_dense_backward,  # we fall back on xpu
     aten.index_add,  # we conditionally call this decomp
     aten.glu,  # inductor lowers this directly
     aten.select_scatter,  # need to be in the ATen graph in order for it to work with the re-inplacing pass
@@ -133,6 +135,23 @@ def register_decomposition(
     return decomp.register_decomposition(ops, decompositions)
 
 
+@register_decomposition([aten.embedding_dense_backward])
+def _embedding_dense_backward(
+    grad_output: torch.Tensor,
+    indices: torch.Tensor,
+    num_weights: int,
+    padding_idx: int,
+    scale_grad_by_freq: bool,
+):
+    if grad_output.is_xpu:
+        return NotImplemented
+    # decomp_func = decompositions.pop(op.overloads()[0], None)
+    # We can write a util function to update decomp table if we have more ops to fallback.
+    return decomp_embedding_dense_backward(
+        grad_output, indices, num_weights, padding_idx, scale_grad_by_freq
+    )
+
+
 # TODO: for now, inductor doesn't handle asserts
 # because the condition is symbol -> tensor in the graph.
 @register_decomposition([aten._assert_async.msg])
 
@@ -2612,7 +2612,9 @@ def is_aligned(x):
 make_fallback(aten._pdist_forward)  # Has decomp. Needs benchmarks
 make_fallback(aten.soft_margin_loss_backward, warn=False)  # py_impl?
 make_fallback(aten._fused_rms_norm, warn=False)  # (MPS-only and faster than decomp)
-make_fallback(aten.embedding_dense_backward, warn=False) # (XPU-only and faster than decomp)
+make_fallback(
+    aten.embedding_dense_backward, warn=False
+)  # (XPU-only and faster than decomp)
 
 
 # 1.5) Easy or Impossible
Original file line number	Diff line number	Diff line change
`@@ -1236,8 +1236,6 @@ def embedding_dense_backward(`
`1236`	`1236`	`padding_idx: int,`
`1237`	`1237`	`scale_grad_by_freq: bool,`
`1238`	`1238`	`):`
`1239`		`- if grad_output.is_xpu:`
`1240`		`- return NotImplemented`
`1241`	`1239`	`computation_dtype, result_dtype = utils.elementwise_dtypes(`
`1242`	`1240`	`grad_output, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT`
`1243`	`1241`	`)`