Enable XPU path for FlexAttention

liangan1 · liangan1 · commit 0f4578ca4e3d · 2024-12-18T20:37:49.000-08:00
diff --git a/torch/_inductor/kernel/flex_attention.py b/torch/_inductor/kernel/flex_attention.py
@@ -687,6 +687,8 @@ class Mode(Enum):
     fwd = auto()
     bwd = auto()
 
+def _get_xpu_config(query, mode: Mode) -> Tuple[int, int, int, int]:
+    return (64, 64, 4, 3)
 
 def _get_rocm_config(query, mode: Mode) -> Tuple[int, int, int, int]:
     dtype = query.get_dtype()
@@ -770,18 +772,29 @@ def _get_nv_config(query, mode: Mode) -> Tuple[int, int, int, int]:
 
 
 def _get_default_config_fwd(query) -> Tuple[int, int, int, int]:
-    if torch.version.hip is None:
-        return _get_nv_config(query, mode=Mode.fwd)
+    device_type = query.device.type
+    if device_type == "cuda": 
+        if torch.version.hip is None:
+            return _get_nv_config(query, mode=Mode.fwd)
+        else:
+            return _get_rocm_config(query, mode=Mode.fwd)
+    elif device_type == "xpu":
+        return _get_xpu_config(query, mode=Mode.fwd)
     else:
-        return _get_rocm_config(query, mode=Mode.fwd)
+        raise NotImplementedError(f"Unsupported device type: {device_type}")
 
 
 def _get_default_config_bwd(query) -> Tuple[int, int, int, int]:
-    if torch.version.hip is None:
-        return _get_nv_config(query, mode=Mode.bwd)
+    device_type = query.device.type
+    if device_type == "cuda": 
+        if torch.version.hip is None:
+            return _get_nv_config(query, mode=Mode.bwd)
+        else:
+            return _get_rocm_config(query, mode=Mode.bwd)
+    elif device_type == "xpu":
+        return _get_xpu_config(query, mode=Mode.bwd)
     else:
-        return _get_rocm_config(query, mode=Mode.bwd)
-
+        raise NotImplementedError(f"Unsupported device type: {device_type}")
 
 def create_num_blocks_fake_generator(sparse_indices):
     # The idea here is that we need to create a real tensor with real data
diff --git a/torch/_inductor/kernel/flex_decoding.py b/torch/_inductor/kernel/flex_decoding.py
@@ -298,7 +298,10 @@ def flex_decoding_grid(batch_size, kv_heads, gqa_group_size, n_keys, d_model, me
 
 
 def get_split_k(B: int, H: int, Mk: int) -> int:
-    num_SM = torch.cuda.get_device_properties("cuda").multi_processor_count
+    if torch.xpu.is_available():
+        num_SM = torch.xpu.get_device_properties("xpu").gpu_subslice_count
+    else:
+        num_SM = torch.cuda.get_device_properties("cuda").multi_processor_count
     bh = max(B * H, 1)  # NOTE: Handle B*h=0 case
     assert isinstance(bh, (int, sympy.Integer)), "B and H must be concrete integers"
     split_k = num_SM // bh * 2  # Each SM should at least get one block.
@@ -312,8 +315,10 @@ def get_split_k(B: int, H: int, Mk: int) -> int:
 def _get_decoding_default_config(key) -> Tuple[int, int, int]:
     dtype = key.get_dtype()
     head_dim = key.get_size()[-1]
-    sm_version = torch.cuda.get_device_capability()
     default_config = (64, 2, 1)
+    if key.get_device().type == "xpu":
+        return default_config
+    sm_version = torch.cuda.get_device_capability()    
     if sm_version >= (9, 0):
         if head_dim > 128 and dtype == torch.float32:
             return default_config
diff --git a/torch/_ops.py b/torch/_ops.py
@@ -239,6 +239,7 @@ def resolve_key(op: OperatorBase, k: DispatchKey):  # type: ignore[valid-type]
     DispatchKey.BackendSelect,
     DispatchKey.AutocastCPU,  # type: ignore[attr-defined]
     DispatchKey.AutocastCUDA,  # type: ignore[attr-defined]
+    DispatchKey.AutocastXPU,
 ]
 
 

Original file line number	Diff line number	Diff line change
`@@ -239,6 +239,7 @@ def resolve_key(op: OperatorBase, k: DispatchKey): # type: ignore[valid-type]`
`239`	`239`	`DispatchKey.BackendSelect,`
`240`	`240`	`DispatchKey.AutocastCPU, # type: ignore[attr-defined]`
`241`	`241`	`DispatchKey.AutocastCUDA, # type: ignore[attr-defined]`
	`242`	`+ DispatchKey.AutocastXPU,`
`242`	`243`	`]`
`243`	`244`
`244`	`245`