pytorch
diff --git a/‎aten/src/ATen/cpu/vec/vec512/vec512_convert.h
Lines changed: 20 additions & 0 deletions b/‎aten/src/ATen/cpu/vec/vec512/vec512_convert.h
Lines changed: 20 additions & 0 deletions
diff --git a/‎test/inductor/test_cpu_repro.py
Lines changed: 11 additions & 2 deletions b/‎test/inductor/test_cpu_repro.py
Lines changed: 11 additions & 2 deletions
diff --git a/‎torch/_inductor/codegen/cpp.py
Lines changed: 2 additions & 0 deletions b/‎torch/_inductor/codegen/cpp.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎torch/csrc/inductor/cpp_wrapper/common.h
Lines changed: 2 additions & 0 deletions b/‎torch/csrc/inductor/cpp_wrapper/common.h
Lines changed: 2 additions & 0 deletions
@@ -311,6 +311,26 @@ struct VecConvert<float, 1, Float8_e4m3fn, 1> {
   }
 };
 
+template <>
+struct VecConvert<Float8_e5m2, 1, float, 1> {
+  static inline VectorizedN<Float8_e5m2, 1> apply(const VectorizedN<float, 1>& src_n) {
+    at::vec::Vectorized<float> src = src_n[0];
+    __m128i res128 = cvtfp32_fp8e5m2(src);
+    return at::vec::Vectorized<Float8_e5m2>(_mm512_castsi128_si512(res128));
+  }
+};
+
+template <>
+struct VecConvert<float, 1, Float8_e5m2, 1> {
+  static inline VectorizedN<float, 1> apply(const VectorizedN<Float8_e5m2, 1>& src_n) {
+    // cvt first 16x8 bits from Float8_e5m2 to float
+    at::vec::Vectorized<Float8_e5m2> src = src_n[0];
+    __m512 result;
+    cvtfp8e5m2_fp32(_mm512_castsi512_si128(src), result);
+    return at::vec::Vectorized<float>(result);
+  }
+};
+
 #endif
 
 } // namespace CPU_CAPABILITY
 
@@ -1418,10 +1418,15 @@ def fn(
         use_quant_list = [False, True]
         use_tensor_overload_list = [False, True]
 
-        assert dtype in [torch.uint8, torch.int8, torch.float8_e4m3fn]
+        assert dtype in [
+            torch.uint8,
+            torch.int8,
+            torch.float8_e4m3fn,
+            torch.float8_e5m2,
+        ]
         quant_min = 0 if dtype == torch.uint8 else -128
         quant_max = 255 if dtype == torch.uint8 else 127
-        if dtype == torch.float8_e4m3fn:
+        if dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
             quant_min = int(torch.finfo(dtype).min)
             quant_max = int(torch.finfo(dtype).max)
             use_tensor_overload_list = [
@@ -1486,6 +1491,10 @@ def test_dequant_quant_lowering_int8(self):
     def test_dequant_quant_lowering_fp8_e4m3(self):
         self._test_dequant_quant_lowering_helper(torch.float8_e4m3fn)
 
+    @requires_vectorization
+    def test_dequant_quant_lowering_fp8_e5m2(self):
+        self._test_dequant_quant_lowering_helper(torch.float8_e5m2)
+
     def _test_dequant_maxpool2d_lowering_helper(self, dtype):
         def fn(x, scale, zero_point, quant_min, quant_max, dtype):
             x = torch.ops.quantized_decomposed.dequantize_per_tensor(
 
@@ -155,6 +155,7 @@ def get_export_declaration():
     torch.int32,
     torch.int64,
     torch.float8_e4m3fn,
+    torch.float8_e5m2,
 ]
 
 MASKED_VECTORIZABLE_DTYPES: list[torch.dtype] = [
@@ -1609,6 +1610,7 @@ def to_dtype(x, dtype, src_dtype=None, use_compute_dtypes=True):
             torch.int32,
             torch.int64,
             torch.float8_e4m3fn,
+            torch.float8_e5m2,
         ], f"{__name__} does not support {dtype}"
         assert isinstance(x, CppCSEVariable)
         src_dtype = x.dtype
 
@@ -12,6 +12,7 @@
 // Include some often-used cpp_wrapper headers, for precompiling.
 #include <c10/util/BFloat16.h>
 #include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e5m2.h>
 #include <torch/csrc/Device.h>
 #include <torch/csrc/DynamicTypes.h>
 #include <torch/csrc/utils/pythoncapi_compat.h>
@@ -72,6 +73,7 @@ using namespace torch::aot_inductor;
 using half = at::Half;
 using bfloat16 = at::BFloat16;
 using float8_e4m3fn = at::Float8_e4m3fn;
+using float8_e5m2 = at::Float8_e5m2;
 
 // Round up to the nearest multiple of 64
 [[maybe_unused]] inline int64_t align(int64_t nbytes) {