File tree Expand file tree Collapse file tree 1 file changed +4
-23
lines changed Expand file tree Collapse file tree 1 file changed +4
-23
lines changed Original file line number Diff line number Diff line change @@ -218,22 +218,6 @@ dot_with_fp32_arith_main_loop_no_bfdot(
218
218
return reduce (sum);
219
219
}
220
220
221
- template <typename T>
222
- struct half_to_float16 {
223
- using type = T;
224
- };
225
-
226
-
227
- #ifdef __aarch64__
228
- template <>
229
- struct half_to_float16 <Half> {
230
- using type = float16_t ;
231
- };
232
- #endif
233
-
234
- template <typename T>
235
- using half_to_float16_t = typename half_to_float16<T>::type;
236
-
237
221
static_assert (
238
222
(vec::Vectorized<Half>::size() & (vec::Vectorized<Half>::size() - 1)) == 0,
239
223
"Below code expects power-of-2 vector register size!");
@@ -258,13 +242,10 @@ static_assert(
258
242
\
259
243
/* Second-tier tail fixup: handle all workloads. */ \
260
244
for (int j = len_aligned_vec; j < len; ++j) { \
261
- /* We use half_to_float16_t here because changing to Half was */ \
262
- /* causing arithmetic to at fp16 precision, but the necessary */ \
263
- /* necessary behavior to pass python test/test_mps.py -k */ \
264
- /* test_output_grad_match_nn_functional_linear_cpu_float16 is */ \
265
- /* fp32. (I'm not sure exactly why this fixes it.) */ \
266
- half_to_float16_t <std::decay_t <decltype (vec1[j])>> x1 = vec1[j]; \
267
- half_to_float16_t <std::decay_t <decltype (vec2[j])>> x2 = vec2[j]; \
245
+ /* Attempting to use Half here caused multiple test failures; */ \
246
+ /* using float to unbreak. (Suspect we need a scalar FMA.) */ \
247
+ float x1 = vec1[j]; \
248
+ float x2 = vec2[j]; \
268
249
reduced_sum += x1 * x2; \
269
250
} \
270
251
return reduced_sum
You can’t perform that action at this time.
0 commit comments