@@ -188,9 +188,8 @@ __global__ void vectorized_elementwise_kernel(int N, func
DF00
_t f, array_t data) {
188
188
elementwise_kernel_helper (f, policy);
189
189
} else { // if this block has a full `block_work_size` data to handle, use
190
190
// vectorized memory access
191
- constexpr auto optimal_vec_size = vec_size;
192
191
elementwise_kernel_helper (
193
- f, memory::policies::vectorized<optimal_vec_size , array_t , elems_per_thread<io_size>()>(data));
192
+ f, memory::policies::vectorized<vec_size , array_t , elems_per_thread<io_size>()>(data));
194
193
}
195
194
#endif // __CUDA_ARCH__ == 900 || __CUDA_ARCH__ == 1000
196
195
} else {
@@ -215,9 +214,8 @@ __global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) {
215
214
elementwise_kernel_helper (f, policy);
216
215
} else { // if this block has a full `block_work_size` data to handle, use
217
216
// vectorized memory access
218
- constexpr auto optimal_vec_size = vec_size;
219
217
elementwise_kernel_helper (
220
- f, memory::policies::vectorized<optimal_vec_size , array_t , elems_per_thread<io_size>()>(data));
218
+ f, memory::policies::vectorized<vec_size , array_t , elems_per_thread<io_size>()>(data));
221
219
}
222
220
}
223
221
}
@@ -248,6 +246,8 @@ __global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) {
248
246
} else { // if this block has a full `block_work_size` data to handle, use
249
247
// vectorized memory access
250
248
constexpr auto optimal_vec_size = calc_optimal_vec_size<vec_size, io_size>();
249
+ elementwise_kernel_helper (
250
+ f, memory::policies::vectorized<optimal_vec_size, array_t , elems_per_thread<io_size>()>(data));
251
251
}
252
252
}
253
253
#endif // USE_ROCM
0 commit comments