pytorch
diff --git a/‎aten/src/ATen/native/ForeachOpsKernels.cpp
Lines changed: 14 additions & 0 deletions b/‎aten/src/ATen/native/ForeachOpsKernels.cpp
Lines changed: 14 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/cuda/ForeachTensorAddScalar.cu
Lines changed: 96 additions & 0 deletions b/‎aten/src/ATen/native/cuda/ForeachTensorAddScalar.cu
Lines changed: 96 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/cuda/ForeachUtils.cuh
Lines changed: 56 additions & 0 deletions b/‎aten/src/ATen/native/cuda/ForeachUtils.cuh
Lines changed: 56 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/cuda/MultiTensorApply.cuh
Lines changed: 89 additions & 0 deletions b/‎aten/src/ATen/native/cuda/MultiTensorApply.cuh
Lines changed: 89 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/native_functions.yaml
Lines changed: 7 additions & 0 deletions b/‎aten/src/ATen/native/native_functions.yaml
Lines changed: 7 additions & 0 deletions
diff --git a/‎test/run_test.py
Lines changed: 1 addition & 0 deletions b/‎test/run_test.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/test_foreach.py
Lines changed: 102 additions & 0 deletions b/‎test/test_foreach.py
Lines changed: 102 additions & 0 deletions
@@ -0,0 +1,14 @@
+#include <ATen/ATen.h>
+namespace at { namespace native {
+
+std::vector<Tensor> foreach_add_scalar_kernel_fallback(TensorList tensors, Scalar scalar) {
+  TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor.");
+
+  std::vector<Tensor> result;
+  for (int i = 0; i < tensors.size(); i++) {
+    auto temp = tensors[i].add(scalar);
+    result.emplace_back(temp);
+  }
+  return result;
+}
+}} // namespace at::native
@@ -0,0 +1,96 @@
+#include <ATen/Dispatch.h>
+#include <ATen/native/cuda/ForeachUtils.cuh>
+#include <ATen/native/cuda/MultiTensorApply.cuh>
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at { namespace native {
+
+namespace {
+
+template<typename x_t, typename out_t>
+struct AddScalarFunctor {
+    __device__ void operator() (
+        int chunk_size,
+        TensorListMetadata<2>& tl,
+        x_t scalar) {
+            int tensor_loc = tl.block_to_tensor[blockIdx.x];
+            int chunk_idx = tl.block_to_chunk[blockIdx.x];
+            int n = tl.sizes[tensor_loc];
+
+            x_t* x = (x_t*)tl.addresses[0][tensor_loc];
+            x += chunk_idx * chunk_size;
+
+            out_t* out = (out_t*)tl.addresses[1][tensor_loc];
+            out += chunk_idx * chunk_size;
+
+            n -= chunk_idx * chunk_size;
+
+            x_t r_x[kILP];
+            out_t r_out[kILP];
+
+            // to make things simple, we put aligned case in a different code path
+            if(n % kILP == 0 && chunk_size % kILP == 0 && is_aligned(x) && is_aligned(out)) {
+                for(int i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; i_start += blockDim.x) {
+                    // load
+                    load_store(r_x, x, 0 , i_start);
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_out[ii] = static_cast<x_t>(r_x[ii]) + scalar;
+                    }
+                    // store
+                    load_store(out, r_out, i_start, 0);
+                }
+            }
+            else {
+                // Non-divergent exit condition for __syncthreads, not necessary here
+                for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) {
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = 0;
+                        int i = i_start + threadIdx.x + ii * blockDim.x;
+                        if(i < n && i < chunk_size) {
+                            r_x[ii] = x[i];
+                        }
+                    }
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_out[ii] = static_cast<x_t>(r_x[ii]) + scalar;
+                    }
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        int i = i_start + threadIdx.x + ii * blockDim.x;
+                        if(i < n && i < chunk_size)
+                            out[i] = r_out[ii];
+                    }
+                }
+            }
+        }
+};
+
+} // namespace
+
+std::vector<Tensor> foreach_tensor_add_scalar_kernel_cuda(TensorList tensors, Scalar scalar) {
+    TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor.");
+
+    if (!check_fast_route(tensors, scalar)) {
+        return at::native::foreach_add_scalar_kernel_fallback(tensors, scalar);
+    }
+
+    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<at::Tensor> vec_res;
+    for (const auto& t: tensors) {
+        vec_res.emplace_back(at::native::empty_like(t));
+    }
+
+    tensor_lists.emplace_back(std::move(tensors.vec()));
+    tensor_lists.emplace_back(std::move(vec_res));
+
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_tensor_add_scalar_kernel_cuda", [&]() {
+        multi_tensor_apply<2>(tensor_lists, AddScalarFunctor<scalar_t, scalar_t>(), scalar.to<scalar_t>());
+    });
+    return tensor_lists[1];
+}
+
+}} // namespace at::native
@@ -0,0 +1,56 @@
+#pragma once
+#include <ATen/ATen.h>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+namespace at { 
+namespace native {
+namespace {
+
+static constexpr int64_t kILP = 4;
+static constexpr int64_t kChunkSize = 65536;
+static constexpr int64_t kBlockSize = 512;
+
+template<typename T>
+__device__ __forceinline__ bool is_aligned(T* p){
+  return ((uint64_t)p) % (kILP * sizeof(T)) == 0;
+}
+
+template<typename T>
+__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset){
+  using LT = at::native::memory::aligned_vector<T, kILP>;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
+
+}
+
+bool check_fast_route(TensorList tensors, Scalar scalar) {
+  TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor.");
+  auto expected_dtype = tensors[0].dtype();
+  auto expected_device = tensors[0].device();
+
+  for (auto t : tensors) {
+    if (t.dtype() != expected_dtype) {
+      return false;
+    }
+
+    if (t.device() != expected_device) {
+      return false;
+    }
+
+    if (t.layout() != at::kStrided) {
+      return false;
+    }
+
+    if (!t.is_non_overlapping_and_dense()) {
+      return false;
+    }
+
+    if ((at::isIntegralType(t.scalar_type(), true) && scalar.isFloatingPoint()) || 
+        t.scalar_type() == at::kBool) {
+     return false;
+    }
+  }
+
+  return true;
+}
+}} // at::native
@@ -0,0 +1,89 @@
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/native/cuda/ForeachUtils.cuh>
+#include <c10/cuda/CUDAGuard.h>
+
+namespace at { namespace native {
+
+namespace {
+
+// TensorListMetadata has to be < 4KB - the limit for kernel launch argument
+static constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
+static constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
+
+template<int n> struct TensorListMetadata
+{
+  void* addresses[n][depth_to_max_tensors[n-1]];
+  int sizes[depth_to_max_tensors[n-1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n-1]];
+  int block_to_chunk[depth_to_max_blocks[n-1]];
+};
+
+template<typename T, typename U, typename... ArgTypes>
+C10_LAUNCH_BOUNDS_1(kBlockSize)
+__global__ void 
+multi_tensor_apply_kernel(
+    T tensorListMeta,
+    U callable,
+    ArgTypes... args) {
+  // Hand the chunk information to the user-supplied functor to process however it likes.
+  callable(kChunkSize, tensorListMeta, args...); 
+}
+
+template<int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply(
+    std::vector<std::vector<at::Tensor>>& tensor_lists,
+    T callable,
+    ArgTypes... args) {
+        TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists has to match the depth.");
+        const cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
+
+        size_t n_tensors = tensor_lists[0].size();
+        TensorListMetadata<depth> tensorListMeta;
+
+        int loc_block_info = 0;
+        int loc_tensor_info = 0;
+        for(size_t t = 0; t < n_tensors; t++) {
+            tensorListMeta.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
+            for (int d = 0; d < depth; d++) {
+                tensorListMeta.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
+            }
+            loc_tensor_info++;
+
+            int chunks = (tensor_lists[0][t].numel() + kChunkSize - 1)/kChunkSize;
+            for (int chunk = 0; chunk < chunks; chunk++) {
+                tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+                tensorListMeta.block_to_chunk[loc_block_info] = chunk;
+                loc_block_info++;
+
+                bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth-1] &&
+                    chunk == chunks - 1);
+                bool blocks_full = (loc_block_info == depth_to_max_blocks[depth-1]);
+                bool last_chunk = (t == n_tensors - 1 && chunk == chunks - 1);
+
+                if (tensors_full || blocks_full || last_chunk) {
+                    multi_tensor_apply_kernel<<<loc_block_info, kBlockSize, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        tensorListMeta,
+                        callable,
+                        args...);
+
+                    AT_CUDA_CHECK(cudaGetLastError());
+
+                    // Reset.
+                    loc_block_info = 0;
+                    if(chunk == chunks - 1) {
+                        loc_tensor_info = 0; 
+                    }
+                    else {
+                        tensorListMeta.sizes[0] = tensorListMeta.sizes[loc_tensor_info-1];
+                        for(int d = 0; d < depth; d++) {
+                            tensorListMeta.addresses[d][0] = tensorListMeta.addresses[d][loc_tensor_info-1];
+                        }
+                        loc_tensor_info = 1;
+                    }
+                }
+            }
+        }
+    }
+} // namespace
+}} // at::native
@@ -5410,6 +5410,13 @@
     CUDA: cat_out_cuda
     QuantizedCPU: cat_out_quantized_cpu
 
+- func: _foreach_add.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_add_scalar_kernel_fallback
+    CUDA: foreach_tensor_add_scalar_kernel_cuda
+
 - func: _mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor, Tensor)
   use_c10_dispatcher: full
   dispatch:
 
@@ -38,6 +38,7 @@
     'distributed/test_distributed',
     'test_distributions',
     'test_expecttest',
+    'test_foreach',
     'test_indexing',
     'test_jit',
     'test_logging',
 
@@ 
10000
-0,0 +1,102 @@
+import torch
+import torch.cuda
+from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes
+
+class TestForeach(TestCase):
+    @dtypes(*torch.testing.get_all_dtypes())
+    def test_add_scalar_with_same_size_tensors(self, device, dtype):
+        N = 20
+        H = 20
+        W = 20
+        tensors = []
+        for _ in range(N):
+            tensors.append(torch.zeros(H, W, device=device, dtype=dtype))
+
+        res = torch._foreach_add(tensors, 1)
+        for t in res:
+            if dtype == torch.bool:
+                dtype = torch.int64
+            self.assertEqual(t, torch.ones(H, W, device=device, dtype=dtype))
+
+    @dtypes(*torch.testing.get_all_dtypes())
+    def test_add_scalar_with_different_size_tensors(self, device, dtype):
+        N = 20
+        H = 20
+        W = 20
+
+        tensors = []
+        size_change = 0
+        for _ in range(N):
+            tensors.append(torch.zeros(H + size_change, W + size_change, device=device, dtype=dtype))
+            size_change += 1
+
+        res = torch._foreach_add(tensors, 1)
+
+        size_change = 0
+        for t in res: 
+            if dtype == torch.bool:
+                dtype = torch.int64
+            self.assertEqual(t, torch.ones(H + size_change, W + size_change, device=device, dtype=dtype))
+            size_change += 1
+
+    @dtypes(*torch.testing.get_all_dtypes())
+    def test_add_scalar_with_empty_list(self, device, dtype):
+        tensors = []
+        with self.assertRaises(RuntimeError):
+            torch._foreach_add(tensors, 1)
+
+    @dtypes(*torch.testing.get_all_dtypes())
+    def test_add_scalar_with_overlapping_tensors(self, device, dtype):
+        tensors = [torch.ones(1, 1, device=device, dtype=dtype).expand(2, 1, 3)]
+        expected = [torch.tensor([[[2, 2, 2]], [[2, 2, 2]]], dtype=dtype, device=device)]
+
+        if dtype == torch.bool: 
+            expected[0] = expected[0].to(torch.int64).add(1)
+
+        res = torch._foreach_add(tensors, 1)
+        self.assertEqual(res, expected)
+
+    def test_add_scalar_with_different_tensor_dtypes(self, device):
+        tensors = [torch.tensor([1], dtype=torch.float, device=device), 
+                   torch.tensor([1], dtype=torch.int, device=device)]
+
+        expected = [torch.tensor([2], dtype=torch.float, device=device), 
+                    torch.tensor([2], dtype=torch.int, device=device)]
+
+        res = torch._foreach_add(tensors, 1)
+        self.assertEqual(res, expected)
+
+    def test_add_scalar_with_different_scalar_type(self, device):
+        # int tensor with float scalar
+        # should go 'slow' route
+        scalar = 1.1
+        tensors = [torch.tensor([1], dtype=torch.int, device=device)]
+        res = torch._foreach_add(tensors, scalar)
+        self.assertEqual(res, [torch.tensor([2.1], device=device)])
+
+        # float tensor with int scalar
+        # should go 'fast' route
+        scalar = 1
+        tensors = [torch.tensor([1.1], device=device)]
+        res = torch._foreach_add(tensors, scalar)
+        self.assertEqual(res, [torch.tensor([2.1], device=device)])
+
+   
96D1
     # bool tensor with int scalar
+        # should go 'slow' route
+        scalar = 1
+        tensors = [torch.tensor([False], device=device)]
+        res = torch._foreach_add(tensors, scalar)
+        self.assertEqual(res, [torch.tensor([1], device=device)])
+
+        # bool tensor with float scalar
+        # should go 'slow' route
+        scalar = 1.1
+        tensors = [torch.tensor([False], device=device)]
+        res = torch._foreach_add(tensors, scalar)
+        self.assertEqual(res, [torch.tensor([1.1], device=device)])
+
+instantiate_device_type_tests(TestForeach, globals())
+
+if __name__ == '__main__':
+    run_tests()