pytorch · xwang233 · Nov 25, 2020 · Nov 25, 2020 · Nov 25, 2020 · Nov 25, 2020
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
@@ -325,6 +325,7 @@ if(USE_CUDA AND NOT USE_ROCM)
       ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublas_static.a
       ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcufft_static_nocallback.a
       ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcusolver_static.a
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/liblapack_static.a     # needed for libcusolver_static
       )
   else()
     list(APPEND ATen_CUDA_DEPENDENCY_LIBS

diff --git a/aten/src/ATen/cuda/CUDASolver.cpp b/aten/src/ATen/cuda/CUDASolver.cpp
@@ -145,6 +145,196 @@ void getrs<c10::complex<float>>(
       info));
 }
 
+
+template<>
+void gesvdj<float>(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, float* A, int lda, float* S, float* U,
+    int ldu, float *V, int ldv, int *info, gesvdjInfo_t params
+) {
+  int lwork;
+  TORCH_CUSOLVER_CHECK(cusolverDnSgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, &lwork, params));
+
+  auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+  auto dataPtr = allocator.allocate(sizeof(float)*lwork);
+
+  TORCH_CUSOLVER_CHECK(cusolverDnSgesvdj(
+    handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv,
+    static_cast<float*>(dataPtr.get()),
+    lwork, info, params));
+}
+
+template<>
+void gesvdj<double>(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, double* A, int lda, double* S, double* U,
+    int ldu, double *V, int ldv, int *info, gesvdjInfo_t params
+) {
+  int lwork;
+  TORCH_CUSOLVER_CHECK(cusolverDnDgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, &lwork
10000
, params));
+
+  auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+  auto dataPtr = allocator.allocate(sizeof(double)*lwork);
+
+  TORCH_CUSOLVER_CHECK(cusolverDnDgesvdj(
+    handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv,
+    static_cast<double*>(dataPtr.get()),
+    lwork, info, params));
+}
+
+template<>
+void gesvdj<c10::complex<float>>(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, c10::complex<float>* A, int lda, float* S, c10::complex<float>* U,
+    int ldu, c10::complex<float> *V, int ldv, int *info, gesvdjInfo_t params
+) {
+  int lwork;
+  TORCH_CUSOLVER_CHECK(cusolverDnCgesvdj_bufferSize(
+    handle, jobz, econ, m, n,
+    reinterpret_cast<cuComplex*>(A),
+    lda, S,
+    reinterpret_cast<cuComplex*>(U),
+    ldu,
+    reinterpret_cast<cuComplex*>(V),
+    ldv, &lwork, params));
+
+  auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+  auto dataPtr = allocator.allocate(sizeof(cuComplex)*lwork);
+
+  TORCH_CUSOLVER_CHECK(cusolverDnCgesvdj(
+    handle, jobz, econ, m, n,
+    reinterpret_cast<cuComplex*>(A),
+    lda, S,
+    reinterpret_cast<cuComplex*>(U),
+    ldu,
+    reinterpret_cast<cuComplex*>(V),
+    ldv,
+    static_cast<cuComplex*>(dataPtr.get()),
+    lwork, info, params));
+}
+
+template<>
+void gesvdj<c10::complex<double>>(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, c10::complex<double>* A, int lda, double* S, c10::complex<double>* U,
+    int ldu, c10::complex<double> *V, int ldv, int *info, gesvdjInfo_t params
+) {
+  int lwork;
+  TORCH_CUSOLVER_CHECK(cusolverDnZgesvdj_bufferSize(
+    handle, jobz, econ, m, n,
+    reinterpret_cast<cuDoubleComplex*>(A),
+    lda, S,
+    reinterpret_cast<cuDoubleComplex*>(U),
+    ldu,
+    reinterpret_cast<cuDoubleComplex*>(V),
+    ldv, &lwork, params));
+
+  auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+  auto dataPtr = allocator.allocate(sizeof(cuDoubleComplex)*lwork);
+
+  TORCH_CUSOLVER_CHECK(cusolverDnZgesvdj(
+    handle, jobz, econ, m, n,
+    reinterpret_cast<cuDoubleComplex*>(A),
+    lda, S,
+    reinterpret_cast<cuDoubleComplex*>(U),
+    ldu,
+    reinterpret_cast<cuDoubleComplex*>(V),
+    ldv,
+    static_cast<cuDoubleComplex*>(dataPtr.get()),
+    lwork, info, params));
+}
+
+
+template<>
+void gesvdjBatched<float>(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n, float* A, int lda, float* S, float* U,
+    int ldu, float *V, int ldv, int *info, gesvdjInfo_t params, int batchSize
+) {
+  int lwork;
+  TORCH_CUSOLVER_CHECK(cusolverDnSgesvdjBatched_bufferSize(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, &lwork, params, batchSize));
+
+  auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+  auto dataPtr = allocator.allocate(sizeof(float)*lwork);
+
+  TORCH_CUSOLVER_CHECK(cusolverDnSgesvdjBatched(
+    handle, jobz, m, n, A, lda, S, U, ldu, V, ldv,
+    static_cast<float*>(dataPtr.get()),
+    lwork, info, params, batchSize));
+}
+
+template<>
+void gesvdjBatched<double>(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n, double* A, int lda, double* S, double* U,
+    int ldu, double *V, int ldv, int *info, gesvdjInfo_t params, int batchSize
+) {
+  int lwork;
+  TORCH_CUSOLVER_CHECK(cusolverDnDgesvdjBatched_bufferSize(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, &lwork, params, batchSize));
+
+  auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+  auto dataPtr = allocator.allocate(sizeof(double)*lwork);
+
+  TORCH_CUSOLVER_CHECK(cusolverDnDgesvdjBatched(
+    handle, jobz, m, n, A, lda, S, U, ldu, V, ldv,
+    static_cast<double*>(dataPtr.get()),
+    lwork, info, params, batchSize));
+}
+
+template<>
+void gesvdjBatched<c10::complex<float>>(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n, c10::complex<float>* A, int lda, float* S, c10::complex<float>* U,
+    int ldu, c10::complex<float> *V, int ldv, int *info, gesvdjInfo_t params, int batchSize
+) {
+  int lwork;
+  TORCH_CUSOLVER_CHECK(cusolverDnCgesvdjBatched_bufferSize(
+    handle, jobz, m, n,
+    reinterpret_cast<cuComplex*>(A),
+    lda, S,
+    reinterpret_cast<cuComplex*>(U),
+    ldu,
+    reinterpret_cast<cuComplex*>(V),
+    ldv, &lwork, params, batchSize));
+
+  auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+  auto dataPtr = allocator.allocate(sizeof(cuComplex)*lwork);
+
+  TORCH_CUSOLVER_CHECK(cusolverDnCgesvdjBatched(
+    handle, jobz, m, n,
+    reinterpret_cast<cuComplex*>(A),
+    lda, S,
+    reinterpret_cast<cuComplex*>(U),
+    ldu,
+    reinterpret_cast<cuComplex*>(V),
+    ldv,
+    static_cast<cuComplex*>(dataPtr.get()),
+    lwork, info, params, batchSize));
+}
+
+template<>
+void gesvdjBatched<c10::complex<double>>(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n, c10::complex<double>* A, int lda, double* S, c10::complex<double>* U,
+    int ldu, c10::complex<double> *V, int ldv, int *info, gesvdjInfo_t params, int batchSize
+) {
+  int lwork;
+  TORCH_CUSOLVER_CHECK(cusolverDnZgesvdjBatched_bufferSize(
+    handle, jobz, m, n,
+    reinterpret_cast<cuDoubleComplex*>(A),
+    lda, S,
+    reinterpret_cast<cuDoubleComplex*>(U),
+    ldu,
+    reinterpret_cast<cuDoubleComplex*>(V),
+    ldv, &lwork, params, batchSize));
+
+  auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+  auto dataPtr = allocator.allocate(sizeof(cuDoubleComplex)*lwork);
+
+  TORCH_CUSOLVER_CHECK(cusolverDnZgesvdjBatched(
+    handle, jobz, m, n,
+    reinterpret_cast<cuDoubleComplex*>(A),
+    lda, S,
+    reinterpret_cast<cuDoubleComplex*>(U),
+    ldu,
+    reinterpret_cast<cuDoubleComplex*>(V),
+    ldv,
+    static_cast<cuDoubleComplex*>(dataPtr.get()),
+    lwork, info, params, batchSize));
+}
+
 } // namespace solver
 } // namespace cuda
 } // namespace at

diff --git a/aten/src/ATen/cuda/CUDASolver.h b/aten/src/ATen/cuda/CUDASolver.h
@@ -42,6 +42,41 @@ template<>
 void getrs<c10::complex<float>>(CUDASOLVER_GETRS_ARGTYPES(c10::complex<float>));
 
 
+#define CUDASOLVER_GESVDJ_ARGTYPES(Dtype, Vtype)  \
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, Dtype* A, int lda, Vtype* S, Dtype* U, \
+    int ldu, Dtype *V, int ldv, int *info, gesvdjInfo_t params
+
+template<class Dtype, class Vtype>
+void gesvdj(CUDASOLVER_GESVDJ_ARGTYPES(Dtype, Vtype)) {
+  TORCH_INTERNAL_ASSERT(false, "at::cuda::solver::gesvdj: not implemented for ", typeid(Dtype).name());
+}
+template<>
+void gesvdj<float>(CUDASOLVER_GESVDJ_ARGTYPES(float, float));
+template<>
+void gesvdj<double>(CUDASOLVER_GESVDJ_ARGTYPES(double, double));
+template<>
+void gesvdj<c10::complex<float>>(CUDASOLVER_GESVDJ_ARGTYPES(c10::complex<float>, float));
+template<>
+void gesvdj<c10::complex<double>>(CUDASOLVER_GESVDJ_ARGTYPES(c10::complex<double>, double));
+
+
+#define CUDASOLVER_GESVDJ_BATCHED_ARGTYPES(Dtype, Vtype)  \
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n, Dtype* A, int lda, Vtype* S, Dtype* U, \
+    int ldu, Dtype *V, int ldv, int *info, gesvdjInfo_t params, int batchSize
+
+template<class Dtype, class Vtype>
+void gesvdjBatched(CUDASOLVER_GESVDJ_BATCHED_ARGTYPES(Dtype, Vtype)) {
+  TORCH_INTERNAL_ASSERT(false, "at::cuda::solver::gesvdj: not implemented for ", typeid(Dtype).name());
+}
+template<>
+void gesvdjBatched<float>(CUDASOLVER_GESVDJ_BATCHED_ARGTYPES(float, float));
+template<>
+void gesvdjBatched<double>(CUDASOLVER_GESVDJ_BATCHED_ARGTYPES(double, double));
+template<>
+void gesvdjBatched<c10::complex<float>>(CUDASOLVER_GESVDJ_BATCHED_ARGTYPES(c10::complex<float>, float));
+template<>
+void gesvdjBatched<c10::complex<double>>(CUDASOLVER_GESVDJ_BATCHED_ARGTYPES(c10::complex<double>, double));
+
 } // namespace solver
 } // namespace cuda
 } // namespace at

diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h
@@ -239,7 +239,14 @@ static inline std::tuple<std::vector<int64_t>,
 }
 
 // Function to generate empty tensors of required size, strides and dtype for the SVD operation
-static inline std::tuple<Tensor, Tensor, Tensor> _create_U_S_VT(const Tensor& input, bool some, bool compute_uv) {
+static inline std::tuple<Tensor, Tensor, Tensor> _create_U_S_VT(const Tensor& input, bool some, bool compute_uv,
+    const bool svd_use_cusolver=false) {
+
+  // U, S, VT are initialized as empty tensors.
+  // For CPU LAPACK and GPU MAGMA backend, the tensors are initialized on CPU.
+  // For GPU cuSOLVER backend, the tensors are initialized on GPU.
+  const auto usvt_device = svd_use_cusolver ? at::kCUDA : at::kCPU;
+
   auto sizes = input.sizes().vec();
   int64_t m = input.size(-2), n = input.size(-1);
 
@@ -251,47 +258,21 @@ static inline std::tuple<Tensor, Tensor, Tensor> _create_U_S_VT(const Tensor& in
   strides[input.dim() - 1] = m;
   strides[input.dim() - 2] = 1;
 
-  Tensor U_empty;
-  if (!input.is_cuda()) {
-    U_empty = at::empty_strided(sizes, strides, input.options());
-  } else {
-    // NB: U_empty is an empty tensor created on the CPU intentionally, because magma_(d/s)gesdd
-    // (which is the driver routine for the divide and conquer SVD operation)
-    // takes in arrays on the CPU as input. This routine is a hybrid CPU-GPU routine that
-    // moves the inputs between devices internally.
-    U_empty = at::empty_strided(sizes, strides, input.options().device(at::kCPU));
-  }
+  Tensor U_empty = at::empty_strided(sizes, strides, input.options().device(usvt_device));
+  U_empty.zero_();
 
   // VT should be a column-major or a batch of column-major matrices
   sizes[input.dim() - 2] = n;
   sizes[input.dim() - 1] = n;
-  strides = at::detail::defaultStrides(sizes);
-  strides[input.dim() - 1] = n;
-  strides[input.dim() - 2] = 1;
-  Tensor VT_empty;
-  if (!input.is_cuda()) {
-    VT_empty = at::empty_strided(sizes, strides, input.options());
-  } else {
-    // NB: VT_empty is an empty tensor created on the CPU intentionally, because magma_(d/s)gesdd
-    // (which is the driver routine for the divide and conquer SVD operation)
-    // takes in arrays on the CPU as input. This routine is a hybrid CPU-GPU routine that
-    // moves the inputs between devices internally.
-    VT_empty = at::empty_strided(sizes, strides, input.options().device(at::kCPU));
-  }
+  // VT should be a column-major or a batch of column-major matrices
+  Tensor VT_empty = at::zeros(sizes, input.options().device(usvt_device));
+  VT_empty.transpose_(-2, -1);
 
   sizes.pop_back();
   sizes[input.dim() - 2] = std::min(m, n);
-  Tensor S_empty;
   ScalarType dtype = toValueType(typeMetaToScalarType(input.dtype()));
-  if (!input.is_cuda()) {
-    S_empty = at::empty(sizes, input.options().dtype(dtype));
-  } else {
-    // NB: S_empty is an empty tensor created on the CPU intentionally, because magma_(d/s)gesdd
-    // (which is the driver routine for the divide and conquer SVD operation)
-    // takes in arrays on the CPU as input. This routine is a hybrid CPU-GPU routine that
-    // moves the inputs between devices internally. 
-    S_empty = at::empty(sizes, input.options().dtype(dtype).device(at::kCPU));
-  }
+  Tensor S_empty = at::empty(sizes, input.options().dtype(dtype).device(usvt_device));
+
   return std::tuple<Tensor, Tensor, Tensor>(U_empty, S_empty, VT_empty);
 }
 

diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
@@ -2200,7 +2200,7 @@ AT_ERROR("svd: MAGMA library not found in "
 #endif
 }
 
-std::tuple<Tensor, Tensor, Tensor> _svd_helper_cuda(const Tensor& self, bool some, bool compute_uv) {
+std::tuple<Tensor, Tensor, Tensor> _svd_helper_cuda_legacy(const Tensor& self, bool some, bool compute_uv) {
   std::vector<int64_t> infos(batchCount(self), 0);
   int64_t m = self.size(-2), n = self.size(-1);
   int64_t k = std::min(m, n);
@@ -2256,6 +2256,14 @@ std::tuple<Tensor, Tensor, Tensor> _svd_helper_cuda(const Tensor& self, bool som
   return std::make_tuple(U_working_copy, S_working_copy, VT_working_copy);
 }
 
+std::tuple<Tensor, Tensor, Tensor> _svd_helper_cuda(const Tensor& self, bool some, bool compute_uv) {
+#ifdef USE_CUSOLVER
+  return _svd_helper_cuda_lib(self, some, compute_uv);
+#else
+  return _svd_helper_cuda_legacy(self, some, compute_uv);
+#endif
+}
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lu_solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template <typename scalar_t>