pytorch
diff --git a/‎aten/src/ATen/cuda/CUDABlas.cpp
Lines changed: 11 additions & 2 deletions b/‎aten/src/ATen/cuda/CUDABlas.cpp
Lines changed: 11 additions & 2 deletions
diff --git a/‎aten/src/ATen/cuda/CublasHandlePool.cpp
Lines changed: 6 additions & 3 deletions b/‎aten/src/ATen/cuda/CublasHandlePool.cpp
Lines changed: 6 additions & 3 deletions
@@ -194,9 +194,13 @@ static size_t _parseChosenWorkspaceSize() {
   }
   size_t workspace_size = 76*1024; /* Use 76 MB for hipBLASLt */
 #else
+#if defined(FBCODE_CAFFE2)
   size_t workspace_size = 1024; /* default size in KiB according to #73328 */
+#else
+  // default to CUBLAS_WORKSPACE_CONFIG workspace size
+  size_t workspace_size = at::cuda::getChosenWorkspaceSize() / 1024;
+#endif
 #endif
-
   if (val.has_value()) {
     try {
       workspace_size = std::stoi(val.value());
@@ -236,7 +240,12 @@ struct CublasLtWorkspace {
   CublasLtWorkspace() {
     size = _getWorkspaceSize();
 #ifndef USE_ROCM
-    static bool unified = c10::utils::check_env("TORCH_CUBLASLT_UNIFIED_WORKSPACE") == true;
+    constexpr auto envvar = "TORCH_CUBLASLT_UNIFIED_WORKSPACE";
+#if defined(FBCODE_CAFFE2)
+    static bool unified = c10::utils::check_env(envvar) == true;
+#else
+    static bool unified = c10::utils::has_env(envvar) ? c10::utils::check_env(envvar) == true : true;
+#endif
     if (unified) {
       auto cublasWorkspaceSize = at::cuda::getChosenWorkspaceSize();
       if (cublasWorkspaceSize < size) {
 
@@ -127,10 +127,13 @@ size_t parseChosenWorkspaceSize() {
   const bool gfx94_95 = at::detail::getCUDAHooks().isGPUArch({"gfx94", "gfx95"});
   const size_t default_size = gfx94_95 ? 1024 * 128 * 1024 : 1024 * 32 * 1024;
 #else
-  /* :4096:2:16:8 default, 32MiB for Hopper */
+  /* :4096:2:16:8 default, 32MiB for Hopper/Blackwell, 12MiB for GeForce Blackwell */
   cudaDeviceProp* properties = at::cuda::getCurrentDeviceProperties();
-  const bool sm90 = properties != nullptr && properties->major == 9 && properties->minor == 0;
-  const size_t default_size = sm90 ? 4096 * 8 * 1024 : 4096 * 1024 * 2 + 16 * 1024 * 8;
+  const bool sm90or100 = properties != nullptr && (properties->major == 9 || properties->major == 10) && properties->minor == 0;
+  const bool sm120 = properties != nullptr && properties->major == 12 && properties->minor == 0;
+  constexpr size_t sm90or100size = 32768 * 1024;
+  constexpr size_t sm120size = 12288 * 1024;
+  const size_t default_size = sm90or100 ? sm90or100size : sm120 ? sm120size : 4096 * 1024 * 2 + 16 * 1024 * 8;
 #endif
 
   if (val) {