pytorch
@@ -173,11 +173,22 @@ @interface MPSGraphExecutionDescriptor ()
       endKernelCoalescing();
       id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
 
-      [blitEncoder copyFromBuffer:srcBuffer
-                     sourceOffset:(NSUInteger)srcOffset
-                         toBuffer:dstBuffer
-                destinationOffset:(NSUInteger)dstOffset
-                             size:(NSUInteger)length];
+      // For some reason copyFromBuffer for 4Gb fails without returning an error
+      // See https://github.com/pytorch/pytorch/issues/124335
+      // Workaround by batching copy commands into 2Gb chunks
+      constexpr size_t max_copy_size = 0x80000000; // 2GB
+      size_t bytes_copied = 0;
+      size_t bytes_remains = length;
+      while (bytes_remains > 0) {
+        NSUInteger bytes_to_copy = std::min(max_copy_size, bytes_remains);
+        [blitEncoder copyFromBuffer:srcBuffer
+                       sourceOffset:(NSUInteger)srcOffset + bytes_copied
+                           toBuffer:dstBuffer
+                  destinationOffset:(NSUInteger)dstOffset + bytes_copied
+                               size:bytes_to_copy];
+        bytes_copied += bytes_to_copy;
+        bytes_remains -= bytes_to_copy;
+      }
       [blitEncoder endEncoding];
 
       // profilerId has a value only if copy profiling is enabled
 
@@ -1089,6 +1089,7 @@ def addDecorator(op, d) -> None:
     NNTestCase = NoTest  # noqa: F811
 
 product_version = float('.'.join(platform.mac_ver()[0].split('.')[:2]) or -1)
+total_memory = int(subprocess.check_output(["sysctl", "-n", "hw.memsize"]))
 
 # Determine whether to enable MPS memory leak check (uses same code as CUDA).
 TEST_MPS_MEM_LEAK_CHECK = os.getenv('PYTORCH_TEST_MPS_MEM_LEAK_CHECK', '0') == '1'
@@ -7013,6 +7014,16 @@ def compare_mm(m, n, k, dtype=torch.float):
             # Test bfloat16 mm
             compare_mm(1024, 1, 32769, torch.bfloat16)
 
+    @unittest.skipIf(total_memory < 12_000_000_000, "Needs at least 12Gb RAM to run the test")
+    @unittest.skipIf(product_version < 14.0, "Can't allocate 4Gb tensor on MacOS 13")
+    def test_copy_large(self):
+        """ Test that copy of 4Gb+ tensors works """
+        x = torch.ones((2**30 + 11,), dtype=torch.float32)
+        y = x.to(device="mps")
+        self.assertTrue(torch.all(y == torch.tensor(1.0, device="mps")))
+        del y
+        del x
+
     # Test flip
     def test_flip(self):
         def helper(shape, dims):