8000 [MPS] Fix large copy (#124635) · pytorch/pytorch@abf3f90 · GitHub
[go: up one dir, main page]

Skip to content

Commit abf3f90

Browse files
malfetpytorchmergebot
authored andcommitted
[MPS] Fix large copy (#124635)
By slicing `copyFromBuffer:sourceOffset:toBuffer:destinationOffset:size:` into 2Gb chunks Add regression test, but limit it to machines with 12Gb of RAM or more, and MacOS 14+, as on MacOS 13 attempt to alloc 4Gb tensor fails with: ``` /AppleInternal/Library/BuildRoots/c651a45f-806e-11ed-a221-7ef33c48bc85/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Types/MPSNDArray.mm:724: failed assertion `[MPSNDArray initWithDevice:descriptor:] Error: total bytes of NDArray > 2**32' ``` Fixes #124335 Pull Request resolved: #124635 Approved by: https://github.com/kulinseth
1 parent 72a34ee commit abf3f90

File tree

2 files changed

+27
-5
lines changed
Original file line numberDiff line numberDiff line change
@@ -173,11 +173,22 @@ @interface MPSGraphExecutionDescriptor ()
173173
endKernelCoalescing();
174174
id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
175175

176-
[blitEncoder copyFromBuffer:srcBuffer
177-
sourceOffset:(NSUInteger)srcOffset
178-
toBuffer:dstBuffer
179-
destinationOffset:(NSUInteger)dstOffset
180-
size:(NSUInteger)length];
176+
// For some reason copyFromBuffer for 4Gb fails without returning an error
177+
// See https://github.com/pytorch/pytorch/issues/124335
178+
// Workaround by batching copy commands into 2Gb chunks
179+
constexpr size_t max_copy_size = 0x80000000; // 2GB
180+
size_t bytes_copied = 0;
181+
size_t bytes_remains = length;
182+
while (bytes_remains > 0) {
183+
NSUInteger bytes_to_copy = std::min(max_copy_size, bytes_remains);
184+
[blitEncoder copyFromBuffer:srcBuffer
185+
sourceOffset:(NSUInteger)srcOffset + bytes_copied
186+
toBuffer:dstBuffer
187+
destinationOffset:(NSUInteger)dstOffset + bytes_copied
188+
size:bytes_to_copy];
189+
bytes_copied += bytes_to_copy;
190+
bytes_remains -= bytes_to_copy;
191+
}
181192
[blitEncoder endEncoding];
182193

183194
// profilerId has a value only if copy profiling is enabled
Original file line numberDiff line numberDiff line change
@@ -1089,6 +1089,7 @@ def addDecorator(op, d) -> None:
10891089
NNTestCase = NoTest # noqa: F811
10901090

10911091
product_version = float('.'.join(platform.mac_ver()[0].split('.')[:2]) or -1)
1092+
total_memory = int(subprocess.check_output(["sysctl", "-n", "hw.memsize"]))
10921093

10931094
# Determine whether to enable MPS memory leak check (uses same code as CUDA).
10941095
TEST_MPS_MEM_LEAK_CHECK = os.getenv('PYTORCH_TEST_MPS_MEM_LEAK_CHECK', '0') == '1'
@@ -7013,6 +7014,16 @@ def compare_mm(m, n, k, dtype=torch.float):
70137014
# Test bfloat16 mm
70147015
compare_mm(1024, 1, 32769, torch.bfloat16)
70157016

7017+
@unittest.skipIf(total_memory < 12_000_000_000, "Needs at least 12Gb RAM to run the test")
7018+
@unittest.skipIf(product_version < 14.0, "Can't allocate 4Gb tensor on MacOS 13")
7019+
def test_copy_large(self):
7020+
""" Test that copy of 4Gb+ tensors works """
7021+
x = torch.ones((2**30 + 11,), dtype=torch.float32)
7022+
y = x.to(device="mps")
7023+
self.assertTrue(torch.all(y == torch.tensor(1.0, device="mps")))
7024+
del y
7025+
del x
7026+
70167027
# Test flip
70177028
def test_flip(self):
70187029
def helper(shape, dims):