8000 Enable `_lazy_clone` between CPU and MPS by kurtamohler · Pull Request #148408 · pytorch/pytorch · GitHub
[go: up one dir, main page]

Skip to content

Enable _lazy_clone between CPU and MPS #148408

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 82 commits into
base: gh/kurtamohler/32/base
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
82 commits
Select commit Hold shift + click to select a range
e4a0cb8
Update
kurtamohler Mar 4, 2025
7c0f082
Update
kurtamohler Mar 7, 2025
501d68f
Update
kurtamohler Mar 7, 2025
3f9ca3c
Update
kurtamohler Mar 7, 2025
2a4340b
Update
kurtamohler Mar 11, 2025
778a0af
Update
kurtamohler Mar 12, 2025
8130be6
Update
kurtamohler Mar 12, 2025
e67656b
Update
kurtamohler Mar 12, 2025
3d169bd
Update
kurtamohler Mar 13, 2025
c48bfa0
Update
kurtamohler Mar 17, 2025
1826b54
Update
kurtamohler Mar 17, 2025
6fa7775
Update
kurtamohler Mar 17, 2025
cefe1c4
Update
kurtamohler Mar 18, 2025
02976b5
Update
kurtamohler Mar 18, 2025
02011e0
Update
kurtamohler Mar 18, 2025
3438700
Update
kurtamohler Mar 18, 2025
ed0f556
Update
kurtamohler Mar 18, 2025
bf1de02
Update
kurtamohler Mar 18, 2025
a74ae14
Update
kurtamohler Mar 18, 2025
eb66439
Update
kurtamohler Mar 18, 2025
764a186
Update
kurtamohler Mar 19, 2025
ae00427
Update
kurtamohler Mar 19, 2025
651e02a
Update
kurtamohler Mar 21, 2025
4f82e2f
Update
kurtamohler Mar 24, 2025
6b3ab6b
Update
kurtamohler Mar 24, 2025
2eeda3a
Update
kurtamohler Mar 25, 2025
fba676d
Update
kurtamohler Mar 25, 2025
027b63f
Update
kurtamohler Mar 25, 2025
534b116
Update
kurtamohler Mar 25, 2025
2c09346
Update
kurtamohler Mar 25, 2025
5cc9a0e
Update
kurtamohler Mar 25, 2025
8f33e76
Update
kurtamohler Mar 26, 2025
a2dfe1b
Update
kurtamohler Mar 26, 2025
83e3b26
Update
kurtamohler Mar 26, 2025
7d2ebbb
Update
kurtamohler Mar 26, 2025
b1f827d
Update
kurtamohler Mar 26, 2025
dcc6e97
Update
kurtamohler Mar 26, 2025
65986d1
Update
kurtamohler Mar 26, 2025
52ed388
Update
kurtamohler Mar 26, 2025
9eec7be
Update
kurtamohler Mar 26, 2025
e4b3f98
Update
kurtamohler Mar 26, 2025
56fb303
Update
kurtamohler Mar 26, 2025
8cb9ef8
Update
kurtamohler Mar 26, 2025
78ebf1a
Update
kurtamohler Mar 27, 2025
dcf885e
Update
kurtamohler Mar 27, 2025
5e39c32
Update
kurtamohler Mar 27, 2025
a5354d7
Update
kurtamohler Mar 27, 2025
749e37b
Update
kurtamohler Mar 27, 2025
bc0bda8
Update
kurtamohler Mar 27, 2025
d44aa02
Update
kurtamohler Mar 27, 2025
bf7be18
Update
kurtamohler Apr 1, 2025
9700e0f
Update
kurtamohler Apr 2, 2025
20269b1
Update
kurtamohler Apr 2, 2025
4fbe13f
Update
kurtamohler Apr 2, 2025
300b52d
Update
kurtamohler Apr 2, 2025
541e910
Update
kurtamohler Apr 3, 2025
3ca6e53
Update
kurtamohler Apr 3, 2025
e919203
Update
kurtamohler Apr 4, 2025
1c63790
Update
kurtamohler Apr 4, 2025
15ff7b3
Update
kurtamohler Apr 4, 2025
0e6cc83
Update
kurtamohler Apr 4, 2025
af8e76d
Update
kurtamohler Apr 5, 2025
a4993ea
Update
kurtamohler Apr 7, 2025
229632a
Update
kurtamohler Apr 7, 2025
7b38ca7
Update
kurtamohler Apr 7, 2025
bd89c07
Update
kurtamohler Apr 8, 2025
22081eb
Update
kurtamohler Apr 25, 2025
1e5de8a
Update
kurtamohler Apr 28, 2025
2f864f1
Update
kurtamohler Apr 28, 2025
c688b9c
Update
kurtamohler Apr 28, 2025
d5e4f89
Update
kurtamohler May 1, 2025
3b1a2c7
Update
kurtamohler May 16, 2025
74a35a4
Update
kurtamohler May 21, 2025
62ebaa3
Update
kurtamohler May 21, 2025
c6f7cf6
Update
kurtamohler May 21, 2025
da2602f
Update
kurtamohler May 23, 2025
529e78c
Update on "Enable `_lazy_clone` between CPU and MPS"
kurtamohler May 24, 2025
14e095a
Update on "Enable `_lazy_clone` between CPU and MPS"
kurtamohler May 27, 2025
af8d3ff
Update on "Enable `_lazy_clone` between CPU and MPS"
kurtamohler May 28, 2025
281f616
Update on "Enable `_lazy_clone` between CPU and MPS"
kurtamohler May 28, 2025
d472408
Update on "Enable `_lazy_clone` between CPU and MPS"
kurtamohler May 31, 2025
49f6711
Update
kurtamohler Jul 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Update
[ghstack-poisoned]
  • Loading branch information
kurtamohler committed Mar 7, 2025
commit 7c0f0820bd0f4ec98b7cb10a47e5a2c11bcc3f5d
15 changes: 12 additions & 3 deletions c10/core/impl/COW.cpp
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
#include <c10/core/impl/COW.h>

#include <c10/core/Allocator.h>
#include <c10/core/DeviceGuard.h>
#include <c10/core/StorageImpl.h>
#include <c10/core/alignment.h>
#include <c10/core/impl/COWDeleter.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDAFunctions.h>
#include <c10/util/Exception.h>
#include <c10/util/ParallelGuard.h>
#include <c10/util/UniqueVoidPtr.h>

#ifdef USE_CUDA
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDAFunctions.h>
#endif

#include <memory>
#include <optional>

Expand Down Expand Up @@ -121,9 +125,12 @@ c10::intrusive_ptr<StorageImpl> lazy_clone_storage(
c10::impl::cow::cow_deleter);
device_type = device.type();

#ifdef USE_CUDA
if (device_type == c10::kCUDA) {
allocator = c10::cuda::CUDACachingAllocator::get();
} else {
} else
#endif
{
allocator = c10::GetAllocator(device.type());
}

Expand Down Expand Up @@ -170,9 +177,11 @@ C10_API void materialize_cow_storage(StorageImpl& storage) {
// that the data will remain while we copy it.
new_data_ptr = storage.allocator()->clone(data_ptr.get(), storage.nbytes());
if (!devices_match) {
#ifdef USE_CUDA
if (storage.device().type() == c10::kCUDA) {
c10::cuda::device_synchronize();
}
#endif
}
}

Expand Down
16 changes: 10 additions & 6 deletions c10/test/core/impl/cow_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ class ContextTest : public testing::Test {
};

TEST_F(ContextTest, Basic) {
auto& context = *new cow::COWDeleterContext(new_delete_tracker());
Device device(c10::kCPU);
auto& context = *new cow::COWDeleterContext(new_delete_tracker(), device);
ASSERT_THAT(delete_count(), testing::Eq(0));

context.increment_refcount();
Expand Down Expand Up @@ -76,7 +77,8 @@ TEST_F(ContextTest, Basic) {

TEST_F(ContextTest, cow_deleter) {
// This is effectively the same thing as decrement_refcount() above.
auto& context = *new cow::COWDeleterContext(new_delete_tracker());
Device device(c10::kCPU);
auto& context = *new cow::COWDeleterContext(new_delete_tracker(), device);
ASSERT_THAT(delete_count(), testing::Eq(0));

cow::cow_deleter(&context);
Expand Down Expand Up @@ -150,14 +152,15 @@ TEST(lazy_clone_storage_test, already_copy_on_write) {
new std::byte[5],
+[](void* bytes) { delete[] static_cast<std::byte*>(bytes); });
void* data_ptr = data.get();
Device device(c10::kCPU);
StorageImpl original_storage(
{},
/*size_bytes=*/5,
at::DataPtr(
/*data=*/data_ptr,
/*ctx=*/new cow::COWDeleterContext(std::move(data)),
/*ctx=*/new cow::COWDeleterContext(std::move(data), device),
cow::cow_deleter,
Device(Device::Type::CPU)),
device),
/*allocator=*/nullptr,
/*resizable=*/false);

Expand Down Expand Up @@ -193,14 +196,15 @@ TEST(materialize_test, copy_on_write_single_reference) {
new std::byte[4],
+[](void* bytes) { delete[] static_cast<std::byte*>(bytes); });
void* data_ptr = data.get();
Device device(c10::kCPU);
StorageImpl storage(
{},
/*size_bytes=*/4,
at::DataPtr(
/*data=*/data_ptr,
/*ctx=*/new cow::COWDeleterContext(std::move(data)),
/*ctx=*/new cow::COWDeleterContext(std::move(data), device),
cow::cow_deleter,
Device(Device::Type::CPU)),
device),
/*allocator=*/nullptr,
/*resizable=*/false);

Expand Down
17 changes: 16 additions & 1 deletion test/test_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -5342,8 +5342,23 @@ def test_lazy_clone_to_device(self, device):
('cpu', 'cuda:1'),
('cuda:1', 'cuda:0'),
('cuda:0', 'cuda:1'),
# TODO: Figure out why CUDA to CPU segfaults
# ('cuda', 'cpu'),
# NOTE: CUDA -> CPU doesn't work at the moment. Traceback shows that
# apparently the CPU Allocator's `clone` method doesn't know how to
# deal with a CUDA data pointer. May not need to solve this problem
# at the moment, because the whole point is MPS-CPU, not CUDA-CPU.
#
# Thread 1 "python" received signal SIGSEGV, Segmentation fault.
# __memcpy_avx_unaligned () at ../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:220
# warning: 220 ../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S: No such file or directory
# (gdb) bt
# #0 __memcpy_avx_unaligned () at ../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:220
# #1 0x00007ffff3d22e40 in c10::Allocator::clone (this=0x7ffff3df8010 <c10::g_cpu_alloc>,
# data=0x7ffc89000200, n=40)
# at /home/kurtamohler/develop/pytorch-0/c10/util/UniqueVoidPtr.h:61
# #2 0x00007ffff3d751cf in c10::impl::cow::materialize_cow_storage (storage=...)
# at /home/kurtamohler/develop/pytorch-0/c10/util/UniqueVoidPtr.h:61
#
]
for from_device, to_device in device_pairs:
from_device_check = torch.empty(0, device=from_device).device
Expand Down
Loading
You are viewing a condensed version of this merge commit. You can view the full changes here.
0