8000 Deprecate overleap functions in CUDAAllocatorConfig, use AcceleratorA… · pytorch/pytorch@1fc010a · GitHub
[go: up one dir, main page]

Skip to content

Commit 1fc010a

Browse files
guangyeypytorchmergebot
authored andcommitted
Deprecate overleap functions in CUDAAllocatorConfig, use AcceleratorAllocatorConfig instead (#156165)
Pull Request resolved: #156165 Approved by: https://github.com/albanD ghstack dependencies: #149601, #157908, #150312
1 parent dfacf11 commit 1fc010a

File tree

5 files changed

+44
-32
lines changed

5 files changed

+44
-32
lines changed

aten/src/ATen/cuda/CachingHostAllocator.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ struct CUDACachingHostAllocatorImpl
162162
}
163163

164164
bool pinned_use_background_threads() override {
165-
return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
165+
return c10::CachingAllocator::AcceleratorAllocatorConfig::
166166
pinned_use_background_threads();
167167
}
168168

c10/cuda/CUDAAllocatorConfig.h

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
8000
33
#include <c10/core/AllocatorConfig.h>
44
#include <c10/cuda/CUDAException.h>
55
#include <c10/cuda/CUDAMacros.h>
6+
#include <c10/util/Deprecated.h>
67
#include <c10/util/Exception.h>
78
#include <c10/util/env.h>
89

@@ -17,9 +18,13 @@ enum class Expandable_Segments_Handle_Type : int {
1718
// Environment config parser
1819
class C10_CUDA_API CUDAAllocatorConfig {
1920
public:
21+
C10_DEPRECATED_MESSAGE(
22+
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_split_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size() instead.")
2023
static size_t max_split_size() {
2124
return c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size();
2225
}
26+
C10_DEPRECATED_MESSAGE(
27+
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::garbage_collection_threshold() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::garbage_collection_threshold() instead.")
2328
static double garbage_collection_threshold() {
2429
return c10::CachingAllocator::AcceleratorAllocatorConfig::
2530
garbage_collection_threshold();
@@ -60,6 +65,8 @@ class C10_CUDA_API CUDAAllocatorConfig {
6065
return instance().m_pinned_num_register_threads;
6166
}
6267

68+
C10_DEPRECATED_MESSAGE(
69+
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_use_background_threads() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::pinned_use_background_threads() instead.")
6370
static bool pinned_use_background_threads() {
6471
return c10::CachingAllocator::AcceleratorAllocatorConfig::
6572
pinned_use_background_threads();
@@ -72,25 +79,29 @@ class C10_CUDA_API CUDAAllocatorConfig {
7279
return 128;
7380
}
7481

75-
// This is used to round-up allocation size to nearest power of 2 divisions.
76-
// More description below in function roundup_power2_next_division
77-
// As an example, if we want 4 divisions between 2's power, this can be done
78-
// using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
82+
C10_DEPRECATED_MESSAGE(
83+
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
7984
static size_t roundup_power2_divisions(size_t size) {
8085
return c10::CachingAllocator::AcceleratorAllocatorConfig::
8186
roundup_power2_divisions(size);
8287
}
8388

89+
C10_DEPRECATED_MESSAGE(
90+
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
8491
static std::vector<size_t> roundup_power2_divisions() {
8592
return c10::CachingAllocator::AcceleratorAllocatorConfig::
8693
roundup_power2_divisions();
8794
}
8895

96+
C10_DEPRECATED_MESSAGE(
97+
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_non_split_rounding_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_non_split_rounding_size() instead.")
8998
static size_t max_non_split_rounding_size() {
9099
return c10::CachingAllocator::AcceleratorAllocatorConfig::
91100
max_non_split_rounding_size();
92101
}
93102

103+
C10_DEPRECATED_MESSAGE(
104+
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::last_allocator_settings() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::last_allocator_settings() instead.")
94105
static std::string last_allocator_settings() {
95106
return c10::CachingAllocator::getAllocatorSettings();
96107
}

c10/cuda/CUDACachingAllocator.cpp

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1225,7 +1225,7 @@ class DeviceCachingAllocator {
12251225
DeviceCachingAllocator()
12261226
: large_blocks(/*small=*/false), small_blocks(/*small=*/true) {
12271227
stats.max_split_size =
1228-
static_cast<int64_t>(CUDAAllocatorConfig::max_split_size());
1228+
static_cast<int64_t>(AcceleratorAllocatorConfig::max_split_size());
12291229
context_recorder_.store(nullptr);
12301230
}
12311231

@@ -1350,7 +1350,8 @@ class DeviceCachingAllocator {
13501350
// Do garbage collection if the flag is set.
13511351
if (C10_UNLIKELY(
13521352
set_fraction &&
1353-
CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
1353+
AcceleratorAllocatorConfig::garbage_collection_threshold() >
1354+
0.0)) {
13541355
garbage_collect_cached_blocks(context);
13551356
}
13561357
// Attempt allocate
@@ -1602,7 +1603,7 @@ class DeviceCachingAllocator {
16021603
stats.active_bytes[stat_type].increase(block->size);
16031604
stats.requested_bytes[stat_type].increase(block->requested_size);
16041605
});
1605-
if (block->size >= CUDAAllocatorConfig::max_split_size())
1606+
if (block->size >= AcceleratorAllocatorConfig::max_split_size())
16061607
stats.oversize_allocations.increase(1);
16071608

16081609
auto allocated_bytes_gauge =
@@ -1653,7 +1654,7 @@ class DeviceCachingAllocator {
16531654
block->pool->owner_MempoolId(),
16541655
context ? context : block->context_when_allocated);
16551656

1656-
if (block->size >= CUDAAllocatorConfig::max_split_size())
1657+
if (block->size >= AcceleratorAllocatorConfig::max_split_size())
16571658
stats.oversize_allocations.decrease(1);
16581659

16591660
if (!block->stream_uses.empty()) {
@@ -2202,7 +2203,8 @@ class DeviceCachingAllocator {
22022203
if (size < kMinBlockSize) {
22032204
return kMinBlockSize;
22042205
} else {
2205-
auto divisions = CUDAAllocatorConfig::roundup_power2_divisions(size);
2206+
auto divisions =
2207+
AcceleratorAllocatorConfig::roundup_power2_divisions(size);
22062208
if (divisions > 1 && size > (kMinBlockSize * divisions)) {
22072209
return roundup_power2_next_division(size, divisions);
22082210
} else {
@@ -2692,7 +2694,7 @@ class DeviceCachingAllocator {
26922694
if (block->pool->is_small || CUDAAllocatorConfig::expandable_segments()) {
26932695
return remaining >= kMinBlockSize;
26942696
} else {
2695-
return (size < CUDAAllocatorConfig::max_split_size()) &&
2697+
return (size < AcceleratorAllocatorConfig::max_split_size()) &&
26962698
(remaining > kSmallSize);
26972699
}
26982700
}
@@ -2712,7 +2714,7 @@ class DeviceCachingAllocator {
27122714

27132715
if (C10_UNLIKELY(
27142716
set_fraction &&
2715-
CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
2717+
AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) {
27162718
// Track block reuse interval only when garbage collection is enabled.
27172719
++pool.get_free_blocks_call_count;
27182720
}
@@ -2754,13 +2756,13 @@ class DeviceCachingAllocator {
27542756
}
27552757

27562758
// Do not return an oversized block for a large request
2757-
if ((p.size() < CUDAAllocatorConfig::max_split_size()) &&
2758-
((*it)->size >= CUDAAllocatorConfig::max_split_size()))
2759+
if ((p.size() < AcceleratorAllocatorConfig::max_split_size()) &&
2760+
((*it)->size >= AcceleratorAllocatorConfig::max_split_size()))
27592761
return false;
27602762
// Allow oversized block size to be rounded up but within a limit
2761-
if ((p.size() >= CUDAAllocatorConfig::max_split_size()) &&
2763+
if ((p.size() >= AcceleratorAllocatorConfig::max_split_size()) &&
27622764
((*it)->size >=
2763-
p.size() + CUDAAllocatorConfig::max_non_split_rounding_size()))
2765+
p.size() + AcceleratorAllocatorConfig::max_non_split_rounding_size()))
27642766
return false;
27652767
p.block = *it;
27662768
pool.blocks.erase(it);
@@ -2783,7 +2785,7 @@ class DeviceCachingAllocator {
27832785
// therefore should be of less overheads.
27842786

27852787
size_t gc_threshold = static_cast<size_t>(
2786-
CUDAAllocatorConfig::garbage_collection_threshold() *
2788+
AcceleratorAllocatorConfig::garbage_collection_threshold() *
27872789
static_cast<double>(allowed_memory_maximum));
27882790
// No need to trigger GC yet
27892791
if (total_allocated_memory <= gc_threshold) {
@@ -2931,7 +2933,7 @@ class DeviceCachingAllocator {
29312933
stats.segment[stat_type].increase(1);
29322934
stats.reserved_bytes[stat_type].increase(size);
29332935
});
2934-
if (size >= CUDAAllocatorConfig::max_split_size())
2936+
if (size >= AcceleratorAllocatorConfig::max_split_size())
29352937
stats.oversize_segments.increase(1);
29362938
auto reserved_bytes_gauge =
29372939
STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
@@ -2960,16 +2962,16 @@ class DeviceCachingAllocator {
29602962
bool release_available_cached_blocks(
29612963
const AllocParams& p,
29622964
const std::shared_ptr<GatheredContext>& context) {
2963-
if (CUDAAllocatorConfig::max_split_size() ==
2965+
if (AcceleratorAllocatorConfig::max_split_size() ==
29642966
std::numeric_limits<size_t>::max())
29652967
return false;
29662968
BlockPool& pool = *p.pool;
29672969

29682970
// because of std::unique_ptr, block cannot be trivially copied
29692971
// Use constructor for search key.
29702972
Block key(p.search_key.device, p.search_key.stream, p.search_key.size);
2971-
key.size = (key.size < CUDAAllocatorConfig::max_split_size())
2972-
? CUDAAllocatorConfig::max_split_size()
2973+
key.size = (key.size < AcceleratorAllocatorConfig::max_split_size())
2974+
? AcceleratorAllocatorConfig::max_split_size()
29732975
: key.size;
29742976
auto it = pool.blocks.lower_bound(&key);
29752977
if (it == pool.blocks.end() || (*it)->stream != p.stream() ||
@@ -2982,7 +2984,7 @@ class DeviceCachingAllocator {
29822984
--it; // Back up one item. Now on the largest block for the correct
29832985
// stream
29842986
while ((totalReleased < key.size) &&
2985-
((*it)->size >= CUDAAllocatorConfig::max_split_size()) &&
2987+
((*it)->size >= AcceleratorAllocatorConfig::max_split_size()) &&
29862988
((*it)->stream == p.stream())) {
29872989
auto cur = it;
29882990
bool is_first = cur == pool.blocks.begin();
@@ -3107,7 +3109,7 @@ class DeviceCachingAllocator {
31073109
stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
31083110
.current);
31093111

3110-
if (block->size >= CUDAAllocatorConfig::max_split_size())
3112+
if (block->size >= AcceleratorAllocatorConfig::max_split_size())
31113113
stats.oversize_segments.decrease(1);
31123114
pool->blocks.erase(block);
31133115
delete block;
@@ -3734,18 +3736,19 @@ class NativeCachingAllocator : public CUDAAllocator {
37343736

37353737
auto& md = result.config_metadata;
37363738
md.garbage_collection_threshold =
3737-
CUDAAllocatorConfig::garbage_collection_threshold();
3738-
md.max_split_size = CUDAAllocatorConfig::max_split_size();
3739+
AcceleratorAllocatorConfig::garbage_collection_threshold();
3740+
md.max_split_size = AcceleratorAllocatorConfig::max_split_size();
37393741
md.pinned_num_register_threads =
37403742
CUDAAllocatorConfig::pinned_num_register_threads();
37413743
md.expandable_segments = CUDAAllocatorConfig::expandable_segments();
37423744
md.release_lock_on_malloc =
37433745
CUDAAllocatorConfig::release_lock_on_cudamalloc();
37443746
md.pinned_use_host_register =
37453747
CUDAAllocatorConfig::pinned_use_cuda_host_register();
3746-
md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings();
3748+
md.last_allocator_settings =
3749+
AcceleratorAllocatorConfig::last_allocator_settings();
37473750
md.roundup_power2_divisions =
3748-
CUDAAllocatorConfig::roundup_power2_divisions();
3751+
AcceleratorAllocatorConfig::roundup_power2_divisions();
37493752

37503753
return result;
37513754
}

c10/xpu/XPUCachingAllocator.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#include <c10/core/AllocatorConfig.h>
12
#include <c10/util/flat_hash_map.h>
23
#include <c10/util/irange.h>
34
#include <c10/xpu/XPUCachingAllocator.h>
@@ -20,8 +21,6 @@ constexpr size_t kMinBlockSize = 512;
2021
constexpr size_t kSmallSize = 1048576;
2122
// "small" allocations are packed in 2 MiB blocks
2223
constexpr size_t kSmallBuffer = 2097152;
23-
// "large" allocations may be packed in 20 MiB blocks
24-
constexpr size_t kLargeBuffer = 20971520;
2524
// allocations between 1 and 10 MiB may use kLargeBuffer
2625
constexpr size_t kMinLargeAlloc = 10485760;
2726
// round up large allocations to 2 MiB

torch/csrc/cuda/Module.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@
2020
#include <ATen/cuda/detail/CUDAHooks.h>
2121
#include <ATen/cuda/jiterator.h>
2222
#include <ATen/cuda/tunable/Tunable.h>
23+
#include <c10/core/AllocatorConfig.h>
2324
#include <c10/core/StorageImpl.h>
24-
#include <c10/cuda/CUDAAllocatorConfig.h>
2525
#include <c10/cuda/CUDACachingAllocator.h>
2626
#include <c10/cuda/CUDAFunctions.h>
2727
#include <ATen/cuda/CUDAGraphsUtils.cuh>
@@ -426,8 +426,7 @@ PyObject* THCPModule_cudaCachingAllocator_set_allocator_settings(
426426
PyObject* _unused,
427427
PyObject* env) {
428428
HANDLE_TH_ERRORS
429-
c10::cuda::CUDACachingAllocator::setAllocatorSettings(
430-
THPUtils_unpackString(env));
429+
c10::CachingAllocator::setAllocatorSettings(THPUtils_unpackString(env));
431430
Py_RETURN_NONE;
432431
END_HANDLE_TH_ERRORS
433432
}

0 commit comments

Comments
 (0)
0