diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index d1734b5d0..d4375fe9d 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -220,6 +220,39 @@ UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, scalable_pool_uniform) #endif +// stacked pool benchmarks + +UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, + disjoint_pool_stack_fix, fixed_alloc_size, + pool_stacked_allocator); + +UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, + disjoint_pool_stack_fix) + ->Apply(&default_multiple_alloc_fix_size) + ->Apply(&multithreaded); + +UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, + disjoint_pool_stack_uniform, uniform_alloc_size, + pool_stacked_allocator); +UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, + disjoint_pool_stack_uniform) + ->Apply(&default_multiple_alloc_uniform_size) + ->Apply(&multithreaded); + +UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, disjoint_pool_stack_fix, + fixed_alloc_size, + pool_stacked_allocator); +UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, disjoint_pool_stack_fix) + ->Apply(&default_multiple_alloc_fix_size) + ->Apply(&multithreaded); + +UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, disjoint_pool_stack_uniform, + uniform_alloc_size, + pool_stacked_allocator); +UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, disjoint_pool_stack_uniform) + ->Apply(&default_multiple_alloc_uniform_size) + ->Apply(&multithreaded); + //BENCHMARK_MAIN(); int main(int argc, char **argv) { if (initAffinityMask()) { diff --git a/benchmark/benchmark.hpp b/benchmark/benchmark.hpp index 1d08f5d72..201c9e382 100644 --- a/benchmark/benchmark.hpp +++ b/benchmark/benchmark.hpp @@ -73,6 +73,7 @@ #include #include #include +#include #include #include @@ -209,32 +210,71 @@ class provider_allocator : public allocator_interface { // TODO: assert Pool to be a pool_interface. template class pool_allocator : public allocator_interface { public: - unsigned SetUp(::benchmark::State &state, unsigned argPos) override { + virtual unsigned SetUp(::benchmark::State &state, + unsigned argPos) override { pool.SetUp(state); return argPos; } - void preBench(::benchmark::State &state) override { pool.preBench(state); } - void postBench(::benchmark::State &state) override { + virtual void preBench(::benchmark::State &state) override { + pool.preBench(state); + } + virtual void postBench(::benchmark::State &state) override { pool.postBench(state); } - void TearDown(::benchmark::State &state) override { pool.TearDown(state); } + virtual void TearDown(::benchmark::State &state) override { + pool.TearDown(state); + } - void *benchAlloc(size_t size) override { + virtual void *benchAlloc(size_t size) override { return umfPoolMalloc(pool.pool, size); } - void benchFree(void *ptr, [[maybe_unused]] size_t size) override { + virtual void benchFree(void *ptr, [[maybe_unused]] size_t size) override { umfPoolFree(pool.pool, ptr); } static std::string name() { return Pool::name(); } - private: + protected: Pool pool; }; +template +class pool_stacked_allocator + : public pool_allocator> { + using base = pool_allocator>; + + public: + virtual void preBench([[maybe_unused]] ::benchmark::State &state) override { + // we do not measure fragmentation for stack pools + } + virtual void + postBench([[maybe_unused]] ::benchmark::State &state) override { + // we do not measure fragmentation for stack pools + } + void *benchAlloc(size_t size) override { + static thread_local int counter = 0; + static auto pool_number = base::pool.pools.size(); + // stacked pools has limited space, so we might need a few + // tries to find one with free space + auto retry = pool_number; + while (retry--) { + void *ptr = umfPoolMalloc( + base::pool.pools[(++counter % pool_number)], size); + if (ptr != NULL) { + return ptr; + } + } + return NULL; + } + + void benchFree(void *ptr, [[maybe_unused]] size_t size) override { + umfFree(ptr); + } +}; + template struct benchmark_interface : public benchmark::Fixture { int parseArgs(::benchmark::State &state, int argPos) { diff --git a/benchmark/benchmark_umf.hpp b/benchmark/benchmark_umf.hpp index c32cc51cb..74551b20e 100644 --- a/benchmark/benchmark_umf.hpp +++ b/benchmark/benchmark_umf.hpp @@ -317,6 +317,142 @@ struct disjoint_pool : public pool_interface { } }; +// benchmark tracking provider, by creating big number pools(2^7) stacked +template +struct disjoint_pool_stack : public disjoint_pool { + using base = disjoint_pool; + + std::vector providers; + std::vector pools; + std::vector pool_ptrs; + + static constexpr size_t firstPoolSize = 2ull * 1024 * 1024 * 1024; // 2GB + static constexpr size_t levels = 7; + + void SetUp(::benchmark::State &state) { + base::provider.SetUp(state); + if (state.thread_index() != 0) { + return; + } + + providers.push_back(base::provider.provider); + base::provider.provider = nullptr; + + auto params = base::getParams(state); + umf_memory_pool_handle_t rootPool = nullptr; + auto umf_result = umfPoolCreate(base::getOps(state), providers[0], + params.get(), 0, &rootPool); + if (umf_result != UMF_RESULT_SUCCESS) { + state.SkipWithError("umfPoolCreate() failed"); + return; + } + + pools.push_back(rootPool); // root pool + + umf_fixed_memory_provider_params_handle_t params_fixed = nullptr; + umf_result = umfFixedMemoryProviderParamsCreate( + ¶ms_fixed, (void *)0x1, 0x1); // dummy + + size_t poolSize = firstPoolSize; + size_t level_start = 0; + size_t level_pools = 1; + + for (size_t level = 1; level < levels; ++level) { + // split each pools for 3 parts - two for children, and third from other allocations from this pool + poolSize /= 3; + size_t new_level_pools = level_pools * 2; + + for (size_t parent_idx = 0; parent_idx < level_pools; + ++parent_idx) { + umf_memory_pool_handle_t parent_pool = + pools[level_start + parent_idx]; + + for (int child = 0; child < 2; ++child) { + void *ptr = umfPoolMalloc(parent_pool, poolSize); + if (!ptr) { + state.SkipWithError("umfPoolMalloc() failed"); + return; + } + pool_ptrs.push_back(ptr); + + umf_result = umfFixedMemoryProviderParamsSetMemory( + params_fixed, ptr, poolSize); + umf_memory_provider_handle_t prov; + umf_result = umfMemoryProviderCreate( + umfFixedMemoryProviderOps(), params_fixed, &prov); + if (umf_result != UMF_RESULT_SUCCESS) { + state.SkipWithError("umfMemoryProviderCreate() failed"); + return; + } + providers.push_back(prov); + + umf_memory_pool_handle_t newPool; + umf_result = umfPoolCreate(base::getOps(state), prov, + params.get(), 0, &newPool); + if (umf_result != UMF_RESULT_SUCCESS) { + state.SkipWithError("umfPoolCreate() failed"); + return; + } + + pools.push_back(newPool); + } + } + + level_start += level_pools; + level_pools = new_level_pools; + } + + umfFixedMemoryProviderParamsDestroy(params_fixed); + } + + void TearDown(::benchmark::State &state) { + if (state.thread_index() != 0) { + return; + } + + size_t pool_index = pools.size(); + size_t provider_index = providers.size(); + size_t ptr_index = pool_ptrs.size(); + + // Go from last level to first (excluding level 0, root) + for (int level = levels - 1; level > 0; --level) { + size_t level_pools = 1ull << level; // 2^level pools + + // Destroy pools + for (size_t i = 0; i < level_pools; ++i) { + --pool_index; + umfPoolDestroy(pools[pool_index]); + } + + // Destroy providers and free pointers + for (size_t i = 0; i < level_pools; ++i) { + --provider_index; + umfMemoryProviderDestroy(providers[provider_index]); + + --ptr_index; + void *ptr = pool_ptrs[ptr_index]; + if (ptr) { + umfFree(ptr); + } + } + } + + // Root pool and provider + umfPoolDestroy(pools[0]); + umfMemoryProviderDestroy(providers[0]); + + pools.clear(); + providers.clear(); + pool_ptrs.clear(); + + base::TearDown(state); + } + + static std::string name() { + return "disjoint_pool_stacked<" + Provider::name() + ">"; + } +}; + #ifdef UMF_POOL_JEMALLOC_ENABLED template struct jemalloc_pool : public pool_interface { diff --git a/src/libumf.def b/src/libumf.def index aaf430277..10b0326b8 100644 --- a/src/libumf.def +++ b/src/libumf.def @@ -133,6 +133,7 @@ EXPORTS umfFixedMemoryProviderOps umfFixedMemoryProviderParamsCreate umfFixedMemoryProviderParamsDestroy + umfFixedMemoryProviderParamsSetMemory umfLevelZeroMemoryProviderParamsSetFreePolicy umfLevelZeroMemoryProviderParamsSetDeviceOrdinal ; Added in UMF_0.12 diff --git a/src/libumf.map b/src/libumf.map index 41aa48e9c..e283c3853 100644 --- a/src/libumf.map +++ b/src/libumf.map @@ -131,6 +131,7 @@ UMF_0.11 { umfFixedMemoryProviderOps; umfFixedMemoryProviderParamsCreate; umfFixedMemoryProviderParamsDestroy; + umfFixedMemoryProviderParamsSetMemory; umfLevelZeroMemoryProviderParamsSetFreePolicy; umfLevelZeroMemoryProviderParamsSetDeviceOrdinal; } UMF_0.10; diff --git a/src/provider/provider_tracking.c b/src/provider/provider_tracking.c index 774e103af..39e6eadb2 100644 --- a/src/provider/provider_tracking.c +++ b/src/provider/provider_tracking.c @@ -81,6 +81,7 @@ static tracker_alloc_info_t *get_most_nested_alloc_segment( uintptr_t parent_key = 0; uintptr_t rkey = 0; uint64_t rsize = 0; + size_t n_children = 0; int level = 0; int found = 0; @@ -113,8 +114,8 @@ static tracker_alloc_info_t *get_most_nested_alloc_segment( } utils_atomic_load_acquire_u64((uint64_t *)&rvalue->size, &rsize); - - if (found && ((uintptr_t)ptr < rkey + rsize) && rvalue->n_children) { + utils_atomic_load_acquire_size_t(&rvalue->n_children, &n_children); + if (found && ((uintptr_t)ptr < rkey + rsize) && n_children) { if (level == MAX_LEVELS_OF_ALLOC_SEGMENT_MAP - 1) { break; } @@ -146,13 +147,13 @@ static tracker_alloc_info_t *get_most_nested_alloc_segment( ref_value = NULL; } } - } while (found && ((uintptr_t)ptr < rkey + rsize) && rvalue->n_children); + } while (found && ((uintptr_t)ptr < rkey + rsize) && n_children); if (!rvalue || rkey != (uintptr_t)ptr) { return NULL; } - if (no_children && (rvalue->n_children > 0)) { + if (no_children && (n_children > 0)) { return NULL; } @@ -207,11 +208,12 @@ umfMemoryTrackerAddAtLevel(umf_memory_tracker_handle_t hTracker, int level, (void *)hTracker, level, (void *)pool, ptr, size); if (parent_value) { - parent_value->n_children++; + size_t n_children = + utils_atomic_increment_size_t(&parent_value->n_children) + 1; LOG_DEBUG( "child #%zu added to memory region: tracker=%p, level=%i, " "pool=%p, ptr=%p, size=%zu", - parent_value->n_children, (void *)hTracker, level - 1, + n_children, (void *)hTracker, level - 1, (void *)parent_value->pool, (void *)parent_key, parent_value->size); assert(ref_parent_value); @@ -243,6 +245,7 @@ static umf_result_t umfMemoryTrackerAdd(umf_memory_tracker_handle_t hTracker, uintptr_t parent_key = 0; uintptr_t rkey = 0; uint64_t rsize = 0; + size_t n_children = 0; int level = 0; int found = 0; @@ -311,7 +314,8 @@ static umf_result_t umfMemoryTrackerAdd(umf_memory_tracker_handle_t hTracker, ref_parent_value = ref_value; level++; } - } while (found && ((uintptr_t)ptr < rkey + rsize) && rvalue->n_children); + utils_atomic_load_acquire_size_t(&rvalue->n_children, &n_children); + } while (found && ((uintptr_t)ptr < rkey + rsize) && n_children); if (ref_value && ref_value != ref_parent_value) { critnib_release(hTracker->alloc_segments_map[level], ref_value); @@ -366,12 +370,14 @@ static umf_result_t umfMemoryTrackerRemove(umf_memory_tracker_handle_t hTracker, critnib_release(hTracker->alloc_segments_map[level], ref_value); if (parent_value) { + size_t n_children = + utils_atomic_decrement_size_t(&parent_value->n_children); LOG_DEBUG( "child #%zu removed from memory region: tracker=%p, level=%i, " "pool=%p, ptr=%p, size=%zu", - parent_value->n_children, (void *)hTracker, level - 1, - (void *)parent_value->pool, (void *)parent_key, parent_value->size); - parent_value->n_children--; + n_children, (void *)hTracker, level - 1, (void *)parent_value->pool, + (void *)parent_key, parent_value->size); + assert(ref_parent_value); assert(level >= 1); // release the ref_parent_value got from get_most_nested_alloc_segment() @@ -485,6 +491,7 @@ umf_result_t umfMemoryTrackerGetAllocInfo(const void *ptr, uintptr_t top_most_key = 0; uintptr_t rkey = 0; uint64_t rsize = 0; + size_t n_children = 0; int level = 0; int found = 0; @@ -519,7 +526,7 @@ umf_result_t umfMemoryTrackerGetAllocInfo(const void *ptr, } utils_atomic_load_acquire_u64((uint64_t *)&rvalue->size, &rsize); - + utils_atomic_load_acquire_size_t(&rvalue->n_children, &n_children); if (found && (uintptr_t)ptr < rkey + rsize) { top_most_key = rkey; top_most_value = rvalue; @@ -530,13 +537,13 @@ umf_result_t umfMemoryTrackerGetAllocInfo(const void *ptr, } ref_top_most_value = ref_value; ref_level = level; - if (rvalue->n_children == 0 || + if (n_children == 0 || level == MAX_LEVELS_OF_ALLOC_SEGMENT_MAP - 1) { break; } level++; } - } while (found && (uintptr_t)ptr < rkey + rsize && rvalue->n_children); + } while (found && (uintptr_t)ptr < rkey + rsize && n_children); if (!top_most_value) { if (ref_value) { diff --git a/src/utils/utils_concurrency.h b/src/utils/utils_concurrency.h index 540ad6095..14d7fe54e 100644 --- a/src/utils/utils_concurrency.h +++ b/src/utils/utils_concurrency.h @@ -270,6 +270,16 @@ static inline bool utils_compare_exchange_size_t(size_t *ptr, size_t *expected, (uint64_t *)desired); } +static inline size_t utils_atomic_increment_size_t(size_t *val) { + COMPILE_ERROR_ON(sizeof(size_t) != sizeof(uint64_t)); + return utils_atomic_increment_u64((uint64_t *)val); +} + +static inline size_t utils_atomic_decrement_size_t(size_t *val) { + COMPILE_ERROR_ON(sizeof(size_t) != sizeof(uint64_t)); + return utils_atomic_decrement_u64((uint64_t *)val); +} + #ifdef __cplusplus } #endif