E5E4 Implement the new tuning API for `Dispatch[Streaming]ReduceByKey` by bernhardmgruber · Pull Request #7667 · NVIDIA/cccl · GitHub
[go: up one dir, main page]

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 41 additions & 96 deletions cub/benchmarks/bench/reduce/by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,72 +15,29 @@
// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5

#if !TUNE_BASE
# if TUNE_TRANSPOSE == 0
# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
# else // TUNE_TRANSPOSE == 1
# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
# endif // TUNE_TRANSPOSE

# if TUNE_LOAD == 0
# define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
# else // TUNE_LOAD == 1
# define TUNE_LOAD_MODIFIER cub::LOAD_CA
# endif // TUNE_LOAD

struct reduce_by_key_policy_hub
struct bench_reduce_by_key_policy_selector
{
struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
[[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const
-> cub::detail::reduce_by_key::reduce_by_key_policy
{
using ReduceByKeyPolicyT =
cub::AgentReduceByKeyPolicy<TUNE_THREADS,
TUNE_ITEMS,
TUNE_LOAD_ALGORITHM,
TUNE_LOAD_MODIFIER,
cub::BLOCK_SCAN_WARP_SCANS,
delay_constructor_t>;
};

using MaxPolicy = Policy500;
return {
TUNE_THREADS,
TUNE_ITEMS,
TUNE_TRANSPOSE == 0 ? cub::BLOCK_LOAD_DIRECT : cub::BLOCK_LOAD_WARP_TRANSPOSE,
TUNE_LOAD == 0 ? cub::LOAD_DEFAULT : cub::LOAD_CA,
cub::BLOCK_SCAN_WARP_SCANS,
delay_constructor_policy,
};
}
};
#endif // !TUNE_BASE

template <class KeyT, class ValueT, class OffsetT>
static void reduce(nvbench::state& state, nvbench::type_list<KeyT, ValueT, OffsetT>)
{
using keys_input_it_t = const KeyT*;
using unique_output_it_t = KeyT*;
using vals_input_it_t = const ValueT*;
using aggregate_output_it_t = ValueT*;
using num_runs_output_iterator_t = OffsetT*;
using equality_op_t = ::cuda::std::equal_to<>;
using reduction_op_t = ::cuda::std::plus<>;
using accum_t = ValueT;
using offset_t = OffsetT;

#if !TUNE_BASE
using dispatch_t = cub::DispatchReduceByKey<
keys_input_it_t,
unique_output_it_t,
vals_input_it_t,
aggregate_output_it_t,
num_runs_output_iterator_t,
equality_op_t,
reduction_op_t,
offset_t,
accum_t,
reduce_by_key_policy_hub>;
#else
using dispatch_t = cub::DispatchReduceByKey<
keys_input_it_t,
unique_output_it_t,
vals_input_it_t,
aggregate_output_it_t,
num_runs_output_iterator_t,
equality_op_t,
reduction_op_t,
offset_t,
accum_t>;
#endif
using equality_op_t = ::cuda::std::equal_to<>;
using reduction_op_t = ::cuda::std::plus<>;
using offset_t = OffsetT;

const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
constexpr std::size_t min_segment_size = 1;
Expand All @@ -92,43 +49,42 @@ static void reduce(nvbench::state& state, nvbench::type_list<KeyT, ValueT, Offse
thrust::device_vector<KeyT> out_keys(elements);
thrust::device_vector<KeyT> in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size);

KeyT* d_in_keys = thrust::raw_pointer_cast(in_keys.data());
const KeyT* d_in_keys = thrust::raw_pointer_cast(in_keys.data());
KeyT* d_out_keys = thrust::raw_pointer_cast(out_keys.data());
ValueT* d_in_vals = thrust::raw_pointer_cast(in_vals.data());
const ValueT* d_in_vals = thrust::raw_pointer_cast(in_vals.data());
ValueT* d_out_vals = thrust::raw_pointer_cast(out_vals.data());
OffsetT* d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data());

std::uint8_t* d_temp_storage{};
std::size_t temp_storage_bytes{};
const offset_t num_items = static_cast<offset_t>(elements);

dispatch_t::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in_keys,
d_out_keys,
d_in_vals,
d_out_vals,
d_num_runs_out,
equality_op_t{},
reduction_op_t{},
elements,
0);
auto dispatch_on_stream = [&](cudaStream_t stream) {
return cub::detail::reduce_by_key::dispatch</* OverrideAccumT */ ValueT>(
d_temp_storage,
temp_storage_bytes,
d_in_keys,
d_out_keys,
d_in_vals,
d_out_vals,
d_num_runs_out,
equality_op_t{},
reduction_op_t{},
num_items,
stream
#if !TUNE_BASE
,
bench_reduce_by_key_policy_selector{}
#endif
);
};

dispatch_on_stream(cudaStream_t{0});

thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

dispatch_t::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in_keys,
d_out_keys,
d_in_vals,
d_out_vals,
d_num_runs_out,
equality_op_t{},
reduction_op_t{},
elements,
0);
dispatch_on_stream(cudaStream_t{0});
cudaDeviceSynchronize();
const OffsetT num_runs = num_runs_out[0];

Expand All @@ -140,18 +96,7 @@ static void reduce(nvbench::state& state, nvbench::type_list<KeyT, ValueT, Offse
state.add_global_memory_writes<OffsetT>(1);

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
dispatch_t::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in_keys,
d_out_keys,
d_in_vals,
d_out_vals,
d_num_runs_out,
equality_op_t{},
reduction_op_t{},
elements,
launch.get_stream());
dispatch_on_stream(launch.get_stream());
});
}

Expand Down
138 changes: 42 additions & 96 deletions cub/benchmarks/bench/run_length_encode/encode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,32 +17,20 @@
// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5

#if !TUNE_BASE
# if TUNE_TRANSPOSE == 0
# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
# else // TUNE_TRANSPOSE == 1
# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
# endif // TUNE_TRANSPOSE

# if TUNE_LOAD == 0
# define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
# else // TUNE_LOAD == 1
# define TUNE_LOAD_MODIFIER cub::LOAD_CA
# endif // TUNE_LOAD

struct reduce_by_key_policy_hub
struct bench_encode_policy_selector
{
struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
[[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const
-> cub::detail::reduce_by_key::reduce_by_key_policy
{
using ReduceByKeyPolicyT =
cub::AgentReduceByKeyPolicy<TUNE_THREADS,
TUNE_ITEMS,
TUNE_LOAD_ALGORITHM,
TUNE_LOAD_MODIFIER,
cub::BLOCK_SCAN_WARP_SCANS,
delay_constructor_t>;
};

using MaxPolicy = Policy500;
return {
TUNE_THREADS,
TUNE_ITEMS,
TUNE_TRANSPOSE == 0 ? cub::BLOCK_LOAD_DIRECT : cub::BLOCK_LOAD_WARP_TRANSPOSE,
TUNE_LOAD == 0 ? cub::LOAD_DEFAULT : cub::LOAD_CA,
cub::BLOCK_SCAN_WARP_SCANS,
delay_constructor_policy,
};
}
};
#endif // !TUNE_BASE

Expand All @@ -56,41 +44,10 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
// Offset type large enough to represent the total number of runs in the sequence
using num_runs_t = offset_t;

using keys_input_it_t = const T*;
using unique_output_it_t = T*;
using run_length_input_it_t = thrust::constant_iterator<run_length_t, offset_t>;
using run_length_output_it_t = run_length_t*;
using num_runs_output_iterator_t = num_runs_t*;
using equality_op_t = ::cuda::std::equal_to<>;
using reduction_op_t = ::cuda::std::plus<>;
using accum_t = run_length_t;

#if !TUNE_BASE
using dispatch_t = cub::detail::reduce::DispatchStreamingReduceByKey<
keys_input_it_t,
unique_output_it_t,
run_length_input_it_t,
run_length_output_it_t,
num_runs_output_iterator_t,
equality_op_t,
reduction_op_t,
offset_t,
accum_t,
reduce_by_key_policy_hub>;
#else
using policy_t = cub::detail::rle::encode::policy_hub<accum_t, T>;
using dispatch_t = cub::detail::reduce::DispatchStreamingReduceByKey<
keys_input_it_t,
unique_output_it_t,
run_length_input_it_t,
run_length_output_it_t,
num_runs_output_iterator_t,
equality_op_t,
reduction_op_t,
offset_t,
accum_t,
policy_t>;
#endif
using run_length_input_it_t = thrust::constant_iterator<run_length_t, offset_t>;
using equality_op_t = ::cuda::std::equal_to<>;
using reduction_op_t = ::cuda::std::plus<>;
using accum_t = run_length_t;

const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
constexpr std::size_t min_segment_size = 1;
Expand All @@ -101,43 +58,43 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
thrust::device_vector<T> out_keys(elements);
thrust::device_vector<T> in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size);

T* d_in_keys = thrust::raw_pointer_cast(in_keys.data());
const T* d_in_keys = thrust::raw_pointer_cast(in_keys.data());
T* d_out_keys = thrust::raw_pointer_cast(out_keys.data());
auto d_out_vals = thrust::raw_pointer_cast(out_vals.data());
auto d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data());
run_length_input_it_t d_in_vals(run_length_t{1});

std::uint8_t* d_temp_storage{};
std::size_t temp_storage_bytes{};
const offset_t num_items = static_cast<offset_t>(elements);

auto dispatch_on_stream = [&](cudaStream_t stream) {
return cub::detail::reduce_by_key::dispatch_streaming_reduce_by_key</* OverrideAccumT */ accum_t>(
d_temp_storage,
temp_storage_bytes,
d_in_keys,
d_out_keys,
d_in_vals,
d_out_vals,
d_num_runs_out,
equality_op_t{},
reduction_op_t{},
num_items,
stream,
#if TUNE_BASE
cub::detail::rle::encode::policy_selector_from_types<accum_t, T> {}
#else // TUNE_BASE
bench_encode_policy_selector{}
#endif // TUNE_BASE
);
};

dispatch_t::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in_keys,
d_out_keys,
d_in_vals,
d_out_vals,
d_num_runs_out,
equality_op_t{},
reduction_op_t{},
elements,
0);
dispatch_on_stream(cudaStream_t{0});

thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

dispatch_t::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in_keys,
d_out_keys,
d_in_vals,
d_out_vals,
d_num_runs_out,
equality_op_t{},
reduction_op_t{},
elements,
0);
dispatch_on_stream(cudaStream_t{0});
cudaDeviceSynchronize();
const num_runs_t num_runs = num_runs_out[0];

Expand All @@ -148,18 +105,7 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
state.add_global_memory_writes<num_runs_t>(1);

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
dispatch_t::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in_keys,
d_out_keys,
d_in_vals,
d_out_vals,
d_num_runs_out,
equality_op_t{},
reduction_op_t{},
elements,
launch.get_stream());
dispatch_on_stream(launch.get_stream());
});
}

Expand Down
11 changes: 5 additions & 6 deletions cub/cub/agent/agent_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
// SPDX-FileCopyrightText: Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3

/**
* @file cub::AgentReduceByKey implements a stateful abstraction of CUDA thread
* blocks for participating in device-wide reduce-value-by-key.
*/
//! @file
//! cub::detail::reduce_by_key::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for
//! participating in device-wide reduce-value-by-key.

#pragma once

Expand Down Expand Up @@ -92,7 +91,7 @@ struct AgentReduceByKeyPolicy
* Thread block abstractions
******************************************************************************/

namespace detail::reduce
namespace detail::reduce_by_key
{
/**
* @brief AgentReduceByKey implements a stateful abstraction of CUDA thread
Expand Down Expand Up @@ -771,6 +770,6 @@ struct AgentReduceByKey
}
}
};
} // namespace detail::reduce
} // namespace detail::reduce_by_key

CUB_NAMESPACE_END
Loading
Loading
0