diff --git a/include/dr/mp.hpp b/include/dr/mp.hpp index f9598bbcd8..06f4fb3af9 100644 --- a/include/dr/mp.hpp +++ b/include/dr/mp.hpp @@ -79,3 +79,4 @@ #include #include #include +#include diff --git a/include/dr/mp/algorithms/for_each.hpp b/include/dr/mp/algorithms/for_each.hpp index 8851208198..0704e71741 100644 --- a/include/dr/mp/algorithms/for_each.hpp +++ b/include/dr/mp/algorithms/for_each.hpp @@ -18,7 +18,53 @@ namespace dr::mp { -/// Collective for_each on distributed range +// the concept doesn't work yet... for some reason +template +concept dual_vector_range = + dr::distributed_range && requires(R &r) { dr::ranges::segments(r)[0].is_compute(); }; + +void partial_for_each(dual_vector_range auto &&dr, auto op) { + dr::drlog.debug(dr::logger::for_each, "partial_for_each: parallel execution\n"); + if (rng::empty(dr)) { + return; + } + + auto is_local = [](const auto &segment) { + return dr::ranges::rank(segment) == default_comm().rank(); + }; + + for (auto &seg : dr::ranges::segments(dr) | rng::views::filter(is_local)) { + if (!seg.is_compute()) { + seg.swap_state(); + continue; + } + + auto b = dr::ranges::local(rng::begin(seg)); + auto s = rng::subrange(b, b + rng::distance(seg)); + + if (mp::use_sycl()) { + dr::drlog.debug(" using sycl\n"); + + assert(rng::distance(s) > 0); +#ifdef SYCL_LANGUAGE_VERSION + dr::__detail::parallel_for( + dr::mp::sycl_queue(), sycl::range<1>(rng::distance(s)), + [first = rng::begin(s), op](auto idx) { op(first[idx]); }) + .wait(); +#else + assert(false); +#endif + } else { + dr::drlog.debug(" using cpu\n"); + rng::for_each(s, op); + } + + seg.swap_state(); + } + barrier(); +} + +// Collective for_each on distributed range void for_each(dr::distributed_range auto &&dr, auto op) { dr::drlog.debug(dr::logger::for_each, "for_each: parallel execution\n"); if (rng::empty(dr)) { diff --git a/include/dr/mp/containers/dual_distributed_vector.hpp b/include/dr/mp/containers/dual_distributed_vector.hpp new file mode 100644 index 0000000000..ab70a03109 --- /dev/null +++ b/include/dr/mp/containers/dual_distributed_vector.hpp @@ -0,0 +1,338 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include +#include + +namespace dr::mp { + +static constexpr std::size_t DUAL_SEGMENTS_PER_PROC = 2; + +class DualMpiBackend { + dr::rma_window win_; + +public: + void *allocate(std::size_t data_size) { + assert(data_size > 0); + void *data = __detail::allocator().allocate(data_size); + DRLOG("called MPI allocate({}) -> got:{}", data_size, data); + win_.create(default_comm(), data, data_size); + active_wins().insert(win_.mpi_win()); + return data; + } + + void deallocate(void *data, std::size_t data_size) { + assert(data_size > 0); + DRLOG("calling MPI deallocate ({}, data_size:{})", data, data_size); + active_wins().erase(win_.mpi_win()); + win_.free(); + __detail::allocator().deallocate(static_cast(data), + data_size); + } + + void getmem(void *dst, std::size_t offset, std::size_t datalen, + int segment_index) { + const std::size_t peer = get_peer(segment_index); + + DRLOG("calling MPI get(dst:{}, " + "segm_offset:{}, size:{}, peer:{})", + dst, offset, datalen, peer); + +#if (MPI_VERSION >= 4) || \ + (defined(I_MPI_NUMVERSION) && (I_MPI_NUMVERSION > 20211200000)) + // 64-bit API inside + win_.get(dst, datalen, peer, offset); +#else + for (std::size_t remainder = datalen, off = 0UL; remainder > 0;) { + std::size_t s = std::min(remainder, (std::size_t)INT_MAX); + DRLOG("{}:{} win_.get total {} now {} bytes at off {}, dst offset {}", + default_comm().rank(), __LINE__, datalen, s, off, offset + off); + win_.get((uint8_t *)dst + off, s, peer, offset + off); + off += s; + remainder -= s; + } +#endif + } + + void putmem(void const *src, std::size_t offset, std::size_t datalen, + int segment_index) { + const std::size_t peer = get_peer(segment_index); + + DRLOG("calling MPI put(segm_offset:{}, " + "src:{}, size:{}, peer:{})", + offset, src, datalen, peer); + +#if (MPI_VERSION >= 4) || \ + (defined(I_MPI_NUMVERSION) && (I_MPI_NUMVERSION > 20211200000)) + // 64-bit API inside + win_.put(src, datalen, peer, offset); +#else + for (std::size_t remainder = datalen, off = 0UL; remainder > 0;) { + std::size_t s = std::min(remainder, (std::size_t)INT_MAX); + DRLOG("{}:{} win_.put {} bytes at off {}, dst offset {}", + default_comm().rank(), __LINE__, s, off, offset + off); + win_.put((uint8_t *)src + off, s, peer, offset + off); + off += s; + remainder -= s; + } +#endif + } + + std::size_t getrank() { return win_.communicator().rank(); } + + void fence() { win_.fence(); } + +private: + std::size_t get_peer(const std::size_t segment_index) { + const auto size = win_.communicator().size(); + return segment_index < size ? segment_index : 2 * size - segment_index - 1; + } +}; + +/// distributed vector +template +class dual_distributed_vector { + +public: + using value_type = T; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + using backend_type = BackendT; + + class iterator { + public: + using iterator_category = std::random_access_iterator_tag; + using value_type = typename dual_distributed_vector::value_type; + using difference_type = typename dual_distributed_vector::difference_type; + + iterator() {} + iterator(const dual_distributed_vector *parent, difference_type offset) + : parent_(parent), offset_(offset) {} + + auto operator+(difference_type n) const { + return iterator(parent_, offset_ + n); + } + friend auto operator+(difference_type n, const iterator &other) { + return other + n; + } + auto operator-(difference_type n) const { + return iterator(parent_, offset_ - n); + } + auto operator-(iterator other) const { return offset_ - other.offset_; } + + auto &operator+=(difference_type n) { + offset_ += n; + return *this; + } + auto &operator-=(difference_type n) { + offset_ -= n; + return *this; + } + auto &operator++() { + offset_++; + return *this; + } + auto operator++(int) { + auto old = *this; + offset_++; + return old; + } + auto &operator--() { + offset_--; + return *this; + } + auto operator--(int) { + auto old = *this; + offset_--; + return old; + } + + bool operator==(iterator other) const { + if (parent_ == nullptr || other.parent_ == nullptr) { + return false; + } else { + return offset_ == other.offset_; + } + } + auto operator<=>(iterator other) const { + assert(parent_ == other.parent_); + return offset_ <=> other.offset_; + } + + auto operator*() const { + auto segment_size = parent_->segment_size_; + return parent_ + ->segments()[offset_ / segment_size][offset_ % segment_size]; + } + auto operator[](difference_type n) const { return *(*this + n); } + + auto local() { + auto segment_size = parent_->segment_size_; + return (parent_->segments()[offset_ / segment_size].begin() + + offset_ % segment_size) + .local(); + } + + // + // Support for distributed ranges + // + // distributed iterator provides segments + // remote iterator provides local + // + auto segments() { + return dr::__detail::drop_segments(parent_->segments(), offset_); + } + + private: + const dual_distributed_vector *parent_ = nullptr; + difference_type offset_; + }; + + // Do not copy + // We need a move constructor for the implementation of reduce algorithm + dual_distributed_vector(const dual_distributed_vector &) = delete; + dual_distributed_vector &operator=(const dual_distributed_vector &) = delete; + dual_distributed_vector(dual_distributed_vector &&) { assert(false); } + + /// Constructor + dual_distributed_vector(std::size_t size = 0, + distribution dist = distribution()) { + init(size, dist); + } + + /// Constructor + dual_distributed_vector(std::size_t size, value_type fill_value, + distribution dist = distribution()) { + init(size, dist); + mp::fill(*this, fill_value); + } + + ~dual_distributed_vector() { + if (finalized()) return; + + for (size_t i = 0; i < DUAL_SEGMENTS_PER_PROC; i++) { + fence(i); + + if (datas_[i] != nullptr) { + backends_[i].deallocate(datas_[i], data_size_ * sizeof(value_type)); + } + + delete halos_[i]; + } + + delete halo_; + } + + /// Returns iterator to beginning + auto begin() const { return iterator(this, 0); } + /// Returns iterator to end + auto end() const { return begin() + size_; } + + /// Returns size + auto size() const { return size_; } + /// Returns reference using index + auto operator[](difference_type n) const { return *(begin() + n); } + + auto &halo() const { return *halo_; } + + auto segments() const { return rng::views::all(segments_); } + auto segments() { return rng::views::all(segments_); } + + __attribute__((unused)) + void fence(const std::size_t i) { backends_[i].fence(); } + + auto res_idx(const std::size_t segment_index) const { + return segment_index < default_comm().size() ? 0 : 1; + } + + backend_type& backend(const std::size_t segment_index) { + return backends_[res_idx(segment_index)]; + } + const backend_type& backend(const std::size_t segment_index) const { + return backends_[res_idx(segment_index)]; + } + + T *data(const std::size_t segment_index) { + return datas_[res_idx(segment_index)]; + } + + std::size_t data_size() const { return data_size_; } + +private: + void init(auto size, auto dist) { + size_ = size; + distribution_ = dist; + + // determine the distribution of data + auto comm_size = default_comm().size(); // dr-style ignore + auto hb = dist.halo(); + std::size_t gran = dist.granularity(); + // TODO: make this an error that is reported back to user + assert(size % gran == 0 && "size must be a multiple of the granularity"); + assert(hb.prev % gran == 0 && "size must be a multiple of the granularity"); + assert(hb.next % gran == 0 && "size must be a multiple of the granularity"); + + std::size_t segment_count = comm_size * DUAL_SEGMENTS_PER_PROC; + auto proc_segments_size = gran * std::max({ + (size / gran + segment_count - 1) / segment_count, + hb.prev / gran, + hb.next / gran}); + segment_size_ = proc_segments_size; + + std::size_t actual_segment_count_ = + size_ / segment_size_ + (size_ % segment_size_ == 0 ? 0 : 1); + assert(actual_segment_count_ <= segment_count + && "there must be at most 2 segments per process"); + + data_size_ = segment_size_ + hb.prev + hb.next; + + for (std::size_t i = 0; i < DUAL_SEGMENTS_PER_PROC; i++) { + if (size_ > 0) { + datas_.push_back(static_cast(backends_[i].allocate(data_size_ * sizeof(value_type)))); + std::memset(datas_[i], 69, data_size_ * sizeof(value_type)); + halos_.push_back(new dual_span_halo(default_comm(), datas_[i], data_size_, hb, i == 1)); + } + } + + halo_ = new cyclic_span_halo(halos_); + + std::size_t segment_index = 0; + for (std::size_t i = 0; i < size; i += segment_size_) { + segments_.emplace_back(this, segment_index++, + std::min(segment_size_, size - i), data_size_); + } + + for (size_t i = 0; i < default_comm().size(); i++) { + segments_[default_comm().size() + i].swap_state(); + } + + for (size_t i = 0; i < DUAL_SEGMENTS_PER_PROC; i++) { + fence(i); + } + } + + friend dual_dv_segment_iterator; + + std::size_t segment_size_ = 0; + std::size_t data_size_ = 0; // size + halo + + std::vector *> halos_; + std::vector datas_; + cyclic_span_halo *halo_; + + distribution distribution_; + std::size_t size_; + std::vector> segments_; + std::vector backends_{DUAL_SEGMENTS_PER_PROC}; +}; + +template +auto &halo(const dual_distributed_vector &dv) { + return dv.halo(); +} + +} // namespace dr::mp diff --git a/include/dr/mp/containers/dual_segment.hpp b/include/dr/mp/containers/dual_segment.hpp new file mode 100644 index 0000000000..c45919d9fd --- /dev/null +++ b/include/dr/mp/containers/dual_segment.hpp @@ -0,0 +1,278 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +namespace dr::mp { + +template class dual_dv_segment_iterator; + +template class dual_dv_segment_reference { + using iterator = dual_dv_segment_iterator; + +public: + using value_type = typename DV::value_type; + + dual_dv_segment_reference(const iterator it) : iterator_(it) {} + + operator value_type() const { return iterator_.get(); } + auto operator=(const value_type &value) const { + iterator_.put(value); + return *this; + } + auto operator=(const dual_dv_segment_reference &other) const { + *this = value_type(other); + return *this; + } + auto operator&() const { return iterator_; } + +private: + const iterator iterator_; +}; // dual_dv_segment_reference + +template class dual_dv_segment_iterator { +public: + using value_type = typename DV::value_type; + using size_type = typename DV::size_type; + using difference_type = typename DV::difference_type; + + dual_dv_segment_iterator() = default; + dual_dv_segment_iterator(DV *dv, std::size_t segment_index, std::size_t index) { + dv_ = dv; + segment_index_ = segment_index; + index_ = index; + } + + auto operator<=>(const dual_dv_segment_iterator &other) const noexcept { + // assertion below checks against compare dereferenceable iterator to a + // singular iterator and against attempt to compare iterators from different + // sequences like _Safe_iterator does + assert(dv_ == other.dv_); + return segment_index_ == other.segment_index_ + ? index_ <=> other.index_ + : segment_index_ <=> other.segment_index_; + } + + // Comparison + bool operator==(const dual_dv_segment_iterator &other) const noexcept { + return (*this <=> other) == 0; + } + + // Only this arithmetic manipulate internal state + auto &operator+=(difference_type n) { + assert(dv_ != nullptr); + assert(n >= 0 || static_cast(index_) >= -n); + index_ += n; + return *this; + } + + auto &operator-=(difference_type n) { return *this += (-n); } + + difference_type operator-(const dual_dv_segment_iterator &other) const noexcept { + assert(dv_ != nullptr && dv_ == other.dv_); + assert(index_ >= other.index_); + return index_ - other.index_; + } + + // prefix + auto &operator++() { + *this += 1; + return *this; + } + auto &operator--() { + *this -= 1; + return *this; + } + + // postfix + auto operator++(int) { + auto prev = *this; + *this += 1; + return prev; + } + auto operator--(int) { + auto prev = *this; + *this -= 1; + return prev; + } + + auto operator+(difference_type n) const { + auto p = *this; + p += n; + return p; + } + auto operator-(difference_type n) const { + auto p = *this; + p -= n; + return p; + } + + // When *this is not first in the expression + friend auto operator+(difference_type n, const dual_dv_segment_iterator &other) { + return other + n; + } + + // dereference + auto operator*() const { + assert(dv_ != nullptr); + return dual_dv_segment_reference{*this}; + } + auto operator[](difference_type n) const { + assert(dv_ != nullptr); + return *(*this + n); + } + + void get(value_type *dst, std::size_t size) const { + assert(dv_ != nullptr); + assert(segment_index_ * dv_->segment_size_ + index_ < dv_->size()); + auto segment_offset = index_ + dv_->distribution_.halo().prev; + dv_->backend(segment_index_).getmem(dst, segment_offset * sizeof(value_type), + size * sizeof(value_type), segment_index_); + } + + value_type get() const { + value_type val; + get(&val, 1); + return val; + } + + void put(const value_type *dst, std::size_t size) const { + assert(dv_ != nullptr); + assert(segment_index_ * dv_->segment_size_ + index_ < dv_->size()); + auto segment_offset = index_ + dv_->distribution_.halo().prev; + dr::drlog.debug("dv put:: ({}:{}:{})\n", segment_index_, segment_offset, + size); + dv_->backend(segment_index_).putmem(dst, segment_offset * sizeof(value_type), + size * sizeof(value_type), segment_index_); + } + + void put(const value_type &value) const { put(&value, 1); } + + auto rank() const { + assert(dv_ != nullptr); + + if (segment_index_ < default_comm().size()) { + return segment_index_; + } + + return 2 * default_comm().size() - segment_index_ - 1; + } + + auto local() const { +#ifndef SYCL_LANGUAGE_VERSION + assert(dv_ != nullptr); +#endif + const auto my_process_rank = dv_->backend(segment_index_).getrank(); + const bool is_left_segment = segment_index_ < default_comm().size(); + const auto normalized_segment_index = is_left_segment + ? segment_index_ + : 2 * default_comm().size() - segment_index_ - 1; + const auto data = dv_->data(segment_index_); + + const bool is_in_bounds = my_process_rank == normalized_segment_index; + const bool is_in_halo_prev = is_left_segment + ? my_process_rank + 1 == normalized_segment_index + : my_process_rank == normalized_segment_index + 1; + const bool is_in_halo_next = is_left_segment + ? my_process_rank == normalized_segment_index + 1 + : my_process_rank + 1 == normalized_segment_index; + + if (is_in_bounds) { + assert(!is_in_halo_prev && !is_in_halo_next); + return data + index_ + dv_->distribution_.halo().prev; + } +#ifndef SYCL_LANGUAGE_VERSION + assert(!dv_->distribution_.halo().periodic); // not implemented +#endif + // sliding view needs local iterators that point to the halo + if (is_in_halo_prev) { + assert(!is_in_bounds && !is_in_halo_next); +#ifndef SYCL_LANGUAGE_VERSION + assert(index_ <= dv_->distribution_.halo().next); // <= instead of < to cover end() case +#endif + return data + dv_->distribution_.halo().prev + index_ + dv_->segment_size_; + } + + if (is_in_halo_next) { + assert(!is_in_bounds && !is_in_halo_prev); +#ifndef SYCL_LANGUAGE_VERSION + assert(dv_->segment_size_ - index_ <= dv_->distribution_.halo().prev); +#endif + return data + dv_->distribution_.halo().prev + index_ - dv_->segment_size_; + } + +#ifndef SYCL_LANGUAGE_VERSION + assert(false); // trying to read non-owned memory +#endif + return static_castdata(segment_index_))>(nullptr); + } + + auto segments() const { + assert(dv_ != nullptr); + return dr::__detail::drop_segments(dv_->segments(), segment_index_, index_); + } + + auto &halo() const { + assert(dv_ != nullptr); + return dv_->halo(); + } + auto halo_bounds() const { + assert(dv_ != nullptr); + return dv_->distribution_.halo(); + } + +private: + // all fields need to be initialized by default ctor so every default + // constructed iter is equal to any other default constructed iter + DV *dv_ = nullptr; + std::size_t segment_index_ = 0; + std::size_t index_ = 0; +}; // dual_dv_segment_iterator + +template class dual_dv_segment { +private: + using iterator = dual_dv_segment_iterator; + +public: + using difference_type = std::ptrdiff_t; + dual_dv_segment() = default; + dual_dv_segment(DV *dv, std::size_t segment_index, std::size_t size, + std::size_t reserved) { + dv_ = dv; + segment_index_ = segment_index; + size_ = size; + reserved_ = reserved; + assert(dv_ != nullptr); + } + + auto size() const { + assert(dv_ != nullptr); + return size_; + } + + auto begin() const { return iterator(dv_, segment_index_, 0); } + auto end() const { return begin() + size(); } + auto reserved() const { return reserved_; } + + auto operator[](difference_type n) const { return *(begin() + n); } + + bool is_local() const { + auto rank = default_comm().rank(); + return segment_index_ == rank + || segment_index_ == 2 * default_comm().size() - rank - 1; + } + + bool is_compute() const { return _is_compute; } + + void swap_state() { _is_compute = !_is_compute; } + +private: + bool _is_compute = true; + DV *dv_ = nullptr; + std::size_t segment_index_; + std::size_t size_; + std::size_t reserved_; +}; // dual_dv_segment + +} // namespace dr::mp diff --git a/include/dr/mp/halo.hpp b/include/dr/mp/halo.hpp index 7f7b7dbdb1..283d9d4a68 100644 --- a/include/dr/mp/halo.hpp +++ b/include/dr/mp/halo.hpp @@ -394,13 +394,13 @@ class span_halo : public span_halo_impl { std::vector owned; DRLOG("owned groups {}/{} first/last", comm.first(), comm.last()); if (hb.next > 0 && (hb.periodic || !comm.first())) { - owned.emplace_back(span.subspan(hb.prev, hb.next), comm.prev(), - halo_tag::reverse); + owned.emplace_back(span.subspan(hb.prev, hb.next), + comm.prev(), halo_tag::reverse); } if (hb.prev > 0 && (hb.periodic || !comm.last())) { owned.emplace_back( span.subspan(rng::size(span) - (hb.prev + hb.next), hb.prev), - comm.next(), halo_tag::forward); + comm.next(), halo_tag::forward); } return owned; } @@ -409,15 +409,155 @@ class span_halo : public span_halo_impl { halo_groups(communicator comm, std::span span, halo_bounds hb) { std::vector halo; if (hb.prev > 0 && (hb.periodic || !comm.first())) { - halo.emplace_back(span.first(hb.prev), comm.prev(), halo_tag::forward); + halo.emplace_back(span.first(hb.prev), + comm.prev(), halo_tag::forward); } if (hb.next > 0 && (hb.periodic || !comm.last())) { - halo.emplace_back(span.last(hb.next), comm.next(), halo_tag::reverse); + halo.emplace_back(span.last(hb.next), + comm.next(), halo_tag::reverse); } return halo; } }; +template > +class dual_span_halo : public span_halo_impl { +public: + using group_type = span_group; + + dual_span_halo() : span_halo_impl(communicator(), {}, {}) {} + + dual_span_halo(communicator comm, T *data, std::size_t size, halo_bounds hb, bool rev = false) + : span_halo_impl(comm, owned_groups(comm, {data, size}, hb, rev), + halo_groups(comm, {data, size}, hb, rev)) { + check(size, hb); + } + + dual_span_halo(communicator comm, std::span span, halo_bounds hb, bool rev = false) + : span_halo_impl(comm, owned_groups(comm, span, hb, rev), + halo_groups(comm, span, hb, rev)) {} + +private: + void check(auto size, auto hb) { + assert(size >= hb.prev + hb.next + std::max(hb.prev, hb.next)); + } + + static std::vector + owned_groups(communicator comm, std::span span, halo_bounds hb, bool rev) { + std::vector owned; + + bool should_make_left = hb.next > 0 && (hb.periodic || !(rev ? comm.last() : comm.first())); + bool should_make_right = hb.prev > 0 && (hb.periodic || !(rev ? comm.first() : comm.last())); + + DRLOG("owned groups {}/{} first/last", comm.first(), comm.last()); + if (should_make_left) { + owned.emplace_back(span.subspan(hb.prev, hb.next), + rev ? comm.next() : comm.prev(), + rev ? halo_tag::forward : halo_tag::reverse); + } + if (should_make_right) { + owned.emplace_back( + span.subspan(rng::size(span) - (hb.prev + hb.next), hb.prev), + rev ? comm.prev() : comm.next(), + rev ? halo_tag::reverse : halo_tag::forward); + } + return owned; + } + + static std::vector + halo_groups(communicator comm, std::span span, halo_bounds hb, bool rev) { + std::vector halo; + + bool should_make_left = hb.prev > 0 && (hb.periodic || !(rev ? comm.last() : comm.first())); + bool should_make_right = hb.next > 0 && (hb.periodic || !(rev ? comm.first() : comm.last())); + + if (should_make_left) { + halo.emplace_back(span.first(hb.prev), + rev ? comm.next() : comm.prev(), + rev ? halo_tag::reverse : halo_tag::forward); + } + if (should_make_right) { + halo.emplace_back(span.last(hb.next), + rev ? comm.prev() : comm.next(), + rev ? halo_tag::forward : halo_tag::reverse); + } + return halo; + } +}; + +template > +class cyclic_span_halo { +public: + using group_type = span_group; + using halo_type = dual_span_halo; + + cyclic_span_halo(const std::vector& halos) + : halos_(halos) { + for (const auto& halo : halos_) { + assert(halo != nullptr); + } + } + + void partial_exchange_begin() { + halos_[next_comm_index_]->exchange_begin(); + // increment_index(); + } + + void partial_exchange_finalize() { + halos_[next_comm_index_]->exchange_finalize(); + increment_index(); + } + + void partial_exchange() { + halos_[next_comm_index_]->exchange(); + increment_index(); + } + + void exchange() { + for (const auto &halo: halos_) { + halo->exchange(); + } + } + + void exchange_begin() { + for (const auto &halo: halos_) { + halo->exchange_begin(); + } + } + + void exchange_finalize() { + for (const auto &halo: halos_) { + halo->exchange_finalize(); + } + } + + void reduce_begin() { + halos_[next_comm_index_]->reduce_begin(); + } + + void reduce_finalize(const auto &op) { + halos_[next_comm_index_]->reduce_finalize(op); + //increment_index(); + } + + void reduce_finalize() { + halos_[next_comm_index_]->reduce_finalize(); + //increment_index(); + } + + void swap() { + increment_index(); + } + +private: + void increment_index() { + next_comm_index_ = (next_comm_index_ + 1) % halos_.size(); + } + + std::vector halos_; + std::size_t next_comm_index_ = 0; +}; + } // namespace dr::mp #ifdef DR_FORMAT diff --git a/test/gtest/mp/CMakeLists.txt b/test/gtest/mp/CMakeLists.txt index 32f26d120a..e2e7a37511 100644 --- a/test/gtest/mp/CMakeLists.txt +++ b/test/gtest/mp/CMakeLists.txt @@ -36,7 +36,9 @@ add_executable( communicator.cpp copy.cpp distributed_vector.cpp + dual_distributed_vector.cpp halo.cpp + halo-dual.cpp mdstar.cpp mpsort.cpp reduce.cpp @@ -58,11 +60,13 @@ add_executable( add_executable(mp-quick-test mp-tests.cpp - ../common/count.cpp + ../common/iota.cpp + # ../common/for_each.cpp + # dual_distributed_vector.cpp ) # cmake-format: on -target_compile_definitions(mp-quick-test PRIVATE QUICK_TEST) +target_compile_definitions(mp-quick-test PRIVATE QUICK_TEST DR_FORMAT) foreach(test-exec IN ITEMS mp-tests mp-tests-3 mp-quick-test) if(ENABLE_ISHMEM) @@ -78,6 +82,8 @@ endforeach() if(NOT ENABLE_ISHMEM) add_mp_ctest(NAME mp-quick-test NPROC 1) add_mp_ctest(NAME mp-quick-test NPROC 2) + add_mp_ctest(NAME mp-quick-test NPROC 3) + add_mp_ctest(NAME mp-quick-test NPROC 4) cmake_path(GET MPI_CXX_ADDITIONAL_INCLUDE_DIRS FILENAME MPI_IMPL) diff --git a/test/gtest/mp/dual_distributed_vector.cpp b/test/gtest/mp/dual_distributed_vector.cpp new file mode 100644 index 0000000000..d31ab5c07a --- /dev/null +++ b/test/gtest/mp/dual_distributed_vector.cpp @@ -0,0 +1,115 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "xp-tests.hpp" + +using T = int; +using DV = dr::mp::dual_distributed_vector; +using DVI = typename DV::iterator; + +TEST(MpTests, DualDistributedVectorQuery) { + const int n = 10; + DV a(n); + + EXPECT_EQ(a.size(), n); +} + +TEST(MpTests, DualDistributedVectorIndex) { + const std::size_t n = 10; + DV dv(n); + + if (comm_rank == 0) { + for (std::size_t i = 0; i < n; i++) { + dv[i] = i + 10; + } + } + dr::mp::fence(); + + for (std::size_t i = 0; i < n; i++) { + EXPECT_EQ(dv[i], i + 10); + } + + DV dv2(n); + + if (comm_rank == 0) { + dv2[3] = 1000; + dv2[3] = dv[3]; + } + dr::mp::fence(); + EXPECT_EQ(dv2[3], dv[3]); +} + +TEST(MpTests, DualDistributedVectorAlgorithms) { + const std::size_t n = 10; + const int root = 0; + DV dv(n); + + if (comm_rank == root) { + std::vector ref(n); + std::iota(ref.begin(), ref.end(), 1); + + std::iota(dv.begin(), dv.end(), 1); + + EXPECT_TRUE(equal_gtest(dv, ref)); + + std::iota(ref.begin(), ref.end(), 11); + std::copy(ref.begin(), ref.end(), dv.begin()); + EXPECT_TRUE(equal_gtest(dv, ref)); + + std::iota(ref.begin(), ref.end(), 21); + rng::copy(ref, dv.begin()); + EXPECT_TRUE(equal_gtest(dv, ref)); + + std::iota(dv.begin(), dv.end(), 31); + rng::copy(dv, ref.begin()); + EXPECT_TRUE(equal_gtest(dv, ref)); + } +} + +int aa; + +// Operations on a const distributed_vector +void common_operations(auto &dv) { + aa = dv[0]; + EXPECT_EQ(dv[0], 100); + EXPECT_EQ(*(&(dv[0])), 100); + + auto p = &dv[0]; + EXPECT_EQ(*(p + 1), 101); +} + +TEST(MpTests, DualDistributedVectorReference) { + std::size_t n = 10; + DV dv(n); + if (comm_rank == 0) { + rng::iota(dv, 100); + } + dr::mp::fence(); + + const DV &cdv = dv; + if (comm_rank == 0) { + common_operations(cdv); + common_operations(dv); + } + MPI_Barrier(comm); + + if (comm_rank == 0) { + dv[2] = 2; + } + dr::mp::fence(); + EXPECT_EQ(dv[2], 2); +} + +TEST(MpTests, DualDistributedVectorGranularity) { + std::size_t gran = 3; + std::size_t n = gran * 6; + auto dist = dr::mp::distribution().granularity(gran); + DV dv(n, dist); + + std::size_t previous_size = gran; + for (auto &segment : dr::ranges::segments(dv)) { + EXPECT_EQ(previous_size % gran, 0); + previous_size = segment.size(); + } +} diff --git a/test/gtest/mp/halo-dual.cpp b/test/gtest/mp/halo-dual.cpp new file mode 100644 index 0000000000..7617c4cdd1 --- /dev/null +++ b/test/gtest/mp/halo-dual.cpp @@ -0,0 +1,351 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "xp-tests.hpp" + +#include +#include +#include +#include +#include +#include + +template class HaloDual : public testing::Test {}; + +TYPED_TEST_SUITE(HaloDual, ::testing::Types>); + +template +void check_matching(DV &dv, int idx, int expected_value) { + typename DV::value_type *local_ptr = (dv.begin() + idx).local(); + EXPECT_TRUE(local_ptr != nullptr); + typename DV::value_type value_on_host; + + if (dr::mp::use_sycl()) + dr::mp::__detail::sycl_copy(local_ptr, &value_on_host); + else + value_on_host = *local_ptr; + + DRLOG("checking idx:{} expected:{}", idx, expected_value); + EXPECT_EQ(value_on_host, expected_value); +} + +template +void local_is_accessible_in_halo_region(const int halo_prev, + const int halo_next) { + + DV dv(12, dr::mp::distribution().halo(halo_prev, halo_next)); + DRLOG("local_is_accessible_in_halo_region TEST START, prev:{}, next:{}", + halo_prev, halo_next); + iota(dv, 0); + DRLOG("exchange start"); + + dv.halo().exchange(); + + // arrays below is function depending on size of communicator-1 + std::array first_local_index____; + std::array first_nonlocal_index_; + std::array second_local_index___; + std::array second_nonlocal_index; + const int X = 10000; // to mark unused value + + switch (dr::mp::default_comm().rank()) { + case 0: + first_local_index____ = {+0, +0, +0, 0}; + first_nonlocal_index_ = {12, +3, +2, 2}; + second_local_index___ = {+X, +9, 10, X}; + second_nonlocal_index = {+X, 12, 12, X}; + break; + case 1: + first_local_index____ = {X, 3, +2, 2}; + first_nonlocal_index_ = {X, 9, +4, 4}; + second_local_index___ = {X, X, +8, X}; + second_nonlocal_index = {X, X, 10, X}; + break; + case 2: + first_local_index____ = {X, X, 4, +4}; + first_nonlocal_index_ = {X, X, 8, +6}; + second_local_index___ = {X, X, X, 10}; + second_nonlocal_index = {X, X, X, 12}; + break; + case 3: + first_local_index____ = {X, X, X, +6}; + first_nonlocal_index_ = {X, X, X, 10}; + second_local_index___ = {X, X, X, +X}; + second_nonlocal_index = {X, X, X, +X}; + break; + default: + first_local_index____ = {X, X, X, X}; + first_nonlocal_index_ = {X, X, X, X}; + second_local_index___ = {X, X, X, X}; + second_nonlocal_index = {X, X, X, X}; + } + + const auto c = dr::mp::default_comm().size() - 1; + auto first_legal_idx = std::max(0, first_local_index____[c] - halo_prev); + auto first_illegal_idx = std::min(12, first_nonlocal_index_[c] + halo_next); + auto second_legal_idx = std::max(0, second_local_index___[c] - halo_prev); + auto second_illegal_idx = std::min(12, second_nonlocal_index[c] + halo_next); + + DRLOG("checking access to idx between first legal {} and first illegal {}, " + "c:{}", + first_legal_idx, first_illegal_idx, c); + + for (int idx = first_legal_idx; idx < first_illegal_idx; ++idx) { + check_matching(dv, idx, idx); + } + + DRLOG("checking access to idx between second legal {} and second illegal {}, " + "c:{}", + second_legal_idx, second_illegal_idx, c); + + for (int idx = second_legal_idx; idx < second_illegal_idx; ++idx) { + check_matching(dv, idx, idx); + } + + DRLOG("checks ok"); +} + +TYPED_TEST(HaloDual, local_is_accessible_in_halo_region_halo_11) { + local_is_accessible_in_halo_region(1, 1); +} + +TYPED_TEST(HaloDual, local_is_accessible_in_halo_region_halo_10) { + local_is_accessible_in_halo_region(1, 0); +} + +TYPED_TEST(HaloDual, local_is_accessible_in_halo_region_halo_01) { + local_is_accessible_in_halo_region(0, 1); +} + +template +void local_is_accessible_in_halo_region__partial(const int halo_prev, + const int halo_next) { + + DV dv(12, dr::mp::distribution().halo(halo_prev, halo_next)); + DRLOG("local_is_accessible_in_halo_region TEST START, prev:{}, next:{}", + halo_prev, halo_next); + iota(dv, 0); + DRLOG("exchange start"); + + dv.halo().exchange(); + + // arrays below is function depending on size of communicator-1 + std::array first_segment_begin_; + std::array first_segment_end___; + std::array second_segment_begin; + std::array second_segment_end__; + const int X = 10000; // to mark unused value + + switch (dr::mp::default_comm().rank()) { + case 0: + first_segment_begin_ = {+0, +0, +0, 0}; + first_segment_end___ = {+6, +3, +2, 2}; + second_segment_begin = {+6, +9, 10, X}; + second_segment_end__ = {12, 12, 12, X}; + break; + case 1: + first_segment_begin_ = {X, 3, +2, 2}; + first_segment_end___ = {X, 6, +4, 4}; + second_segment_begin = {X, 6, +8, X}; + second_segment_end__ = {X, 9, 10, X}; + break; + case 2: + first_segment_begin_ = {X, X, 4, +4}; + first_segment_end___ = {X, X, 6, +6}; + second_segment_begin = {X, X, 6, 10}; + second_segment_end__ = {X, X, 8, 12}; + break; + case 3: + first_segment_begin_ = {X, X, X, +6}; + first_segment_end___ = {X, X, X, +8}; + second_segment_begin = {X, X, X, +8}; + second_segment_end__ = {X, X, X, 10}; + break; + default: + first_segment_begin_ = {X, X, X, X}; + first_segment_end___ = {X, X, X, X}; + second_segment_begin = {X, X, X, X}; + second_segment_end__ = {X, X, X, X}; + } + + const auto c = dr::mp::default_comm().size() - 1; + auto first_legal_idx = std::max(0, first_segment_begin_[c] - halo_prev); + auto first_illegal_idx = std::min(12, first_segment_end___[c] + halo_next); + auto second_legal_idx = std::max(0, second_segment_begin[c] - halo_prev); + auto second_illegal_idx = std::min(12, second_segment_end__[c] + halo_next); + + if (first_segment_end___[c] == second_segment_begin[c]) { + // we own the middle segment + first_illegal_idx = std::min(12, first_segment_end___[c]); + second_legal_idx = std::max(0, second_segment_begin[c]); + } + + constexpr size_t N_STEPS = 5; + auto foreach_fn = [](auto&& elem) { elem *= 10; }; + int expected_multiplier = 1; + + for (size_t i = 0; i < N_STEPS; i++) { + expected_multiplier *= 10; + + partial_for_each(dv, foreach_fn); + dv.halo().partial_exchange(); + + for (int idx = first_legal_idx; idx < first_illegal_idx; ++idx) { + check_matching(dv, idx, idx * expected_multiplier); + } + + partial_for_each(dv, foreach_fn); + dv.halo().partial_exchange(); + + for (int idx = second_legal_idx; idx < second_illegal_idx; ++idx) { + check_matching(dv, idx, idx * expected_multiplier); + } + } + + DRLOG("checks ok"); +} + +TYPED_TEST(HaloDual, local_is_accessible_in_halo_region_halo_11__partial) { + local_is_accessible_in_halo_region__partial(0, 1); +} + +// perf test! + +[[maybe_unused]] +static constexpr size_t DISTRIBUTED_VECTOR_SIZE = 1000000; + +[[maybe_unused]] +static constexpr size_t N_STEPS = 1; // 100000; + +[[maybe_unused]] +auto stencil1d_subrange_op = [](auto ¢er) { + auto win = ¢er; + center = win[-1] + win[0] + win[1]; +}; + +[[maybe_unused]] +auto stencil1d_subrange_op__heavy = [](auto ¢er) { + auto win = ¢er; + auto result = win[-1] + win[0] + win[1]; + + for (int i = 1; i < 1000; i++) { + if (i % 2 == 0) { + result *= i; + } else { + result /= i; + } + } + + center = result; + return result; +}; + +void perf_test_dual(const auto& op) { + dr::mp::dual_distributed_vector dv(DISTRIBUTED_VECTOR_SIZE, dr::mp::distribution().halo(1, 1)); + DRLOG("perf_test_dual TEST START"); + iota(dv, 0); + DRLOG("exchange start"); + + auto start = std::chrono::high_resolution_clock::now(); + + dv.halo().exchange(); + + // auto dv_subrange = rng::subrange(dv.begin() + 1, dv.end() - 1); + + for (size_t i = 0; i < N_STEPS; i++) { + dv.halo().partial_exchange_begin(); + partial_for_each(dv, op); + dv.halo().partial_exchange_finalize(); + + dv.halo().partial_exchange_begin(); + partial_for_each(dv, op); + dv.halo().partial_exchange_finalize(); + } + + auto end = std::chrono::high_resolution_clock::now(); + auto duration = duration_cast(end - start); + std::cout << "perf_test_dual results: \n\ttime: " << duration.count() << "us" << std::endl; +} + +void perf_test_classic(const auto& op) { + dr::mp::distributed_vector dv(DISTRIBUTED_VECTOR_SIZE, dr::mp::distribution().halo(1, 1)); + DRLOG("perf_test TEST START"); + iota(dv, 0); + DRLOG("exchange start"); + + auto start = std::chrono::high_resolution_clock::now(); + + dv.halo().exchange(); + + // auto dv_subrange = rng::subrange(dv.begin() + 1, dv.end() - 1); + + for (size_t i = 0; i < N_STEPS; i++) { + for_each(dv, op); + dv.halo().exchange(); + } + + auto end = std::chrono::high_resolution_clock::now(); + auto duration = duration_cast(end - start); + std::cout << "perf_test results: \n\ttime: " << duration.count() << "us" << std::endl; +} + +TYPED_TEST(HaloDual, perf_test_dual_dv) { + perf_test_dual(stencil1d_subrange_op__heavy); +} + +TYPED_TEST(HaloDual, perf_test_classic_dv) { + perf_test_classic(stencil1d_subrange_op__heavy); +} + +// auto is_local = [](const auto &segment) { +// return dr::ranges::rank(segment) == dr::mp::default_comm().rank(); +// }; + +// auto perf_test_segment_lambda = [](auto ¢er) { center = center + 1; }; + +// void perf_test_dual_segment() { +// dr::mp::dual_distributed_vector dv(DISTRIBUTED_VECTOR_SIZE, dr::mp::distribution().halo(1, 1)); + +// auto start = std::chrono::high_resolution_clock::now(); + +// for (auto &seg : dr::ranges::segments(dv) | rng::views::filter(is_local)) { +// auto b = dr::ranges::local(rng::begin(seg)); +// auto s = rng::subrange(b, b + rng::distance(seg)); + +// for (size_t i = 0; i < N_STEPS; i++) { +// rng::for_each(s, perf_test_segment_lambda); +// } +// } + +// auto end = std::chrono::high_resolution_clock::now(); +// auto duration = duration_cast(end - start); +// std::cout << "perf_test_dual_segment results: \n\ttime: " << duration.count() << "us" << std::endl; +// } + +// void perf_test_classic_segment() { +// dr::mp::distributed_vector dv(DISTRIBUTED_VECTOR_SIZE, dr::mp::distribution().halo(1, 1)); + +// auto start = std::chrono::high_resolution_clock::now(); + +// for (auto &seg : dr::ranges::segments(dv) | rng::views::filter(is_local)) { +// auto b = dr::ranges::local(rng::begin(seg)); +// auto s = rng::subrange(b, b + rng::distance(seg)); + +// for (size_t i = 0; i < N_STEPS; i++) { +// rng::for_each(s, perf_test_segment_lambda); +// } +// } + +// auto end = std::chrono::high_resolution_clock::now(); +// auto duration = duration_cast(end - start); +// std::cout << "perf_test_classic_segment results: \n\ttime: " << duration.count() << "us" << std::endl; +// } + +// TYPED_TEST(HaloDual, perf_test_classic_dv_segment) { +// perf_test_classic_segment(); +// } + +// TYPED_TEST(HaloDual, perf_test_dual_dv_segment) { +// perf_test_dual_segment(); +// } diff --git a/test/gtest/mp/halo.cpp b/test/gtest/mp/halo.cpp index 255832a02a..12f9268133 100644 --- a/test/gtest/mp/halo.cpp +++ b/test/gtest/mp/halo.cpp @@ -17,6 +17,7 @@ void local_is_accessible_in_halo_region(const int halo_prev, halo_prev, halo_next); iota(dv, 0); DRLOG("exchange start"); + dv.halo().exchange(); // arrays below is function depending on size of communicator-1 diff --git a/test/gtest/mp/xp-tests.hpp b/test/gtest/mp/xp-tests.hpp index d4c14fc0e5..e2a1206a88 100644 --- a/test/gtest/mp/xp-tests.hpp +++ b/test/gtest/mp/xp-tests.hpp @@ -44,11 +44,15 @@ using AllTypes = using IshmemTypes = ::testing::Types>; #else -using AllTypes = ::testing::Types>; -using IshmemTypes = ::testing::Types>; +using AllTypes = ::testing::Types>; // , + //dr::mp::dual_distributed_vector>; +using IshmemTypes = ::testing::Types>; //, + // dr::mp::dual_distributed_vector>; #endif -using AllTypesWithoutIshmem = ::testing::Types>; +using AllTypesWithoutIshmem = + ::testing::Types>; //, + //dr::mp::dual_distributed_vector>; namespace dr::mp {