8000 APM-786 heap sort with stored values by Dronplane · Pull Request #19551 · arangodb/arangodb · GitHub
[go: up one dir, main page]

Skip to content

APM-786 heap sort with stored values #19551

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 33 commits into from
Aug 14, 2023
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
07b591f
wip
Dronplane Apr 25, 2023
899331c
wip
Dronplane Apr 26, 2023
a03e71e
Merge remote-tracking branch 'origin/devel' into feature/APM-786-heap…
Dronplane Jul 18, 2023
a1ea685
wip
Dronplane Jul 18, 2023
b1ba920
wip
Dronplane Aug 1, 2023
8e392ce
Merge remote-tracking branch 'origin/devel' into feature/APM-786-heap…
Dronplane Aug 1, 2023
4584827
wip
Dronplane Aug 1, 2023
13d90f3
wip
Dronplane Aug 1, 2023
f62a7d5
wip
Dronplane Aug 3, 2023
dc44497
Merge remote-tracking branch 'origin/devel' into feature/APM-786-heap…
Dronplane Aug 3, 2023
48442bc
wip
Dronplane Aug 7, 2023
c2a0b23
Merge remote-tracking branch 'origin/devel' into feature/APM-786-heap…
Dronplane Aug 7, 2023
6995111
wip
Dronplane Aug 8, 2023
3227d21
wip
Dronplane Aug 8, 2023
de064a2
wip
Dronplane Aug 9, 2023
022d9da
Merge remote-tracking branch 'origin/devel' into feature/APM-786-heap…
Dronplane Aug 9, 2023
9599662
wip
Dronplane Aug 9, 2023
8f8acff
8000 fixex for postfix case
Dronplane Aug 9, 2023
e53f502
cleanup
Dronplane Aug 9, 2023
e7589a9
fix build
Dronplane Aug 10, 2023
7227454
fix
Dronplane Aug 10, 2023
0b91f37
fix typename
Dronplane Aug 10, 2023
0cdf93b
clang-format
Dronplane Aug 10, 2023
a47eb93
add explicit inits
Dronplane Aug 10, 2023
b56a4ec
fix
Dronplane Aug 10, 2023
e4fc95a
review comments
Dronplane Aug 10, 2023
34b8946
get rid of values mapping
Dronplane Aug 11, 2023
3504ef2
remove redundant inits
Dronplane Aug 11, 2023
9b8f84f
Merge remote-tracking branch 'origin/devel' into feature/APM-786-heap…
Dronplane Aug 11, 2023
9db2616
fix empty field case
Dronplane Aug 11, 2023
ec8b7b1
fix tests
Dronplane Aug 11, 2023
40dbd9e
Merge remote-tracking branch 'origin/devel' into feature/APM-786-heap…
Dronplane Aug 11, 2023
b1c9e09
fix asan build + fix slice iteration
Dronplane Aug 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 87 additions & 19 deletions arangod/Aql/IResearchViewExecutor.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "Aql/AqlFunctionsInternalCache.h"
#include "Aql/RegisterInfos.h"
#include "Aql/VarInfoMap.h"
#include "Containers/FlatHashSet.h"
#include "IResearch/ExpressionFilter.h"
#include "IResearch/IResearchFilterFactory.h"
#include "IResearch/IResearchExpressionContext.h"
Expand Down Expand Up @@ -88,8 +89,8 @@ class IResearchViewExecutorInfos {
iresearch::IResearchViewNode::ViewValuesRegisters&&
outNonMaterializedViewRegs,
iresearch::CountApproximate, iresearch::FilterOptimization,
std::vector<std::pair<size_t, bool>> scorersSort, size_t scorersSortLimit,
iresearch::SearchMeta const* meta);
std::vector<iresearch::HeapSortElement> const& heapSort,
size_t heapSortLimit, iresearch::SearchMeta const* meta);

auto getDocumentRegister() const noexcept -> RegisterId;

Expand Down Expand Up @@ -132,9 +133,9 @@ class IResearchViewExecutorInfos {

iresearch::IResearchViewStoredValues const& storedValues() const noexcept;

size_t scoresSortLimit() const noexcept { return _scorersSortLimit; }
size_t heapSortLimit() const noexcept { return _heapSortLimit; }

auto scoresSort() const noexcept { return std::span{_scorersSort}; }
auto heapSort() const noexcept { return std::span{_heapSort}; }

size_t scoreRegistersCount() const noexcept { return _scoreRegistersCount; }

Expand All @@ -160,8 +161,8 @@ class IResearchViewExecutorInfos {
iresearch::IResearchViewNode::ViewValuesRegisters _outNonMaterializedViewRegs;
iresearch::CountApproximate _countApproximate;
iresearch::FilterOptimization _filterOptimization;
std::vector<std::pair<size_t, bool>> _scorersSort;
size_t _scorersSortLimit;
std::vector<iresearch::HeapSortElement> const& _heapSort;
size_t _heapSortLimit;
iresearch::SearchMeta const* _meta;
int const _depth;
bool _filterConditionIsEmpty;
Expand Down Expand Up @@ -246,6 +247,11 @@ class ScoreIterator {
size_t _numScores;
};

union HeapSortValue {
irs::score_t score;
velocypack::Slice slice;
};

// Holds and encapsulates the data read from the iresearch index.
template<typename ValueType, bool copyStored>
class IndexReadBuffer {
Expand Down Expand Up @@ -274,8 +280,25 @@ class IndexReadBuffer {

ScoreIterator getScores(IndexReadBufferEntry bufferEntry) noexcept;

void setScoresSort(std::span<std::pair<size_t, bool> const> s) noexcept {
_scoresSort = s;
void setHeapSort(std::span<iresearch::HeapSortElement const> s,
iresearch::IResearchViewNode::ViewValuesRegisters const&
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we make it "flat_map"/just sorted vector of pairs?

Copy link
Contributor Author
@Dronplane Dronplane Aug 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can think about this. But let's do it in a separate PR as this one is already big enough.

storedValues) noexcept {
_heapSort = s;
TRI_ASSERT(_heapUsedStoredColumns.empty());
TRI_ASSERT(_heapOnlyColumnsCount == 0);
for (auto const& c : _heapSort) {
if (c.isScore()) {
continue;
}
auto it = storedValues.find(c.source);
if (it != storedValues.end()) {
_heapUsedStoredColumns.emplace(
c.source,
static_cast<size_t>(std::distance(storedValues.begin(), it)));
} else {
++_heapOnlyColumnsCount;
}
}
}

irs::score_t* pushNoneScores(size_t count);
Expand All @@ -288,7 +311,9 @@ class IndexReadBuffer {
_searchDocs.emplace_back(segment, docId);
}

void pushSortedValue(StorageSnapshot const& snapshot, ValueType&& value,
template<typename ColumnReaderProvider>
void pushSortedValue(ColumnReaderProvider& columnReader,
StorageSnapshot const& snapshot, ValueType&& value,
std::span<float_t const> scores, irs::score& score,
irs::score_t& threshold);

Expand All @@ -315,14 +340,22 @@ class IndexReadBuffer {
// before and after.
void assertSizeCoherence() const noexcept;

size_t memoryUsage(size_t maxSize) const noexcept {
size_t memoryUsage(size_t maxSize, size_t scores,
size_t stored) const noexcept {
auto res =
maxSize * sizeof(typename decltype(_keyBuffer)::value_type) +
maxSize * sizeof(typename decltype(_scoreBuffer)::value_type) +
maxSize * sizeof(typename decltype(_searchDocs)::value_type) +
maxSize * sizeof(typename decltype(_storedValuesBuffer)::value_type);
if (!_scoresSort.empty()) {
scores * maxSize * sizeof(typename decltype(_scoreBuffer)::value_type) +
maxSize * sizeof(typename decltype(_searchDocs)::value_type);
if (!_heapSort.empty()) {
res += stored * maxSize *
sizeof(typename decltype(_storedValuesBuffer)::value_type);
res += maxSize * sizeof(typename decltype(_rows)::value_type);
res += _heapOnlyColumnsCount * maxSize *
sizeof(typename decltype(_heapOnlyStoredValuesBuffer)::value_type);
res += _heapOnlyColumnsCount *
sizeof(typename decltype(_currentDocumentBuffer)::value_type);
res += (maxSize * _heapSort.size()) *
sizeof(typename decltype(_heapSortValues)::value_type);
}
return res;
}
Expand All @@ -331,17 +364,21 @@ class IndexReadBuffer {
size_t stored) {
TRI_ASSERT(_storedValuesBuffer.empty());
if (_keyBuffer.capacity() < atMost) {
auto newMemoryUsage = memoryUsage(atMost);
auto newMemoryUsage = memoryUsage(atMost, scores, stored);
auto tracked = _memoryTracker.tracked();
if (newMemoryUsage > tracked) {
_memoryTracker.increase(newMemoryUsage - tracked);
}
_keyBuffer.reserve(atMost);
_searchDocs.reserve(atMost);
_scoreBuffer.reserve(atMost * scores);
_storedValuesBuffer.reserve(atMost * stored);
if (!_scoresSort.empty()) {
if (!_heapSort.empty()) {
_rows.reserve(atMost);
_currentDocumentBuffer.reserve(_heapSort.size());
// resize is important here as we want
// indexed access for setting values
_storedValuesBuffer.resize(atMost * stored);
_heapOnlyStoredValuesBuffer.resize(_heapOnlyColumnsCount * atMost);
}
}
_maxSize = atMost;
Expand Down Expand Up @@ -390,14 +427,44 @@ class IndexReadBuffer {
StorageSnapshot const* snapshot;
ValueType value;
};

template<typename ColumnReaderProvider>
velocypack::Slice readHeapSortColumn(iresearch::HeapSortElement const& cmp,
irs::doc_id_t doc,
ColumnReaderProvider& readerProvider,
size_t readerSlot);
template<bool fullHeap, typename ColumnReaderProvider>
void finalizeHeapSortDocument(size_t idx, irs::doc_id_t,
std::span<float_t const> scores,
ColumnReaderProvider& readerProvider);

std::vector<BufferValueType> _keyBuffer;
// FIXME(gnusi): compile time
std::vector<iresearch::SearchDoc> _searchDocs;
std::vector<irs::score_t> _scoreBuffer;
StoredValuesContainer _storedValuesBuffer;
StoredValuesContainer _storedValuesBuffer; // FIXME: ensure atMost rows are
// allocated from the beginning!

// Heap Sort facilities
// maps stored column to index in _storedValuesBuffer
containers::FlatHashMap<ptrdiff_t, size_t> _heapUsedStoredColumns;
// atMost * <num heap only values>
StoredValuesContainer _heapOnlyStoredValuesBuffer;
// <num heap only values>
std::vector<ColumnIterator> _heapOnlyStoredValuesReaders;
// <num heap only values>
StoredValuesContainer _currentDocumentBuffer;
using DocumentSlicesContainer =
typename std::conditional<copyStored, bool,
std::vector<velocypack::Slice>>::type;
DocumentSlicesContainer _currentDocumentSlices;
std::vector<HeapSortValue> _heapSortValues;
std::span<iresearch::HeapSortElement const> _heapSort;
size_t _heapOnlyColumnsCount{0};
size_t _currentReaderOffset{std::numeric_limits<size_t>::max()};

size_t _numScoreRegisters;
size_t _keyBaseIdx;
std::span<std::pair<size_t, bool> const> _scoresSort;
std::vector<size_t> _rows;
size_t _maxSize;
size_t _heapSizeLeft;
Expand Down Expand Up @@ -585,6 +652,7 @@ class IResearchViewExecutorBase {
irs::Scorers _scorers;
irs::WandContext _wand;
std::vector<ColumnIterator> _storedValuesReaders;
containers::FlatHashSet<ptrdiff_t> _storedColumnsMask;
std::array<char, arangodb::iresearch::kSearchDocBufSize> _buf;
bool _isInitialized;
// new mangling only:
Expand Down
Loading
0