8000 Fix selectivity estimate based optimizer rule in cluster (#21678) · solisoft/arangodb@a15a339 · GitHub
[go: up one dir, main page]

Skip to content

Commit a15a339

Browse files
authored
Fix selectivity estimate based optimizer rule in cluster (arangodb#21678)
1 parent a4f8f3b commit a15a339

File tree

3 files changed

+72
-5
lines changed

3 files changed

+72
-5
lines changed

arangod/Aql/Optimizer/Rule/OptimizerRuleCollectWithIndex.cpp

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
///
1919
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
2020
////////////////////////////////////////////////////////////////////////////////
21+
#include "Cluster/ServerState.h"
2122
#include "Aql/Ast.h"
2223
#include "Aql/Collection.h"
2324
#include "Aql/Condition.h"
@@ -36,6 +37,7 @@
3637
#include "Aql/Query.h"
3738
#include "Indexes/Index.h"
3839
#include "Logger/LogMacros.h"
40+
#include "VocBase/LogicalCollection.h"
3941

4042
using namespace arangodb;
4143
using namespace arangodb::aql;
@@ -152,16 +154,35 @@ bool selectivityIsLowEnough(IndexNode const& in) {
152154
// assume there are n documents in the collection and we have
153155
// k distinct features. A linear search is in O(n) while a distinct scan
154156
// requires O(k log n). So checking for k log n < n, it follows
155-
// k / n < 1 / log n, where k / n is precisely the selectivity estimate.
156-
double selectivity = index->selectivityEstimate();
157+
// k / n < 1 / log n.
158+
// We cannot easily know the number of distinct features k, but we know the
159+
// index selectivity estimate k_index / n, which is an upper bound for k / n
160+
// because the distinct fields for the collect need to be a prefix of the
161+
// index fields.
162+
double index_selectivity = index->selectivityEstimate();
157163
auto numberOfItems = in.estimateCost().estimatedNrItems;
158164

159-
double const requiredSelectivity = 1. / log(numberOfItems);
165+
double requiredSelectivity;
166+
if (ServerState::instance()->isSingleServer()) {
167+
requiredSelectivity = 1. / log(numberOfItems);
168+
} else {
169+
// in cluster mode, we use the same equation as an approximation, although
170+
// actually
171+
// 1. each shard has its own selectivity estimate (the selectivity estimate
172+
// used here is an average over all shards)
173+
// 2. each shard includes a different number of documents (depending on the
174+
// sharding strategy used)
175+
// TODO compare the total costs of executing the optimized vs. the
176+
// non-optimized execution node, which involves getting the selectivity
177+
// estimate of each shard separately
178+
requiredSelectivity =
179+
1. / log(numberOfItems / index->collection().numberOfShards());
180+
}
160181

161-
if (selectivity > requiredSelectivity) {
182+
if (index_selectivity > requiredSelectivity) {
162183
LOG_RULE << "IndexNode " << in.id()
163184
<< " not eligible - selectivity is too high, actual = "
164-
<< selectivity << " max allowed = " << requiredSelectivity;
185+
<< index_selectivity << " max allowed = " << requiredSelectivity;
165186
return false;
166187
}
167188

arangod/RocksDBEngine/RocksDBVPackIndex.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3298,6 +3298,7 @@ RocksDBVPackIndex::distinctScanFor(
32983298
TRI_ASSERT(supportsDistinctScan(scanOptions));
32993299

33003300
std::vector<std::size_t> inverseFieldMapping;
3301+
// distinct fields can only include a prefix of the index fields
33013302
inverseFieldMapping.resize(scanOptions.distinctFields.size());
33023303
for (size_t k = 0; k < scanOptions.distinctFields.size(); k++) {
33033304
inverseFieldMapping[scanOptions.distinctFields[k]] = k;

tests/js/client/aql/aql-index-collect.js

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
const jsunity = require("jsunity");
2828
const { db } = require("@arangodb");
29+
const { waitForEstimatorSync } = require('@arangodb/test-helper');
2930

3031
const database = "IndexCollectDatabase";
3132
const collection = "c";
@@ -87,6 +88,50 @@ function IndexCollectOptimizerTestSuite() {
8788
}
8889
},
8990

91+
testOptimizerRuleAppliesWhenSelectivityIsLowEnough: function () {
92+
const c_new = db._create(collection + "1", { numberOfShards: 3 });
93+
94+
let docs = [];
95+
// number of documents for single server n := 6000
96+
// number of documents per shard for cluster n := 6000 / 3 = 2000
97+
for (let l = 0; l < 6000; l++) {
98+
docs.push({ a: l % 100 }); // number of distinct values k = 100
99+
}
100+
c_new.save(docs);
101+
c_new.ensureIndex({ type: "persistent", fields: ["a"] });
102+
waitForEstimatorSync();
103+
104+
// for optimizer rule to be applied k < n / log n
105+
// single server k < 263
106+
// cluster k < 689
107+
108+
// k := 100 distinct values, optimizer rule applies
109+
let explain = db._createStatement(`FOR doc IN ${c_new.name()} COLLECT a = doc.a RETURN a`).explain();
110+
assertTrue(explain.plan.rules.indexOf(indexCollectOptimizerRule) !== -1);
111+
},
112+
113+
testOptimerRuleDoesNotApplyWhenSelectivityIsTooHigh: function () {
114+
const c_new = db._create(collection + "2", { numberOfShards: 2 });
115+
116+
let docs = [];
117+
// number of documents for single server n := 6000
118+
// number of documents per shard for cluster n := 6000 / 3 = 2000
119+
for (let l = 0; l < 6000; l++) {
120+
docs.push({ a: l % 1000 }); // number of different values k = 1000
121+
}
122+
c_new.save(docs);
123+
c_new.ensureIndex({ type: "persistent", fields: ["a"] });
124+
waitForEstimatorSync();
125+
126+
// for optimizer rule to be applied k < n / log n
127+
// single server k < 263
128+
// cluster k < 689
129+
130+
// k := 1000 distinct values, optimizer rule does not apply
131+
let explain = db._createStatement(`FOR doc IN ${c_new.name()} COLLECT a = doc.a RETURN a`).explain();
132+
assertFalse(explain.plan.rules.indexOf(indexCollectOptimizerRule) !== -1);
133+
},
134+
90135
testCollectOptions: function () {
91136
const index = db[collection].ensureIndex({ type: "persistent", fields: ["a", "b", "c.d"], name: "foobar" });
92137
const queries = [

0 commit comments

Comments
 (0)
0