|
18 | 18 | ///
|
19 | 19 | /// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
20 | 20 | ////////////////////////////////////////////////////////////////////////////////
|
| 21 | +#include "Cluster/ServerState.h" |
21 | 22 | #include "Aql/Ast.h"
|
22 | 23 | #include "Aql/Collection.h"
|
23 | 24 | #include "Aql/Condition.h"
|
|
36 | 37 | #include "Aql/Query.h"
|
37 | 38 | #include "Indexes/Index.h"
|
38 | 39 | #include "Logger/LogMacros.h"
|
| 40 | +#include "VocBase/LogicalCollection.h" |
39 | 41 |
|
40 | 42 | using namespace arangodb;
|
41 | 43 | using namespace arangodb::aql;
|
@@ -152,16 +154,35 @@ bool selectivityIsLowEnough(IndexNode const& in) {
|
152 | 154 | // assume there are n documents in the collection and we have
|
153 | 155 | // k distinct features. A linear search is in O(n) while a distinct scan
|
154 | 156 | // requires O(k log n). So checking for k log n < n, it follows
|
155 |
| - // k / n < 1 / log n, where k / n is precisely the selectivity estimate. |
156 |
| - double selectivity = index->selectivityEstimate(); |
| 157 | + // k / n < 1 / log n. |
| 158 | + // We cannot easily know the number of distinct features k, but we know the |
| 159 | + // index selectivity estimate k_index / n, which is an upper bound for k / n |
| 160 | + // because the distinct fields for the collect need to be a prefix of the |
| 161 | + // index fields. |
| 162 | + double index_selectivity = index->selectivityEstimate(); |
157 | 163 | auto numberOfItems = in.estimateCost().estimatedNrItems;
|
158 | 164 |
|
159 |
| - double const requiredSelectivity = 1. / log(numberOfItems); |
| 165 | + double requiredSelectivity; |
| 166 | + if (ServerState::instance()->isSingleServer()) { |
| 167 | + requiredSelectivity = 1. / log(numberOfItems); |
| 168 | + } else { |
| 169 | + // in cluster mode, we use the same equation as an approximation, although |
| 170 | + // actually |
| 171 | + // 1. each shard has its own selectivity estimate (the selectivity estimate |
| 172 | + // used here is an average over all shards) |
| 173 | + // 2. each shard includes a different number of documents (depending on the |
| 174 | + // sharding strategy used) |
| 175 | + // TODO compare the total costs of executing the optimized vs. the |
| 176 | + // non-optimized execution node, which involves getting the selectivity |
| 177 | + // estimate of each shard separately |
| 178 | + requiredSelectivity = |
| 179 | + 1. / log(numberOfItems / index->collection().numberOfShards()); |
| 180 | + } |
160 | 181 |
|
161 |
| - if (selectivity > requiredSelectivity) { |
| 182 | + if (index_selectivity > requiredSelectivity) { |
162 | 183 | LOG_RULE << "IndexNode " << in.id()
|
163 | 184 | << " not eligible - selectivity is too high, actual = "
|
164 |
| - << selectivity << " max allowed = " << requiredSelectivity; |
| 185 | + << index_selectivity << " max allowed = " << requiredSelectivity; |
165 | 186 | return false;
|
166 | 187 | }
|
167 | 188 |
|
|
0 commit comments