8000 fix index selection with prefix indexes by jsteemann · Pull Request #14424 · arangodb/arangodb · GitHub
[go: up one dir, main page]

Skip to content

fix index selection with prefix indexes #14424

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jul 7, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions CHANGELOG
8000
Original file line number Diff line number Diff line change
@@ -1,6 +1,31 @@
devel
-----

* Fixed an issue in index selection, when the selectivty estimate of another
prefix index was used without checking if the other index covered the
FILTER condition.

For example, given the following indexes:

- index 1: ["e", "a", "b", "c"]
- index 2: ["e", "a", "b"]
- index 3: ["d", "e", "f", "g"]

and the FILTER condition `d == 1 && e == 2 && f == 3`, then the best index
to pick would be index 3. However, the optimizer may have picked index 1
here.
All indexes are valid candidates for this FILTER condition, but none of the
indexes covered all attributes of the FILTER condition. So the index
selectivity estimates were (correctly) not used directly to determine the
best index.
The actual bug happened when comparing the usefulness of the candidate
indexes, when figuring out that even though the selectivity estimate for
index 1 could not be used, but that there existed a prefix index of index
1 (index 2). The selecivity estimate of this index was taken _without_
checking that prefix index actually satisfied the FILTER condition fully.
The prefix index' selectivity estimate must only be used if it fully
satisfies the FILTER condition, which was not the case here.

* Added garbage collection for finished and failed Pregel conductors.
Previously, Pregel executions that finished successfully or unsuccessfully
remained in memory until being explicitly canceled. This prevented a
Expand Down
26 changes: 16 additions & 10 deletions arangod/Aql/OptimizerUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -372,17 +372,17 @@ std::pair<bool, bool> findIndexHandleForAndNode(
return;
}

double totalCost = filterCost;
if (!sortCondition.isEmpty()) {
// only take into account the costs for sorting if there is actually
// something to sort
if (supportsSort) {
totalCost += sortCost;
} else {
totalCost +=
Index::SortCosts::defaultCosts(itemsInIndex).estimatedCosts;
if (!supportsSort) {
sortCost = Index::SortCosts::defaultCosts(itemsInIndex).estimatedCosts;
}
} else {
sortCost = 0.0;
}

double totalCost = filterCost + sortCost;

// the more attributes an index contains, the more useful it will be for projections.
double projectionsFactor = 1.0 - ((idx->fields().size() - 1) * 0.02);
Expand All @@ -398,13 +398,13 @@ std::pair<bool, bool> findIndexHandleForAndNode(
<< ", supportsFilter: " << supportsFilter
<< ", supportsSort: " << supportsSort
<< ", projectionsFactor: " << projectionsFactor
<< ", filterCost: " << filterCost
<< ", sortCost: " << sortCost
<< ", totalCost: " << totalCost
<< ", isOnlyAttributeAccess: " << isOnlyAttributeAccess
<< ", isUnidirectional: " << sortCondition.isUnidirectional()
<< ", isOnlyEqualityMatch: " << node->isOnlyEqualityMatch()
<< ", itemsInIndex/estimatedItems: " << itemsInIndex;
<< ", itemsInIndex/estimatedItems: " << itemsInIndex
<< ", filterCost: " << filterCost
<< ", sortCost: " << sortCost
<< ", totalCost: " << totalCost;

if (bestIndex == nullptr || totalCost < bestCost) {
bestIndex = idx;
Expand Down Expand Up @@ -452,6 +452,12 @@ std::pair<bool, bool> findIndexHandleForAndNode(
return std::make_pair(false, false);
}

LOG_TOPIC("1d732", TRACE, Logger::FIXME)
<< "selected index: " << bestIndex.get()
<< ", isSorted: " << bestIndex->isSorted()
<< ", isSparse: " << bestIndex->sparse()
<< ", fields: " << bestIndex->fields().size();

// intentionally commented out here. can be enabled during development
// LOG_TOPIC("4b655", TRACE, Logger::FIXME) << "- picked: " << bestIndex.get();

Expand Down
50 changes: 39 additions & 11 deletions arangod/Indexes/SortedIndexAttributeMatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -333,38 +333,66 @@ Index::FilterCosts SortedIndexAttributeMatcher::supportsFilterCondition(
// condition we have will make it even more selectivity. so in this case
// we will re-use the selectivity estimate from the other index, and are
// happy.
double otherEstimate = -1.0;

for (auto const& otherIdx : allIndexes) {
auto const* other = otherIdx.get();
if (other == idx || !other->hasSelectivityEstimate()) {
continue;
}
size_t matches = 0;
auto const& otherFields = other->fields();
if (otherFields.size() > idx->fields().size()) {
// filter out too long other indexes
continue;
}

size_t matches = 0;
for (size_t i = 0; i < otherFields.size(); ++i) {
if (otherFields[i] != idx->fields()[i]) {
break;
}
++matches;

if (matches > found.size()) {
break;
}
}

if (matches == otherFields.size()) {
double estimate = other->selectivityEstimate();
if (estimate > 0.0) {
// reuse the estimate from the other index
estimatedItems = static_cast<double>(1.0 / estimate * values);
} else {
// use a guesstimate
estimatedItems /= equalityReductionFactor;
// the other index is a full prefix of our own index.
// now check if the other index actually satisfies the filter condition
std::unordered_map<size_t, std::vector<arangodb::aql::AstNode const*>> foundOther;
[[maybe_unused]] size_t valuesOther = 0; // ignored here
matchAttributes(otherIdx.get(), node, reference, foundOther, valuesOther, nonNullAttributes, false);

auto [attributesCoveredOther, attributesCoveredByEqualityOther, equalityReductionFactorOther, nonEqualityReductionFactorOther] =
::analyzeConditions(otherIdx.get(), foundOther);

// all attributes from the other index must be covered with equality lookups,
// otherwise we cannot use the other index' selectivity estimate
if (foundOther.size() == matches && attributesCoveredByEqualityOther == matches) {
double estimate = other->selectivityEstimate();
if (estimate > 0.0 && estimate > otherEstimate) {
otherEstimate = estimate;
}
}
break;
}
}
}

if (otherEstimate > 0.0) {
// reuse the estimate from the other index
estimatedItems = static_cast<double>(1.0 / otherEstimate * values);
} else {
// use a guesstimate
estimatedItems /= equalityReductionFactor;
}

estimatedItems /= nonEqualityReductionFactor;
}

// costs.estimatedItems is always set here, make it at least 1
costs.estimatedItems = std::max(size_t(1), static_cast<size_t>(estimatedItems));

// seek cost is O(log(n))
costs.estimatedCosts = std::max(double(1.0),
std::log2(double(itemsInIndex)) * values);
Expand Down
58 changes: 57 additions & 1 deletion tests/js/server/aql/aql-index-choice.js
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ let setupCollection = function(cn, n) {
let c = db._create(cn);
let docs = [];
for (let i = 0; i < n; ++i) {
docs.push({ _key: "test" + i, uid: i, pid: (i % 100), dt: Date.now() - ((i * 10) % 10000) });
docs.push({ _key: "test" + i, uid: i, pid: (i % 100), dt: Date.now() - ((i * 10) % 10000), other: i });
if (docs.length === 5000) {
c.insert(docs);
docs = [];
Expand Down Expand Up @@ -67,6 +67,62 @@ function BaseTestConfig () {
tearDown: function() {
resetIndexes(cn);
},

testManyIndexesWithPrefixes: function() {
db[cn].ensureIndex({ type: "persistent", fields: ["uid", "other2"] });
db[cn].ensureIndex({ type: "persistent", fields: ["uid", "other3", "other4", "other5"] });
db[cn].ensureIndex({ type: "persistent", fields: ["uid", "other3", "other4"] });
db[cn].ensureIndex({ type: "persistent", fields: ["uid", "other6", "other4", "other5"] });
db[cn].ensureIndex({ type: "persistent", fields: ["uid", "other6", "other4"] });
db[cn].ensureIndex({ type: "persistent", fields: ["other5", "uid"] });
db[cn].ensureIndex({ type: "persistent", fields: ["uid", "other6", "pid", "other5"] });
db[cn].ensureIndex({ type: "persistent", fields: ["uid", "other6", "pid"] });
db[cn].ensureIndex({ type: "persistent", fields: ["pid", "dt", "uid", "other4"] });

[
[ `FOR doc IN ${cn} FILTER doc.pid == 1234 FILTER doc.dt == 1234 FILTER doc.uid == 1234 RETURN doc`, ["pid", "dt", "uid", "other4"] ],
[ `FOR doc IN ${cn} FILTER doc.pid == 1234 FILTER doc.dt == 1234 FILTER doc.uid == 1234 SORT doc.other5 RETURN doc`, ["pid", "dt", "uid", "other4"] ],
].forEach((q) => {
let nodes = AQL_EXPLAIN(q[0]).plan.nodes;
assertEqual(1, nodes.filter((n) => n.type === 'IndexNode').length);
let indexNode = nodes.filter((n) => n.type === 'IndexNode')[0];
assertFalse(indexNode.indexCoversProjections);
assertEqual(1, indexNode.indexes.length);
let index = indexNode.indexes[0];
assertEqual("persistent", index.type);
assertEqual(q[1], index.fields, q);
});
},

testIndexPrefixes: function() {
db[cn].ensureIndex({ type: "persistent", fields: ["pid"] });
db[cn].ensureIndex({ type: "persistent", fields: ["pid", "dt"] });
db[cn].ensureIndex({ type: "persistent", fields: ["uid"] });
db[cn].ensureIndex({ type: "persistent", fields: ["uid", "dt"] });
db[cn].ensureIndex({ type: "persistent", fields: ["uid", "pid"] });
db[cn].ensureIndex({ type: "persistent", fields: ["uid", "pid", "dt"] });
db[cn].ensureIndex({ type: "persistent", fields: ["uid", "pid", "dt"] });
db[cn].ensureIndex({ type: "persistent", fields: ["dt", "other"] });
db[cn].ensureIndex({ type: "persistent", fields: ["uid", "other"] });
db[cn].ensureIndex({ type: "persistent", fields: ["uid", "other", "pid"] });

[
[ `FOR doc IN ${cn} FILTER doc.uid == 1234 RETURN doc`, ["uid", "pid", "dt"] ],
[ `FOR doc IN ${cn} FILTER doc.pid == 1234 RETURN doc`, ["pid", "dt"] ],
[ `FOR doc IN ${cn} FILTER doc.pid == 1234 && doc.uid == 1234 RETURN doc`, ["uid", "pid", "dt"] ],
[ `FOR doc IN ${cn} FILTER doc.pid == 1234 && doc.dt == 1234 RETURN doc`, ["pid", "dt"] ],
[ `FOR doc IN ${cn} FILTER doc.pid == 1234 && doc.dt == 1234 && doc.uid == 1234 RETURN doc`, ["uid", "pid", "dt"] ],
].forEach((q) => {
let nodes = AQL_EXPLAIN(q[0]).plan.nodes;
assertEqual(1, nodes.filter((n) => n.type === 'IndexNode').length);
let indexNode = nodes.filter((n) => n.type === 'IndexNode')[0];
assertFalse(indexNode.indexCoversProjections);
assertEqual(1, indexNode.indexes.length);
let index = indexNode.indexes[0];
assertEqual("persistent", index.type);
assertEqual(q[1], index.fields, q);
});
},

testSingleIndexSingleFilter: function() {
db[cn].ensureIndex({ type: "persistent", fields: ["uid"] });
Expand Down
0