solisoft
diff --git a/‎arangod/Aql/Optimizer/Rule/OptimizerRuleCollectWithIndex.cpp
Lines changed: 26 additions & 5 deletions b/‎arangod/Aql/Optimizer/Rule/OptimizerRuleCollectWithIndex.cpp
Lines changed: 26 additions & 5 deletions
diff --git a/‎arangod/RocksDBEngine/RocksDBVPackIndex.cpp
Lines changed: 1 addition & 0 deletions b/‎arangod/RocksDBEngine/RocksDBVPackIndex.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/js/client/aql/aql-index-collect.js
Lines changed: 45 additions & 0 deletions b/‎tests/js/client/aql/aql-index-collect.js
Lines changed: 45 additions & 0 deletions
@@ -18,6 +18,7 @@
 ///
 /// Copyright holder is ArangoDB GmbH, Cologne, Germany
 ////////////////////////////////////////////////////////////////////////////////
+#include "Cluster/ServerState.h"
 #include "Aql/Ast.h"
 #include "Aql/Collection.h"
 #include "Aql/Condition.h"
@@ -36,6 +37,7 @@
 #include "Aql/Query.h"
 #include "Indexes/Index.h"
 #include "Logger/LogMacros.h"
+#include "VocBase/LogicalCollection.h"
 
 using namespace arangodb;
 using namespace arangodb::aql;
@@ -152,16 +154,35 @@ bool selectivityIsLowEnough(IndexNode const& in) {
   // assume there are n documents in the collection and we have
   // k distinct features. A linear search is in O(n) while a distinct scan
   // requires O(k log n). So checking for k log n < n, it follows
-  // k / n < 1 / log n, where k / n is precisely the selectivity estimate.
-  double selectivity = index->selectivityEstimate();
+  // k / n < 1 / log n.
+  // We cannot easily know the number of distinct features k, but we know the
+  // index selectivity estimate k_index / n, which is an upper bound for k / n
+  // because the distinct fields for the collect need to be a prefix of the
+  // index fields.
+  double index_selectivity = index->selectivityEstimate();
   auto numberOfItems = in.estimateCost().estimatedNrItems;
 
-  double const requiredSelectivity = 1. / log(numberOfItems);
+  double requiredSelectivity;
+  if (ServerState::instance()->isSingleServer()) {
+    requiredSelectivity = 1. / log(numberOfItems);
+  } else {
+    // in cluster mode, we use the same equation as an approximation, although
+    // actually
+    // 1. each shard has its own selectivity estimate (the selectivity estimate
+    // used here is an average over all shards)
+    // 2. each shard includes a different number of documents (depending on the
+    // sharding strategy used)
+    // TODO compare the total costs of executing the optimized vs. the
+    // non-optimized execution node, which involves getting the selectivity
+    // estimate of each shard separately
+    requiredSelectivity =
+        1. / log(numberOfItems / index->collection().numberOfShards());
+  }
 
-  if (selectivity > requiredSelectivity) {
+  if (index_selectivity > requiredSelectivity) {
     LOG_RULE << "IndexNode " << in.id()
              << " not eligible - selectivity is too high, actual = "
-             << selectivity << " max allowed = " << requiredSelectivity;
+             << index_selectivity << " max allowed = " << requiredSelectivity;
     return false;
   }
 
 
@@ -3298,6 +3298,7 @@ RocksDBVPackIndex::distinctScanFor(
   TRI_ASSERT(supportsDistinctScan(scanOptions));
 
   std::vector<std::size_t> inverseFieldMapping;
+  // distinct fields can only include a prefix of the index fields
   inverseFieldMapping.resize(scanOptions.distinctFields.size());
   for (size_t k = 0; k < scanOptions.distinctFields.size(); k++) {
     inverseFieldMapping[scanOptions.distinctFields[k]] = k;
 
@@ -26,6 +26,7 @@
 
 const jsunity = require("jsunity");
 const { db } = require("@arangodb");
+const { waitForEstimatorSync } = require('@arangodb/test-helper');
 
 const database = "IndexCollectDatabase";
 const collection = "c";
@@ -87,6 +88,50 @@ function IndexCollectOptimizerTestSuite() {
       }
     },
 
+    testOptimizerRuleAppliesWhenSelectivityIsLowEnough: function () {
+      const c_new = db._create(collection + "1", { numberOfShards: 3 });
+
+      let docs = [];
+      // number of documents for single server n := 6000
+      // number of documents per shard for cluster n := 6000 / 3 = 2000
+      for (let l = 0; l < 6000; l++) {
+        docs.push({ a: l % 100 }); // number of distinct values k = 100
+      }
+      c_new.save(docs);
+      c_new.ensureIndex({ type: "persistent", fields: ["a"] });
+      waitForEstimatorSync();
+
+      // for optimizer rule to be applied k < n / log n
+      // single server k < 263
+      // cluster k < 689
+
+      // k := 100 distinct values, optimizer rule applies
+      let explain = db._createStatement(`FOR doc IN ${c_new.name()} COLLECT a = doc.a RETURN a`).explain();
+      assertTrue(explain.plan.rules.indexOf(indexCollectOptimizerRule) !== -1);
+    },
+
+    testOptimerRuleDoesNotApplyWhenSelectivityIsTooHigh: function () {
+      const c_new = db._create(collection + "2", { numberOfShards: 2 });
+
+      let docs = [];
+      // number of documents for single server n := 6000
+      // number of documents per shard for cluster n := 6000 / 3 = 2000
+      for (let l = 0; l < 6000; l++) {
+        docs.push({ a: l % 1000 }); // number of different values k = 1000
+      }
+      c_new.save(docs);
+      c_new.ensureIndex({ type: "persistent", fields: ["a"] });
+      waitForEstimatorSync();
+
+      // for optimizer rule to be applied k < n / log n
+      // single server k < 263
+      // cluster k < 689
+
+      // k := 1000 distinct values, optimizer rule does not apply
+      let explain = db._createStatement(`FOR doc IN ${c_new.name()} COLLECT a = doc.a RETURN a`).explain();
+      assertFalse(explain.plan.rules.indexOf(indexCollectOptimizerRule) !== -1);
+    },
+
     testCollectOptions: function () {
       const index = db[collection].ensureIndex({ type: "persistent", fields: ["a", "b", "c.d"], name: "foobar" });
       const queries = [