From 3f6967cfa985faace07a2c2ae83d3836dc9d3448 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Mon, 3 Dec 2018 21:03:59 +0100 Subject: [PATCH 01/31] Initial commit --- arangod/Agency/Supervision.cpp | 4 +- arangod/Aql/OptimizerRules.cpp | 16 +- .../Aql/OptimizerRulesReplaceFunctions.cpp | 1 - arangod/Cluster/ClusterInfo.cpp | 6 +- arangod/ClusterEngine/ClusterCollection.cpp | 40 +- arangod/ClusterEngine/ClusterCollection.h | 3 - arangod/ClusterEngine/ClusterEngine.cpp | 6 +- arangod/ClusterEngine/ClusterEngine.h | 2 - arangod/ClusterEngine/ClusterIndex.cpp | 13 - arangod/ClusterEngine/ClusterIndex.h | 7 +- arangod/ClusterEngine/ClusterIndexFactory.cpp | 2 +- arangod/IResearch/IResearchLink.cpp | 15 +- arangod/IResearch/IResearchLink.h | 5 +- arangod/IResearch/IResearchLinkCoordinator.h | 6 +- arangod/IResearch/IResearchMMFilesLink.cpp | 18 +- arangod/IResearch/IResearchMMFilesLink.h | 14 +- arangod/IResearch/IResearchRocksDBLink.h | 6 +- .../IResearchRocksDBRecoveryHelper.cpp | 4 +- arangod/Indexes/Index.cpp | 40 +- arangod/Indexes/Index.h | 14 +- arangod/MMFiles/MMFilesCollection.cpp | 50 +-- arangod/MMFiles/MMFilesCollection.h | 3 - arangod/MMFiles/MMFilesEngine.cpp | 6 +- arangod/MMFiles/MMFilesEngine.h | 2 - arangod/MMFiles/MMFilesIndex.h | 8 + arangod/RestHandler/RestIndexHandler.cpp | 4 +- arangod/RocksDBEngine/RocksDBCollection.cpp | 343 ++++++------------ arangod/RocksDBEngine/RocksDBCollection.h | 6 - arangod/RocksDBEngine/RocksDBEngine.cpp | 18 +- arangod/RocksDBEngine/RocksDBEngine.h | 2 - arangod/RocksDBEngine/RocksDBIndex.cpp | 204 ++++++++++- arangod/RocksDBEngine/RocksDBIndex.h | 27 +- arangod/RocksDBEngine/RocksDBIndexFactory.cpp | 6 +- arangod/RocksDBEngine/RocksDBLogValue.cpp | 4 + arangod/RocksDBEngine/RocksDBLogValue.h | 4 + .../RocksDBEngine/RocksDBTransactionState.cpp | 5 - .../RocksDBEngine/RocksDBTransactionState.h | 5 +- arangod/StorageEngine/PhysicalCollection.cpp | 35 ++ arangod/StorageEngine/PhysicalCollection.h | 16 +- arangod/StorageEngine/StorageEngine.h | 2 - arangod/Transaction/Methods.cpp | 11 +- arangod/V8Server/v8-vocindex.cpp | 7 +- arangod/VocBase/LogicalCollection.cpp | 16 +- arangod/VocBase/Methods/Indexes.cpp | 16 +- arangod/VocBase/Methods/Indexes.h | 3 +- arangod/VocBase/vocbase.cpp | 7 +- .../modules/@arangodb/arango-collection.js | 6 +- lib/Basics/StaticStrings.cpp | 1 + lib/Basics/StaticStrings.h | 1 + tests/IResearch/IResearchLink-test.cpp | 2 - .../IResearchLinkCoordinator-test.cpp | 2 - .../IResearchViewCoordinator-test.cpp | 18 - tests/IResearch/StorageEngineMock.cpp | 10 +- tests/IResearch/StorageEngineMock.h | 7 +- 54 files changed, 558 insertions(+), 521 deletions(-) diff --git a/arangod/Agency/Supervision.cpp b/arangod/Agency/Supervision.cpp index 5a56fd170343..f6ae4c9e69ca 100644 --- a/arangod/Agency/Supervision.cpp +++ b/arangod/Agency/Supervision.cpp @@ -1066,7 +1066,7 @@ void Supervision::readyOrphanedIndexCreations() { indexes = collection("indexes").getArray(); if (indexes.length() > 0) { for (auto const& planIndex : VPackArrayIterator(indexes)) { - if (planIndex.hasKey("isBuilding") && collection.has("shards")) { + if (planIndex.hasKey(StaticStrings::IndexIsBuilding) && collection.has("shards")) { auto const& planId = planIndex.get("id"); auto const& shards = collection("shards"); if (collection.has("numberOfShards") && @@ -1121,7 +1121,7 @@ void Supervision::readyOrphanedIndexCreations() { { VPackObjectBuilder props(envelope.get()); for (auto const& prop : VPackObjectIterator(planIndex)) { auto const& key = prop.key.copyString(); - if (key != "isBuilding") { + if (key != StaticStrings::IndexIsBuilding) { envelope->add(key, prop.value); } }} diff --git a/arangod/Aql/OptimizerRules.cpp b/arangod/Aql/OptimizerRules.cpp index fd3792c3b586..94d5e280cf4a 100644 --- a/arangod/Aql/OptimizerRules.cpp +++ b/arangod/Aql/OptimizerRules.cpp @@ -6581,15 +6581,15 @@ static bool geoFuncArgCheck(ExecutionPlan* plan, AstNode const* args, info.collectionNodeToReplace = collNode; info.collectionNodeOutVar = collNode->outVariable(); info.collection = collNode->collection(); - std::shared_ptr coll = - collNode->collection()->getCollection(); - - // check for suitable indexes - for (std::shared_ptr idx : coll->getIndexes()) { + + // we should not access the LogicalCollection directly + Query* query = plan->getAst()->query(); + auto indexes = query->trx()->indexesForCollection(info.collection->name()); + // check for suitiable indexes + for (std::shared_ptr idx : indexes) { // check if current index is a geo-index - bool isGeo = - idx->type() == arangodb::Index::IndexType::TRI_IDX_TYPE_GEO_INDEX; - if (isGeo && idx->fields().size() == 1) { // individual fields + bool isGeo = idx->type() == arangodb::Index::IndexType::TRI_IDX_TYPE_GEO_INDEX; + if (isGeo && idx->fields().size() == 1) { // individual fields // check access paths of attributes in ast and those in index match if (idx->fields()[0] == attributeAccess.second) { if (info.index != nullptr && info.index != idx) { diff --git a/arangod/Aql/OptimizerRulesReplaceFunctions.cpp b/arangod/Aql/OptimizerRulesReplaceFunctions.cpp index a7996aff9175..0e7d412e5c2d 100644 --- a/arangod/Aql/OptimizerRulesReplaceFunctions.cpp +++ b/arangod/Aql/OptimizerRulesReplaceFunctions.cpp @@ -193,7 +193,6 @@ std::pair getAttributeAccessFromIndex(Ast* ast, AstNode* doc for(auto& idx : indexes){ if(Index::isGeoIndex(idx->type())) { // we take the first index that is found - bool isGeo1 = idx->type() == Index::IndexType::TRI_IDX_TYPE_GEO1_INDEX; bool isGeo2 = idx->type() == Index::IndexType::TRI_IDX_TYPE_GEO2_INDEX; bool isGeo = idx->type() == Index::IndexType::TRI_IDX_TYPE_GEO_INDEX; diff --git a/arangod/Cluster/ClusterInfo.cpp b/arangod/Cluster/ClusterInfo.cpp index d830a5c40877..031a72ae5b8f 100644 --- a/arangod/Cluster/ClusterInfo.cpp +++ b/arangod/Cluster/ClusterInfo.cpp @@ -2629,14 +2629,14 @@ int ClusterInfo::ensureIndexCoordinatorInner( for (auto const& e : VPackObjectIterator(slice)) { TRI_ASSERT(e.key.isString()); std::string const& key = e.key.copyString(); - if (key != StaticStrings::IndexId && key != "isBuilding") { + if (key != StaticStrings::IndexId && key != StaticStrings::IndexIsBuilding) { ob->add(e.key); ob->add(e.value); } } if (numberOfShards > 0 && !slice.get(StaticStrings::IndexType).isEqualString("arangosearch")) { - ob->add("isBuilding", VPackValue(true)); + ob->add(StaticStrings::IndexIsBuilding, VPackValue(true)); } ob->add(StaticStrings::IndexId, VPackValue(idString)); } @@ -2709,7 +2709,7 @@ int ClusterInfo::ensureIndexCoordinatorInner( { VPackObjectBuilder o(&finishedPlanIndex); for (auto const& entry : VPackObjectIterator(newIndexBuilder.slice())) { auto const key = entry.key.copyString(); - if (key != "isBuilding" && key != "isNewlyCreated") { + if (key != StaticStrings::IndexIsBuilding && key != "isNewlyCreated") { finishedPlanIndex.add(entry.key.copyString(), entry.value); } } diff --git a/arangod/ClusterEngine/ClusterCollection.cpp b/arangod/ClusterEngine/ClusterCollection.cpp index aeb6992975f7..098d41f203cc 100644 --- a/arangod/ClusterEngine/ClusterCollection.cpp +++ b/arangod/ClusterEngine/ClusterCollection.cpp @@ -360,42 +360,6 @@ void ClusterCollection::prepareIndexes( TRI_ASSERT(!_indexes.empty()); } -static std::shared_ptr findIndex( - velocypack::Slice const& info, - std::vector> const& indexes) { - TRI_ASSERT(info.isObject()); - - // extract type - VPackSlice value = info.get("type"); - - if (!value.isString()) { - // Compatibility with old v8-vocindex. - THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, - "invalid index type definition"); - } - - std::string tmp = value.copyString(); - arangodb::Index::IndexType const type = arangodb::Index::type(tmp.c_str()); - - for (auto const& idx : indexes) { - if (idx->type() == type) { - // Only check relevant indexes - if (idx->matchesDefinition(info)) { - // We found an index for this definition. - return idx; - } - } - } - return nullptr; -} - -/// @brief Find index by definition -std::shared_ptr ClusterCollection::lookupIndex( - velocypack::Slice const& info) const { - READ_LOCKER(guard, _indexesLock); - return findIndex(info, _indexes); -} - std::shared_ptr ClusterCollection::createIndex( arangodb::velocypack::Slice const& info, bool restore, bool& created) { @@ -418,9 +382,7 @@ std::shared_ptr ClusterCollection::createIndex( TRI_ASSERT(engine != nullptr); // We are sure that we do not have an index of this type. - // We also hold the lock. - // Create it - + // We also hold the lock. Create it idx = engine->indexFactory().prepareIndexFromSlice( info, true, _logicalCollection, false ); diff --git a/arangod/ClusterEngine/ClusterCollection.h b/arangod/ClusterEngine/ClusterCollection.h index 9100202fab91..6746edf179fd 100644 --- a/arangod/ClusterEngine/ClusterCollection.h +++ b/arangod/ClusterEngine/ClusterCollection.h @@ -104,9 +104,6 @@ class ClusterCollection final : public PhysicalCollection { void prepareIndexes(arangodb::velocypack::Slice indexesSlice) override; - /// @brief Find index by definition - std::shared_ptr lookupIndex(velocypack::Slice const&) const override; - std::shared_ptr createIndex(arangodb::velocypack::Slice const& info, bool restore, bool& created) override; diff --git a/arangod/ClusterEngine/ClusterEngine.cpp b/arangod/ClusterEngine/ClusterEngine.cpp index eaaccb73ffdd..e42dbdbfee22 100644 --- a/arangod/ClusterEngine/ClusterEngine.cpp +++ b/arangod/ClusterEngine/ClusterEngine.cpp @@ -289,11 +289,10 @@ void ClusterEngine::recoveryDone(TRI_vocbase_t& vocbase) { std::string ClusterEngine::createCollection( TRI_vocbase_t& vocbase, - TRI_voc_cid_t cid, LogicalCollection const& collection ) { - TRI_ASSERT(cid != 0); - TRI_UpdateTickServer(static_cast(cid)); + TRI_ASSERT(collection.id() != 0); + TRI_UpdateTickServer(static_cast(collection.id())); return std::string(); // no need to return a path } @@ -320,7 +319,6 @@ void ClusterEngine::destroyCollection( void ClusterEngine::changeCollection( TRI_vocbase_t& vocbase, - TRI_voc_cid_t id, LogicalCollection const& collection, bool doSync ) { diff --git a/arangod/ClusterEngine/ClusterEngine.h b/arangod/ClusterEngine/ClusterEngine.h index 8c2e3807722a..7979e88f8cc7 100644 --- a/arangod/ClusterEngine/ClusterEngine.h +++ b/arangod/ClusterEngine/ClusterEngine.h @@ -239,7 +239,6 @@ class ClusterEngine final : public StorageEngine { public: std::string createCollection( TRI_vocbase_t& vocbase, - TRI_voc_cid_t id, LogicalCollection const& collection ) override; @@ -260,7 +259,6 @@ class ClusterEngine final : public StorageEngine { void changeCollection( TRI_vocbase_t& vocbase, - TRI_voc_cid_t id, LogicalCollection const& collection, bool doSync ) override; diff --git a/arangod/ClusterEngine/ClusterIndex.cpp b/arangod/ClusterEngine/ClusterIndex.cpp index 3cc7a66b642b..71ec0999c814 100644 --- a/arangod/ClusterEngine/ClusterIndex.cpp +++ b/arangod/ClusterEngine/ClusterIndex.cpp @@ -135,19 +135,6 @@ void ClusterIndex::updateClusterSelectivityEstimate(double estimate) { _clusterSelectivity = estimate; } -bool ClusterIndex::isPersistent() const { - if (_engineType == ClusterEngineType::MMFilesEngine) { - return _indexType == Index::TRI_IDX_TYPE_PERSISTENT_INDEX; - } else if (_engineType == ClusterEngineType::RocksDBEngine) { - return true; - } else if (_engineType == ClusterEngineType::MockEngine) { - return false; - } - TRI_ASSERT(false); - THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, - "unsupported cluster storage engine"); -} - bool ClusterIndex::isSorted() const { if (_engineType == ClusterEngineType::MMFilesEngine) { return _indexType == Index::TRI_IDX_TYPE_SKIPLIST_INDEX || diff --git a/arangod/ClusterEngine/ClusterIndex.h b/arangod/ClusterEngine/ClusterIndex.h index 012594e25609..0e1c16e9186f 100644 --- a/arangod/ClusterEngine/ClusterIndex.h +++ b/arangod/ClusterEngine/ClusterIndex.h @@ -49,6 +49,11 @@ class ClusterIndex : public Index { /// @brief return a VelocyPack representation of the index void toVelocyPack(velocypack::Builder& builder, std::underlying_type::type) const override; + + /// @brief if true this index should not be shown externally + bool isHidden() const override { + return false; // do not generally hide indexes + } IndexType type() const override { return _indexType; } @@ -56,8 +61,6 @@ class ClusterIndex : public Index { return Index::oldtypeName(_indexType); } - bool isPersistent() const override; - bool canBeDropped() const override { return _indexType != Index::TRI_IDX_TYPE_PRIMARY_INDEX && _indexType != Index::TRI_IDX_TYPE_EDGE_INDEX; diff --git a/arangod/ClusterEngine/ClusterIndexFactory.cpp b/arangod/ClusterEngine/ClusterIndexFactory.cpp index b606ad9f61be..e204ab7b071f 100644 --- a/arangod/ClusterEngine/ClusterIndexFactory.cpp +++ b/arangod/ClusterEngine/ClusterIndexFactory.cpp @@ -217,7 +217,7 @@ void ClusterIndexFactory::prepareIndexes( continue; } - if (basics::VelocyPackHelper::getBooleanValue(v, "isBuilding", false)) { + if (basics::VelocyPackHelper::getBooleanValue(v, StaticStrings::IndexIsBuilding, false)) { // This index is still being built. Do not add. continue; } diff --git a/arangod/IResearch/IResearchLink.cpp b/arangod/IResearch/IResearchLink.cpp index cbde1deae5b2..2cb11c0fa5a9 100644 --- a/arangod/IResearch/IResearchLink.cpp +++ b/arangod/IResearch/IResearchLink.cpp @@ -348,19 +348,6 @@ Result IResearchLink::insert( return _view->insert(*trx, _collection.id(), documentId, doc, _meta); } -bool IResearchLink::isPersistent() const { - auto* engine = arangodb::EngineSelectorFeature::ENGINE; - - // FIXME TODO remove once MMFilesEngine will fillIndex(...) during recovery - // currently the index is created but fill is deffered untill the end of recovery - // at the end of recovery only non-persistent indexes are filled - if (engine && engine->inRecovery()) { - return false; - } - - return true; // records persisted into the iResearch view -} - bool IResearchLink::isSorted() const { return false; // iResearch does not provide a fixed default sort order } @@ -555,4 +542,4 @@ NS_END // arangodb // ----------------------------------------------------------------------------- // --SECTION-- END-OF-FILE -// ----------------------------------------------------------------------------- \ No newline at end of file +// ----------------------------------------------------------------------------- diff --git a/arangod/IResearch/IResearchLink.h b/arangod/IResearch/IResearchLink.h index 16ca3e8b26e8..1f3e14e8f1cd 100644 --- a/arangod/IResearch/IResearchLink.h +++ b/arangod/IResearch/IResearchLink.h @@ -96,8 +96,11 @@ class IResearchLink { Index::OperationMode mode ); // arangodb::Index override - bool isPersistent() const; // arangodb::Index override bool isSorted() const; // arangodb::Index override + + bool isHidden() const { // arangodb::Index override + return true; // always hide links + } //////////////////////////////////////////////////////////////////////////////// /// @brief the identifier for this link diff --git a/arangod/IResearch/IResearchLinkCoordinator.h b/arangod/IResearch/IResearchLinkCoordinator.h index 829d2ad13671..7febe2200cdc 100644 --- a/arangod/IResearch/IResearchLinkCoordinator.h +++ b/arangod/IResearch/IResearchLinkCoordinator.h @@ -100,10 +100,12 @@ class IResearchLinkCoordinator final: public arangodb::ClusterIndex { return arangodb::Result(TRI_ERROR_NOT_IMPLEMENTED); } - virtual bool isPersistent() const override { return true; } - // IResearch does not provide a fixed default sort order virtual bool isSorted() const override { return false; } + + bool isHidden() const override { + return true; + } virtual void load() override { /* NOOP */ } diff --git a/arangod/IResearch/IResearchMMFilesLink.cpp b/arangod/IResearch/IResearchMMFilesLink.cpp index e91bc41e9a28..9fcea02346bc 100644 --- a/arangod/IResearch/IResearchMMFilesLink.cpp +++ b/arangod/IResearch/IResearchMMFilesLink.cpp @@ -25,6 +25,8 @@ #include "Logger/Logger.h" #include "Logger/LogMacros.h" #include "MMFiles/MMFilesCollection.h" +#include "StorageEngine/EngineSelectorFeature.h" +#include "StorageEngine/StorageEngine.h" #include "VocBase/LogicalCollection.h" #include "IResearchCommon.h" @@ -37,7 +39,7 @@ NS_BEGIN(iresearch) IResearchMMFilesLink::IResearchMMFilesLink( TRI_idx_iid_t iid, arangodb::LogicalCollection& collection -): Index(iid, collection, IResearchLinkHelper::emptyIndexSlice()), +): MMFilesIndex(iid, collection, IResearchLinkHelper::emptyIndexSlice()), IResearchLink(iid, collection) { TRI_ASSERT(!ServerState::instance()->isCoordinator()); _unique = false; // cannot be unique since multiple fields are indexed @@ -133,6 +135,20 @@ void IResearchMMFilesLink::toVelocyPack( builder.close(); } +bool IResearchMMFilesLink::isPersistent() const { + auto* engine = arangodb::EngineSelectorFeature::ENGINE; + + // FIXME TODO remove once MMFilesEngine will fillIndex(...) during recovery + // currently the index is created but fill is deffered untill the end of recovery + // at the end of recovery only non-persistent indexes are filled + if (engine && engine->inRecovery()) { + return false; + } + + return true; // records persisted into the iResearch view +} + + NS_END // iresearch NS_END // arangodb diff --git a/arangod/IResearch/IResearchMMFilesLink.h b/arangod/IResearch/IResearchMMFilesLink.h index bb102a99615c..e1fbe385584c 100644 --- a/arangod/IResearch/IResearchMMFilesLink.h +++ b/arangod/IResearch/IResearchMMFilesLink.h @@ -26,13 +26,13 @@ #include "IResearchLink.h" -#include "Indexes/Index.h" +#include "MMFiles/MMFilesIndex.h" NS_BEGIN(arangodb) NS_BEGIN(iresearch) class IResearchMMFilesLink final - : public arangodb::Index, public IResearchLink { + : public arangodb::MMFilesIndex, public IResearchLink { public: DECLARE_SHARED_PTR(Index); @@ -75,13 +75,15 @@ class IResearchMMFilesLink final return IResearchLink::insert(trx, documentId, doc, mode); } - virtual bool isPersistent() const override { - return IResearchLink::isPersistent(); - } + bool isPersistent() const override; virtual bool isSorted() const override { return IResearchLink::isSorted(); } + + bool isHidden() const override { + return IResearchLink::isHidden(); + } virtual void load() override { IResearchLink::load(); @@ -153,4 +155,4 @@ class IResearchMMFilesLink final NS_END // iresearch NS_END // arangodb -#endif \ No newline at end of file +#endif diff --git a/arangod/IResearch/IResearchRocksDBLink.h b/arangod/IResearch/IResearchRocksDBLink.h index 961dde92fd72..e7691c6d9bfc 100644 --- a/arangod/IResearch/IResearchRocksDBLink.h +++ b/arangod/IResearch/IResearchRocksDBLink.h @@ -81,6 +81,10 @@ class IResearchRocksDBLink final virtual bool isSorted() const override { return IResearchLink::isSorted(); } + + bool isHidden() const override { + return IResearchLink::isHidden(); + } virtual void load() override { IResearchLink::load(); @@ -154,4 +158,4 @@ class IResearchRocksDBLink final NS_END // iresearch NS_END // arangodb -#endif \ No newline at end of file +#endif diff --git a/arangod/IResearch/IResearchRocksDBRecoveryHelper.cpp b/arangod/IResearch/IResearchRocksDBRecoveryHelper.cpp index 74894ad817c9..c3fb4d34f1bf 100644 --- a/arangod/IResearch/IResearchRocksDBRecoveryHelper.cpp +++ b/arangod/IResearch/IResearchRocksDBRecoveryHelper.cpp @@ -119,7 +119,7 @@ void ensureLink( auto const indexTypeSlice = indexSlice.get(arangodb::StaticStrings::IndexType); auto const indexTypeStr = indexTypeSlice.copyString(); - auto const indexType = arangodb::Index::type(indexTypeStr.c_str()); + auto const indexType = arangodb::Index::type(indexTypeStr); if (arangodb::Index::IndexType::TRI_IDX_TYPE_IRESEARCH_LINK != indexType) { // skip non iresearch link indexes @@ -465,4 +465,4 @@ void IResearchRocksDBRecoveryHelper::LogData(const rocksdb::Slice& blob) { // ----------------------------------------------------------------------------- // --SECTION-- END-OF-FILE -// ----------------------------------------------------------------------------- \ No newline at end of file +// ----------------------------------------------------------------------------- diff --git a/arangod/Indexes/Index.cpp b/arangod/Indexes/Index.cpp index 0814e7f324e3..b6f112177e2c 100644 --- a/arangod/Indexes/Index.cpp +++ b/arangod/Indexes/Index.cpp @@ -195,12 +195,11 @@ size_t Index::sortWeight(arangodb::aql::AstNode const* node) { /// @brief validate fields from slice void Index::validateFields(VPackSlice const& slice) { - auto allowExpansion = Index::allowExpansion( - Index::type(slice.get(arangodb::StaticStrings::IndexType).copyString()) - ); + VPackValueLength len; + const char *idxStr = slice.get(arangodb::StaticStrings::IndexType).getString(len); + auto allowExpansion = Index::allowExpansion(Index::type(idxStr, len)); auto fields = slice.get(arangodb::StaticStrings::IndexFields); - if (!fields.isArray()) { return; } @@ -218,34 +217,35 @@ void Index::validateFields(VPackSlice const& slice) { } /// @brief return the index type based on a type name -Index::IndexType Index::type(char const* type) { - if (::strcmp(type, "primary") == 0) { +Index::IndexType Index::type(char const* type, size_t len) { + if (::strncmp(type, "primary", len) == 0) { return TRI_IDX_TYPE_PRIMARY_INDEX; } - if (::strcmp(type, "edge") == 0) { + if (::strncmp(type, "edge", len) == 0) { return TRI_IDX_TYPE_EDGE_INDEX; } - if (::strcmp(type, "hash") == 0) { + if (::strncmp(type, "hash", len) == 0) { return TRI_IDX_TYPE_HASH_INDEX; } - if (::strcmp(type, "skiplist") == 0) { + if (::strncmp(type, "skiplist", len) == 0) { return TRI_IDX_TYPE_SKIPLIST_INDEX; } - if (::strcmp(type, "persistent") == 0 || ::strcmp(type, "rocksdb") == 0) { + if (::strncmp(type, "persistent", len) == 0 || + ::strncmp(type, "rocksdb", len) == 0) { return TRI_IDX_TYPE_PERSISTENT_INDEX; } - if (::strcmp(type, "fulltext") == 0) { + if (::strncmp(type, "fulltext", len) == 0) { return TRI_IDX_TYPE_FULLTEXT_INDEX; } - if (::strcmp(type, "geo1") == 0) { + if (::strncmp(type, "geo", len) == 0) { + return TRI_IDX_TYPE_GEO_INDEX; + } + if (::strncmp(type, "geo1", len) == 0) { return TRI_IDX_TYPE_GEO1_INDEX; } - if (::strcmp(type, "geo2") == 0) { + if (::strncmp(type, "geo2", len) == 0) { return TRI_IDX_TYPE_GEO2_INDEX; } - if (::strcmp(type, "geo") == 0) { - return TRI_IDX_TYPE_GEO_INDEX; - } #ifdef USE_IRESEARCH if (arangodb::iresearch::DATA_SOURCE_TYPE.name() == type) { return TRI_IDX_TYPE_IRESEARCH_LINK; @@ -259,7 +259,7 @@ Index::IndexType Index::type(char const* type) { } Index::IndexType Index::type(std::string const& type) { - return Index::type(type.c_str()); + return Index::type(type.c_str(), type.size()); } /// @brief return the name of an index type @@ -372,11 +372,11 @@ bool Index::Compare(VPackSlice const& lhs, VPackSlice const& rhs) { return false; } - auto type = Index::type(lhsType.copyString()); - + VPackValueLength len; + const char* tmp = lhsType.getString(len); + auto type = Index::type(tmp, len); // unique must be identical if present auto value = lhs.get(arangodb::StaticStrings::IndexUnique); - if (value.isBoolean()) { if (arangodb::basics::VelocyPackHelper::compare( value, rhs.get(arangodb::StaticStrings::IndexUnique), false diff --git a/arangod/Indexes/Index.h b/arangod/Indexes/Index.h index 04c7d106ef6c..f74012ed2033 100644 --- a/arangod/Indexes/Index.h +++ b/arangod/Indexes/Index.h @@ -190,7 +190,7 @@ class Index { char const* oldtypeName() const { return oldtypeName(type()); } /// @brief return the index type based on a type name - static IndexType type(char const* type); + static IndexType type(char const* type, size_t len); static IndexType type(std::string const& type); @@ -227,7 +227,6 @@ class Index { static bool Compare(velocypack::Slice const& lhs, velocypack::Slice const& rhs); - virtual bool isPersistent() const { return false; } virtual bool canBeDropped() const = 0; /// @brief whether or not the index provides an iterator that can extract @@ -242,10 +241,13 @@ class Index { /// @brief whether or not the index is sorted virtual bool isSorted() const = 0; + + /// @brief if true this index should not be shown externally + virtual bool isHidden() const = 0; /// @brief whether or not the index has a selectivity estimate virtual bool hasSelectivityEstimate() const = 0; - + /// @brief return the selectivity estimate of the index /// must only be called if hasSelectivityEstimate() returns true /// @@ -273,10 +275,10 @@ class Index { Basics = 0, /// @brief serialize figures for index Figures = 2, - /// @brief serialize object ids for persistence - ObjectId = 4, /// @brief serialize selectivity estimates - Estimates = 8 + Estimates = 4, + /// @brief serialize object ids for persistence + ObjectId = 8, }; /// @brief helper for building flags diff --git a/arangod/MMFiles/MMFilesCollection.cpp b/arangod/MMFiles/MMFilesCollection.cpp index 760848d0c972..109b01b3bd73 100644 --- a/arangod/MMFiles/MMFilesCollection.cpp +++ b/arangod/MMFiles/MMFilesCollection.cpp @@ -723,7 +723,6 @@ int MMFilesCollection::close() { engine->changeCollection( _logicalCollection.vocbase(), - _logicalCollection.id(), _logicalCollection, doSync ); @@ -1663,7 +1662,8 @@ void MMFilesCollection::fillIndex( return; } - if (idx->isPersistent() && skipPersistent) { + MMFilesIndex* midx = static_cast(idx); + if (midx->isPersistent() && skipPersistent) { return; } @@ -1707,7 +1707,8 @@ int MMFilesCollection::fillIndexes( if (idx->type() == Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX) { continue; } - if (idx->isPersistent()) { + MMFilesIndex* midx = static_cast(idx); + if (midx->isPersistent()) { continue; } idx->unload(); // TODO: check is this safe? truncate not necessarily @@ -2202,35 +2203,6 @@ void MMFilesCollection::prepareIndexes(VPackSlice indexesSlice) { TRI_ASSERT(!_indexes.empty()); } -std::shared_ptr MMFilesCollection::lookupIndex( - VPackSlice const& info) const { - TRI_ASSERT(info.isObject()); - - // extract type - auto value = info.get(arangodb::StaticStrings::IndexType); - - if (!value.isString()) { - // Compatibility with old v8-vocindex. - THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, "invalid index definition"); - } - - std::string tmp = value.copyString(); - arangodb::Index::IndexType const type = arangodb::Index::type(tmp.c_str()); - - {READ_LOCKER(guard, _indexesLock); - for (auto const& idx : _indexes) { - if (idx->type() == type) { - // Only check relevant indices - if (idx->matchesDefinition(info)) { - // We found an index for this definition. - return idx; - } - } - } - } - return nullptr; -} - std::shared_ptr MMFilesCollection::createIndex(arangodb::velocypack::Slice const& info, bool restore, bool& created) { @@ -2418,7 +2390,8 @@ void MMFilesCollection::addIndexLocal(std::shared_ptr idx) { } // update statistics - if (idx->isPersistent()) { + MMFilesIndex* midx = static_cast(idx.get()); + if (midx->isPersistent()) { ++_persistentIndexes; } } @@ -2493,7 +2466,8 @@ bool MMFilesCollection::removeIndex(TRI_idx_iid_t iid) { _indexes.erase(_indexes.begin() + i); // update statistics - if (idx->isPersistent()) { + MMFilesIndex* midx = static_cast(idx.get()); + if (midx->isPersistent()) { --_persistentIndexes; } @@ -3252,7 +3226,7 @@ void MMFilesCollection::setCurrentVersion() { StorageEngine* engine = EngineSelectorFeature::ENGINE; engine->changeCollection(_logicalCollection.vocbase(), - _logicalCollection.id(), _logicalCollection, doSync); + _logicalCollection, doSync); } /// @brief creates a new entry in the primary index @@ -3299,7 +3273,8 @@ Result MMFilesCollection::insertSecondaryIndexes( auto idx = indexes[i]; TRI_ASSERT(idx->type() != Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX); - if (!useSecondary && !idx->isPersistent()) { + MMFilesIndex* midx = static_cast(idx.get()); + if (!useSecondary && !midx->isPersistent()) { continue; } @@ -3345,7 +3320,8 @@ Result MMFilesCollection::deleteSecondaryIndexes( auto idx = indexes[i]; TRI_ASSERT(idx->type() != Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX); - if (!useSecondary && !idx->isPersistent()) { + MMFilesIndex* midx = static_cast(idx.get()); + if (!useSecondary && !midx->isPersistent()) { continue; } diff --git a/arangod/MMFiles/MMFilesCollection.h b/arangod/MMFiles/MMFilesCollection.h index 831cdea46b68..d47d84082d09 100644 --- a/arangod/MMFiles/MMFilesCollection.h +++ b/arangod/MMFiles/MMFilesCollection.h @@ -247,9 +247,6 @@ class MMFilesCollection final : public PhysicalCollection { void prepareIndexes(arangodb::velocypack::Slice indexesSlice) override; - /// @brief Find index by definition - std::shared_ptr lookupIndex(velocypack::Slice const&) const override; - std::unique_ptr getAllIterator(transaction::Methods* trx) const override; std::unique_ptr getAnyIterator(transaction::Methods* trx) const override; void invokeOnAllElements( diff --git a/arangod/MMFiles/MMFilesEngine.cpp b/arangod/MMFiles/MMFilesEngine.cpp index 8686a57af66e..d82042e125da 100644 --- a/arangod/MMFiles/MMFilesEngine.cpp +++ b/arangod/MMFiles/MMFilesEngine.cpp @@ -923,11 +923,12 @@ void MMFilesEngine::waitUntilDeletion(TRI_voc_tick_t id, bool force, // to "createCollection" returns std::string MMFilesEngine::createCollection( TRI_vocbase_t& vocbase, - TRI_voc_cid_t id, LogicalCollection const& collection ) { auto path = databasePath(&vocbase); TRI_ASSERT(!path.empty()); + const TRI_voc_cid_t id = collection.id(); + TRI_ASSERT(id != 0); // sanity check if (sizeof(MMFilesDatafileHeaderMarker) + sizeof(MMFilesDatafileFooterMarker) > @@ -1245,11 +1246,10 @@ void MMFilesEngine::destroyCollection( // to "changeCollection" returns void MMFilesEngine::changeCollection( TRI_vocbase_t& vocbase, - TRI_voc_cid_t id, LogicalCollection const& collection, bool doSync ) { - saveCollectionInfo(&vocbase, id, &collection, doSync); + saveCollectionInfo(&vocbase, collection.id(), &collection, doSync); } // asks the storage engine to persist renaming of a collection diff --git a/arangod/MMFiles/MMFilesEngine.h b/arangod/MMFiles/MMFilesEngine.h index fbf91675d2b7..5f6c7e6017ec 100644 --- a/arangod/MMFiles/MMFilesEngine.h +++ b/arangod/MMFiles/MMFilesEngine.h @@ -262,7 +262,6 @@ class MMFilesEngine final : public StorageEngine { // to "createCollection" returns std::string createCollection( TRI_vocbase_t& vocbase, - TRI_voc_cid_t id, LogicalCollection const& collection ) override; @@ -302,7 +301,6 @@ class MMFilesEngine final : public StorageEngine { // to "changeCollection" returns void changeCollection( TRI_vocbase_t& vocbase, - TRI_voc_cid_t id, LogicalCollection const& collection, bool doSync ) override; diff --git a/arangod/MMFiles/MMFilesIndex.h b/arangod/MMFiles/MMFilesIndex.h index a4cb6e5834c3..356cda83d915 100644 --- a/arangod/MMFiles/MMFilesIndex.h +++ b/arangod/MMFiles/MMFilesIndex.h @@ -52,11 +52,19 @@ class MMFilesIndex : public Index { ) : Index(id, collection, info) {} + /// @brief if true this index should not be shown externally + virtual bool isHidden() const override { + return false; // do not generally hide MMFiles indexes + } void afterTruncate(TRI_voc_tick_t) override { // for mmfiles, truncating the index just unloads it unload(); } + + virtual bool isPersistent() const { + return false; + }; }; } diff --git a/arangod/RestHandler/RestIndexHandler.cpp b/arangod/RestHandler/RestIndexHandler.cpp index 327f37f4b845..b9fa51ebadea 100644 --- a/arangod/RestHandler/RestIndexHandler.cpp +++ b/arangod/RestHandler/RestIndexHandler.cpp @@ -93,10 +93,10 @@ RestStatus RestIndexHandler::getIndexes() { if (_request->parsedValue("withStats", false)) { flags = Index::makeFlags(Index::Serialize::Estimates, Index::Serialize::Figures); } - bool withLinks = _request->parsedValue("withLinks", false); + bool withHidden = _request->parsedValue("withHidden", false); VPackBuilder indexes; - Result res = methods::Indexes::getAll(coll.get(), flags, withLinks, indexes); + Result res = methods::Indexes::getAll(coll.get(), flags, withHidden, indexes); if (!res.ok()) { generateError(rest::ResponseCode::BAD, res.errorNumber(), res.errorMessage()); diff --git a/arangod/RocksDBEngine/RocksDBCollection.cpp b/arangod/RocksDBEngine/RocksDBCollection.cpp index 786a8fb4ff7d..b6da94fb5a76 100644 --- a/arangod/RocksDBEngine/RocksDBCollection.cpp +++ b/arangod/RocksDBEngine/RocksDBCollection.cpp @@ -67,7 +67,6 @@ #include #include -#include #include #include @@ -280,7 +279,15 @@ void RocksDBCollection::prepareIndexes( ); } + bool droppedIndex = false; for (std::shared_ptr& idx : indexes) { + RocksDBIndex* rtrx = static_cast(idx.get()); + if (rtrx->isBuilding()) { + int res = rtrx->drop(); + TRI_ASSERT(res == TRI_ERROR_NO_ERROR); + droppedIndex = true; + continue; + } addIndex(std::move(idx)); } @@ -300,60 +307,38 @@ void RocksDBCollection::prepareIndexes( THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, msg); } - - TRI_ASSERT(!_indexes.empty()); -} - -static std::shared_ptr findIndex( - velocypack::Slice const& info, - std::vector> const& indexes) { - TRI_ASSERT(info.isObject()); - - auto value = info.get(arangodb::StaticStrings::IndexType); // extract type - - if (!value.isString()) { - // Compatibility with old v8-vocindex. - THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, - "invalid index type definition"); - } - - std::string tmp = value.copyString(); - arangodb::Index::IndexType const type = arangodb::Index::type(tmp.c_str()); - - for (auto const& idx : indexes) { - if (idx->type() == type) { - // Only check relevant indexes - if (idx->matchesDefinition(info)) { - // We found an index for this definition. - return idx; - } - } + + if (droppedIndex) { + auto builder = _logicalCollection.toVelocyPackIgnore({"path", "statusString"}, true, true); + // log this event in the WAL and in the collection meta-data + auto engine = static_cast(EngineSelectorFeature::ENGINE); + engine->writeCreateCollectionMarker(_logicalCollection.vocbase().id(), + _logicalCollection.id(), + builder.slice(), + RocksDBLogValue::Empty()); } - return nullptr; -} -/// @brief Find index by definition -std::shared_ptr RocksDBCollection::lookupIndex( - velocypack::Slice const& info) const { - READ_LOCKER(guard, _indexesLock); - return findIndex(info, _indexes); + TRI_ASSERT(!_indexes.empty()); } std::shared_ptr RocksDBCollection::createIndex( arangodb::velocypack::Slice const& info, bool restore, bool& created) { TRI_ASSERT(info.isObject()); + SingleCollectionTransaction trx( // prevent concurrent dropping transaction::StandaloneContext::Create(_logicalCollection.vocbase()), _logicalCollection, - AccessMode::Type::EXCLUSIVE); + AccessMode::Type::WRITE); Result res = trx.begin(); if (!res.ok()) { THROW_ARANGO_EXCEPTION(res); } - - std::shared_ptr idx = lookupIndex(info); + WRITE_LOCKER(indexGuard, _indexesLock); + + // Step 1. Check for matching index + std::shared_ptr idx = findIndex(info, _indexes); if (idx) { created = false; // We already have this index. return idx; @@ -361,7 +346,7 @@ std::shared_ptr RocksDBCollection::createIndex( RocksDBEngine* engine = static_cast(EngineSelectorFeature::ENGINE); - // We are sure that we do not have an index of this type. + // Step 2. We are sure that we do not have an index of this type. // We also hold the lock. Create it const bool generateKey = !restore; idx = engine->indexFactory().prepareIndexFromSlice( @@ -375,55 +360,65 @@ std::shared_ptr RocksDBCollection::createIndex( TRI_ASSERT(idx->type() != Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX); TRI_ASSERT(idx->type() != Index::IndexType::TRI_IDX_TYPE_EDGE_INDEX); - std::shared_ptr other = PhysicalCollection::lookupIndex(idx->id()); - if (other) { // index already exists - return other; - } - - res = fillIndexes(&trx, idx); - if (!res.ok()) { - THROW_ARANGO_EXCEPTION(res); + for (auto const& other : _indexes) { // conflicting index exists + if (other->id() == idx->id()) { + return other; // index already exists + } } - // we need to sync the selectivity estimates - res = engine->settingsManager()->sync(false); - if (res.fail()) { - LOG_TOPIC(WARN, Logger::ENGINES) << "could not sync settings: " - << res.errorMessage(); - } + addIndex(idx); // add index to list + indexGuard.unlock(); // toVelocyPackIgnore needs the read-lock - rocksdb::Status s = engine->db()->GetRootDB()->FlushWAL(true); - if (!s.ok()) { - LOG_TOPIC(WARN, Logger::ENGINES) << "could not flush wal: " - << s.ToString(); + // Step 3. add index to collection info + if (!engine->inRecovery()) { + auto builder = _logicalCollection.toVelocyPackIgnore( + {"path", "statusString"}, true, /*forPersistence*/ true); + res = engine->writeCreateCollectionMarker(_logicalCollection.vocbase().id(), + _logicalCollection.id(), + builder.slice(), + RocksDBLogValue::Empty()); } + RocksDBIndex* ridx = static_cast(idx.get()); + res = ridx->fillIndex(trx); + if (res.ok()) { + // we need to sync the selectivity estimates + res = engine->settingsManager()->sync(false); + if (res.fail()) { // not a deal breaker + LOG_TOPIC(WARN, Logger::ENGINES) << "could not sync settings: " + << res.errorMessage(); + } + + rocksdb::Status s = engine->db()->GetRootDB()->FlushWAL(true); + if (!s.ok()) { // not a deal breaker + LOG_TOPIC(WARN, Logger::ENGINES) << "could not flush wal: " + << s.ToString(); + } + #if USE_PLAN_CACHE - arangodb::aql::PlanCache::instance()->invalidate( - _logicalCollection->vocbase()); + arangodb::aql::PlanCache::instance()->invalidate(_logicalCollection->vocbase()); #endif - // Until here no harm is done if something fails. The shared_ptr will - // clean up, if left before - { - WRITE_LOCKER(guard, _indexesLock); - addIndex(idx); - } - if (!engine->inRecovery()) { - auto builder = _logicalCollection.toVelocyPackIgnore( - {"path", "statusString"}, true, /*forPersistence*/ true); - VPackBuilder indexInfo; - idx->toVelocyPack(indexInfo, Index::makeFlags(Index::Serialize::ObjectId)); - res = engine->writeCreateCollectionMarker( - _logicalCollection.vocbase().id(), - _logicalCollection.id(), - builder.slice(), - RocksDBLogValue::IndexCreate( + if (!engine->inRecovery()) { + auto builder = _logicalCollection.toVelocyPackIgnore( + {"path", "statusString"}, true, /*forPersistence*/ true); + VPackBuilder indexInfo; + idx->toVelocyPack(indexInfo, Index::makeFlags(Index::Serialize::ObjectId)); + res = engine->writeCreateCollectionMarker( _logicalCollection.vocbase().id(), _logicalCollection.id(), - indexInfo.slice() - ) - ); + builder.slice(), + RocksDBLogValue::IndexCreate( + _logicalCollection.vocbase().id(), + _logicalCollection.id(), + indexInfo.slice() + ) + ); + } + + if (res.ok()) { + res = trx.commit(); + } } if (res.fail()) { @@ -439,16 +434,11 @@ std::shared_ptr RocksDBCollection::createIndex( ++i; } idx->drop(); - THROW_ARANGO_EXCEPTION(res); - } - - res = trx.commit(); - if (res.fail()) { + THROW_ARANGO_EXCEPTION(res); } created = true; - return idx; } @@ -459,53 +449,53 @@ bool RocksDBCollection::dropIndex(TRI_idx_iid_t iid) { // invalid index id or primary index return true; } - - size_t i = 0; - WRITE_LOCKER(guard, _indexesLock); - for (std::shared_ptr index : _indexes) { - RocksDBIndex* cindex = static_cast(index.get()); - TRI_ASSERT(cindex != nullptr); - - if (iid == cindex->id()) { - int rv = cindex->drop(); - - if (rv == TRI_ERROR_NO_ERROR) { - // trigger compaction before deleting the object - cindex->cleanup(); - + + std::shared_ptr toRemove; + { + size_t i = 0; + WRITE_LOCKER(guard, _indexesLock); + for (std::shared_ptr& idx : _indexes) { + if (iid == idx->id()) { + toRemove = std::move(idx); _indexes.erase(_indexes.begin() + i); - events::DropIndex("", std::to_string(iid), TRI_ERROR_NO_ERROR); - // toVelocyPackIgnore will take a read lock and we don't need the - // lock anymore, this branch always returns - guard.unlock(); - - auto engine = static_cast(EngineSelectorFeature::ENGINE); - engine->removeIndexMapping(cindex->objectId()); - - auto builder = _logicalCollection.toVelocyPackIgnore( - {"path", "statusString"}, true, true); - - // log this event in the WAL and in the collection meta-data - int res = engine->writeCreateCollectionMarker( - _logicalCollection.vocbase().id(), - _logicalCollection.id(), - builder.slice(), - RocksDBLogValue::IndexDrop( - _logicalCollection.vocbase().id(), _logicalCollection.id(), iid - ) - ); - - return res == TRI_ERROR_NO_ERROR; + break; } - - break; + ++i; } - ++i; } + if (!toRemove) { // index not found + // We tried to remove an index that does not exist + events::DropIndex("", std::to_string(iid), TRI_ERROR_ARANGO_INDEX_NOT_FOUND); + return false; + } + + READ_LOCKER(guard, _indexesLock); + + RocksDBIndex* cindex = static_cast(toRemove.get()); + TRI_ASSERT(cindex != nullptr); - // We tried to remove an index that does not exist - events::DropIndex("", std::to_string(iid), TRI_ERROR_ARANGO_INDEX_NOT_FOUND); - return false; + int res = cindex->drop(); + if (res == TRI_ERROR_NO_ERROR) { + events::DropIndex("", std::to_string(iid), TRI_ERROR_NO_ERROR); + + // trigger compaction before deleting the object + cindex->compact(); + + auto builder = _logicalCollection.toVelocyPackIgnore( + {"path", "statusString"}, true, true); + + // log this event in the WAL and in the collection meta-data + auto engine = static_cast(EngineSelectorFeature::ENGINE); + res = engine->writeCreateCollectionMarker( + _logicalCollection.vocbase().id(), + _logicalCollection.id(), + builder.slice(), + RocksDBLogValue::IndexDrop( + _logicalCollection.vocbase().id(), _logicalCollection.id(), iid + ) + ); + } + return res == TRI_ERROR_NO_ERROR; } std::unique_ptr RocksDBCollection::getAllIterator(transaction::Methods* trx) const { @@ -1237,102 +1227,6 @@ void RocksDBCollection::addIndex(std::shared_ptr idx) { } } -template -static arangodb::Result fillIndex(transaction::Methods* trx, - RocksDBIndex* ridx, - std::unique_ptr it, - WriteBatchType& batch, - RocksDBCollection* rcol) { - auto state = RocksDBTransactionState::toState(trx); - - // fillindex can be non transactional, we just need to clean up - rocksdb::DB* db = rocksutils::globalRocksDB()->GetRootDB(); - TRI_ASSERT(db != nullptr); - - uint64_t numDocsWritten = 0; - // write batch will be reset every x documents - MethodsType batched(state, &batch); - - arangodb::Result res; - auto cb = [&](LocalDocumentId const& documentId, VPackSlice slice) { - if (res.ok()) { - res = ridx->insertInternal(trx, &batched, documentId, slice, - Index::OperationMode::normal); - if (res.ok()) { - numDocsWritten++; - } - } - }; - - rocksdb::WriteOptions wo; - - bool hasMore = true; - while (hasMore && res.ok()) { - hasMore = it->nextDocument(cb, 250); - - if (TRI_VOC_COL_STATUS_DELETED == it->collection()->status() - || it->collection()->deleted()) { - res = TRI_ERROR_INTERNAL; - } else if (application_features::ApplicationServer::isStopping()) { - res = TRI_ERROR_SHUTTING_DOWN; - } - - if (res.ok()) { - rocksdb::Status s = db->Write(wo, batch.GetWriteBatch()); - if (!s.ok()) { - res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); - break; - } - } - - batch.Clear(); - } - - // we will need to remove index elements created before an error - // occurred, this needs to happen since we are non transactional - if (res.fail()) { - RocksDBKeyBounds bounds = ridx->getBounds(); - arangodb::Result res2 = rocksutils::removeLargeRange(rocksutils::globalRocksDB(), bounds, - true, /*useRangeDel*/numDocsWritten > 25000); - if (res2.fail()) { - LOG_TOPIC(WARN, Logger::ENGINES) << "was not able to roll-back " - << "index creation: " << res2.errorMessage(); - } - } - - return res; -} - -/// non-transactional: fill index with existing documents -/// from this collection -arangodb::Result RocksDBCollection::fillIndexes( - transaction::Methods* trx, std::shared_ptr added) { - TRI_ASSERT(trx->state()->collection( - _logicalCollection.id(), AccessMode::Type::EXCLUSIVE - )); - - std::unique_ptr it(new RocksDBAllIndexIterator( - &_logicalCollection, trx, primaryIndex() - )); - - RocksDBIndex* ridx = static_cast(added.get()); - - if (ridx->unique()) { - // unique index. we need to keep track of all our changes because we need to avoid - // duplicate index keys. must therefore use a WriteBatchWithIndex - rocksdb::WriteBatchWithIndex batch(ridx->columnFamily()->GetComparator(), 32 * 1024 * 1024); - return fillIndex( - trx, ridx, std::move(it), batch, this); - } else { - // non-unique index. all index keys will be unique anyway because they contain the document id - // we can therefore get away with a cheap WriteBatch - rocksdb::WriteBatch batch(32 * 1024 * 1024); - return fillIndex( - trx, ridx, std::move(it), batch, this); - } - return Result(); -} - Result RocksDBCollection::insertDocument( arangodb::transaction::Methods* trx, LocalDocumentId const& documentId, VPackSlice const& doc, OperationOptions& options) const { @@ -1790,6 +1684,7 @@ uint64_t RocksDBCollection::recalculateCounts() { rocksdb::Slice upper(bounds.end()); rocksdb::ReadOptions ro; + ro.snapshot = snapshot; ro.prefix_same_as_start = true; ro.iterate_upper_bound = &upper; ro.verify_checksums = false; @@ -1825,7 +1720,7 @@ void RocksDBCollection::compact() { READ_LOCKER(guard, _indexesLock); for (std::shared_ptr i : _indexes) { RocksDBIndex* index = static_cast(i.get()); - index->cleanup(); + index->compact(); } } diff --git a/arangod/RocksDBEngine/RocksDBCollection.h b/arangod/RocksDBEngine/RocksDBCollection.h index 2a38e4a614bf..939386ebfc56 100644 --- a/arangod/RocksDBEngine/RocksDBCollection.h +++ b/arangod/RocksDBEngine/RocksDBCollection.h @@ -94,9 +94,6 @@ class RocksDBCollection final : public PhysicalCollection { void prepareIndexes(arangodb::velocypack::Slice indexesSlice) override; - /// @brief Find index by definition - std::shared_ptr lookupIndex(velocypack::Slice const&) const override; - std::shared_ptr createIndex(arangodb::velocypack::Slice const& info, bool restore, bool& created) override; @@ -208,9 +205,6 @@ class RocksDBCollection final : public PhysicalCollection { void figuresSpecific(std::shared_ptr&) override; void addIndex(std::shared_ptr idx); - arangodb::Result fillIndexes(transaction::Methods*, - std::shared_ptr); - // @brief return the primary index // WARNING: Make sure that this instance // is somehow protected. If it goes out of all scopes diff --git a/arangod/RocksDBEngine/RocksDBEngine.cpp b/arangod/RocksDBEngine/RocksDBEngine.cpp index 50c30c026ede..66c1662180f1 100644 --- a/arangod/RocksDBEngine/RocksDBEngine.cpp +++ b/arangod/RocksDBEngine/RocksDBEngine.cpp @@ -1137,7 +1137,9 @@ int RocksDBEngine::writeCreateCollectionMarker(TRI_voc_tick_t databaseId, // Write marker + key into RocksDB inside one batch rocksdb::WriteBatch batch; - batch.PutLogData(logValue.slice()); + if (logValue.slice().size() > 0) { + batch.PutLogData(logValue.slice()); + } batch.Put(RocksDBColumnFamily::definitions(), key.string(), value.string()); rocksdb::Status res = db->Write(wo, &batch); @@ -1186,14 +1188,13 @@ void RocksDBEngine::recoveryDone(TRI_vocbase_t& vocbase) { std::string RocksDBEngine::createCollection( TRI_vocbase_t& vocbase, - TRI_voc_cid_t cid, LogicalCollection const& collection ) { - auto builder = collection.toVelocyPackIgnore( - {"path", "statusString"}, /*translate cid*/ true, /*for persistence*/ true - ); - + const TRI_voc_cid_t cid = collection.id(); TRI_ASSERT(cid != 0); + + auto builder = collection.toVelocyPackIgnore({"path", "statusString"}, + /*translateCid*/ true, /*forPersist*/ true); TRI_UpdateTickServer(static_cast(cid)); int res = writeCreateCollectionMarker( @@ -1358,7 +1359,6 @@ void RocksDBEngine::destroyCollection( void RocksDBEngine::changeCollection( TRI_vocbase_t& vocbase, - TRI_voc_cid_t id, LogicalCollection const& collection, bool doSync ) { @@ -1367,9 +1367,9 @@ void RocksDBEngine::changeCollection( ); int res = writeCreateCollectionMarker( vocbase.id(), - id, + collection.id(), builder.slice(), - RocksDBLogValue::CollectionChange(vocbase.id(), id) + RocksDBLogValue::CollectionChange(vocbase.id(), collection.id()) ); if (res != TRI_ERROR_NO_ERROR) { diff --git a/arangod/RocksDBEngine/RocksDBEngine.h b/arangod/RocksDBEngine/RocksDBEngine.h index 9d7d98af5b0b..f37a4165393f 100644 --- a/arangod/RocksDBEngine/RocksDBEngine.h +++ b/arangod/RocksDBEngine/RocksDBEngine.h @@ -235,7 +235,6 @@ class RocksDBEngine final : public StorageEngine { public: std::string createCollection( TRI_vocbase_t& vocbase, - TRI_voc_cid_t id, LogicalCollection const& collection ) override; @@ -256,7 +255,6 @@ class RocksDBEngine final : public StorageEngine { void changeCollection( TRI_vocbase_t& vocbase, - TRI_voc_cid_t id, LogicalCollection const& collection, bool doSync ) override; diff --git a/arangod/RocksDBEngine/RocksDBIndex.cpp b/arangod/RocksDBEngine/RocksDBIndex.cpp index 4d2aa9b3678a..fdb24450b306 100644 --- a/arangod/RocksDBEngine/RocksDBIndex.cpp +++ b/arangod/RocksDBEngine/RocksDBIndex.cpp @@ -38,6 +38,10 @@ #include "VocBase/ticks.h" #include +#include +#include +#include + using namespace arangodb; using namespace arangodb::rocksutils; @@ -49,6 +53,10 @@ using namespace arangodb::rocksutils; uint64_t const arangodb::RocksDBIndex::ESTIMATOR_SIZE = 4096; +inline static uint64_t ensureObjectId(uint64_t oid) { + return (oid != 0) ? oid : TRI_NewTickServer(); +} + RocksDBIndex::RocksDBIndex( TRI_idx_iid_t id, LogicalCollection& collection, @@ -60,11 +68,12 @@ RocksDBIndex::RocksDBIndex( bool useCache ) : Index(id, collection, attributes, unique, sparse), - _objectId((objectId != 0) ? objectId : TRI_NewTickServer()), + _objectId(ensureObjectId(objectId)), _cf(cf), _cache(nullptr), _cachePresent(false), - _cacheEnabled(useCache && !collection.system() && CacheManagerFeature::MANAGER != nullptr) { + _cacheEnabled(useCache && !collection.system() && CacheManagerFeature::MANAGER != nullptr), + _isBuilding(false) { TRI_ASSERT(cf != nullptr && cf != RocksDBColumnFamily::definitions()); if (_cacheEnabled) { @@ -86,29 +95,28 @@ RocksDBIndex::RocksDBIndex( bool useCache ) : Index(id, collection, info), - _objectId(basics::VelocyPackHelper::stringUInt64(info.get("objectId"))), + _objectId(ensureObjectId(basics::VelocyPackHelper::stringUInt64(info.get("objectId")))), _cf(cf), _cache(nullptr), _cachePresent(false), - _cacheEnabled(useCache && !collection.system() && CacheManagerFeature::MANAGER != nullptr) { + _cacheEnabled(useCache && !collection.system() && CacheManagerFeature::MANAGER != nullptr), + _isBuilding(basics::VelocyPackHelper::getBooleanValue(info, StaticStrings::IndexIsBuilding, false)) { TRI_ASSERT(cf != nullptr && cf != RocksDBColumnFamily::definitions()); - if (_objectId == 0) { - _objectId = TRI_NewTickServer(); - } - if (_cacheEnabled) { createCache(); } RocksDBEngine* engine = static_cast(EngineSelectorFeature::ENGINE); - engine->addIndexMapping( _objectId, collection.vocbase().id(), collection.id(), _iid ); } RocksDBIndex::~RocksDBIndex() { + auto engine = static_cast(EngineSelectorFeature::ENGINE); + engine->removeIndexMapping(_objectId); + if (useCache()) { try { TRI_ASSERT(_cache != nullptr); @@ -279,7 +287,7 @@ size_t RocksDBIndex::memory() const { } /// compact the index, should reduce read amplification -void RocksDBIndex::cleanup() { +void RocksDBIndex::compact() { rocksdb::TransactionDB* db = rocksutils::globalRocksDB(); rocksdb::CompactRangeOptions opts; if (_cf != RocksDBColumnFamily::invalid()) { @@ -345,3 +353,179 @@ RocksDBCuckooIndexEstimator* RocksDBIndex::estimator() { void RocksDBIndex::setEstimator(std::unique_ptr>) { // Nothing to do. } + +template +static arangodb::Result fillIndex(transaction::Methods& trx, + RocksDBIndex* ridx, + RocksDBCollection* coll, + WriteBatchType& batch) { + auto state = RocksDBTransactionState::toState(&trx); + arangodb::Result res; + + // fillindex can be non transactional, we just need to clean up + RocksDBEngine* engine = rocksutils::globalRocksEngine(); + bool const assumeExclusive = engine->inRecovery(); + rocksdb::DB* rootDB = engine->db()->GetRootDB(); + TRI_ASSERT(rootDB != nullptr); + + uint64_t numDocsWritten = 0; + // write batch will be reset every x documents + MethodsType batched(state, &batch); + + auto bounds = RocksDBKeyBounds::CollectionDocuments(coll->objectId()); + rocksdb::Slice upper(bounds.end()); + + rocksdb::Status s; + rocksdb::WriteOptions wo; + wo.disableWAL = false; // TODO set to true eventually + + // we iterator without a snapshot + rocksdb::ReadOptions ro; + ro.prefix_same_as_start = true; + ro.iterate_upper_bound = &upper; + ro.verify_checksums = false; + ro.fill_cache = false; + + rocksdb::ColumnFamilyHandle* docCF = bounds.columnFamily(); + std::unique_ptr it(rootDB->NewIterator(ro, docCF)); + + + // empty transaction used to lock the keys + rocksdb::TransactionOptions to; + to.lock_timeout = 100; // 100ms + std::unique_ptr rtrx(engine->db()->BeginTransaction(wo)); + std::vector toRevisit; + toRevisit.reserve(1024); + + it->Seek(bounds.start()); + while (it->Valid() && it->key().compare(upper) < 0) { + + bool skipKey = false; + if (!assumeExclusive) { + rocksdb::PinnableSlice slice; + s = rtrx->GetForUpdate(ro, docCF, it->key(), &slice, /*exclusive*/false); + if (s.IsBusy() || s.IsTimedOut() || s.IsTryAgain()) { + LOG_DEVEL << "was not able to lock key"; + toRevisit.push_back(RocksDBKey::documentId(it->key())); + skipKey = true; + } else if (s.IsNotFound()) { // deleted while we were looking + skipKey = true; + } + } + + if (!skipKey) { + res = ridx->insertInternal(&trx, &batched, RocksDBKey::documentId(it->key()), + VPackSlice(it->value().data()), + Index::OperationMode::normal); + if (res.fail()) { + break; + } + numDocsWritten++; + } + + if (numDocsWritten % 200 == 0) { // commit buffered writes + s = rootDB->Write(wo, batch.GetWriteBatch()); + if (!s.ok()) { + res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); + break; + } + batch.Clear(); + engine->db()->BeginTransaction(wo, to, rtrx.get()); // release keys + } + + it->Next(); + } + + if (res.ok() && batch.GetWriteBatch()->Count() > 0) { + s = rootDB->Write(wo, batch.GetWriteBatch()); + if (!s.ok()) { + res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); + } + } + + batch.Clear(); + if (res.ok() && !toRevisit.empty()) { // now roll-up skipped keys + to.lock_timeout = 5000; // longer timeout to increase the odds + engine->db()->BeginTransaction(wo, to, rtrx.get()); // release keys + RocksDBKey key; + + for (LocalDocumentId const& doc : toRevisit) { + key.constructDocument(coll->objectId(), doc); + + rocksdb::PinnableSlice slice; + s = rtrx->GetForUpdate(ro, docCF, key.string(), &slice, /*exclusive*/false); + if (!s.ok()) { + res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); + break; + } + res = ridx->insertInternal(&trx, &batched, doc, + VPackSlice(slice.data()), + Index::OperationMode::normal); + if (res.fail()) { + break; + } + + numDocsWritten++; + } + } + +// rocksdb::WriteOptions wo; +// +// bool hasMore = true; +// while (hasMore && res.ok()) { +// hasMore = it->nextDocument(cb, 250); +// +// if (TRI_VOC_COL_STATUS_DELETED == it->collection()->status() +// || it->collection()->deleted()) { +// res = TRI_ERROR_INTERNAL; +// } else if (application_features::ApplicationServer::isStopping()) { +// res = TRI_ERROR_SHUTTING_DOWN; +// } +// +// if (res.ok()) { +// rocksdb::Status s = db->Write(wo, batch.GetWriteBatch()); +// if (!s.ok()) { +// res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); +// break; +// } +// } +// +// batch.Clear(); +// } + + // we will need to remove index elements created before an error + // occurred, this needs to happen since we are non transactional + if (res.fail()) { + RocksDBKeyBounds bounds = ridx->getBounds(); + arangodb::Result res2 = rocksutils::removeLargeRange(rocksutils::globalRocksDB(), bounds, + true, /*useRangeDel*/numDocsWritten > 25000); + if (res2.fail()) { + LOG_TOPIC(WARN, Logger::ENGINES) << "was not able to roll-back " + << "index creation: " << res2.errorMessage(); + } + } + + return res; +} + +/// non-transactional: fill index with existing documents +/// from this collection +arangodb::Result RocksDBIndex::fillIndex(transaction::Methods& trx) { + TRI_ASSERT(trx.state()->collection(_collection.id(), AccessMode::Type::WRITE)); + +// std::unique_ptr it(new RocksDBAllIndexIterator(&_collection, trx, primaryIndex())); + RocksDBCollection* coll = static_cast(_collection.getPhysical()); + + if (this->unique()) { + // unique index. we need to keep track of all our changes because we need to avoid + // duplicate index keys. must therefore use a WriteBatchWithIndex + rocksdb::WriteBatchWithIndex batch(_cf->GetComparator(), 32 * 1024 * 1024); + return ::fillIndex(trx, this, coll, batch); + } else { + // non-unique index. all index keys will be unique anyway because they contain the document id + // we can therefore get away with a cheap WriteBatch + rocksdb::WriteBatch batch(32 * 1024 * 1024); + return ::fillIndex(trx, this, coll, batch); + } + return Result(); +} diff --git a/arangod/RocksDBEngine/RocksDBIndex.h b/arangod/RocksDBEngine/RocksDBIndex.h index d29e8abb0e11..0dd01a93bcc3 100644 --- a/arangod/RocksDBEngine/RocksDBIndex.h +++ b/arangod/RocksDBEngine/RocksDBIndex.h @@ -62,8 +62,13 @@ class RocksDBIndex : public Index { std::underlying_type::type) const override; uint64_t objectId() const { return _objectId; } + + /// @brief if true this index should not be shown externally + virtual bool isHidden() const override { + return isBuilding(); // do not show building indexes + } - bool isPersistent() const override final { return true; } + size_t memory() const override; int drop() override; virtual void afterTruncate(TRI_voc_tick_t tick) override; @@ -71,9 +76,8 @@ class RocksDBIndex : public Index { void load() override; void unload() override; - size_t memory() const override; - - void cleanup(); + /// compact the index, should reduce read amplification + void compact(); /// @brief provides a size hint for the index int sizeHint(transaction::Methods* /*trx*/, size_t /*size*/) override final { @@ -130,11 +134,19 @@ class RocksDBIndex : public Index { static RocksDBKeyBounds getBounds(Index::IndexType type, uint64_t objectId, bool unique); + + /// @brief is this index still beeing build + bool isBuilding() const { + return _isBuilding.load(std::memory_order_acquire); + } + /// @brief get index estimator, optional virtual RocksDBCuckooIndexEstimator* estimator(); virtual void setEstimator(std::unique_ptr>); virtual void recalculateEstimates() {} + arangodb::Result fillIndex(transaction::Methods&); + protected: RocksDBIndex( TRI_idx_iid_t id, @@ -158,6 +170,8 @@ class RocksDBIndex : public Index { inline bool useCache() const { return (_cacheEnabled && _cachePresent); } void blackListKey(char const* data, std::size_t len); void blackListKey(StringRef& ref) { blackListKey(ref.data(), ref.size()); }; + + protected: uint64_t _objectId; rocksdb::ColumnFamilyHandle* _cf; @@ -167,6 +181,11 @@ class RocksDBIndex : public Index { // it's quicker than accessing the shared_ptr each time bool _cachePresent; bool _cacheEnabled; + + private: + + /// is this index currently building + std::atomic _isBuilding; }; } // namespace arangodb diff --git a/arangod/RocksDBEngine/RocksDBIndexFactory.cpp b/arangod/RocksDBEngine/RocksDBIndexFactory.cpp index 5ec9c02e864b..db06bfa38376 100644 --- a/arangod/RocksDBEngine/RocksDBIndexFactory.cpp +++ b/arangod/RocksDBEngine/RocksDBIndexFactory.cpp @@ -627,9 +627,9 @@ void RocksDBIndexFactory::prepareIndexes( auto value = v.get("type"); if (value.isString()) { - std::string tmp = value.copyString(); - arangodb::Index::IndexType const type = - arangodb::Index::type(tmp.c_str()); + VPackValueLength len; + const char* tmp = value.getStringUnchecked(len); + arangodb::Index::IndexType const type = arangodb::Index::type(tmp, len); if (type == Index::IndexType::TRI_IDX_TYPE_EDGE_INDEX) { VPackSlice fields = v.get("fields"); diff --git a/arangod/RocksDBEngine/RocksDBLogValue.cpp b/arangod/RocksDBEngine/RocksDBLogValue.cpp index 3e3f94a898f9..d3da40c9ca09 100644 --- a/arangod/RocksDBEngine/RocksDBLogValue.cpp +++ b/arangod/RocksDBEngine/RocksDBLogValue.cpp @@ -129,6 +129,10 @@ RocksDBLogValue RocksDBLogValue::SingleRemoveV2(TRI_voc_tick_t vocbaseId, return RocksDBLogValue(RocksDBLogType::SingleRemoveV2, vocbaseId, cid, rid); } +/*static*/ RocksDBLogValue RocksDBLogValue::Empty() { + return RocksDBLogValue(); +} + RocksDBLogValue::RocksDBLogValue(RocksDBLogType type, uint64_t val) : _buffer() { switch (type) { diff --git a/arangod/RocksDBEngine/RocksDBLogValue.h b/arangod/RocksDBEngine/RocksDBLogValue.h index f22eae87dbc4..ac9ef040377d 100644 --- a/arangod/RocksDBEngine/RocksDBLogValue.h +++ b/arangod/RocksDBEngine/RocksDBLogValue.h @@ -85,6 +85,9 @@ class RocksDBLogValue { static RocksDBLogValue SinglePut(TRI_voc_tick_t vocbaseId, TRI_voc_cid_t cid); static RocksDBLogValue SingleRemoveV2(TRI_voc_tick_t vocbaseId, TRI_voc_cid_t cid, TRI_voc_rid_t rid); + + // empty log value + static RocksDBLogValue Empty(); public: @@ -126,6 +129,7 @@ class RocksDBLogValue { rocksdb::Slice slice() const { return rocksdb::Slice(_buffer); } private: + explicit RocksDBLogValue() {} RocksDBLogValue(RocksDBLogType, uint64_t); RocksDBLogValue(RocksDBLogType, uint64_t, uint64_t); RocksDBLogValue(RocksDBLogType, uint64_t, uint64_t, uint64_t); diff --git a/arangod/RocksDBEngine/RocksDBTransactionState.cpp b/arangod/RocksDBEngine/RocksDBTransactionState.cpp index 2be51a4a46f0..339be6b0b899 100644 --- a/arangod/RocksDBEngine/RocksDBTransactionState.cpp +++ b/arangod/RocksDBEngine/RocksDBTransactionState.cpp @@ -575,11 +575,6 @@ Result RocksDBTransactionState::addOperation( return checkIntermediateCommit(currentSize, hasPerformedIntermediateCommit); } -RocksDBMethods* RocksDBTransactionState::rocksdbMethods() { - TRI_ASSERT(_rocksMethods); - return _rocksMethods.get(); -} - uint64_t RocksDBTransactionState::sequenceNumber() const { if (_rocksTransaction) { return static_cast( diff --git a/arangod/RocksDBEngine/RocksDBTransactionState.h b/arangod/RocksDBEngine/RocksDBTransactionState.h index 64266a4fc492..1f18d7aea0b2 100644 --- a/arangod/RocksDBEngine/RocksDBTransactionState.h +++ b/arangod/RocksDBEngine/RocksDBTransactionState.h @@ -120,7 +120,10 @@ class RocksDBTransactionState final : public TransactionState { bool& hasPerformedIntermediateCommit); /// @brief return wrapper around rocksdb transaction - RocksDBMethods* rocksdbMethods(); + RocksDBMethods* rocksdbMethods() { + TRI_ASSERT(_rocksMethods); + return _rocksMethods.get(); + } /// @brief Rocksdb sequence number of snapshot. Works while trx /// has either a snapshot or a transaction diff --git a/arangod/StorageEngine/PhysicalCollection.cpp b/arangod/StorageEngine/PhysicalCollection.cpp index 7a56c83e57ca..6083dc6aa907 100644 --- a/arangod/StorageEngine/PhysicalCollection.cpp +++ b/arangod/StorageEngine/PhysicalCollection.cpp @@ -105,6 +105,41 @@ bool PhysicalCollection::hasIndexOfType(arangodb::Index::IndexType type) const { return false; } +/// @brief Find index by definition +/*static*/ std::shared_ptr PhysicalCollection::findIndex( + VPackSlice const& info, + std::vector> const& indexes) { + TRI_ASSERT(info.isObject()); + + auto value = info.get(arangodb::StaticStrings::IndexType); // extract type + + if (!value.isString()) { + // Compatibility with old v8-vocindex. + THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, + "invalid index type definition"); + } + + VPackValueLength len; + const char* str = value.getStringUnchecked(len); + arangodb::Index::IndexType const type = arangodb::Index::type(str, len); + for (auto const& idx : indexes) { + if (idx->type() == type) { + // Only check relevant indexes + if (idx->matchesDefinition(info)) { + // We found an index for this definition. + return idx; + } + } + } + return nullptr; +} + +/// @brief Find index by definition +std::shared_ptr PhysicalCollection::lookupIndex(VPackSlice const& info) const { + READ_LOCKER(guard, _indexesLock); + return findIndex(info, _indexes); +} + std::shared_ptr PhysicalCollection::lookupIndex( TRI_idx_iid_t idxId) const { READ_LOCKER(guard, _indexesLock); diff --git a/arangod/StorageEngine/PhysicalCollection.h b/arangod/StorageEngine/PhysicalCollection.h index d23b1221c7fc..0dd2bac52d7b 100644 --- a/arangod/StorageEngine/PhysicalCollection.h +++ b/arangod/StorageEngine/PhysicalCollection.h @@ -100,23 +100,15 @@ class PhysicalCollection { bool hasIndexOfType(arangodb::Index::IndexType type) const; + /// @brief find index by definition + static std::shared_ptr findIndex(velocypack::Slice const&, + std::vector> const&); /// @brief Find index by definition - virtual std::shared_ptr lookupIndex( - velocypack::Slice const&) const = 0; + std::shared_ptr lookupIndex(velocypack::Slice const&) const; /// @brief Find index by iid std::shared_ptr lookupIndex(TRI_idx_iid_t) const; std::vector> getIndexes() const; - template - void enumerateIndexes(F&& f) { - _indexesLock.readLock(); // avoid including ReadLocker.h - try { - for (auto& idx : _indexes) { - std::forward(f)(idx); - } - } catch(...) {} - _indexesLock.unlockRead(); - } void getIndexesVPack(velocypack::Builder&, unsigned flags, std::function const& filter) const; diff --git a/arangod/StorageEngine/StorageEngine.h b/arangod/StorageEngine/StorageEngine.h index 0468b0aadf79..2b838b6a8657 100644 --- a/arangod/StorageEngine/StorageEngine.h +++ b/arangod/StorageEngine/StorageEngine.h @@ -264,7 +264,6 @@ class StorageEngine : public application_features::ApplicationFeature { // to "createCollection" returns virtual std::string createCollection( TRI_vocbase_t& vocbase, - TRI_voc_cid_t id, LogicalCollection const& collection ) = 0; @@ -304,7 +303,6 @@ class StorageEngine : public application_features::ApplicationFeature { // to "changeCollection" returns virtual void changeCollection( TRI_vocbase_t& vocbase, - TRI_voc_cid_t id, LogicalCollection const& collection, bool doSync ) = 0; diff --git a/arangod/Transaction/Methods.cpp b/arangod/Transaction/Methods.cpp index a1b5be9f851f..7a7c68f5e5c3 100644 --- a/arangod/Transaction/Methods.cpp +++ b/arangod/Transaction/Methods.cpp @@ -3243,7 +3243,16 @@ std::vector> transaction::Methods::indexesForCollection( TRI_voc_cid_t cid = addCollectionAtRuntime(collectionName); LogicalCollection* document = documentCollection(trxCollection(cid)); - return document->getIndexes(); + std::vector> indexes = document->getIndexes(); + auto it = indexes.begin(); + while (it != indexes.end()) { + if ((*it)->isHidden()) { + it = indexes.erase(it); + } else { + it++; + } + } + return indexes; } /// @brief Lock all collections. Only works for selected sub-classes diff --git a/arangod/V8Server/v8-vocindex.cpp b/arangod/V8Server/v8-vocindex.cpp index 1db700a0684a..4769226609fb 100644 --- a/arangod/V8Server/v8-vocindex.cpp +++ b/arangod/V8Server/v8-vocindex.cpp @@ -179,14 +179,13 @@ static void JS_GetIndexesVocbaseCol( flags = Index::makeFlags(Index::Serialize::Estimates, Index::Serialize::Figures); } - bool withLinks = false; - + bool withHidden = false; if (args.Length() > 1) { - withLinks = TRI_ObjectToBoolean(args[1]); + withHidden = TRI_ObjectToBoolean(args[1]); } VPackBuilder output; - auto res = methods::Indexes::getAll(collection, flags, withLinks, output); + auto res = methods::Indexes::getAll(collection, flags, withHidden, output); if (res.fail()) { TRI_V8_THROW_EXCEPTION(res); diff --git a/arangod/VocBase/LogicalCollection.cpp b/arangod/VocBase/LogicalCollection.cpp index 757fb9edcaa0..38ac7cdae71d 100644 --- a/arangod/VocBase/LogicalCollection.cpp +++ b/arangod/VocBase/LogicalCollection.cpp @@ -499,7 +499,7 @@ Result LogicalCollection::rename(std::string&& newName) { TRI_ASSERT(engine != nullptr); name(std::move(newName)); - engine->changeCollection(vocbase(), id(), *this, doSync); + engine->changeCollection(vocbase(), *this, doSync); } catch (basics::Exception const& ex) { // Engine Rename somehow failed. Reset to old name name(std::move(oldName)); @@ -593,7 +593,7 @@ void LogicalCollection::toVelocyPackForClusterInventory(VPackBuilder& result, // AND exclude arangosearch indexes return (idx->type() != arangodb::Index::TRI_IDX_TYPE_PRIMARY_INDEX && idx->type() != arangodb::Index::TRI_IDX_TYPE_EDGE_INDEX && - idx->type() != arangodb::Index::TRI_IDX_TYPE_IRESEARCH_LINK); + !idx->isHidden()); }); result.add("planVersion", VPackValue(planVersion())); result.add("isReady", VPackValue(isReady)); @@ -645,10 +645,13 @@ arangodb::Result LogicalCollection::appendVelocyPack( // Indexes result.add(VPackValue("indexes")); auto flags = Index::makeFlags(); + auto filter = [&](arangodb::Index const* idx) { // hide hidden indexes + return (forPersistence || !idx->isHidden()); + }; if (forPersistence) { flags = Index::makeFlags(Index::Serialize::ObjectId); } - getIndexesVPack(result, flags); + getIndexesVPack(result, flags, filter); // Cluster Specific result.add(StaticStrings::IsSmart, VPackValue(_isSmart)); @@ -684,6 +687,9 @@ VPackBuilder LogicalCollection::toVelocyPackIgnore( full.openObject(); properties(full, translateCids, forPersistence); full.close(); + if (ignoreKeys.empty()) { + return full; + } return VPackCollection::remove(full.slice(), ignoreKeys); } @@ -798,7 +804,7 @@ arangodb::Result LogicalCollection::properties( ); } - engine->changeCollection(vocbase(), id(), *this, doSync); + engine->changeCollection(vocbase(), *this, doSync); if (DatabaseFeature::DATABASE != nullptr && DatabaseFeature::DATABASE->versionTracker() != nullptr) { @@ -874,7 +880,7 @@ void LogicalCollection::persistPhysicalCollection() { TRI_ASSERT(!ServerState::instance()->isCoordinator()); StorageEngine* engine = EngineSelectorFeature::ENGINE; - auto path = engine->createCollection(vocbase(), id(), *this); + auto path = engine->createCollection(vocbase(), *this); getPhysical()->setPath(path); } diff --git a/arangod/VocBase/Methods/Indexes.cpp b/arangod/VocBase/Methods/Indexes.cpp index f5daf29cd68d..0145355ea02a 100644 --- a/arangod/VocBase/Methods/Indexes.cpp +++ b/arangod/VocBase/Methods/Indexes.cpp @@ -83,7 +83,7 @@ Result Indexes::getIndex(LogicalCollection const* collection, VPackBuilder tmp; - Result res = Indexes::getAll(collection, Index::makeFlags(), false, tmp); + Result res = Indexes::getAll(collection, Index::makeFlags(), /*withHidden*/true, tmp); if (res.ok()) { for (VPackSlice const& index : VPackArrayIterator(tmp.slice())) { if (index.get("id").compareString(name) == 0) { @@ -98,7 +98,7 @@ Result Indexes::getIndex(LogicalCollection const* collection, /// @brief get all indexes, skips view links arangodb::Result Indexes::getAll(LogicalCollection const* collection, std::underlying_type::type flags, - bool withLinks, + bool withHidden, VPackBuilder& result) { VPackBuilder tmp; if (ServerState::instance()->isCoordinator()) { @@ -119,13 +119,9 @@ arangodb::Result Indexes::getAll(LogicalCollection const* collection, VPackBuilder tmpInner; auto c = ClusterInfo::instance()->getCollection(databaseName, cid); -#ifdef USE_IRESEARCH - c->getIndexesVPack(tmpInner, flags, [withLinks](arangodb::Index const* idx) { - return withLinks || idx->type() != Index::TRI_IDX_TYPE_IRESEARCH_LINK; + c->getIndexesVPack(tmpInner, flags, [&](arangodb::Index const* idx) { + return !withHidden || !idx->isHidden(); }); -#else - c->getIndexesVPack(tmpInner, flags); -#endif tmp.openArray(); for (VPackSlice const& s : VPackArrayIterator(tmpInner.slice())) { @@ -169,11 +165,9 @@ arangodb::Result Indexes::getAll(LogicalCollection const* collection, tmp.openArray(true); for (std::shared_ptr const& idx : indexes) { -#ifdef USE_IRESEARCH - if (!withLinks && idx->type() == Index::TRI_IDX_TYPE_IRESEARCH_LINK) { + if (!withHidden && idx->isHidden()) { continue; } -#endif idx->toVelocyPack(tmp, flags); } tmp.close(); diff --git a/arangod/VocBase/Methods/Indexes.h b/arangod/VocBase/Methods/Indexes.h index c726284b8397..558e0133f7be 100644 --- a/arangod/VocBase/Methods/Indexes.h +++ b/arangod/VocBase/Methods/Indexes.h @@ -48,8 +48,7 @@ struct Indexes { /// @brief get all indexes, skips view links static arangodb::Result getAll(LogicalCollection const* collection, std::underlying_type::type, - bool skipLinks, - arangodb::velocypack::Builder&); + bool withHidden, arangodb::velocypack::Builder&); static arangodb::Result createIndex(LogicalCollection*, Index::IndexType, std::vector const&, diff --git a/arangod/VocBase/vocbase.cpp b/arangod/VocBase/vocbase.cpp index b109015bc8e0..99800a5f2993 100644 --- a/arangod/VocBase/vocbase.cpp +++ b/arangod/VocBase/vocbase.cpp @@ -778,9 +778,7 @@ int TRI_vocbase_t::dropCollectionWorker(arangodb::LogicalCollection* collection, collection->deleted(true); try { - engine->changeCollection( - *this, collection->id(), *collection, doSync - ); + engine->changeCollection(*this, *collection, doSync); } catch (arangodb::basics::Exception const& ex) { collection->deleted(false); events::DropCollection(colName, ex.code()); @@ -1015,8 +1013,7 @@ void TRI_vocbase_t::inventory( collection->getIndexesVPack(result, Index::makeFlags(), [](arangodb::Index const* idx) { // we have to exclude the primary, edge index and links for dump / restore return (idx->type() != arangodb::Index::TRI_IDX_TYPE_PRIMARY_INDEX && - idx->type() != arangodb::Index::TRI_IDX_TYPE_EDGE_INDEX && - idx->type() != arangodb::Index::TRI_IDX_TYPE_IRESEARCH_LINK); + idx->type() != arangodb::Index::TRI_IDX_TYPE_EDGE_INDEX && !idx->isHidden()); }); result.add("parameters", VPackValue(VPackValueType::Object)); collection->toVelocyPackIgnore(result, { "objectId", "path", "statusString", "indexes" }, true, false); diff --git a/js/client/modules/@arangodb/arango-collection.js b/js/client/modules/@arangodb/arango-collection.js index 36ebd7eb75ab..3d677f6ee287 100644 --- a/js/client/modules/@arangodb/arango-collection.js +++ b/js/client/modules/@arangodb/arango-collection.js @@ -611,10 +611,10 @@ ArangoCollection.prototype.refresh = function () { // / @brief gets all indexes // ////////////////////////////////////////////////////////////////////////////// -ArangoCollection.prototype.getIndexes = ArangoCollection.prototype.indexes = function (withStats, withLinks) { +ArangoCollection.prototype.getIndexes = ArangoCollection.prototype.indexes = function (withStats, withHidden) { let url = this._indexurl() + '&withStats=' + (withStats || false); - if (withLinks) { - url += '&withLinks=true'; + if (withHidden) { + url += '&withHidden=true'; } var requestResult = this._database._connection.GET(url); diff --git a/lib/Basics/StaticStrings.cpp b/lib/Basics/StaticStrings.cpp index 4fd3c118871e..82005548c3fa 100644 --- a/lib/Basics/StaticStrings.cpp +++ b/lib/Basics/StaticStrings.cpp @@ -95,6 +95,7 @@ std::string const StaticStrings::IndexId("id"); std::string const StaticStrings::IndexSparse("sparse"); std::string const StaticStrings::IndexType("type"); std::string const StaticStrings::IndexUnique("unique"); +std::string const StaticStrings::IndexIsBuilding("isBuilding"); // HTTP headers std::string const StaticStrings::Accept("accept"); diff --git a/lib/Basics/StaticStrings.h b/lib/Basics/StaticStrings.h index da9b0c2dae54..d038b8f29dec 100644 --- a/lib/Basics/StaticStrings.h +++ b/lib/Basics/StaticStrings.h @@ -94,6 +94,7 @@ class StaticStrings { static std::string const IndexSparse; // index sparsness marker static std::string const IndexType; // index type static std::string const IndexUnique; // index uniqueness marker + static std::string const IndexIsBuilding; // index build in-process // HTTP headers static std::string const Accept; diff --git a/tests/IResearch/IResearchLink-test.cpp b/tests/IResearch/IResearchLink-test.cpp index 60a37d393fa7..905c10f00343 100644 --- a/tests/IResearch/IResearchLink-test.cpp +++ b/tests/IResearch/IResearchLink-test.cpp @@ -211,7 +211,6 @@ SECTION("test_defaults") { CHECK((false == link->hasExpansion())); CHECK((false == link->hasSelectivityEstimate())); CHECK((false == link->implicitlyUnique())); - CHECK((true == link->isPersistent())); CHECK((false == link->isSorted())); CHECK((0 < link->memory())); CHECK((true == link->sparse())); @@ -263,7 +262,6 @@ SECTION("test_defaults") { CHECK((false == link->hasExpansion())); CHECK((false == link->hasSelectivityEstimate())); CHECK((false == link->implicitlyUnique())); - CHECK((true == link->isPersistent())); CHECK((false == link->isSorted())); CHECK((0 < link->memory())); CHECK((true == link->sparse())); diff --git a/tests/IResearch/IResearchLinkCoordinator-test.cpp b/tests/IResearch/IResearchLinkCoordinator-test.cpp index 639adf46311e..05a20ecb43a1 100644 --- a/tests/IResearch/IResearchLinkCoordinator-test.cpp +++ b/tests/IResearch/IResearchLinkCoordinator-test.cpp @@ -349,7 +349,6 @@ SECTION("test_create_drop") { CHECK((false == index->hasExpansion())); CHECK((false == index->hasSelectivityEstimate())); CHECK((false == index->implicitlyUnique())); - CHECK((true == index->isPersistent())); CHECK((false == index->isSorted())); CHECK((0 < index->memory())); CHECK((true == index->sparse())); @@ -458,7 +457,6 @@ SECTION("test_create_drop") { CHECK((false == index->hasExpansion())); CHECK((false == index->hasSelectivityEstimate())); CHECK((false == index->implicitlyUnique())); - CHECK((true == index->isPersistent())); CHECK((false == index->isSorted())); CHECK((0 < index->memory())); CHECK((true == index->sparse())); diff --git a/tests/IResearch/IResearchViewCoordinator-test.cpp b/tests/IResearch/IResearchViewCoordinator-test.cpp index 1d42fa264317..be0c9b890822 100644 --- a/tests/IResearch/IResearchViewCoordinator-test.cpp +++ b/tests/IResearch/IResearchViewCoordinator-test.cpp @@ -1191,7 +1191,6 @@ SECTION("test_update_links_partial_remove") { CHECK((false == index->hasExpansion())); CHECK((false == index->hasSelectivityEstimate())); CHECK((false == index->implicitlyUnique())); - CHECK((true == index->isPersistent())); CHECK((false == index->isSorted())); CHECK((0 < index->memory())); CHECK((true == index->sparse())); @@ -1237,7 +1236,6 @@ SECTION("test_update_links_partial_remove") { CHECK((false == index->hasExpansion())); CHECK((false == index->hasSelectivityEstimate())); CHECK((false == index->implicitlyUnique())); - CHECK((true == index->isPersistent())); CHECK((false == index->isSorted())); CHECK((0 < index->memory())); CHECK((true == index->sparse())); @@ -1283,7 +1281,6 @@ SECTION("test_update_links_partial_remove") { CHECK((false == index->hasExpansion())); CHECK((false == index->hasSelectivityEstimate())); CHECK((false == index->implicitlyUnique())); - CHECK((true == index->isPersistent())); CHECK((false == index->isSorted())); CHECK((0 < index->memory())); CHECK((true == index->sparse())); @@ -1416,7 +1413,6 @@ SECTION("test_update_links_partial_remove") { CHECK((false == index->hasExpansion())); CHECK((false == index->hasSelectivityEstimate())); CHECK((false == index->implicitlyUnique())); - CHECK((true == index->isPersistent())); CHECK((false == index->isSorted())); CHECK((0 < index->memory())); CHECK((true == index->sparse())); @@ -1463,7 +1459,6 @@ SECTION("test_update_links_partial_remove") { CHECK((false == index->hasExpansion())); CHECK((false == index->hasSelectivityEstimate())); CHECK((false == index->implicitlyUnique())); - CHECK((true == index->isPersistent())); CHECK((false == index->isSorted())); CHECK((0 < index->memory())); CHECK((true == index->sparse())); @@ -1736,7 +1731,6 @@ SECTION("test_update_links_partial_add") { CHECK((false == index->hasExpansion())); CHECK((false == index->hasSelectivityEstimate())); CHECK((false == index->implicitlyUnique())); - CHECK((true == index->isPersistent())); CHECK((false == index->isSorted())); CHECK((0 < index->memory())); CHECK((true == index->sparse())); @@ -1782,7 +1776,6 @@ SECTION("test_update_links_partial_add") { CHECK((false == index->hasExpansion())); CHECK((false == index->hasSelectivityEstimate())); CHECK((false == index->implicitlyUnique())); - CHECK((true == index->isPersistent())); CHECK((false == index->isSorted())); CHECK((0 < index->memory())); CHECK((true == index->sparse())); @@ -1930,7 +1923,6 @@ SECTION("test_update_links_partial_add") { CHECK((false == index->hasExpansion())); CHECK((false == index->hasSelectivityEstimate())); CHECK((false == index->implicitlyUnique())); - CHECK((true == index->isPersistent())); CHECK((false == index->isSorted())); CHECK((0 < index->memory())); CHECK((true == index->sparse())); @@ -1977,7 +1969,6 @@ SECTION("test_update_links_partial_add") { CHECK((false == index->hasExpansion())); CHECK((false == index->hasSelectivityEstimate())); CHECK((false == index->implicitlyUnique())); - CHECK((true == index->isPersistent())); CHECK((false == index->isSorted())); CHECK((0 < index->memory())); CHECK((true == index->sparse())); @@ -2024,7 +2015,6 @@ SECTION("test_update_links_partial_add") { CHECK((false == index->hasExpansion())); CHECK((false == index->hasSelectivityEstimate())); CHECK((false == index->implicitlyUnique())); - CHECK((true == index->isPersistent())); CHECK((false == index->isSorted())); CHECK((0 < index->memory())); CHECK((true == index->sparse())); @@ -2344,7 +2334,6 @@ SECTION("test_update_links_replace") { CHECK((false == index->hasExpansion())); CHECK((false == index->hasSelectivityEstimate())); CHECK((false == index->implicitlyUnique())); - CHECK((true == index->isPersistent())); CHECK((false == index->isSorted())); CHECK((0 < index->memory())); CHECK((true == index->sparse())); @@ -2390,7 +2379,6 @@ SECTION("test_update_links_replace") { CHECK((false == index->hasExpansion())); CHECK((false == index->hasSelectivityEstimate())); CHECK((false == index->implicitlyUnique())); - CHECK((true == index->isPersistent())); CHECK((false == index->isSorted())); CHECK((0 < index->memory())); CHECK((true == index->sparse())); @@ -2520,7 +2508,6 @@ SECTION("test_update_links_replace") { CHECK((false == index->hasExpansion())); CHECK((false == index->hasSelectivityEstimate())); CHECK((false == index->implicitlyUnique())); - CHECK((true == index->isPersistent())); CHECK((false == index->isSorted())); CHECK((0 < index->memory())); CHECK((true == index->sparse())); @@ -2642,7 +2629,6 @@ SECTION("test_update_links_replace") { CHECK((false == index->hasExpansion())); CHECK((false == index->hasSelectivityEstimate())); CHECK((false == index->implicitlyUnique())); - CHECK((true == index->isPersistent())); CHECK((false == index->isSorted())); CHECK((0 < index->memory())); CHECK((true == index->sparse())); @@ -2932,7 +2918,6 @@ SECTION("test_update_links_clear") { CHECK((false == index->hasExpansion())); CHECK((false == index->hasSelectivityEstimate())); CHECK((false == index->implicitlyUnique())); - CHECK((true == index->isPersistent())); CHECK((false == index->isSorted())); CHECK((0 < index->memory())); CHECK((true == index->sparse())); @@ -2979,7 +2964,6 @@ SECTION("test_update_links_clear") { CHECK((false == index->hasExpansion())); CHECK((false == index->hasSelectivityEstimate())); CHECK((false == index->implicitlyUnique())); - CHECK((true == index->isPersistent())); CHECK((false == index->isSorted())); CHECK((0 < index->memory())); CHECK((true == index->sparse())); @@ -3025,7 +3009,6 @@ SECTION("test_update_links_clear") { CHECK((false == index->hasExpansion())); CHECK((false == index->hasSelectivityEstimate())); CHECK((false == index->implicitlyUnique())); - CHECK((true == index->isPersistent())); CHECK((false == index->isSorted())); CHECK((0 < index->memory())); CHECK((true == index->sparse())); @@ -3293,7 +3276,6 @@ SECTION("test_drop_link") { CHECK((false == index->hasExpansion())); CHECK((false == index->hasSelectivityEstimate())); CHECK((false == index->implicitlyUnique())); - CHECK((true == index->isPersistent())); CHECK((false == index->isSorted())); CHECK((0 < index->memory())); CHECK((true == index->sparse())); diff --git a/tests/IResearch/StorageEngineMock.cpp b/tests/IResearch/StorageEngineMock.cpp index 78c1f9cf21b6..427636b18e51 100644 --- a/tests/IResearch/StorageEngineMock.cpp +++ b/tests/IResearch/StorageEngineMock.cpp @@ -168,6 +168,8 @@ class EdgeIndexMock final : public arangodb::Index { bool canBeDropped() const override { return false; } + bool isHidden() const override { return false; } + bool isSorted() const override { return false; } bool hasSelectivityEstimate() const override { return false; } @@ -701,12 +703,6 @@ void PhysicalCollectionMock::invokeOnAllElements(arangodb::transaction::Methods* } } -std::shared_ptr PhysicalCollectionMock::lookupIndex(arangodb::velocypack::Slice const&) const { - before(); - TRI_ASSERT(false); - return nullptr; -} - arangodb::LocalDocumentId PhysicalCollectionMock::lookupKey(arangodb::transaction::Methods*, arangodb::velocypack::Slice const&) const { before(); TRI_ASSERT(false); @@ -1028,7 +1024,6 @@ void StorageEngineMock::addV8Functions() { void StorageEngineMock::changeCollection( TRI_vocbase_t& vocbase, - TRI_voc_cid_t id, arangodb::LogicalCollection const& collection, bool doSync ) { @@ -1061,7 +1056,6 @@ std::string StorageEngineMock::collectionPath( std::string StorageEngineMock::createCollection( TRI_vocbase_t& vocbase, - TRI_voc_cid_t id, arangodb::LogicalCollection const& collection ) { return ""; // physical path of the new collection diff --git a/tests/IResearch/StorageEngineMock.h b/tests/IResearch/StorageEngineMock.h index a27d5f9837d2..a9877b7358d3 100644 --- a/tests/IResearch/StorageEngineMock.h +++ b/tests/IResearch/StorageEngineMock.h @@ -36,7 +36,7 @@ namespace arangodb { -class KeyLockInfo; +struct KeyLockInfo; class TransactionManager; class WalAccess; @@ -76,7 +76,6 @@ class PhysicalCollectionMock: public arangodb::PhysicalCollection { arangodb::KeyLockInfo* /*keyLockInfo*/, std::function callbackDuringLock) override; virtual void invokeOnAllElements(arangodb::transaction::Methods* trx, std::function callback) override; - virtual std::shared_ptr lookupIndex(arangodb::velocypack::Slice const&) const override; virtual arangodb::LocalDocumentId lookupKey(arangodb::transaction::Methods*, arangodb::velocypack::Slice const&) const override; virtual size_t memory() const override; virtual uint64_t numberDocuments(arangodb::transaction::Methods* trx) const override; @@ -164,10 +163,10 @@ class StorageEngineMock: public arangodb::StorageEngine { virtual void addOptimizerRules() override; virtual void addRestHandlers(arangodb::rest::RestHandlerFactory& handlerFactory) override; virtual void addV8Functions() override; - virtual void changeCollection(TRI_vocbase_t& vocbase, TRI_voc_cid_t id, arangodb::LogicalCollection const& collection, bool doSync) override; + virtual void changeCollection(TRI_vocbase_t& vocbase, arangodb::LogicalCollection const& collection, bool doSync) override; virtual arangodb::Result changeView(TRI_vocbase_t& vocbase, arangodb::LogicalView const& view, bool doSync) override; virtual std::string collectionPath(TRI_vocbase_t const& vocbase, TRI_voc_cid_t id) const override; - virtual std::string createCollection(TRI_vocbase_t& vocbase, TRI_voc_cid_t id, arangodb::LogicalCollection const& collection) override; + virtual std::string createCollection(TRI_vocbase_t& vocbase, arangodb::LogicalCollection const& collection) override; virtual std::unique_ptr createDatabase(TRI_voc_tick_t id, arangodb::velocypack::Slice const& args, int& status) override; virtual arangodb::Result createLoggerState(TRI_vocbase_t*, VPackBuilder&) override; virtual std::unique_ptr createPhysicalCollection(arangodb::LogicalCollection& collection, arangodb::velocypack::Slice const& info) override; From 38b35001b4994fc6308a7fb0b2d7097a55a47e7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Mon, 3 Dec 2018 21:11:41 +0100 Subject: [PATCH 02/31] make sure index is hidden --- arangod/RocksDBEngine/RocksDBIndex.cpp | 4 ++++ arangod/VocBase/Methods/Indexes.cpp | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arangod/RocksDBEngine/RocksDBIndex.cpp b/arangod/RocksDBEngine/RocksDBIndex.cpp index fdb24450b306..03309a250a18 100644 --- a/arangod/RocksDBEngine/RocksDBIndex.cpp +++ b/arangod/RocksDBEngine/RocksDBIndex.cpp @@ -512,9 +512,13 @@ static arangodb::Result fillIndex(transaction::Methods& trx, /// from this collection arangodb::Result RocksDBIndex::fillIndex(transaction::Methods& trx) { TRI_ASSERT(trx.state()->collection(_collection.id(), AccessMode::Type::WRITE)); + _isBuilding.store(true, std::memory_order_release); // std::unique_ptr it(new RocksDBAllIndexIterator(&_collection, trx, primaryIndex())); RocksDBCollection* coll = static_cast(_collection.getPhysical()); + auto guard = scopeGuard([this] { + _isBuilding.store(false, std::memory_order_release); + }); if (this->unique()) { // unique index. we need to keep track of all our changes because we need to avoid diff --git a/arangod/VocBase/Methods/Indexes.cpp b/arangod/VocBase/Methods/Indexes.cpp index 0145355ea02a..a5133bd4e893 100644 --- a/arangod/VocBase/Methods/Indexes.cpp +++ b/arangod/VocBase/Methods/Indexes.cpp @@ -120,7 +120,7 @@ arangodb::Result Indexes::getAll(LogicalCollection const* collection, VPackBuilder tmpInner; auto c = ClusterInfo::instance()->getCollection(databaseName, cid); c->getIndexesVPack(tmpInner, flags, [&](arangodb::Index const* idx) { - return !withHidden || !idx->isHidden(); + return withHidden || !idx->isHidden(); }); tmp.openArray(); From 9bcc737e2d700b4742d576651a98f9eb8573e7dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Tue, 4 Dec 2018 22:05:26 +0100 Subject: [PATCH 03/31] last changes --- arangod/Indexes/Index.cpp | 22 +- arangod/Indexes/Index.h | 2 +- arangod/MMFiles/MMFilesCollection.cpp | 2 +- arangod/RocksDBEngine/RocksDBCollection.cpp | 59 ++- arangod/RocksDBEngine/RocksDBEngine.cpp | 1 - arangod/RocksDBEngine/RocksDBIndex.cpp | 337 ++++++++++++++---- arangod/RocksDBEngine/RocksDBIndex.h | 5 +- arangod/RocksDBEngine/RocksDBMethods.cpp | 38 ++ arangod/RocksDBEngine/RocksDBMethods.h | 34 ++ .../RocksDBEngine/RocksDBTransactionState.cpp | 9 +- .../RocksDBEngine/RocksDBTransactionState.h | 3 +- arangod/VocBase/LogicalCollection.cpp | 2 +- 12 files changed, 417 insertions(+), 97 deletions(-) diff --git a/arangod/Indexes/Index.cpp b/arangod/Indexes/Index.cpp index 78aa8deea693..c2f62f1e0083 100644 --- a/arangod/Indexes/Index.cpp +++ b/arangod/Indexes/Index.cpp @@ -220,32 +220,32 @@ void Index::validateFields(VPackSlice const& slice) { /// @brief return the index type based on a type name Index::IndexType Index::type(char const* type, size_t len) { - if (::strncmp(type, "primary", len) == 0) { + if (7 == len && ::strncmp(type, "primary", len) == 0) { return TRI_IDX_TYPE_PRIMARY_INDEX; } - if (::strncmp(type, "edge", len) == 0) { + if (4 == len && ::strncmp(type, "edge", len) == 0) { return TRI_IDX_TYPE_EDGE_INDEX; } - if (::strncmp(type, "hash", len) == 0) { + if (4 == len && ::strncmp(type, "hash", len) == 0) { return TRI_IDX_TYPE_HASH_INDEX; } - if (::strncmp(type, "skiplist", len) == 0) { + if (8 == len && ::strncmp(type, "skiplist", len) == 0) { return TRI_IDX_TYPE_SKIPLIST_INDEX; } - if (::strncmp(type, "persistent", len) == 0 || - ::strncmp(type, "rocksdb", len) == 0) { + if ((10 == len && ::strncmp(type, "persistent", len) == 0) || + (7 == len && ::strncmp(type, "rocksdb", len) == 0)) { return TRI_IDX_TYPE_PERSISTENT_INDEX; } - if (::strncmp(type, "fulltext", len) == 0) { + if (8 == len && ::strncmp(type, "fulltext", len) == 0) { return TRI_IDX_TYPE_FULLTEXT_INDEX; } - if (::strncmp(type, "geo", len) == 0) { + if (3 == len && ::strncmp(type, "geo", len) == 0) { return TRI_IDX_TYPE_GEO_INDEX; } - if (::strncmp(type, "geo1", len) == 0) { + if (4 == len && ::strncmp(type, "geo1", len) == 0) { return TRI_IDX_TYPE_GEO1_INDEX; } - if (::strncmp(type, "geo2", len) == 0) { + if (4 == len && ::strncmp(type, "geo2", len) == 0) { return TRI_IDX_TYPE_GEO2_INDEX; } #ifdef USE_IRESEARCH @@ -253,7 +253,7 @@ Index::IndexType Index::type(char const* type, size_t len) { return TRI_IDX_TYPE_IRESEARCH_LINK; } #endif - if (::strcmp(type, "noaccess") == 0) { + if (8 == len && ::strcmp(type, "noaccess") == 0) { return TRI_IDX_TYPE_NO_ACCESS_INDEX; } diff --git a/arangod/Indexes/Index.h b/arangod/Indexes/Index.h index f74012ed2033..18a945838655 100644 --- a/arangod/Indexes/Index.h +++ b/arangod/Indexes/Index.h @@ -278,7 +278,7 @@ class Index { /// @brief serialize selectivity estimates Estimates = 4, /// @brief serialize object ids for persistence - ObjectId = 8, + Internals = 8, }; /// @brief helper for building flags diff --git a/arangod/MMFiles/MMFilesCollection.cpp b/arangod/MMFiles/MMFilesCollection.cpp index f42e2c4a5010..7a7ed3e0a881 100644 --- a/arangod/MMFiles/MMFilesCollection.cpp +++ b/arangod/MMFiles/MMFilesCollection.cpp @@ -2309,7 +2309,7 @@ int MMFilesCollection::saveIndex(transaction::Methods* trx, std::shared_ptr builder; try { - builder = idx->toVelocyPack(Index::makeFlags(Index::Serialize::ObjectId)); + builder = idx->toVelocyPack(Index::makeFlags(Index::Serialize::Internals)); } catch (arangodb::basics::Exception const& ex) { LOG_TOPIC(ERR, arangodb::Logger::ENGINES) << "cannot save index definition: " << ex.what(); diff --git a/arangod/RocksDBEngine/RocksDBCollection.cpp b/arangod/RocksDBEngine/RocksDBCollection.cpp index b6da94fb5a76..7465403db659 100644 --- a/arangod/RocksDBEngine/RocksDBCollection.cpp +++ b/arangod/RocksDBEngine/RocksDBCollection.cpp @@ -326,10 +326,14 @@ std::shared_ptr RocksDBCollection::createIndex( bool& created) { TRI_ASSERT(info.isObject()); + AccessMode::Type type = AccessMode::Type::WRITE; +// if (!info.get("isBackground").getBool()) { +// AccessMode::Type type = AccessMode::Type::EXCLUSIVE; +// } +// SingleCollectionTransaction trx( // prevent concurrent dropping transaction::StandaloneContext::Create(_logicalCollection.vocbase()), - _logicalCollection, - AccessMode::Type::WRITE); + _logicalCollection, type); Result res = trx.begin(); if (!res.ok()) { @@ -366,21 +370,48 @@ std::shared_ptr RocksDBCollection::createIndex( } } - addIndex(idx); // add index to list - indexGuard.unlock(); // toVelocyPackIgnore needs the read-lock - // Step 3. add index to collection info if (!engine->inRecovery()) { - auto builder = _logicalCollection.toVelocyPackIgnore( - {"path", "statusString"}, true, /*forPersistence*/ true); - res = engine->writeCreateCollectionMarker(_logicalCollection.vocbase().id(), - _logicalCollection.id(), - builder.slice(), - RocksDBLogValue::Empty()); + // read collection info from database + RocksDBKey key; + key.constructCollection(_logicalCollection.vocbase().id(), _logicalCollection.id()); + rocksdb::PinnableSlice value; + rocksdb::Status s = engine->db()->Get(rocksdb::ReadOptions(), + RocksDBColumnFamily::definitions(), + key.string(), &value); + if (!s.ok()) { + res.reset(rocksutils::convertStatus(s)); + } else { + VPackBuilder builder; + builder.openObject(); + for (auto const& pair : VPackObjectIterator(VPackSlice(value.data()))) { + if (pair.key.isEqualString("indexes")) { + VPackArrayBuilder arrGuard(&builder, "indexes"); + builder.add(VPackArrayIterator(pair.value)); + idx->toVelocyPack(builder, Index::makeFlags(Index::Serialize::Internals)); + continue; + } + builder.add(pair.key); + builder.add(pair.value); + } + builder.close(); + res = engine->writeCreateCollectionMarker(_logicalCollection.vocbase().id(), + _logicalCollection.id(), + builder.slice(), + RocksDBLogValue::Empty()); + } + } + + // Step 4. fill index + if (res.ok()) { + addIndex(idx); // add index to indexes list + RocksDBIndex* ridx = static_cast(idx.get()); + res = ridx->fillIndex(trx, [&] { + indexGuard.unlock(); // will be called at appropriate moment + }); } - RocksDBIndex* ridx = static_cast(idx.get()); - res = ridx->fillIndex(trx); + // Step 5. cleanup if (res.ok()) { // we need to sync the selectivity estimates res = engine->settingsManager()->sync(false); @@ -403,7 +434,7 @@ std::shared_ptr RocksDBCollection::createIndex( auto builder = _logicalCollection.toVelocyPackIgnore( {"path", "statusString"}, true, /*forPersistence*/ true); VPackBuilder indexInfo; - idx->toVelocyPack(indexInfo, Index::makeFlags(Index::Serialize::ObjectId)); + idx->toVelocyPack(indexInfo, Index::makeFlags(Index::Serialize::Internals)); res = engine->writeCreateCollectionMarker( _logicalCollection.vocbase().id(), _logicalCollection.id(), diff --git a/arangod/RocksDBEngine/RocksDBEngine.cpp b/arangod/RocksDBEngine/RocksDBEngine.cpp index 66c1662180f1..334df177b3c4 100644 --- a/arangod/RocksDBEngine/RocksDBEngine.cpp +++ b/arangod/RocksDBEngine/RocksDBEngine.cpp @@ -1134,7 +1134,6 @@ int RocksDBEngine::writeCreateCollectionMarker(TRI_voc_tick_t databaseId, auto value = RocksDBValue::Collection(slice); rocksdb::WriteOptions wo; - // Write marker + key into RocksDB inside one batch rocksdb::WriteBatch batch; if (logValue.slice().size() > 0) { diff --git a/arangod/RocksDBEngine/RocksDBIndex.cpp b/arangod/RocksDBEngine/RocksDBIndex.cpp index 03309a250a18..3e768ae42563 100644 --- a/arangod/RocksDBEngine/RocksDBIndex.cpp +++ b/arangod/RocksDBEngine/RocksDBIndex.cpp @@ -169,10 +169,11 @@ void RocksDBIndex::unload() { void RocksDBIndex::toVelocyPack(VPackBuilder& builder, std::underlying_type::type flags) const { Index::toVelocyPack(builder, flags); - if (Index::hasFlag(flags, Index::Serialize::ObjectId)) { + if (Index::hasFlag(flags, Index::Serialize::Internals)) { // If we store it, it cannot be 0 TRI_ASSERT(_objectId != 0); builder.add("objectId", VPackValue(std::to_string(_objectId))); + builder.add(StaticStrings::IndexIsBuilding, VPackValue(isBuilding())); } } @@ -354,27 +355,27 @@ void RocksDBIndex::setEstimator(std::unique_ptr -static arangodb::Result fillIndex(transaction::Methods& trx, - RocksDBIndex* ridx, - RocksDBCollection* coll, - WriteBatchType& batch) { +/// Background index filler task +static arangodb::Result fillIndexBackgroundNonUnique(transaction::Methods& trx, + RocksDBIndex* ridx, + RocksDBCollection* coll, + std::function const& unlock) { auto state = RocksDBTransactionState::toState(&trx); arangodb::Result res; // fillindex can be non transactional, we just need to clean up RocksDBEngine* engine = rocksutils::globalRocksEngine(); - bool const assumeExclusive = engine->inRecovery(); rocksdb::DB* rootDB = engine->db()->GetRootDB(); TRI_ASSERT(rootDB != nullptr); uint64_t numDocsWritten = 0; + // write batch will be reset every x documents - MethodsType batched(state, &batch); + rocksdb::WriteBatch batch; + RocksDBBatchedMethods batched(state, &batch); auto bounds = RocksDBKeyBounds::CollectionDocuments(coll->objectId()); - rocksdb::Slice upper(bounds.end()); - + rocksdb::Slice upper(bounds.end()); // exclusive upper bound rocksdb::Status s; rocksdb::WriteOptions wo; wo.disableWAL = false; // TODO set to true eventually @@ -388,29 +389,39 @@ static arangodb::Result fillIndex(transaction::Methods& trx, rocksdb::ColumnFamilyHandle* docCF = bounds.columnFamily(); std::unique_ptr it(rootDB->NewIterator(ro, docCF)); - + it->SeekForPrev(bounds.end()); + it->Prev(); + if (!it->Valid() || it->key().compare(bounds.start()) < 0) { + return res; + } + std::string lastKey(it->key().data(), it->key().size()); // inclusive + unlock(); // release lock + // empty transaction used to lock the keys rocksdb::TransactionOptions to; to.lock_timeout = 100; // 100ms - std::unique_ptr rtrx(engine->db()->BeginTransaction(wo)); + std::unique_ptr rtrx(engine->db()->BeginTransaction(wo, to)); std::vector toRevisit; toRevisit.reserve(1024); it->Seek(bounds.start()); - while (it->Valid() && it->key().compare(upper) < 0) { + while (true) { // it->Valid() && it->key().compare(lastKey) <= 0 + bool vv = it->Valid(); + int kk = it->key().compare(lastKey); + if (!vv || kk > 0) { + break; + } bool skipKey = false; - if (!assumeExclusive) { - rocksdb::PinnableSlice slice; - s = rtrx->GetForUpdate(ro, docCF, it->key(), &slice, /*exclusive*/false); - if (s.IsBusy() || s.IsTimedOut() || s.IsTryAgain()) { - LOG_DEVEL << "was not able to lock key"; - toRevisit.push_back(RocksDBKey::documentId(it->key())); - skipKey = true; - } else if (s.IsNotFound()) { // deleted while we were looking - skipKey = true; - } + rocksdb::PinnableSlice slice; + s = rtrx->GetForUpdate(ro, docCF, it->key(), &slice, /*exclusive*/false); + if (s.IsBusy() || s.IsTimedOut() || s.IsTryAgain()) { + LOG_DEVEL << "was not able to lock key"; + toRevisit.push_back(RocksDBKey::documentId(it->key())); + skipKey = true; + } else if (s.IsNotFound()) { // deleted while we were looking + skipKey = true; } if (!skipKey) { @@ -436,14 +447,145 @@ static arangodb::Result fillIndex(transaction::Methods& trx, it->Next(); } - if (res.ok() && batch.GetWriteBatch()->Count() > 0) { + + if (res.ok() && batch.GetWriteBatch()->Count() > 0) { // write out remaining keys s = rootDB->Write(wo, batch.GetWriteBatch()); if (!s.ok()) { res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); } } - batch.Clear(); + if (res.ok() && !toRevisit.empty()) { // now roll-up skipped keys + to.lock_timeout = 5000; // longer timeout to increase the odds + engine->db()->BeginTransaction(wo, to, rtrx.get()); // release keys + RocksDBKey key; + + for (LocalDocumentId const& doc : toRevisit) { + key.constructDocument(coll->objectId(), doc); + + rocksdb::PinnableSlice slice; + s = rtrx->GetForUpdate(ro, docCF, key.string(), &slice, /*exclusive*/false); + if (!s.ok()) { + res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); + break; + } + res = ridx->insertInternal(&trx, &batched, doc, + VPackSlice(slice.data()), + Index::OperationMode::normal); + if (res.fail()) { + break; + } + + numDocsWritten++; + } + + if (res.ok() && batch.GetWriteBatch()->Count() > 0) { + s = rootDB->Write(wo, batch.GetWriteBatch()); + if (!s.ok()) { + res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); + } + } + } + + // we will need to remove index elements created before an error + // occurred, this needs to happen since we are non transactional + if (res.fail()) { + RocksDBKeyBounds bounds = ridx->getBounds(); + arangodb::Result res2 = rocksutils::removeLargeRange(rocksutils::globalRocksDB(), bounds, + true, /*useRangeDel*/numDocsWritten > 25000); + if (res2.fail()) { + LOG_TOPIC(WARN, Logger::ENGINES) << "was not able to roll-back " + << "index creation: " << res2.errorMessage(); + } + } + + return res; +} + +// Background index filler task +static arangodb::Result fillIndexBackgroundUnique(transaction::Methods& trx, + RocksDBIndex* ridx, + RocksDBCollection* coll, + std::function const& unlock) { + auto state = RocksDBTransactionState::toState(&trx); + arangodb::Result res; + + // fillindex can be non transactional, we just need to clean up + RocksDBEngine* engine = rocksutils::globalRocksEngine(); + rocksdb::DB* rootDB = engine->db()->GetRootDB(); + TRI_ASSERT(rootDB != nullptr); + + uint64_t numDocsWritten = 0; + + auto bounds = RocksDBKeyBounds::CollectionDocuments(coll->objectId()); + rocksdb::Slice upper(bounds.end()); // exclusive upper bound + rocksdb::Status s; + rocksdb::WriteOptions wo; + wo.disableWAL = false; // TODO set to true eventually + + // we iterator without a snapshot + rocksdb::ReadOptions ro; + ro.prefix_same_as_start = true; + ro.iterate_upper_bound = &upper; + ro.verify_checksums = false; + ro.fill_cache = false; + + rocksdb::ColumnFamilyHandle* docCF = bounds.columnFamily(); + std::unique_ptr it(rootDB->NewIterator(ro, docCF)); + + it->SeekForPrev(bounds.end()); + it->Prev(); + if (!it->Valid() || it->key().compare(bounds.start()) < 0) { + return res; + } + std::string lastKey(it->key().data(), it->key().size()); // inclusive + unlock(); // release lock + + // empty transaction used to lock the keys + rocksdb::TransactionOptions to; + to.lock_timeout = 100; // 100ms + std::unique_ptr rtrx(engine->db()->BeginTransaction(wo, to)); + RocksDBSubTrxMethods batched(state, rtrx.get()); + + std::vector toRevisit; + toRevisit.reserve(1024); + + it->Seek(bounds.start()); + while (it->Valid() && it->key().compare(lastKey) <= 0) { + + bool skipKey = false; + rocksdb::PinnableSlice slice; + s = rtrx->GetForUpdate(ro, docCF, it->key(), &slice, /*exclusive*/false); + if (s.IsBusy() || s.IsTimedOut() || s.IsTryAgain()) { + LOG_DEVEL << "was not able to lock key"; + toRevisit.push_back(RocksDBKey::documentId(it->key())); + skipKey = true; + } else if (s.IsNotFound()) { // deleted while we were looking + skipKey = true; + } + + if (!skipKey) { + res = ridx->insertInternal(&trx, &batched, RocksDBKey::documentId(it->key()), + VPackSlice(it->value().data()), + Index::OperationMode::normal); + if (res.fail()) { + break; + } + numDocsWritten++; + } + + if (numDocsWritten % 200 == 0) { // commit buffered writes + s = rtrx->Commit(); + if (!s.ok()) { + res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); + break; + } + engine->db()->BeginTransaction(wo, to, rtrx.get()); // reuse transaction + } + + it->Next(); + } + if (res.ok() && !toRevisit.empty()) { // now roll-up skipped keys to.lock_timeout = 5000; // longer timeout to increase the odds engine->db()->BeginTransaction(wo, to, rtrx.get()); // release keys @@ -469,29 +611,94 @@ static arangodb::Result fillIndex(transaction::Methods& trx, } } -// rocksdb::WriteOptions wo; -// -// bool hasMore = true; -// while (hasMore && res.ok()) { -// hasMore = it->nextDocument(cb, 250); -// -// if (TRI_VOC_COL_STATUS_DELETED == it->collection()->status() -// || it->collection()->deleted()) { -// res = TRI_ERROR_INTERNAL; -// } else if (application_features::ApplicationServer::isStopping()) { -// res = TRI_ERROR_SHUTTING_DOWN; -// } -// -// if (res.ok()) { -// rocksdb::Status s = db->Write(wo, batch.GetWriteBatch()); -// if (!s.ok()) { -// res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); -// break; -// } -// } -// -// batch.Clear(); -// } + // now actually write all remaining index keys + if (res.ok() && rtrx->GetNumPuts() > 0) { + s = rtrx->Commit(); + if (!s.ok()) { + res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); + } + } + + // we will need to remove index elements created before an error + // occurred, this needs to happen since we are non transactional + if (res.fail()) { + RocksDBKeyBounds bounds = ridx->getBounds(); + arangodb::Result res2 = rocksutils::removeLargeRange(rocksutils::globalRocksDB(), bounds, + true, /*useRangeDel*/numDocsWritten > 25000); + if (res2.fail()) { + LOG_TOPIC(WARN, Logger::ENGINES) << "was not able to roll-back " + << "index creation: " << res2.errorMessage(); + } + } + + return res; +} + +// fast mode assuming exclusive access +template +static arangodb::Result fillIndexFast(transaction::Methods& trx, + RocksDBIndex* ridx, + RocksDBCollection* coll, + WriteBatchType& batch) { + auto state = RocksDBTransactionState::toState(&trx); + arangodb::Result res; + + // fillindex can be non transactional, we just need to clean up + RocksDBEngine* engine = rocksutils::globalRocksEngine(); + rocksdb::DB* rootDB = engine->db()->GetRootDB(); + TRI_ASSERT(rootDB != nullptr); + + uint64_t numDocsWritten = 0; + // write batch will be reset every x documents + MethodsType batched(state, &batch); + + auto bounds = RocksDBKeyBounds::CollectionDocuments(coll->objectId()); + rocksdb::Slice upper(bounds.end()); + + rocksdb::Status s; + rocksdb::WriteOptions wo; + wo.disableWAL = false; // TODO set to true eventually + + // we iterator without a snapshot + rocksdb::ReadOptions ro; + ro.prefix_same_as_start = true; + ro.iterate_upper_bound = &upper; + ro.verify_checksums = false; + ro.fill_cache = false; + + rocksdb::ColumnFamilyHandle* docCF = bounds.columnFamily(); + std::unique_ptr it(rootDB->NewIterator(ro, docCF)); + + it->Seek(bounds.start()); + while (it->Valid() && it->key().compare(upper) < 0) { + + res = ridx->insertInternal(&trx, &batched, RocksDBKey::documentId(it->key()), + VPackSlice(it->value().data()), + Index::OperationMode::normal); + if (res.fail()) { + break; + } + numDocsWritten++; + + if (numDocsWritten % 200 == 0) { // commit buffered writes + s = rootDB->Write(wo, batch.GetWriteBatch()); + if (!s.ok()) { + res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); + break; + } + batch.Clear(); + } + + it->Next(); + } + + if (res.ok() && batch.GetWriteBatch()->Count() > 0) { // + s = rootDB->Write(wo, batch.GetWriteBatch()); + if (!s.ok()) { + res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); + } + } + batch.Clear(); // we will need to remove index elements created before an error // occurred, this needs to happen since we are non transactional @@ -510,26 +717,34 @@ static arangodb::Result fillIndex(transaction::Methods& trx, /// non-transactional: fill index with existing documents /// from this collection -arangodb::Result RocksDBIndex::fillIndex(transaction::Methods& trx) { +arangodb::Result RocksDBIndex::fillIndex(transaction::Methods& trx, + std::function const& unlock) { TRI_ASSERT(trx.state()->collection(_collection.id(), AccessMode::Type::WRITE)); _isBuilding.store(true, std::memory_order_release); -// std::unique_ptr it(new RocksDBAllIndexIterator(&_collection, trx, primaryIndex())); RocksDBCollection* coll = static_cast(_collection.getPhysical()); - auto guard = scopeGuard([this] { + auto guard = scopeGuard([&] { _isBuilding.store(false, std::memory_order_release); + unlock(); // we do not need the index lock anymore }); - if (this->unique()) { - // unique index. we need to keep track of all our changes because we need to avoid - // duplicate index keys. must therefore use a WriteBatchWithIndex - rocksdb::WriteBatchWithIndex batch(_cf->GetComparator(), 32 * 1024 * 1024); - return ::fillIndex(trx, this, coll, batch); + if (trx.state()->isOnlyExclusiveTransaction()) { + if (this->unique()) { + // unique index. we need to keep track of all our changes because we need to avoid + // duplicate index keys. must therefore use a WriteBatchWithIndex + rocksdb::WriteBatchWithIndex batch(_cf->GetComparator(), 32 * 1024 * 1024); + return ::fillIndexFast(trx, this, coll, batch); + } else { + // non-unique index. all index keys will be unique anyway because they contain the document id + // we can therefore get away with a cheap WriteBatch + rocksdb::WriteBatch batch(32 * 1024 * 1024); + return ::fillIndexFast(trx, this, coll, batch); + } } else { - // non-unique index. all index keys will be unique anyway because they contain the document id - // we can therefore get away with a cheap WriteBatch - rocksdb::WriteBatch batch(32 * 1024 * 1024); - return ::fillIndex(trx, this, coll, batch); + if (this->unique()) { + return ::fillIndexBackgroundUnique(trx, this, coll, unlock); + } else { + return ::fillIndexBackgroundNonUnique(trx, this, coll, unlock); + } } - return Result(); } diff --git a/arangod/RocksDBEngine/RocksDBIndex.h b/arangod/RocksDBEngine/RocksDBIndex.h index 0dd01a93bcc3..28ca7fa0134c 100644 --- a/arangod/RocksDBEngine/RocksDBIndex.h +++ b/arangod/RocksDBEngine/RocksDBIndex.h @@ -145,7 +145,10 @@ class RocksDBIndex : public Index { virtual void setEstimator(std::unique_ptr>); virtual void recalculateEstimates() {} - arangodb::Result fillIndex(transaction::Methods&); + /// @brief fill the index + /// @param unlock will be called when the index lock can be released + arangodb::Result fillIndex(transaction::Methods&, + std::function const& unlock); protected: RocksDBIndex( diff --git a/arangod/RocksDBEngine/RocksDBMethods.cpp b/arangod/RocksDBEngine/RocksDBMethods.cpp index 418657c441a8..819ff963b48e 100644 --- a/arangod/RocksDBEngine/RocksDBMethods.cpp +++ b/arangod/RocksDBEngine/RocksDBMethods.cpp @@ -383,3 +383,41 @@ std::unique_ptr RocksDBBatchedWithIndexMethods::NewIterator( return std::unique_ptr( _wb->NewIteratorWithBase(_db->NewIterator(ro, cf))); } + +// =================== RocksDBSubTrxMethods ==================== + +/// transaction wrapper, uses the current rocksdb transaction and non-tracking methods +RocksDBSubTrxMethods::RocksDBSubTrxMethods(RocksDBTransactionState* state, + rocksdb::Transaction* trx) + : RocksDBMethods(state), _trx(trx) {} + +rocksdb::Status RocksDBSubTrxMethods::Get(rocksdb::ColumnFamilyHandle* cf, rocksdb::Slice const& key, + std::string* val) { + TRI_ASSERT(cf != nullptr); + rocksdb::ReadOptions const& ro = _state->_rocksReadOptions; + return _trx->Get(ro, cf, key, val); +} + +rocksdb::Status RocksDBSubTrxMethods::Get(rocksdb::ColumnFamilyHandle* cf, rocksdb::Slice const& key, + rocksdb::PinnableSlice* val) { + TRI_ASSERT(cf != nullptr); + rocksdb::ReadOptions const& ro = _state->_rocksReadOptions; + return _trx->Get(ro, cf, key, val); +} + +rocksdb::Status RocksDBSubTrxMethods::Put(rocksdb::ColumnFamilyHandle* cf, + RocksDBKey const& key, rocksdb::Slice const& val) { + TRI_ASSERT(cf != nullptr); + return _trx->Put(cf, key.string(), val); +} +rocksdb::Status RocksDBSubTrxMethods::Delete(rocksdb::ColumnFamilyHandle* cf, + RocksDBKey const& key) { + TRI_ASSERT(cf != nullptr); + return _trx->Delete(cf, key.string()); +} + +rocksdb::Status RocksDBSubTrxMethods::SingleDelete(rocksdb::ColumnFamilyHandle* cf, + RocksDBKey const& key) { + TRI_ASSERT(cf != nullptr); + return _trx->SingleDelete(cf, key.string()); +} diff --git a/arangod/RocksDBEngine/RocksDBMethods.h b/arangod/RocksDBEngine/RocksDBMethods.h index 10f07b2d194e..2df984f39aae 100644 --- a/arangod/RocksDBEngine/RocksDBMethods.h +++ b/arangod/RocksDBEngine/RocksDBMethods.h @@ -74,6 +74,9 @@ class RocksDBMethods { /// @brief current sequence number rocksdb::SequenceNumber sequenceNumber(); + + /// @brief read options for use with iterators + rocksdb::ReadOptions readOptions(); /// @brief read options for use with iterators rocksdb::ReadOptions iteratorReadOptions(); @@ -245,6 +248,37 @@ class RocksDBBatchedWithIndexMethods final : public RocksDBMethods { rocksdb::TransactionDB* _db; rocksdb::WriteBatchWithIndex* _wb; }; + +/// transaction wrapper, uses the current rocksdb transaction and non-tracking methods +class RocksDBSubTrxMethods final : public RocksDBMethods { +public: + explicit RocksDBSubTrxMethods(RocksDBTransactionState* state, + rocksdb::Transaction* trx); + + rocksdb::Status Get(rocksdb::ColumnFamilyHandle*, rocksdb::Slice const& key, + std::string* val) override; + rocksdb::Status Get(rocksdb::ColumnFamilyHandle*, rocksdb::Slice const& key, + rocksdb::PinnableSlice* val) override; + rocksdb::Status Put( + rocksdb::ColumnFamilyHandle*, RocksDBKey const& key, + rocksdb::Slice const& val) override; + rocksdb::Status Delete(rocksdb::ColumnFamilyHandle*, + RocksDBKey const& key) override; + rocksdb::Status SingleDelete(rocksdb::ColumnFamilyHandle*, + RocksDBKey const&) override; + + std::unique_ptr NewIterator(rocksdb::ReadOptions const&, + rocksdb::ColumnFamilyHandle*) override { + return nullptr; + } + + void SetSavePoint() override {} + rocksdb::Status RollbackToSavePoint() override { return rocksdb::Status::OK(); } + void PopSavePoint() override {} + +private: + rocksdb::Transaction* _trx; +}; // INDEXING MAY ONLY BE DISABLED IN TOPLEVEL AQL TRANSACTIONS // THIS IS BECAUSE THESE TRANSACTIONS WILL EITHER READ FROM diff --git a/arangod/RocksDBEngine/RocksDBTransactionState.cpp b/arangod/RocksDBEngine/RocksDBTransactionState.cpp index 339be6b0b899..71e52e6be6f0 100644 --- a/arangod/RocksDBEngine/RocksDBTransactionState.cpp +++ b/arangod/RocksDBEngine/RocksDBTransactionState.cpp @@ -213,8 +213,8 @@ void RocksDBTransactionState::createTransaction() { _rocksTransaction->GetState() == rocksdb::Transaction::COMMITED || (_rocksTransaction->GetState() == rocksdb::Transaction::STARTED && _rocksTransaction->GetNumKeys() == 0)); - _rocksTransaction = - db->BeginTransaction(_rocksWriteOptions, trxOpts, _rocksTransaction); + rocksdb::WriteOptions wo; + _rocksTransaction = db->BeginTransaction(wo, trxOpts, _rocksTransaction); // add transaction begin marker if (!hasHint(transaction::Hints::Hint::SINGLE_OPERATION)) { @@ -313,8 +313,9 @@ arangodb::Result RocksDBTransactionState::internalCommit() { // we do this only for Windows here, because all other platforms use the // RocksDB SyncThread to do the syncing if (waitForSync()) { - _rocksWriteOptions.sync = true; - _rocksTransaction->SetWriteOptions(_rocksWriteOptions); + rocksdb::WriteOptions wo; + wo.sync = true; + _rocksTransaction->SetWriteOptions(wo); } #endif diff --git a/arangod/RocksDBEngine/RocksDBTransactionState.h b/arangod/RocksDBEngine/RocksDBTransactionState.h index 1f18d7aea0b2..82e72b656b23 100644 --- a/arangod/RocksDBEngine/RocksDBTransactionState.h +++ b/arangod/RocksDBEngine/RocksDBTransactionState.h @@ -74,6 +74,7 @@ class RocksDBTransactionState final : public TransactionState { friend class RocksDBTrxUntrackedMethods; friend class RocksDBBatchedMethods; friend class RocksDBBatchedWithIndexMethods; + friend class RocksDBSubTrxMethods; public: RocksDBTransactionState( @@ -185,8 +186,6 @@ class RocksDBTransactionState final : public TransactionState { /// @brief used for read-only trx and intermediate commits /// For intermediate commits this MUST ONLY be used for iteratos rocksdb::Snapshot const* _readSnapshot; - /// @brief shared write options used - rocksdb::WriteOptions _rocksWriteOptions; /// @brief shared read options which can be used by operations /// For intermediate commits iterators MUST use the _readSnapshot rocksdb::ReadOptions _rocksReadOptions; diff --git a/arangod/VocBase/LogicalCollection.cpp b/arangod/VocBase/LogicalCollection.cpp index 38ac7cdae71d..de0921a3d011 100644 --- a/arangod/VocBase/LogicalCollection.cpp +++ b/arangod/VocBase/LogicalCollection.cpp @@ -649,7 +649,7 @@ arangodb::Result LogicalCollection::appendVelocyPack( return (forPersistence || !idx->isHidden()); }; if (forPersistence) { - flags = Index::makeFlags(Index::Serialize::ObjectId); + flags = Index::makeFlags(Index::Serialize::Internals); } getIndexesVPack(result, flags, filter); From c57de4fa5f56570e8978af3e535b80165c9f6879 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Tue, 4 Dec 2018 22:23:06 +0100 Subject: [PATCH 04/31] fix a bug --- arangod/RocksDBEngine/RocksDBIndex.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arangod/RocksDBEngine/RocksDBIndex.cpp b/arangod/RocksDBEngine/RocksDBIndex.cpp index 3e768ae42563..eeef95c42556 100644 --- a/arangod/RocksDBEngine/RocksDBIndex.cpp +++ b/arangod/RocksDBEngine/RocksDBIndex.cpp @@ -391,7 +391,6 @@ static arangodb::Result fillIndexBackgroundNonUnique(transaction::Methods& trx, std::unique_ptr it(rootDB->NewIterator(ro, docCF)); it->SeekForPrev(bounds.end()); - it->Prev(); if (!it->Valid() || it->key().compare(bounds.start()) < 0) { return res; } @@ -406,12 +405,7 @@ static arangodb::Result fillIndexBackgroundNonUnique(transaction::Methods& trx, toRevisit.reserve(1024); it->Seek(bounds.start()); - while (true) { // it->Valid() && it->key().compare(lastKey) <= 0 - bool vv = it->Valid(); - int kk = it->key().compare(lastKey); - if (!vv || kk > 0) { - break; - } + while (it->Valid() && it->key().compare(lastKey) <= 0) { bool skipKey = false; rocksdb::PinnableSlice slice; @@ -534,7 +528,6 @@ static arangodb::Result fillIndexBackgroundUnique(transaction::Methods& trx, std::unique_ptr it(rootDB->NewIterator(ro, docCF)); it->SeekForPrev(bounds.end()); - it->Prev(); if (!it->Valid() || it->key().compare(bounds.start()) < 0) { return res; } From c79835719129dbba2cfe808414f0feeeb1299591 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Wed, 5 Dec 2018 14:28:12 +0100 Subject: [PATCH 05/31] reduce conflicts --- arangod/RocksDBEngine/RocksDBCollection.cpp | 12 +- arangod/RocksDBEngine/RocksDBIndex.cpp | 186 ++------------------ arangod/RocksDBEngine/RocksDBMethods.cpp | 5 + arangod/RocksDBEngine/RocksDBMethods.h | 2 + 4 files changed, 31 insertions(+), 174 deletions(-) diff --git a/arangod/RocksDBEngine/RocksDBCollection.cpp b/arangod/RocksDBEngine/RocksDBCollection.cpp index 7465403db659..07ef389b5d98 100644 --- a/arangod/RocksDBEngine/RocksDBCollection.cpp +++ b/arangod/RocksDBEngine/RocksDBCollection.cpp @@ -326,16 +326,18 @@ std::shared_ptr RocksDBCollection::createIndex( bool& created) { TRI_ASSERT(info.isObject()); + VPackSlice typeSlice = info.get(StaticStrings::IndexType); + AccessMode::Type type = AccessMode::Type::WRITE; -// if (!info.get("isBackground").getBool()) { -// AccessMode::Type type = AccessMode::Type::EXCLUSIVE; -// } -// +#ifdef USE_IRESEARCH + if (arangodb::Index::type(typeSlice.copyString()) == Index::IndexType::TRI_IDX_TYPE_IRESEARCH_LINK) { + type = AccessMode::Type::EXCLUSIVE; // iresearch needs exclusive access + } +#endif SingleCollectionTransaction trx( // prevent concurrent dropping transaction::StandaloneContext::Create(_logicalCollection.vocbase()), _logicalCollection, type); Result res = trx.begin(); - if (!res.ok()) { THROW_ARANGO_EXCEPTION(res); } diff --git a/arangod/RocksDBEngine/RocksDBIndex.cpp b/arangod/RocksDBEngine/RocksDBIndex.cpp index eeef95c42556..8980b02acf55 100644 --- a/arangod/RocksDBEngine/RocksDBIndex.cpp +++ b/arangod/RocksDBEngine/RocksDBIndex.cpp @@ -355,152 +355,11 @@ void RocksDBIndex::setEstimator(std::unique_ptr const& unlock) { - auto state = RocksDBTransactionState::toState(&trx); - arangodb::Result res; - - // fillindex can be non transactional, we just need to clean up - RocksDBEngine* engine = rocksutils::globalRocksEngine(); - rocksdb::DB* rootDB = engine->db()->GetRootDB(); - TRI_ASSERT(rootDB != nullptr); - - uint64_t numDocsWritten = 0; - - // write batch will be reset every x documents - rocksdb::WriteBatch batch; - RocksDBBatchedMethods batched(state, &batch); - - auto bounds = RocksDBKeyBounds::CollectionDocuments(coll->objectId()); - rocksdb::Slice upper(bounds.end()); // exclusive upper bound - rocksdb::Status s; - rocksdb::WriteOptions wo; - wo.disableWAL = false; // TODO set to true eventually - - // we iterator without a snapshot - rocksdb::ReadOptions ro; - ro.prefix_same_as_start = true; - ro.iterate_upper_bound = &upper; - ro.verify_checksums = false; - ro.fill_cache = false; - - rocksdb::ColumnFamilyHandle* docCF = bounds.columnFamily(); - std::unique_ptr it(rootDB->NewIterator(ro, docCF)); - - it->SeekForPrev(bounds.end()); - if (!it->Valid() || it->key().compare(bounds.start()) < 0) { - return res; - } - std::string lastKey(it->key().data(), it->key().size()); // inclusive - unlock(); // release lock - - // empty transaction used to lock the keys - rocksdb::TransactionOptions to; - to.lock_timeout = 100; // 100ms - std::unique_ptr rtrx(engine->db()->BeginTransaction(wo, to)); - std::vector toRevisit; - toRevisit.reserve(1024); - - it->Seek(bounds.start()); - while (it->Valid() && it->key().compare(lastKey) <= 0) { - - bool skipKey = false; - rocksdb::PinnableSlice slice; - s = rtrx->GetForUpdate(ro, docCF, it->key(), &slice, /*exclusive*/false); - if (s.IsBusy() || s.IsTimedOut() || s.IsTryAgain()) { - LOG_DEVEL << "was not able to lock key"; - toRevisit.push_back(RocksDBKey::documentId(it->key())); - skipKey = true; - } else if (s.IsNotFound()) { // deleted while we were looking - skipKey = true; - } - - if (!skipKey) { - res = ridx->insertInternal(&trx, &batched, RocksDBKey::documentId(it->key()), - VPackSlice(it->value().data()), - Index::OperationMode::normal); - if (res.fail()) { - break; - } - numDocsWritten++; - } - - if (numDocsWritten % 200 == 0) { // commit buffered writes - s = rootDB->Write(wo, batch.GetWriteBatch()); - if (!s.ok()) { - res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); - break; - } - batch.Clear(); - engine->db()->BeginTransaction(wo, to, rtrx.get()); // release keys - } - - it->Next(); - } - - - if (res.ok() && batch.GetWriteBatch()->Count() > 0) { // write out remaining keys - s = rootDB->Write(wo, batch.GetWriteBatch()); - if (!s.ok()) { - res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); - } - } - - if (res.ok() && !toRevisit.empty()) { // now roll-up skipped keys - to.lock_timeout = 5000; // longer timeout to increase the odds - engine->db()->BeginTransaction(wo, to, rtrx.get()); // release keys - RocksDBKey key; - - for (LocalDocumentId const& doc : toRevisit) { - key.constructDocument(coll->objectId(), doc); - - rocksdb::PinnableSlice slice; - s = rtrx->GetForUpdate(ro, docCF, key.string(), &slice, /*exclusive*/false); - if (!s.ok()) { - res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); - break; - } - res = ridx->insertInternal(&trx, &batched, doc, - VPackSlice(slice.data()), - Index::OperationMode::normal); - if (res.fail()) { - break; - } - - numDocsWritten++; - } - - if (res.ok() && batch.GetWriteBatch()->Count() > 0) { - s = rootDB->Write(wo, batch.GetWriteBatch()); - if (!s.ok()) { - res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); - } - } - } - - // we will need to remove index elements created before an error - // occurred, this needs to happen since we are non transactional - if (res.fail()) { - RocksDBKeyBounds bounds = ridx->getBounds(); - arangodb::Result res2 = rocksutils::removeLargeRange(rocksutils::globalRocksDB(), bounds, - true, /*useRangeDel*/numDocsWritten > 25000); - if (res2.fail()) { - LOG_TOPIC(WARN, Logger::ENGINES) << "was not able to roll-back " - << "index creation: " << res2.errorMessage(); - } - } - - return res; -} - // Background index filler task -static arangodb::Result fillIndexBackgroundUnique(transaction::Methods& trx, - RocksDBIndex* ridx, - RocksDBCollection* coll, - std::function const& unlock) { +static arangodb::Result fillIndexBackground(transaction::Methods& trx, + RocksDBIndex* ridx, + RocksDBCollection* coll, + std::function const& unlock) { auto state = RocksDBTransactionState::toState(&trx); arangodb::Result res; @@ -538,6 +397,10 @@ static arangodb::Result fillIndexBackgroundUnique(transaction::Methods& trx, rocksdb::TransactionOptions to; to.lock_timeout = 100; // 100ms std::unique_ptr rtrx(engine->db()->BeginTransaction(wo, to)); + rtrx->SetSnapshot(); + if (!ridx->unique()) { + rtrx->DisableIndexing(); + } RocksDBSubTrxMethods batched(state, rtrx.get()); std::vector toRevisit; @@ -546,26 +409,13 @@ static arangodb::Result fillIndexBackgroundUnique(transaction::Methods& trx, it->Seek(bounds.start()); while (it->Valid() && it->key().compare(lastKey) <= 0) { - bool skipKey = false; - rocksdb::PinnableSlice slice; - s = rtrx->GetForUpdate(ro, docCF, it->key(), &slice, /*exclusive*/false); - if (s.IsBusy() || s.IsTimedOut() || s.IsTryAgain()) { - LOG_DEVEL << "was not able to lock key"; - toRevisit.push_back(RocksDBKey::documentId(it->key())); - skipKey = true; - } else if (s.IsNotFound()) { // deleted while we were looking - skipKey = true; - } - - if (!skipKey) { - res = ridx->insertInternal(&trx, &batched, RocksDBKey::documentId(it->key()), - VPackSlice(it->value().data()), - Index::OperationMode::normal); - if (res.fail()) { - break; - } - numDocsWritten++; + res = ridx->insertInternal(&trx, &batched, RocksDBKey::documentId(it->key()), + VPackSlice(it->value().data()), + Index::OperationMode::normal); + if (res.fail()) { + break; } + numDocsWritten++; if (numDocsWritten % 200 == 0) { // commit buffered writes s = rtrx->Commit(); @@ -574,6 +424,7 @@ static arangodb::Result fillIndexBackgroundUnique(transaction::Methods& trx, break; } engine->db()->BeginTransaction(wo, to, rtrx.get()); // reuse transaction + rtrx->SetSnapshot(); } it->Next(); @@ -582,6 +433,7 @@ static arangodb::Result fillIndexBackgroundUnique(transaction::Methods& trx, if (res.ok() && !toRevisit.empty()) { // now roll-up skipped keys to.lock_timeout = 5000; // longer timeout to increase the odds engine->db()->BeginTransaction(wo, to, rtrx.get()); // release keys + rtrx->SetSnapshot(); RocksDBKey key; for (LocalDocumentId const& doc : toRevisit) { @@ -734,10 +586,6 @@ arangodb::Result RocksDBIndex::fillIndex(transaction::Methods& trx, return ::fillIndexFast(trx, this, coll, batch); } } else { - if (this->unique()) { - return ::fillIndexBackgroundUnique(trx, this, coll, unlock); - } else { - return ::fillIndexBackgroundNonUnique(trx, this, coll, unlock); - } + return ::fillIndexBackground(trx, this, coll, unlock); } } diff --git a/arangod/RocksDBEngine/RocksDBMethods.cpp b/arangod/RocksDBEngine/RocksDBMethods.cpp index 819ff963b48e..74a35c94625d 100644 --- a/arangod/RocksDBEngine/RocksDBMethods.cpp +++ b/arangod/RocksDBEngine/RocksDBMethods.cpp @@ -421,3 +421,8 @@ rocksdb::Status RocksDBSubTrxMethods::SingleDelete(rocksdb::ColumnFamilyHandle* TRI_ASSERT(cf != nullptr); return _trx->SingleDelete(cf, key.string()); } + +bool RocksDBSubTrxMethods::DisableIndexing() { + _trx->DisableIndexing(); + return true; +} diff --git a/arangod/RocksDBEngine/RocksDBMethods.h b/arangod/RocksDBEngine/RocksDBMethods.h index 2df984f39aae..15339117538e 100644 --- a/arangod/RocksDBEngine/RocksDBMethods.h +++ b/arangod/RocksDBEngine/RocksDBMethods.h @@ -276,6 +276,8 @@ class RocksDBSubTrxMethods final : public RocksDBMethods { rocksdb::Status RollbackToSavePoint() override { return rocksdb::Status::OK(); } void PopSavePoint() override {} + bool DisableIndexing() override; + private: rocksdb::Transaction* _trx; }; From 47669006fb6c77fa3b62a16c0af7b0e96f937449 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Thu, 6 Dec 2018 13:16:53 +0100 Subject: [PATCH 06/31] fix background indexing --- arangod/RocksDBEngine/CMakeLists.txt | 1 + arangod/RocksDBEngine/RocksDBBuilderIndex.cpp | 391 ++++++++++++++++++ arangod/RocksDBEngine/RocksDBBuilderIndex.h | 141 +++++++ arangod/RocksDBEngine/RocksDBCollection.cpp | 84 ++-- arangod/RocksDBEngine/RocksDBCollection.h | 4 +- arangod/RocksDBEngine/RocksDBGeoIndex.cpp | 1 - arangod/RocksDBEngine/RocksDBIndex.cpp | 250 +---------- arangod/RocksDBEngine/RocksDBIndex.h | 23 +- arangod/RocksDBEngine/RocksDBIndexFactory.cpp | 38 +- .../RocksDBEngine/RocksDBTransactionState.cpp | 14 +- arangod/VocBase/Methods/Indexes.cpp | 4 +- 11 files changed, 608 insertions(+), 343 deletions(-) create mode 100644 arangod/RocksDBEngine/RocksDBBuilderIndex.cpp create mode 100644 arangod/RocksDBEngine/RocksDBBuilderIndex.h diff --git a/arangod/RocksDBEngine/CMakeLists.txt b/arangod/RocksDBEngine/CMakeLists.txt index 9a45522fd518..ca9d96de041f 100644 --- a/arangod/RocksDBEngine/CMakeLists.txt +++ b/arangod/RocksDBEngine/CMakeLists.txt @@ -40,6 +40,7 @@ endif() # add sources for rocksdb engine set(ROCKSDB_SOURCES RocksDBEngine/RocksDBBackgroundThread.cpp + RocksDBEngine/RocksDBBuilderIndex.cpp RocksDBEngine/RocksDBCollection.cpp RocksDBEngine/RocksDBCollectionMeta.cpp RocksDBEngine/RocksDBCommon.cpp diff --git a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp new file mode 100644 index 000000000000..f615e3af062b --- /dev/null +++ b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp @@ -0,0 +1,391 @@ +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2018 ArangoDB GmbH, Cologne, Germany +/// +/// Licensed under the Apache License, Version 2.0 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// http://www.apache.org/licenses/LICENSE-2.0 +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Simon Grätzer +//////////////////////////////////////////////////////////////////////////////// + +#include "RocksDBBuilderIndex.h" +#include "Basics/VelocyPackHelper.h" +#include "RocksDBEngine/RocksDBColumnFamily.h" +#include "RocksDBEngine/RocksDBCollection.h" +#include "RocksDBEngine/RocksDBCommon.h" +#include "RocksDBEngine/RocksDBMethods.h" +#include "RocksDBEngine/RocksDBTransactionState.h" +#include "StorageEngine/EngineSelectorFeature.h" +#include "Transaction/StandaloneContext.h" +#include "Utils/SingleCollectionTransaction.h" +#include "VocBase/LogicalCollection.h" +#include "VocBase/ticks.h" + +#include +#include +#include +#include + +#include + +using namespace arangodb; +using namespace arangodb::rocksutils; + +RocksDBBuilderIndex::RocksDBBuilderIndex(std::shared_ptr const& wp) + : RocksDBIndex(wp->id(), *wp->collection(), + wp->fields(), wp->unique(), + wp->sparse(), wp->columnFamily(), 0, false), + _wrapped(wp), _hasError(false) { + TRI_ASSERT(_wrapped); +} + +/// @brief return a VelocyPack representation of the index +void RocksDBBuilderIndex::toVelocyPack(VPackBuilder& builder, + std::underlying_type::type flags) const { + VPackBuilder inner; + _wrapped->toVelocyPack(inner, flags); + TRI_ASSERT(inner.slice().isObject()); + builder.openObject(); // FIXME refactor RocksDBIndex::toVelocyPack !! + builder.add(velocypack::ObjectIterator(inner.slice())); + if (Index::hasFlag(flags, Index::Serialize::Internals)) { + builder.add(StaticStrings::IndexIsBuilding, VPackValue(true)); + } + builder.close(); +} + +/// insert index elements into the specified write batch. +Result RocksDBBuilderIndex::insertInternal(transaction::Methods* trx, RocksDBMethods* mthd, + LocalDocumentId const& documentId, + arangodb::velocypack::Slice const& slice, + OperationMode mode) { + Result r = _wrapped->insertInternal(trx, mthd, documentId, slice, mode); + if (!r.ok() && !_hasError.load(std::memory_order_acquire)) { + _errorResult = r; + _hasError.store(true, std::memory_order_release); + } + return Result(); +} + +//Result RocksDBBuilderIndex::updateInternal(transaction::Methods* trx, RocksDBMethods* mthd, +// LocalDocumentId const& oldDocumentId, +// arangodb::velocypack::Slice const& oldDoc, +// LocalDocumentId const& newDocumentId, +// arangodb::velocypack::Slice const& newDoc, +// OperationMode mode) { +// // It is illegal to call this method on the primary index +// // RocksDBPrimaryIndex must override this method accordingly +// TRI_ASSERT(type() != TRI_IDX_TYPE_PRIMARY_INDEX); +// +// Result res = removeInternal(trx, mthd, oldDocumentId, oldDoc, mode); +// if (!res.ok()) { +// return res; +// } +// return insertInternal(trx, mthd, newDocumentId, newDoc, mode); +//} + +/// remove index elements and put it in the specified write batch. +Result RocksDBBuilderIndex::removeInternal(transaction::Methods* trx, RocksDBMethods* mthd, + LocalDocumentId const& documentId, + arangodb::velocypack::Slice const& slice, + OperationMode mode) { + { + std::lock_guard guard(_removedDocsMutex); + _removedDocs.insert(documentId.id()); + } + { // wait for keys do be inserted, so we can remove them again + std::unique_lock guard(_lockedDocsMutex); + if (_lockedDocs.find(documentId.id()) != _lockedDocs.end()) { + _lockedDocsCond.wait(guard); + } + } + + Result r = _wrapped->removeInternal(trx, mthd, documentId, slice, mode); + if (!r.ok() && !_hasError.load(std::memory_order_acquire)) { + _errorResult = r; + _hasError.store(true, std::memory_order_release); + } + return Result(); +} + +// Background index filler task +arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function const& unlock) { + arangodb::Result res; + + // fillindex can be non transactional, we just need to clean up + RocksDBEngine* engine = rocksutils::globalRocksEngine(); + RocksDBCollection* rcoll = static_cast(_collection.getPhysical()); + rocksdb::DB* rootDB = engine->db()->GetRootDB(); + TRI_ASSERT(rootDB != nullptr); + + uint64_t numDocsWritten = 0; + + auto bounds = RocksDBKeyBounds::CollectionDocuments(rcoll->objectId()); + rocksdb::Slice upper(bounds.end()); // exclusive upper bound + rocksdb::Status s; + rocksdb::WriteOptions wo; + wo.disableWAL = false; // TODO set to true eventually + + // create a read-snapshot under the guard + rocksdb::Snapshot const* snap = rootDB->GetSnapshot(); + auto snapGuard = scopeGuard([&] { + rootDB->ReleaseSnapshot(snap); + }); + TRI_ASSERT(snap != nullptr); + + rocksdb::ReadOptions ro; + ro.snapshot = snap; + ro.prefix_same_as_start = true; + ro.iterate_upper_bound = &upper; + ro.verify_checksums = false; + ro.fill_cache = false; + + rocksdb::ColumnFamilyHandle* docCF = bounds.columnFamily(); + std::unique_ptr it(rootDB->NewIterator(ro, docCF)); + + unlock(); // release indexes write lock + SingleCollectionTransaction trx(transaction::StandaloneContext::Create(_collection.vocbase()), + _collection, AccessMode::Type::WRITE); + res = trx.begin(); + if (res.fail()) { + return res; + } + auto state = RocksDBTransactionState::toState(&trx); + + // transaction used to perform actual indexing + rocksdb::TransactionOptions to; + to.lock_timeout = 100; // 100ms + std::unique_ptr rtrx(engine->db()->BeginTransaction(wo, to)); + if (this->unique()) { + rtrx->SetSnapshot(); // needed for unique index conflict detection + } else { // FIXME use PutUntracked + rtrx->DisableIndexing(); // we never check for existing index keys + } + RocksDBSubTrxMethods batched(state, rtrx.get()); + + RocksDBIndex* internal = _wrapped.get(); + TRI_ASSERT(internal != nullptr); + + it->Seek(bounds.start()); + while (it->Valid() && it->key().compare(upper) < 0) { + + if (_hasError.load(std::memory_order_acquire)) { + res = _errorResult; // a Writer got an error + break; + } + + LocalDocumentId docId = RocksDBKey::documentId(it->key()); + { + std::lock_guard guard(_removedDocsMutex); + if (_removedDocs.find(docId.id()) != _removedDocs.end()) { + continue; + } + std::lock_guard guard2(_lockedDocsMutex); + _lockedDocs.insert(docId.id());// must be done under _removedDocsMutex + } + + res = internal->insertInternal(&trx, &batched, docId, + VPackSlice(it->value().data()), + Index::OperationMode::normal); + if (res.fail()) { + break; + } + numDocsWritten++; + + if (numDocsWritten % 200 == 0) { // commit buffered writes + s = rtrx->Commit(); + if (!s.ok()) { + res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); + break; + } + { // clear all the processed documents + std::lock_guard guard2(_lockedDocsMutex); + _lockedDocs.clear(); + _lockedDocsCond.notify_all(); + } + engine->db()->BeginTransaction(wo, to, rtrx.get()); // reuse transaction + if (this->unique()) { + rtrx->SetSnapshot(); + } + } + + it->Next(); + } +// +// if (res.ok() && !toRevisit.empty()) { // now roll-up skipped keys +// to.lock_timeout = 5000; // longer timeout to increase the odds +// engine->db()->BeginTransaction(wo, to, rtrx.get()); // release keys +// rtrx->SetSnapshot(); +// RocksDBKey key; +// +// for (LocalDocumentId const& doc : toRevisit) { +// key.constructDocument(coll->objectId(), doc); +// +// rocksdb::PinnableSlice slice; +// s = rtrx->GetForUpdate(ro, docCF, key.string(), &slice, /*exclusive*/false); +// if (!s.ok()) { +// res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); +// break; +// } +// res = ridx->insertInternal(&trx, &batched, doc, +// VPackSlice(slice.data()), +// Index::OperationMode::normal); +// if (res.fail()) { +// break; +// } +// +// numDocsWritten++; +// } +// } + + // now actually write all remaining index keys + if (res.ok() && rtrx->GetNumPuts() > 0) { + s = rtrx->Commit(); + if (!s.ok()) { + res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); + } + } + + res = trx.commit(); // required to commit selectivity estimates + + // clear all the processed documents + std::lock_guard guard2(_lockedDocsMutex); + _lockedDocs.clear(); + _lockedDocsCond.notify_all(); + + // we will need to remove index elements created before an error + // occurred, this needs to happen since we are non transactional +// if (res.fail()) { +// RocksDBKeyBounds bounds = internal->getBounds(); +// arangodb::Result res2 = rocksutils::removeLargeRange(rocksutils::globalRocksDB(), bounds, +// true, /*useRangeDel*/numDocsWritten > 25000); +// if (res2.fail()) { +// LOG_TOPIC(WARN, Logger::ENGINES) << "was not able to roll-back " +// << "index creation: " << res2.errorMessage(); +// } +// } + + return res; +} + +// fast mode assuming exclusive access +template +static arangodb::Result fillIndexFast(transaction::Methods& trx, + RocksDBIndex* ridx, + RocksDBCollection* coll, + WriteBatchType& batch) { + auto state = RocksDBTransactionState::toState(&trx); + arangodb::Result res; + + // fillindex can be non transactional, we just need to clean up + RocksDBEngine* engine = rocksutils::globalRocksEngine(); + rocksdb::DB* rootDB = engine->db()->GetRootDB(); + TRI_ASSERT(rootDB != nullptr); + + uint64_t numDocsWritten = 0; + // write batch will be reset every x documents + MethodsType batched(state, &batch); + + auto bounds = RocksDBKeyBounds::CollectionDocuments(coll->objectId()); + rocksdb::Slice upper(bounds.end()); + + rocksdb::Status s; + rocksdb::WriteOptions wo; + wo.disableWAL = false; // TODO set to true eventually + + // we iterator without a snapshot + rocksdb::ReadOptions ro; + ro.prefix_same_as_start = true; + ro.iterate_upper_bound = &upper; + ro.verify_checksums = false; + ro.fill_cache = false; + + rocksdb::ColumnFamilyHandle* docCF = bounds.columnFamily(); + std::unique_ptr it(rootDB->NewIterator(ro, docCF)); + + it->Seek(bounds.start()); + while (it->Valid() && it->key().compare(upper) < 0) { + + res = ridx->insertInternal(&trx, &batched, RocksDBKey::documentId(it->key()), + VPackSlice(it->value().data()), + Index::OperationMode::normal); + if (res.fail()) { + break; + } + numDocsWritten++; + + if (numDocsWritten % 200 == 0) { // commit buffered writes + s = rootDB->Write(wo, batch.GetWriteBatch()); + if (!s.ok()) { + res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); + break; + } + batch.Clear(); + } + + it->Next(); + } + + if (res.ok() && batch.GetWriteBatch()->Count() > 0) { // + s = rootDB->Write(wo, batch.GetWriteBatch()); + if (!s.ok()) { + res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); + } + } + batch.Clear(); + + // we will need to remove index elements created before an error + // occurred, this needs to happen since we are non transactional + if (res.fail()) { + RocksDBKeyBounds bounds = ridx->getBounds(); + arangodb::Result res2 = rocksutils::removeLargeRange(rocksutils::globalRocksDB(), bounds, + true, /*useRangeDel*/numDocsWritten > 25000); + if (res2.fail()) { + LOG_TOPIC(WARN, Logger::ENGINES) << "was not able to roll-back " + << "index creation: " << res2.errorMessage(); + } + } + + return res; +} + +/// non-transactional: fill index with existing documents +/// from this collection +arangodb::Result RocksDBBuilderIndex::fillIndex(std::function const& unlock) { +// TRI_ASSERT(trx.state()->collection(_collection.id(), AccessMode::Type::WRITE)); + +#if 0 + RocksDBCollection* coll = static_cast(_collection.getPhysical()); + unlock(); // we do not need the outer lock + SingleCollectionTransaction trx(transaction::StandaloneContext::Create(_collection.vocbase()), + _collection, AccessMode::Type::EXCLUSIVE); + res = trx.begin(); + if (!res.ok()) { + THROW_ARANGO_EXCEPTION(res); + } + + if (this->unique()) { + // unique index. we need to keep track of all our changes because we need to avoid + // duplicate index keys. must therefore use a WriteBatchWithIndex + rocksdb::WriteBatchWithIndex batch(_cf->GetComparator(), 32 * 1024 * 1024); + return ::fillIndexFast(trx, this, coll, batch); + } else { + // non-unique index. all index keys will be unique anyway because they contain the document id + // we can therefore get away with a cheap WriteBatch + rocksdb::WriteBatch batch(32 * 1024 * 1024); + return ::fillIndexFast(trx, this, coll, batch); + } +#endif + return fillIndexBackground(unlock); +} diff --git a/arangod/RocksDBEngine/RocksDBBuilderIndex.h b/arangod/RocksDBEngine/RocksDBBuilderIndex.h new file mode 100644 index 000000000000..eb163cd6bc94 --- /dev/null +++ b/arangod/RocksDBEngine/RocksDBBuilderIndex.h @@ -0,0 +1,141 @@ +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2014-2016 ArangoDB GmbH, Cologne, Germany +/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany +/// +/// Licensed under the Apache License, Version 2.0 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// http://www.apache.org/licenses/LICENSE-2.0 +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Simon Grätzer +//////////////////////////////////////////////////////////////////////////////// + +#ifndef ARANGOD_ROCKSDB_ENGINE_ROCKSDB_BUILDER_INDEX_H +#define ARANGOD_ROCKSDB_ENGINE_ROCKSDB_BUILDER_INDEX_H 1 + +#include "RocksDBEngine/RocksDBIndex.h" + +#include + +namespace arangodb { + +class RocksDBBuilderIndex final : public arangodb::RocksDBIndex { + + public: + /// @brief return a VelocyPack representation of the index + void toVelocyPack(velocypack::Builder& builder, + std::underlying_type::type) const override; + + char const* typeName() const override { + return _wrapped->typeName(); + } + + IndexType type() const override { + return _wrapped->type(); + } + + bool canBeDropped() const override { + return false; // TODO ?! + } + + /// @brief whether or not the index is sorted + bool isSorted() const override { + return _wrapped->isSorted(); + } + + /// @brief if true this index should not be shown externally + bool isHidden() const override { + return true; // do not show building indexes + } + + size_t memory() const override { + return _wrapped->memory(); + } + + int drop() override { + return _wrapped->drop(); + } + + void afterTruncate(TRI_voc_tick_t tick) override { + _wrapped->afterTruncate(tick); + } + + void load() override { + _wrapped->load(); + } + + void unload() override { + _wrapped->unload(); + } + + /// @brief whether or not the index has a selectivity estimate + bool hasSelectivityEstimate() const override { + return false; + } + + /// insert index elements into the specified write batch. + Result insertInternal(transaction::Methods* trx, RocksDBMethods*, + LocalDocumentId const& documentId, + arangodb::velocypack::Slice const&, + OperationMode mode) override; +// +// Result updateInternal(transaction::Methods* trx, RocksDBMethods*, +// LocalDocumentId const& oldDocumentId, +// arangodb::velocypack::Slice const& oldDoc, +// LocalDocumentId const& newDocumentId, +// velocypack::Slice const& newDoc, +// OperationMode mode) override; + + /// remove index elements and put it in the specified write batch. + Result removeInternal(transaction::Methods* trx, RocksDBMethods*, + LocalDocumentId const& documentId, + arangodb::velocypack::Slice const&, + OperationMode mode) override; + + RocksDBBuilderIndex(std::shared_ptr const&); + + /// @brief get index estimator, optional + RocksDBCuckooIndexEstimator* estimator() override { + return _wrapped->estimator(); + } + void setEstimator(std::unique_ptr>) override { + TRI_ASSERT(false); + } + void recalculateEstimates() override { + _wrapped->recalculateEstimates(); + } + + /// @brief fill the index + /// @param unlock will be called when the index lock can be released + Result fillIndex(std::function const& unlock); + +private: + + Result fillIndexBackground(std::function const& unlock); + + private: + std::shared_ptr _wrapped; + std::atomic _hasError; + Result _errorResult; + + std::mutex _removedDocsMutex; + std::unordered_set _removedDocs; + + std::mutex _lockedDocsMutex; + std::condition_variable _lockedDocsCond; + std::unordered_set _lockedDocs; +}; +} // namespace arangodb + +#endif diff --git a/arangod/RocksDBEngine/RocksDBCollection.cpp b/arangod/RocksDBEngine/RocksDBCollection.cpp index 07ef389b5d98..ec457bafd2bf 100644 --- a/arangod/RocksDBEngine/RocksDBCollection.cpp +++ b/arangod/RocksDBEngine/RocksDBCollection.cpp @@ -37,6 +37,7 @@ #include "Indexes/IndexIterator.h" #include "RestServer/DatabaseFeature.h" #include "RocksDBEngine/RocksDBPrimaryIndex.h" +#include "RocksDBEngine/RocksDBBuilderIndex.h" #include "RocksDBEngine/RocksDBCommon.h" #include "RocksDBEngine/RocksDBComparator.h" #include "RocksDBEngine/RocksDBEngine.h" @@ -48,16 +49,12 @@ #include "RocksDBEngine/RocksDBSettingsManager.h" #include "RocksDBEngine/RocksDBTransactionCollection.h" #include "RocksDBEngine/RocksDBTransactionState.h" -#include "RocksDBEngine/RocksDBValue.h" #include "StorageEngine/EngineSelectorFeature.h" #include "StorageEngine/StorageEngine.h" -#include "StorageEngine/TransactionState.h" #include "Transaction/Helpers.h" -#include "Transaction/StandaloneContext.h" #include "Utils/CollectionNameResolver.h" #include "Utils/Events.h" #include "Utils/OperationOptions.h" -#include "Utils/SingleCollectionTransaction.h" #include "VocBase/KeyGenerator.h" #include "VocBase/LocalDocumentId.h" #include "VocBase/LogicalCollection.h" @@ -281,13 +278,6 @@ void RocksDBCollection::prepareIndexes( bool droppedIndex = false; for (std::shared_ptr& idx : indexes) { - RocksDBIndex* rtrx = static_cast(idx.get()); - if (rtrx->isBuilding()) { - int res = rtrx->drop(); - TRI_ASSERT(res == TRI_ERROR_NO_ERROR); - droppedIndex = true; - continue; - } addIndex(std::move(idx)); } @@ -325,23 +315,27 @@ std::shared_ptr RocksDBCollection::createIndex( arangodb::velocypack::Slice const& info, bool restore, bool& created) { TRI_ASSERT(info.isObject()); + Result res; - VPackSlice typeSlice = info.get(StaticStrings::IndexType); - - AccessMode::Type type = AccessMode::Type::WRITE; -#ifdef USE_IRESEARCH - if (arangodb::Index::type(typeSlice.copyString()) == Index::IndexType::TRI_IDX_TYPE_IRESEARCH_LINK) { - type = AccessMode::Type::EXCLUSIVE; // iresearch needs exclusive access + // Step 0. Lock all the things + TRI_vocbase_t& vocbase = _logicalCollection.vocbase(); + TRI_vocbase_col_status_e status; + res = vocbase.useCollection(&_logicalCollection, status); + if (res.fail()) { + THROW_ARANGO_EXCEPTION(res); } -#endif - SingleCollectionTransaction trx( // prevent concurrent dropping - transaction::StandaloneContext::Create(_logicalCollection.vocbase()), - _logicalCollection, type); - Result res = trx.begin(); - if (!res.ok()) { + auto releaseGuard = scopeGuard([&] { + vocbase.releaseCollection(&_logicalCollection); + }); + res = lockWrite(); // MOVE ?!!! + if (res.fail()) { THROW_ARANGO_EXCEPTION(res); } WRITE_LOCKER(indexGuard, _indexesLock); + auto unlockGuard = scopeGuard([&] { + indexGuard.unlock(); + this->unlockWrite(); + }); // Step 1. Check for matching index std::shared_ptr idx = findIndex(info, _indexes); @@ -372,7 +366,9 @@ std::shared_ptr RocksDBCollection::createIndex( } } - // Step 3. add index to collection info + auto buildIdx = std::make_shared(std::static_pointer_cast(idx)); + + // Step 3. add index to collection entry (for removal after a crash) if (!engine->inRecovery()) { // read collection info from database RocksDBKey key; @@ -390,7 +386,7 @@ std::shared_ptr RocksDBCollection::createIndex( if (pair.key.isEqualString("indexes")) { VPackArrayBuilder arrGuard(&builder, "indexes"); builder.add(VPackArrayIterator(pair.value)); - idx->toVelocyPack(builder, Index::makeFlags(Index::Serialize::Internals)); + buildIdx->toVelocyPack(builder, Index::makeFlags(Index::Serialize::Internals)); continue; } builder.add(pair.key); @@ -406,24 +402,33 @@ std::shared_ptr RocksDBCollection::createIndex( // Step 4. fill index if (res.ok()) { - addIndex(idx); // add index to indexes list - RocksDBIndex* ridx = static_cast(idx.get()); - res = ridx->fillIndex(trx, [&] { - indexGuard.unlock(); // will be called at appropriate moment + _indexes.emplace_back(buildIdx); // add index to indexes list + buildIdx->fillIndex([&] { + unlockGuard.fire(); }); } // Step 5. cleanup if (res.ok()) { - // we need to sync the selectivity estimates + { // swap in actual index + WRITE_LOCKER(indexGuard, _indexesLock); + for (auto& other : _indexes) { + if (other->id() == buildIdx->id()) { + other.swap(idx); + } + } + } + + // we should sync the selectivity estimates res = engine->settingsManager()->sync(false); - if (res.fail()) { // not a deal breaker + if (res.fail()) { // not critical LOG_TOPIC(WARN, Logger::ENGINES) << "could not sync settings: " << res.errorMessage(); + res.reset(); } rocksdb::Status s = engine->db()->GetRootDB()->FlushWAL(true); - if (!s.ok()) { // not a deal breaker + if (!s.ok()) { // not critical LOG_TOPIC(WARN, Logger::ENGINES) << "could not flush wal: " << s.ToString(); } @@ -432,7 +437,7 @@ std::shared_ptr RocksDBCollection::createIndex( arangodb::aql::PlanCache::instance()->invalidate(_logicalCollection->vocbase()); #endif - if (!engine->inRecovery()) { + if (!engine->inRecovery()) { // write new collection marker auto builder = _logicalCollection.toVelocyPackIgnore( {"path", "statusString"}, true, /*forPersistence*/ true); VPackBuilder indexInfo; @@ -448,14 +453,10 @@ std::shared_ptr RocksDBCollection::createIndex( ) ); } - - if (res.ok()) { - res = trx.commit(); - } } if (res.fail()) { - // We could not persist the index creation. Better abort + // We could not create the index. Better abort // Remove the Index in the local list again. size_t i = 0; WRITE_LOCKER(guard, _indexesLock); @@ -1610,10 +1611,8 @@ int RocksDBCollection::lockWrite(double timeout) { } /// @brief write unlocks a collection -int RocksDBCollection::unlockWrite() { +void RocksDBCollection::unlockWrite() { _exclusiveLock.unlockWrite(); - - return TRI_ERROR_NO_ERROR; } /// @brief read locks a collection, with a timeout @@ -1664,9 +1663,8 @@ int RocksDBCollection::lockRead(double timeout) { } /// @brief read unlocks a collection -int RocksDBCollection::unlockRead() { +void RocksDBCollection::unlockRead() { _exclusiveLock.unlockRead(); - return TRI_ERROR_NO_ERROR; } // rescans the collection to update document count diff --git a/arangod/RocksDBEngine/RocksDBCollection.h b/arangod/RocksDBEngine/RocksDBCollection.h index 939386ebfc56..ddb474ea446c 100644 --- a/arangod/RocksDBEngine/RocksDBCollection.h +++ b/arangod/RocksDBEngine/RocksDBCollection.h @@ -182,9 +182,9 @@ class RocksDBCollection final : public PhysicalCollection { uint64_t objectId() const { return _objectId; } int lockWrite(double timeout = 0.0); - int unlockWrite(); + void unlockWrite(); int lockRead(double timeout = 0.0); - int unlockRead(); + void unlockRead(); /// recalculte counts for collection in case of failure uint64_t recalculateCounts(); diff --git a/arangod/RocksDBEngine/RocksDBGeoIndex.cpp b/arangod/RocksDBEngine/RocksDBGeoIndex.cpp index b4de6ee0168a..d701025a852c 100644 --- a/arangod/RocksDBEngine/RocksDBGeoIndex.cpp +++ b/arangod/RocksDBEngine/RocksDBGeoIndex.cpp @@ -236,7 +236,6 @@ class RDBNearIterator final : public IndexIterator { geo_index::NearUtils _near; std::unique_ptr _iter; }; -typedef RDBNearIterator LegacyIterator; RocksDBGeoIndex::RocksDBGeoIndex( TRI_idx_iid_t iid, diff --git a/arangod/RocksDBEngine/RocksDBIndex.cpp b/arangod/RocksDBEngine/RocksDBIndex.cpp index 8980b02acf55..63138114d400 100644 --- a/arangod/RocksDBEngine/RocksDBIndex.cpp +++ b/arangod/RocksDBEngine/RocksDBIndex.cpp @@ -72,8 +72,7 @@ RocksDBIndex::RocksDBIndex( _cf(cf), _cache(nullptr), _cachePresent(false), - _cacheEnabled(useCache && !collection.system() && CacheManagerFeature::MANAGER != nullptr), - _isBuilding(false) { + _cacheEnabled(useCache && !collection.system() && CacheManagerFeature::MANAGER != nullptr){ TRI_ASSERT(cf != nullptr && cf != RocksDBColumnFamily::definitions()); if (_cacheEnabled) { @@ -99,8 +98,7 @@ RocksDBIndex::RocksDBIndex( _cf(cf), _cache(nullptr), _cachePresent(false), - _cacheEnabled(useCache && !collection.system() && CacheManagerFeature::MANAGER != nullptr), - _isBuilding(basics::VelocyPackHelper::getBooleanValue(info, StaticStrings::IndexIsBuilding, false)) { + _cacheEnabled(useCache && !collection.system() && CacheManagerFeature::MANAGER != nullptr) { TRI_ASSERT(cf != nullptr && cf != RocksDBColumnFamily::definitions()); if (_cacheEnabled) { @@ -173,7 +171,6 @@ void RocksDBIndex::toVelocyPack(VPackBuilder& builder, // If we store it, it cannot be 0 TRI_ASSERT(_objectId != 0); builder.add("objectId", VPackValue(std::to_string(_objectId))); - builder.add(StaticStrings::IndexIsBuilding, VPackValue(isBuilding())); } } @@ -346,246 +343,3 @@ RocksDBKeyBounds RocksDBIndex::getBounds(Index::IndexType type, THROW_ARANGO_EXCEPTION(TRI_ERROR_NOT_IMPLEMENTED); } } - -RocksDBCuckooIndexEstimator* RocksDBIndex::estimator() { - return nullptr; -} - -void RocksDBIndex::setEstimator(std::unique_ptr>) { - // Nothing to do. -} - -// Background index filler task -static arangodb::Result fillIndexBackground(transaction::Methods& trx, - RocksDBIndex* ridx, - RocksDBCollection* coll, - std::function const& unlock) { - auto state = RocksDBTransactionState::toState(&trx); - arangodb::Result res; - - // fillindex can be non transactional, we just need to clean up - RocksDBEngine* engine = rocksutils::globalRocksEngine(); - rocksdb::DB* rootDB = engine->db()->GetRootDB(); - TRI_ASSERT(rootDB != nullptr); - - uint64_t numDocsWritten = 0; - - auto bounds = RocksDBKeyBounds::CollectionDocuments(coll->objectId()); - rocksdb::Slice upper(bounds.end()); // exclusive upper bound - rocksdb::Status s; - rocksdb::WriteOptions wo; - wo.disableWAL = false; // TODO set to true eventually - - // we iterator without a snapshot - rocksdb::ReadOptions ro; - ro.prefix_same_as_start = true; - ro.iterate_upper_bound = &upper; - ro.verify_checksums = false; - ro.fill_cache = false; - - rocksdb::ColumnFamilyHandle* docCF = bounds.columnFamily(); - std::unique_ptr it(rootDB->NewIterator(ro, docCF)); - - it->SeekForPrev(bounds.end()); - if (!it->Valid() || it->key().compare(bounds.start()) < 0) { - return res; - } - std::string lastKey(it->key().data(), it->key().size()); // inclusive - unlock(); // release lock - - // empty transaction used to lock the keys - rocksdb::TransactionOptions to; - to.lock_timeout = 100; // 100ms - std::unique_ptr rtrx(engine->db()->BeginTransaction(wo, to)); - rtrx->SetSnapshot(); - if (!ridx->unique()) { - rtrx->DisableIndexing(); - } - RocksDBSubTrxMethods batched(state, rtrx.get()); - - std::vector toRevisit; - toRevisit.reserve(1024); - - it->Seek(bounds.start()); - while (it->Valid() && it->key().compare(lastKey) <= 0) { - - res = ridx->insertInternal(&trx, &batched, RocksDBKey::documentId(it->key()), - VPackSlice(it->value().data()), - Index::OperationMode::normal); - if (res.fail()) { - break; - } - numDocsWritten++; - - if (numDocsWritten % 200 == 0) { // commit buffered writes - s = rtrx->Commit(); - if (!s.ok()) { - res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); - break; - } - engine->db()->BeginTransaction(wo, to, rtrx.get()); // reuse transaction - rtrx->SetSnapshot(); - } - - it->Next(); - } - - if (res.ok() && !toRevisit.empty()) { // now roll-up skipped keys - to.lock_timeout = 5000; // longer timeout to increase the odds - engine->db()->BeginTransaction(wo, to, rtrx.get()); // release keys - rtrx->SetSnapshot(); - RocksDBKey key; - - for (LocalDocumentId const& doc : toRevisit) { - key.constructDocument(coll->objectId(), doc); - - rocksdb::PinnableSlice slice; - s = rtrx->GetForUpdate(ro, docCF, key.string(), &slice, /*exclusive*/false); - if (!s.ok()) { - res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); - break; - } - res = ridx->insertInternal(&trx, &batched, doc, - VPackSlice(slice.data()), - Index::OperationMode::normal); - if (res.fail()) { - break; - } - - numDocsWritten++; - } - } - - // now actually write all remaining index keys - if (res.ok() && rtrx->GetNumPuts() > 0) { - s = rtrx->Commit(); - if (!s.ok()) { - res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); - } - } - - // we will need to remove index elements created before an error - // occurred, this needs to happen since we are non transactional - if (res.fail()) { - RocksDBKeyBounds bounds = ridx->getBounds(); - arangodb::Result res2 = rocksutils::removeLargeRange(rocksutils::globalRocksDB(), bounds, - true, /*useRangeDel*/numDocsWritten > 25000); - if (res2.fail()) { - LOG_TOPIC(WARN, Logger::ENGINES) << "was not able to roll-back " - << "index creation: " << res2.errorMessage(); - } - } - - return res; -} - -// fast mode assuming exclusive access -template -static arangodb::Result fillIndexFast(transaction::Methods& trx, - RocksDBIndex* ridx, - RocksDBCollection* coll, - WriteBatchType& batch) { - auto state = RocksDBTransactionState::toState(&trx); - arangodb::Result res; - - // fillindex can be non transactional, we just need to clean up - RocksDBEngine* engine = rocksutils::globalRocksEngine(); - rocksdb::DB* rootDB = engine->db()->GetRootDB(); - TRI_ASSERT(rootDB != nullptr); - - uint64_t numDocsWritten = 0; - // write batch will be reset every x documents - MethodsType batched(state, &batch); - - auto bounds = RocksDBKeyBounds::CollectionDocuments(coll->objectId()); - rocksdb::Slice upper(bounds.end()); - - rocksdb::Status s; - rocksdb::WriteOptions wo; - wo.disableWAL = false; // TODO set to true eventually - - // we iterator without a snapshot - rocksdb::ReadOptions ro; - ro.prefix_same_as_start = true; - ro.iterate_upper_bound = &upper; - ro.verify_checksums = false; - ro.fill_cache = false; - - rocksdb::ColumnFamilyHandle* docCF = bounds.columnFamily(); - std::unique_ptr it(rootDB->NewIterator(ro, docCF)); - - it->Seek(bounds.start()); - while (it->Valid() && it->key().compare(upper) < 0) { - - res = ridx->insertInternal(&trx, &batched, RocksDBKey::documentId(it->key()), - VPackSlice(it->value().data()), - Index::OperationMode::normal); - if (res.fail()) { - break; - } - numDocsWritten++; - - if (numDocsWritten % 200 == 0) { // commit buffered writes - s = rootDB->Write(wo, batch.GetWriteBatch()); - if (!s.ok()) { - res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); - break; - } - batch.Clear(); - } - - it->Next(); - } - - if (res.ok() && batch.GetWriteBatch()->Count() > 0) { // - s = rootDB->Write(wo, batch.GetWriteBatch()); - if (!s.ok()) { - res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); - } - } - batch.Clear(); - - // we will need to remove index elements created before an error - // occurred, this needs to happen since we are non transactional - if (res.fail()) { - RocksDBKeyBounds bounds = ridx->getBounds(); - arangodb::Result res2 = rocksutils::removeLargeRange(rocksutils::globalRocksDB(), bounds, - true, /*useRangeDel*/numDocsWritten > 25000); - if (res2.fail()) { - LOG_TOPIC(WARN, Logger::ENGINES) << "was not able to roll-back " - << "index creation: " << res2.errorMessage(); - } - } - - return res; -} - -/// non-transactional: fill index with existing documents -/// from this collection -arangodb::Result RocksDBIndex::fillIndex(transaction::Methods& trx, - std::function const& unlock) { - TRI_ASSERT(trx.state()->collection(_collection.id(), AccessMode::Type::WRITE)); - _isBuilding.store(true, std::memory_order_release); - - RocksDBCollection* coll = static_cast(_collection.getPhysical()); - auto guard = scopeGuard([&] { - _isBuilding.store(false, std::memory_order_release); - unlock(); // we do not need the index lock anymore - }); - - if (trx.state()->isOnlyExclusiveTransaction()) { - if (this->unique()) { - // unique index. we need to keep track of all our changes because we need to avoid - // duplicate index keys. must therefore use a WriteBatchWithIndex - rocksdb::WriteBatchWithIndex batch(_cf->GetComparator(), 32 * 1024 * 1024); - return ::fillIndexFast(trx, this, coll, batch); - } else { - // non-unique index. all index keys will be unique anyway because they contain the document id - // we can therefore get away with a cheap WriteBatch - rocksdb::WriteBatch batch(32 * 1024 * 1024); - return ::fillIndexFast(trx, this, coll, batch); - } - } else { - return ::fillIndexBackground(trx, this, coll, unlock); - } -} diff --git a/arangod/RocksDBEngine/RocksDBIndex.h b/arangod/RocksDBEngine/RocksDBIndex.h index 28ca7fa0134c..0167417b5323 100644 --- a/arangod/RocksDBEngine/RocksDBIndex.h +++ b/arangod/RocksDBEngine/RocksDBIndex.h @@ -65,7 +65,7 @@ class RocksDBIndex : public Index { /// @brief if true this index should not be shown externally virtual bool isHidden() const override { - return isBuilding(); // do not show building indexes + return false; // do not generally hide indexes } size_t memory() const override; @@ -135,20 +135,12 @@ class RocksDBIndex : public Index { static RocksDBKeyBounds getBounds(Index::IndexType type, uint64_t objectId, bool unique); - /// @brief is this index still beeing build - bool isBuilding() const { - return _isBuilding.load(std::memory_order_acquire); - } - /// @brief get index estimator, optional - virtual RocksDBCuckooIndexEstimator* estimator(); - virtual void setEstimator(std::unique_ptr>); + virtual RocksDBCuckooIndexEstimator* estimator() { + return nullptr; + } + virtual void setEstimator(std::unique_ptr>) {} virtual void recalculateEstimates() {} - - /// @brief fill the index - /// @param unlock will be called when the index lock can be released - arangodb::Result fillIndex(transaction::Methods&, - std::function const& unlock); protected: RocksDBIndex( @@ -184,11 +176,6 @@ class RocksDBIndex : public Index { // it's quicker than accessing the shared_ptr each time bool _cachePresent; bool _cacheEnabled; - - private: - - /// is this index currently building - std::atomic _isBuilding; }; } // namespace arangodb diff --git a/arangod/RocksDBEngine/RocksDBIndexFactory.cpp b/arangod/RocksDBEngine/RocksDBIndexFactory.cpp index c1b59078501f..96c731880f6c 100644 --- a/arangod/RocksDBEngine/RocksDBIndexFactory.cpp +++ b/arangod/RocksDBEngine/RocksDBIndexFactory.cpp @@ -838,8 +838,8 @@ void RocksDBIndexFactory::prepareIndexes( bool splitEdgeIndex = false; TRI_idx_iid_t last = 0; - for (auto const& v : VPackArrayIterator(indexesSlice)) { - if (arangodb::basics::VelocyPackHelper::getBooleanValue(v, "error", + for (VPackSlice v : VPackArrayIterator(indexesSlice)) { + if (arangodb::basics::VelocyPackHelper::getBooleanValue(v, StaticStrings::Error, false)) { // We have an error here. // Do not add index. @@ -848,15 +848,14 @@ void RocksDBIndexFactory::prepareIndexes( } // check for combined edge index from MMFiles; must split! - auto value = v.get("type"); - - if (value.isString()) { + auto typeSlice = v.get(StaticStrings::IndexType); + if (typeSlice.isString()) { VPackValueLength len; - const char* tmp = value.getStringUnchecked(len); + const char* tmp = typeSlice.getStringUnchecked(len); arangodb::Index::IndexType const type = arangodb::Index::type(tmp, len); if (type == Index::IndexType::TRI_IDX_TYPE_EDGE_INDEX) { - VPackSlice fields = v.get("fields"); + VPackSlice fields = v.get(StaticStrings::IndexFields); if (fields.isArray() && fields.length() == 2) { VPackBuilder from; @@ -864,8 +863,8 @@ void RocksDBIndexFactory::prepareIndexes( from.openObject(); for (auto const& f : VPackObjectIterator(v)) { - if (arangodb::StringRef(f.key) == "fields") { - from.add(VPackValue("fields")); + if (arangodb::StringRef(f.key) == StaticStrings::IndexFields) { + from.add(VPackValue(StaticStrings::IndexFields)); from.openArray(); from.add(VPackValue(StaticStrings::FromString)); from.close(); @@ -880,18 +879,16 @@ void RocksDBIndexFactory::prepareIndexes( VPackBuilder to; to.openObject(); - for (auto const& f : VPackObjectIterator(v)) { - if (arangodb::StringRef(f.key) == "fields") { - to.add(VPackValue("fields")); + if (arangodb::StringRef(f.key) == StaticStrings::IndexFields) { + to.add(VPackValue(StaticStrings::IndexFields)); to.openArray(); to.add(VPackValue(StaticStrings::ToString)); to.close(); - } else if (arangodb::StringRef(f.key) == "id") { + } else if (arangodb::StringRef(f.key) == StaticStrings::IndexId) { auto iid = basics::StringUtils::uint64(f.value.copyString()) + 1; - last = iid; - to.add("id", VPackValue(std::to_string(iid))); + to.add(StaticStrings::IndexId, VPackValue(std::to_string(iid))); } else { to.add(f.key); to.add(f.value); @@ -930,9 +927,9 @@ void RocksDBIndexFactory::prepareIndexes( b.openObject(); for (auto const& f : VPackObjectIterator(v)) { - if (arangodb::StringRef(f.key) == "id") { + if (arangodb::StringRef(f.key) == StaticStrings::IndexId) { last++; - b.add("id", VPackValue(std::to_string(last))); + b.add(StaticStrings::IndexId, VPackValue(std::to_string(last))); } else { b.add(f.key); b.add(f.value); @@ -957,7 +954,6 @@ void RocksDBIndexFactory::prepareIndexes( } auto idx = prepareIndexFromSlice(v, false, col, true); - if (!idx) { LOG_TOPIC(ERR, arangodb::Logger::ENGINES) << "error creating index from definition '" << v.toString() << "'"; @@ -971,6 +967,12 @@ void RocksDBIndexFactory::prepareIndexes( << v.toJson() << "'"; } #endif + + if (basics::VelocyPackHelper::getBooleanValue(v, StaticStrings::IndexIsBuilding, false)) { + LOG_TOPIC(WARN, Logger::ENGINES) << "dropping failed index '" << idx->id() << "'"; + idx->drop(); + continue; + } indexes.emplace_back(std::move(idx)); } diff --git a/arangod/RocksDBEngine/RocksDBTransactionState.cpp b/arangod/RocksDBEngine/RocksDBTransactionState.cpp index 71e52e6be6f0..1b005b7dc183 100644 --- a/arangod/RocksDBEngine/RocksDBTransactionState.cpp +++ b/arangod/RocksDBEngine/RocksDBTransactionState.cpp @@ -258,7 +258,7 @@ arangodb::Result RocksDBTransactionState::internalCommit() { } Result result; - if (hasOperations()) { + if (hasOperations()) { // might not have ops for fillIndex // we are actually going to attempt a commit if (!hasHint(transaction::Hints::Hint::SINGLE_OPERATION)) { // add custom commit marker to increase WAL tailing reliability @@ -368,15 +368,8 @@ arangodb::Result RocksDBTransactionState::internalCommit() { TRI_ASSERT(_rocksTransaction->GetNumKeys() == 0 && _rocksTransaction->GetNumPuts() == 0 && _rocksTransaction->GetNumDeletes() == 0); - - rocksdb::SequenceNumber seq = 0; - if (_rocksTransaction) { - seq = _rocksTransaction->GetSnapshot()->GetSequenceNumber(); - } else { - TRI_ASSERT(_readSnapshot); - seq = _readSnapshot->GetSequenceNumber(); - } - + // this is most likely the fill index case + rocksdb::SequenceNumber seq = _rocksTransaction->GetSnapshot()->GetSequenceNumber(); for (auto& trxColl : _collections) { TRI_IF_FAILURE("RocksDBCommitCounts") { continue; @@ -409,7 +402,6 @@ Result RocksDBTransactionState::commitTransaction(transaction::Methods* activeTr if (_rocksTransaction != nullptr) { res = internalCommit(); } - if (res.ok()) { updateStatus(transaction::Status::COMMITTED); cleanupTransaction(); // deletes trx diff --git a/arangod/VocBase/Methods/Indexes.cpp b/arangod/VocBase/Methods/Indexes.cpp index a5133bd4e893..eb15056a846a 100644 --- a/arangod/VocBase/Methods/Indexes.cpp +++ b/arangod/VocBase/Methods/Indexes.cpp @@ -191,7 +191,7 @@ arangodb::Result Indexes::getAll(LogicalCollection const* collection, ); if (type.isString() && type.compareString("edge") == 0) { - VPackSlice fields = index.get("fields"); + VPackSlice fields = index.get(StaticStrings::IndexFields); TRI_ASSERT(fields.isArray() && fields.length() <= 2); if (fields.length() == 1) { // merge indexes @@ -234,7 +234,7 @@ arangodb::Result Indexes::getAll(LogicalCollection const* collection, if (fields[0].compareString(StaticStrings::FromString) == 0) { continue; } else if (fields[0].compareString(StaticStrings::ToString) == 0) { - merge.add("fields", VPackValue(VPackValueType::Array)); + merge.add(StaticStrings::IndexFields, VPackValue(VPackValueType::Array)); merge.add(VPackValue(StaticStrings::FromString)); merge.add(VPackValue(StaticStrings::ToString)); merge.close(); From b5257126df049d698683e8d63e9b4693dc486833 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Thu, 6 Dec 2018 13:30:06 +0100 Subject: [PATCH 07/31] remove unused code --- arangod/RocksDBEngine/RocksDBBuilderIndex.cpp | 45 +++---------------- 1 file changed, 6 insertions(+), 39 deletions(-) diff --git a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp index f615e3af062b..af2feb40ad78 100644 --- a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp +++ b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp @@ -123,6 +123,11 @@ Result RocksDBBuilderIndex::removeInternal(transaction::Methods* trx, RocksDBMet arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function const& unlock) { arangodb::Result res; + // 1. Index everything under a snapshot iterator (get snapshot under exclusive coll lock) + // 2. Track deleted document IDs so we can avoid indexing them + // 3. Avoid conflicts on unique index keys by using rocksdb::Transaction snapshot conflict checking + // 4. Supress unique constraint violations / conflicts or client drivers + // fillindex can be non transactional, we just need to clean up RocksDBEngine* engine = rocksutils::globalRocksEngine(); RocksDBCollection* rcoll = static_cast(_collection.getPhysical()); @@ -169,7 +174,7 @@ arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function std::unique_ptr rtrx(engine->db()->BeginTransaction(wo, to)); if (this->unique()) { rtrx->SetSnapshot(); // needed for unique index conflict detection - } else { // FIXME use PutUntracked + } else { rtrx->DisableIndexing(); // we never check for existing index keys } RocksDBSubTrxMethods batched(state, rtrx.get()); @@ -222,32 +227,6 @@ arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function it->Next(); } -// -// if (res.ok() && !toRevisit.empty()) { // now roll-up skipped keys -// to.lock_timeout = 5000; // longer timeout to increase the odds -// engine->db()->BeginTransaction(wo, to, rtrx.get()); // release keys -// rtrx->SetSnapshot(); -// RocksDBKey key; -// -// for (LocalDocumentId const& doc : toRevisit) { -// key.constructDocument(coll->objectId(), doc); -// -// rocksdb::PinnableSlice slice; -// s = rtrx->GetForUpdate(ro, docCF, key.string(), &slice, /*exclusive*/false); -// if (!s.ok()) { -// res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); -// break; -// } -// res = ridx->insertInternal(&trx, &batched, doc, -// VPackSlice(slice.data()), -// Index::OperationMode::normal); -// if (res.fail()) { -// break; -// } -// -// numDocsWritten++; -// } -// } // now actually write all remaining index keys if (res.ok() && rtrx->GetNumPuts() > 0) { @@ -264,18 +243,6 @@ arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function _lockedDocs.clear(); _lockedDocsCond.notify_all(); - // we will need to remove index elements created before an error - // occurred, this needs to happen since we are non transactional -// if (res.fail()) { -// RocksDBKeyBounds bounds = internal->getBounds(); -// arangodb::Result res2 = rocksutils::removeLargeRange(rocksutils::globalRocksDB(), bounds, -// true, /*useRangeDel*/numDocsWritten > 25000); -// if (res2.fail()) { -// LOG_TOPIC(WARN, Logger::ENGINES) << "was not able to roll-back " -// << "index creation: " << res2.errorMessage(); -// } -// } - return res; } From 0515650984ed619b555f73e71ca3a32512ccc5f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Thu, 6 Dec 2018 14:18:13 +0100 Subject: [PATCH 08/31] fix link creation --- arangod/IResearch/IResearchView.cpp | 4 ++-- arangod/Indexes/Index.cpp | 23 +++++++++++---------- arangod/RocksDBEngine/RocksDBCollection.cpp | 7 ++++--- arangod/Transaction/Methods.cpp | 16 +++++++------- arangod/Transaction/Methods.h | 2 +- 5 files changed, 28 insertions(+), 24 deletions(-) diff --git a/arangod/IResearch/IResearchView.cpp b/arangod/IResearch/IResearchView.cpp index 6ec5012dc552..9a057c3b852b 100644 --- a/arangod/IResearch/IResearchView.cpp +++ b/arangod/IResearch/IResearchView.cpp @@ -1001,7 +1001,7 @@ arangodb::Result IResearchView::appendVelocyPackDetailed( arangodb::velocypack::ObjectBuilder linksBuilderWrapper(&linksBuilder); for (auto& collectionName: state->collectionNames()) { - for (auto& index: trx.indexesForCollection(collectionName)) { + for (auto& index: trx.indexesForCollection(collectionName, /*withHidden*/true)) { if (index && arangodb::Index::IndexType::TRI_IDX_TYPE_IRESEARCH_LINK == index->type()) { // TODO FIXME find a better way to retrieve an IResearch Link // cannot use static_cast/reinterpret_cast since Index is not related to IResearchLink @@ -2139,4 +2139,4 @@ void IResearchView::verifyKnownCollections() { // ----------------------------------------------------------------------------- // --SECTION-- END-OF-FILE -// ----------------------------------------------------------------------------- \ No newline at end of file +// ----------------------------------------------------------------------------- diff --git a/arangod/Indexes/Index.cpp b/arangod/Indexes/Index.cpp index c2f62f1e0083..2173b3a946c6 100644 --- a/arangod/Indexes/Index.cpp +++ b/arangod/Indexes/Index.cpp @@ -220,36 +220,37 @@ void Index::validateFields(VPackSlice const& slice) { /// @brief return the index type based on a type name Index::IndexType Index::type(char const* type, size_t len) { - if (7 == len && ::strncmp(type, "primary", len) == 0) { + if (7 == len && ::memcmp(type, "primary", len) == 0) { return TRI_IDX_TYPE_PRIMARY_INDEX; } - if (4 == len && ::strncmp(type, "edge", len) == 0) { + if (4 == len && ::memcmp(type, "edge", len) == 0) { return TRI_IDX_TYPE_EDGE_INDEX; } - if (4 == len && ::strncmp(type, "hash", len) == 0) { + if (4 == len && ::memcmp(type, "hash", len) == 0) { return TRI_IDX_TYPE_HASH_INDEX; } - if (8 == len && ::strncmp(type, "skiplist", len) == 0) { + if (8 == len && ::memcmp(type, "skiplist", len) == 0) { return TRI_IDX_TYPE_SKIPLIST_INDEX; } - if ((10 == len && ::strncmp(type, "persistent", len) == 0) || - (7 == len && ::strncmp(type, "rocksdb", len) == 0)) { + if ((10 == len && ::memcmp(type, "persistent", len) == 0) || + (7 == len && ::memcmp(type, "rocksdb", len) == 0)) { return TRI_IDX_TYPE_PERSISTENT_INDEX; } - if (8 == len && ::strncmp(type, "fulltext", len) == 0) { + if (8 == len && ::memcmp(type, "fulltext", len) == 0) { return TRI_IDX_TYPE_FULLTEXT_INDEX; } - if (3 == len && ::strncmp(type, "geo", len) == 0) { + if (3 == len && ::memcmp(type, "geo", len) == 0) { return TRI_IDX_TYPE_GEO_INDEX; } - if (4 == len && ::strncmp(type, "geo1", len) == 0) { + if (4 == len && ::memcmp(type, "geo1", len) == 0) { return TRI_IDX_TYPE_GEO1_INDEX; } - if (4 == len && ::strncmp(type, "geo2", len) == 0) { + if (4 == len && ::memcmp(type, "geo2", len) == 0) { return TRI_IDX_TYPE_GEO2_INDEX; } #ifdef USE_IRESEARCH - if (arangodb::iresearch::DATA_SOURCE_TYPE.name() == type) { + std::string const& tmp = arangodb::iresearch::DATA_SOURCE_TYPE.name(); + if (tmp.size() == len && ::memcmp(type, tmp.c_str(), len) == 0) { return TRI_IDX_TYPE_IRESEARCH_LINK; } #endif diff --git a/arangod/RocksDBEngine/RocksDBCollection.cpp b/arangod/RocksDBEngine/RocksDBCollection.cpp index ec457bafd2bf..55b9f3829f57 100644 --- a/arangod/RocksDBEngine/RocksDBCollection.cpp +++ b/arangod/RocksDBEngine/RocksDBCollection.cpp @@ -412,9 +412,10 @@ std::shared_ptr RocksDBCollection::createIndex( if (res.ok()) { { // swap in actual index WRITE_LOCKER(indexGuard, _indexesLock); - for (auto& other : _indexes) { - if (other->id() == buildIdx->id()) { - other.swap(idx); + for (size_t i = 0; i < _indexes.size(); i++) { + if (_indexes[i]->id() == buildIdx->id()) { + _indexes[i] = idx; + break; } } } diff --git a/arangod/Transaction/Methods.cpp b/arangod/Transaction/Methods.cpp index c96727a4347c..f595ae96fd09 100644 --- a/arangod/Transaction/Methods.cpp +++ b/arangod/Transaction/Methods.cpp @@ -3235,7 +3235,7 @@ Result transaction::Methods::unlockRecursive(TRI_voc_cid_t cid, /// @brief get list of indexes for a collection std::vector> transaction::Methods::indexesForCollection( - std::string const& collectionName) { + std::string const& collectionName, bool withHidden) { if (_state->isCoordinator()) { return indexesForCollectionCoordinator(collectionName); } @@ -3244,12 +3244,14 @@ std::vector> transaction::Methods::indexesForCollection( TRI_voc_cid_t cid = addCollectionAtRuntime(collectionName); LogicalCollection* document = documentCollection(trxCollection(cid)); std::vector> indexes = document->getIndexes(); - auto it = indexes.begin(); - while (it != indexes.end()) { - if ((*it)->isHidden()) { - it = indexes.erase(it); - } else { - it++; + if (!withHidden) { + auto it = indexes.begin(); + while (it != indexes.end()) { + if ((*it)->isHidden()) { + it = indexes.erase(it); + } else { + it++; + } } } return indexes; diff --git a/arangod/Transaction/Methods.h b/arangod/Transaction/Methods.h index 9d1fcd1fa006..50fd945cdb10 100644 --- a/arangod/Transaction/Methods.h +++ b/arangod/Transaction/Methods.h @@ -423,7 +423,7 @@ class Methods { /// @brief get all indexes for a collection name ENTERPRISE_VIRT std::vector> indexesForCollection( - std::string const&); + std::string const&, bool withHidden = false); /// @brief Lock all collections. Only works for selected sub-classes virtual int lockCollections(); From 70ef2f19006ef8b9d6994edb9585cdb02225caa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Thu, 6 Dec 2018 15:02:42 +0100 Subject: [PATCH 09/31] fix unique constraint violations --- arangod/RocksDBEngine/RocksDBBuilderIndex.cpp | 6 ++++-- arangod/RocksDBEngine/RocksDBCollection.cpp | 2 +- arangod/RocksDBEngine/RocksDBIndexFactory.cpp | 2 +- arangod/RocksDBEngine/RocksDBMethods.cpp | 11 ++++++----- arangod/RocksDBEngine/RocksDBMethods.h | 1 + 5 files changed, 13 insertions(+), 9 deletions(-) diff --git a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp index af2feb40ad78..2ac480573ff6 100644 --- a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp +++ b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp @@ -60,7 +60,7 @@ void RocksDBBuilderIndex::toVelocyPack(VPackBuilder& builder, builder.openObject(); // FIXME refactor RocksDBIndex::toVelocyPack !! builder.add(velocypack::ObjectIterator(inner.slice())); if (Index::hasFlag(flags, Index::Serialize::Internals)) { - builder.add(StaticStrings::IndexIsBuilding, VPackValue(true)); + builder.add("_inprogress", VPackValue(true)); } builder.close(); } @@ -236,7 +236,9 @@ arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function } } - res = trx.commit(); // required to commit selectivity estimates + if (res.ok()) { + res = trx.commit(); // required to commit selectivity estimates + } // clear all the processed documents std::lock_guard guard2(_lockedDocsMutex); diff --git a/arangod/RocksDBEngine/RocksDBCollection.cpp b/arangod/RocksDBEngine/RocksDBCollection.cpp index 55b9f3829f57..beac732d7818 100644 --- a/arangod/RocksDBEngine/RocksDBCollection.cpp +++ b/arangod/RocksDBEngine/RocksDBCollection.cpp @@ -403,7 +403,7 @@ std::shared_ptr RocksDBCollection::createIndex( // Step 4. fill index if (res.ok()) { _indexes.emplace_back(buildIdx); // add index to indexes list - buildIdx->fillIndex([&] { + res = buildIdx->fillIndex([&] { unlockGuard.fire(); }); } diff --git a/arangod/RocksDBEngine/RocksDBIndexFactory.cpp b/arangod/RocksDBEngine/RocksDBIndexFactory.cpp index 96c731880f6c..d2ee4a5440d0 100644 --- a/arangod/RocksDBEngine/RocksDBIndexFactory.cpp +++ b/arangod/RocksDBEngine/RocksDBIndexFactory.cpp @@ -968,7 +968,7 @@ void RocksDBIndexFactory::prepareIndexes( } #endif - if (basics::VelocyPackHelper::getBooleanValue(v, StaticStrings::IndexIsBuilding, false)) { + if (basics::VelocyPackHelper::getBooleanValue(v, "_inprogress", false)) { LOG_TOPIC(WARN, Logger::ENGINES) << "dropping failed index '" << idx->id() << "'"; idx->drop(); continue; diff --git a/arangod/RocksDBEngine/RocksDBMethods.cpp b/arangod/RocksDBEngine/RocksDBMethods.cpp index 74a35c94625d..08778ce1d88e 100644 --- a/arangod/RocksDBEngine/RocksDBMethods.cpp +++ b/arangod/RocksDBEngine/RocksDBMethods.cpp @@ -389,20 +389,21 @@ std::unique_ptr RocksDBBatchedWithIndexMethods::NewIterator( /// transaction wrapper, uses the current rocksdb transaction and non-tracking methods RocksDBSubTrxMethods::RocksDBSubTrxMethods(RocksDBTransactionState* state, rocksdb::Transaction* trx) - : RocksDBMethods(state), _trx(trx) {} + : RocksDBMethods(state), _trx(trx) { + _ro.prefix_same_as_start = true; + _ro.fill_cache = false; + } rocksdb::Status RocksDBSubTrxMethods::Get(rocksdb::ColumnFamilyHandle* cf, rocksdb::Slice const& key, std::string* val) { TRI_ASSERT(cf != nullptr); - rocksdb::ReadOptions const& ro = _state->_rocksReadOptions; - return _trx->Get(ro, cf, key, val); + return _trx->Get(_ro, cf, key, val); } rocksdb::Status RocksDBSubTrxMethods::Get(rocksdb::ColumnFamilyHandle* cf, rocksdb::Slice const& key, rocksdb::PinnableSlice* val) { TRI_ASSERT(cf != nullptr); - rocksdb::ReadOptions const& ro = _state->_rocksReadOptions; - return _trx->Get(ro, cf, key, val); + return _trx->Get(_ro, cf, key, val); } rocksdb::Status RocksDBSubTrxMethods::Put(rocksdb::ColumnFamilyHandle* cf, diff --git a/arangod/RocksDBEngine/RocksDBMethods.h b/arangod/RocksDBEngine/RocksDBMethods.h index 15339117538e..cae6f3569135 100644 --- a/arangod/RocksDBEngine/RocksDBMethods.h +++ b/arangod/RocksDBEngine/RocksDBMethods.h @@ -280,6 +280,7 @@ class RocksDBSubTrxMethods final : public RocksDBMethods { private: rocksdb::Transaction* _trx; + rocksdb::ReadOptions _ro; }; // INDEXING MAY ONLY BE DISABLED IN TOPLEVEL AQL TRANSACTIONS From e6d23c17881f6374864b891286571650122ac0de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Thu, 6 Dec 2018 16:23:18 +0100 Subject: [PATCH 10/31] fixed arangosearch cluster reporting --- arangod/IResearch/IResearchLink.cpp | 5 + arangod/IResearch/IResearchLink.h | 4 +- tests/js/common/shell/shell-index-rocksdb.js | 131 +++++++++++++++++++ 3 files changed, 137 insertions(+), 3 deletions(-) create mode 100644 tests/js/common/shell/shell-index-rocksdb.js diff --git a/arangod/IResearch/IResearchLink.cpp b/arangod/IResearch/IResearchLink.cpp index b339c21b88f8..35d47d6d56fa 100644 --- a/arangod/IResearch/IResearchLink.cpp +++ b/arangod/IResearch/IResearchLink.cpp @@ -348,6 +348,11 @@ bool IResearchLink::isSorted() const { return false; // iResearch does not provide a fixed default sort order } +bool IResearchLink::isHidden() const { + // hide links unless we are on a DBServer + return !arangodb::ServerState::instance()->isDBServer(); +} + bool IResearchLink::json(arangodb::velocypack::Builder& builder) const { if (!builder.isOpenObject() || !_meta.json(builder)) { return false; diff --git a/arangod/IResearch/IResearchLink.h b/arangod/IResearch/IResearchLink.h index c194bdee6342..baa5279d8296 100644 --- a/arangod/IResearch/IResearchLink.h +++ b/arangod/IResearch/IResearchLink.h @@ -98,9 +98,7 @@ class IResearchLink { bool isSorted() const; // arangodb::Index override - bool isHidden() const { // arangodb::Index override - return true; // always hide links - } + bool isHidden() const; // arangodb::Index override //////////////////////////////////////////////////////////////////////////////// /// @brief the identifier for this link diff --git a/tests/js/common/shell/shell-index-rocksdb.js b/tests/js/common/shell/shell-index-rocksdb.js new file mode 100644 index 000000000000..b00ab66e8aab --- /dev/null +++ b/tests/js/common/shell/shell-index-rocksdb.js @@ -0,0 +1,131 @@ +/*jshint globalstrict:false, strict:false */ +/*global fail, assertEqual, assertNotEqual, assertTrue, assertFalse */ + +//////////////////////////////////////////////////////////////////////////////// +/// @brief test the index +/// +/// @file +/// +/// DISCLAIMER +/// +/// Copyright 2010-2012 triagens GmbH, Cologne, Germany +/// +/// Licensed under the Apache License, Version 2.0 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// http://www.apache.org/licenses/LICENSE-2.0 +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is triAGENS GmbH, Cologne, Germany +/// +/// @author Dr. Frank Celler, Lucas Dohmen +/// @author Copyright 2012, triAGENS GmbH, Cologne, Germany +//////////////////////////////////////////////////////////////////////////////// + +var jsunity = require("jsunity"); +var internal = require("internal"); +var errors = internal.errors; +var testHelper = require("@arangodb/test-helper").Helper; + +function backgroundIndexSuite() { + 'use strict'; + let cn = "UnitTestsCollectionIdx"; + let tasks = require("@arangodb/tasks"); + + return { + + setUp : function () { + internal.db._drop(cn); + internal.db._create(cn); + }, + + tearDown : function () { + tasks.get().forEach(function(task) { + if (task.id.match(/^UnitTest/) || task.name.match(/^UnitTest/)) { + try { + tasks.unregister(task); + } + catch (err) { + } + } + }); + internal.db._drop(cn); + }, + + testInsertInParallel: function () { + let n = 10; + for (let i = 0; i < n; ++i) { + let command = `let c = require("internal").db._collection("${cn}"); + let x = 25; while(x-- > 0) { + let docs = []; + for(let i = 0; i < 1000; i++) { + docs.push({value:i}) + } + c.save(docs); + }`; + tasks.register({ name: "UnitTestsIndexInsert" + i, command: command }); + } + + let time = require("internal").time; + let start = time(); + while (true) { + let indexes = require("internal").db._collection(cn).getIndexes(); + if (indexes.length === n + 1) { + // primary index + user-defined indexes + break; + } + if (time() - start > 180) { + // wait for 3 minutes maximum + fail("Timeout creating 80 indices after 3 minutes: " + JSON.stringify(indexes)); + } + require("internal").wait(0.5, false); + } + + let indexes = require("internal").db._collection(cn).getIndexes(); + assertEqual(n + 1, indexes.length); + }, + + testCreateInParallelDuplicate: function () { + let n = 100; + for (let i = 0; i < n; ++i) { + let command = 'require("internal").db._collection("' + cn + '").ensureIndex({ type: "hash", fields: ["value' + (i % 4) + '"] });'; + tasks.register({ name: "UnitTestsIndexCreate" + i, command: command }); + } + + let time = require("internal").time; + let start = time(); + while (true) { + let indexes = require("internal").db._collection(cn).getIndexes(); + if (indexes.length === 4 + 1) { + // primary index + user-defined indexes + break; + } + if (time() - start > 180) { + // wait for 3 minutes maximum + fail("Timeout creating indices after 3 minutes: " + JSON.stringify(indexes)); + } + require("internal").wait(0.5, false); + } + + // wait some extra time because we just have 4 distinct indexes + // these will be created relatively quickly. by waiting here a bit + // we give the other pending tasks a chance to execute too (but they + // will not do anything because the target indexes already exist) + require("internal").wait(5, false); + + let indexes = require("internal").db._collection(cn).getIndexes(); + assertEqual(4 + 1, indexes.length); + } + + }; +} + +jsunity.run(backgroundIndexSuite); + +return jsunity.done(); From 13bcd447204d9bcb55cd36b475b42d5e39d57f69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Thu, 6 Dec 2018 17:30:42 +0100 Subject: [PATCH 11/31] added test --- tests/js/common/shell/shell-index-rocksdb.js | 201 +++++++++++++++---- 1 file changed, 161 insertions(+), 40 deletions(-) diff --git a/tests/js/common/shell/shell-index-rocksdb.js b/tests/js/common/shell/shell-index-rocksdb.js index b00ab66e8aab..6859b4648cc7 100644 --- a/tests/js/common/shell/shell-index-rocksdb.js +++ b/tests/js/common/shell/shell-index-rocksdb.js @@ -28,10 +28,10 @@ /// @author Copyright 2012, triAGENS GmbH, Cologne, Germany //////////////////////////////////////////////////////////////////////////////// -var jsunity = require("jsunity"); -var internal = require("internal"); -var errors = internal.errors; -var testHelper = require("@arangodb/test-helper").Helper; +const jsunity = require("jsunity"); +const internal = require("internal"); +const errors = internal.errors; +const db = internal.db; function backgroundIndexSuite() { 'use strict'; @@ -41,8 +41,8 @@ function backgroundIndexSuite() { return { setUp : function () { - internal.db._drop(cn); - internal.db._create(cn); + db._drop(cn); + db._create(cn); }, tearDown : function () { @@ -55,14 +55,26 @@ function backgroundIndexSuite() { } } }); - internal.db._drop(cn); + db._drop(cn); }, - testInsertInParallel: function () { - let n = 10; + testInsertParallelNonUnique: function () { + let c = require("internal").db._collection(cn); + // first lets add some initial documents + let x = 10; + while(x-- > 0) { + let docs = []; + for(let i = 0; i < 1000; i++) { + docs.push({value:i}) + } + c.save(docs); + } + + // lets insert the rest via tasks + let n = 9; for (let i = 0; i < n; ++i) { let command = `let c = require("internal").db._collection("${cn}"); - let x = 25; while(x-- > 0) { + let x = 10; while(x-- > 0) { let docs = []; for(let i = 0; i < 1000; i++) { docs.push({value:i}) @@ -72,57 +84,166 @@ function backgroundIndexSuite() { tasks.register({ name: "UnitTestsIndexInsert" + i, command: command }); } + // create the index on the main thread + c.ensureIndex({type: 'hash', fields: ['value'], unique: false}); + let time = require("internal").time; let start = time(); while (true) { - let indexes = require("internal").db._collection(cn).getIndexes(); - if (indexes.length === n + 1) { - // primary index + user-defined indexes + if (c.count() === 100000) { break; } - if (time() - start > 180) { - // wait for 3 minutes maximum - fail("Timeout creating 80 indices after 3 minutes: " + JSON.stringify(indexes)); + if (time() - start > 180) { // wait for 3 minutes maximum + fail("Timeout creating documents after 3 minutes"); } require("internal").wait(0.5, false); } - - let indexes = require("internal").db._collection(cn).getIndexes(); - assertEqual(n + 1, indexes.length); + + // 250 entries of each value [0,999] + for (let i = 0; i < 1000; i++) { + let cursor = db._query("FOR doc IN @@coll FILTER doc.value == @val RETURN 1", + {'@coll': cn, 'val': i}, {count:true}); + assertEqual(cursor.count(), 100); + } + + const estimate = 1000.0 / 100000.0; + + let indexes = c.getIndexes(true); + for (let i of indexes) { + switch (i.type) { + case 'primary': + break; + case 'hash': + assertEqual(i.selectivityEstimate, estimate); + break; + default: + fail(); + } + } }, - testCreateInParallelDuplicate: function () { - let n = 100; - for (let i = 0; i < n; ++i) { - let command = 'require("internal").db._collection("' + cn + '").ensureIndex({ type: "hash", fields: ["value' + (i % 4) + '"] });'; - tasks.register({ name: "UnitTestsIndexCreate" + i, command: command }); + testInsertParallelUnique: function () { + let c = require("internal").db._collection(cn); + // first lets add some initial documents + let x = 0; + while(x < 10000) { + let docs = []; + for(let i = 0; i < 1000; i++) { + docs.push({value: x++}) + } + c.save(docs); + } + + // lets insert the rest via tasks + for (let i = 1; i < 5; ++i) { + let command = `let c = require("internal").db._collection("${cn}"); + let x = ${i} * 10000; + while(x < ${i + 1} * 10000) { + let docs = []; + for(let i = 0; i < 1000; i++) { + docs.push({value: x++}) + } + c.save(docs); + }`; + tasks.register({ name: "UnitTestsIndexInsert" + i, command: command }); } + // create the index on the main thread + c.ensureIndex({type: 'hash', fields: ['value'], unique: true}); + let time = require("internal").time; let start = time(); while (true) { - let indexes = require("internal").db._collection(cn).getIndexes(); - if (indexes.length === 4 + 1) { - // primary index + user-defined indexes + if (c.count() === 50000) { break; } - if (time() - start > 180) { - // wait for 3 minutes maximum - fail("Timeout creating indices after 3 minutes: " + JSON.stringify(indexes)); + if (time() - start > 300) { // wait for 5 minutes maximum + fail("Timeout creating documents after 5 minutes: " + c.count()); } require("internal").wait(0.5, false); } - - // wait some extra time because we just have 4 distinct indexes - // these will be created relatively quickly. by waiting here a bit - // we give the other pending tasks a chance to execute too (but they - // will not do anything because the target indexes already exist) - require("internal").wait(5, false); - - let indexes = require("internal").db._collection(cn).getIndexes(); - assertEqual(4 + 1, indexes.length); - } + // 250 entries of each value [0,999] + for (let i = 0; i < 50000; i++) { + let cursor = db._query("FOR doc IN @@coll FILTER doc.value == @val RETURN 1", + {'@coll': cn, 'val': i}, {count:true}); + assertEqual(cursor.count(), 1); + } + + let indexes = c.getIndexes(true); + for (let i of indexes) { + switch (i.type) { + case 'primary': + break; + case 'hash': + assertEqual(i.selectivityEstimate, 1.0); + break; + default: + fail(); + } + } + }, + + testInsertParallelUniqueConstraintViolation: function () { + let c = require("internal").db._collection(cn); + // first lets add some initial documents + let x = 0; + while(x < 10000) { + let docs = []; + for(let i = 0; i < 1000; i++) { + docs.push({value: x++}) + } + c.save(docs); + } + + // lets insert the rest via tasks + for (let i = 1; i < 5; ++i) { + let command = `let c = require("internal").db._collection("${cn}"); + let x = ${i} * 10000; + while(x < ${i + 1} * 10000) { + let docs = []; + for(let i = 0; i < 1000; i++) { + docs.push({value: x++}) + } + c.save(docs); + }`; + tasks.register({ name: "UnitTestsIndexInsert" + i, command: command }); + } + + c.save({value: 1 }); // now trigger a conflict + //tasks.register({ name: "UnitTestsIndexInsert6" + i, command: `require("internal").db._collection("${cn}").save({value: 1 });` }); + + try { + // create the index on the main thread + c.ensureIndex({type: 'hash', fields: ['value'], unique: true}); + fail(); + } catch(err) { + assertEqual(errors.ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED.code, err.errorNum); + } + + let time = require("internal").time; + let start = time(); + while (true) { + if (c.count() === 50000) { + break; + } + if (time() - start > 300) { // wait for 5 minutes maximum + fail("Timeout creating documents after 5 minutes: " + c.count()); + } + require("internal").wait(0.5, false); + } + + let indexes = c.getIndexes(); + for (let i of indexes) { + switch (i.type) { + case 'primary': + break; + case 'hash': + default: + fail(); + } + } + } }; } From 9e5f960fc11445ca55687a53e2ce8bae47a20055 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Thu, 6 Dec 2018 17:35:26 +0100 Subject: [PATCH 12/31] fix test --- tests/js/common/shell/shell-index-rocksdb.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/js/common/shell/shell-index-rocksdb.js b/tests/js/common/shell/shell-index-rocksdb.js index 6859b4648cc7..749febea4690 100644 --- a/tests/js/common/shell/shell-index-rocksdb.js +++ b/tests/js/common/shell/shell-index-rocksdb.js @@ -224,7 +224,7 @@ function backgroundIndexSuite() { let time = require("internal").time; let start = time(); while (true) { - if (c.count() === 50000) { + if (c.count() === 50001) { break; } if (time() - start > 300) { // wait for 5 minutes maximum From 6bfd840221802e7250fc103135741d63d4bef9c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Thu, 6 Dec 2018 17:54:36 +0100 Subject: [PATCH 13/31] make noncluster for now --- ...ell-index-rocksdb.js => shell-index-noncluster-rocksdb.js} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename tests/js/common/shell/{shell-index-rocksdb.js => shell-index-noncluster-rocksdb.js} (98%) diff --git a/tests/js/common/shell/shell-index-rocksdb.js b/tests/js/common/shell/shell-index-noncluster-rocksdb.js similarity index 98% rename from tests/js/common/shell/shell-index-rocksdb.js rename to tests/js/common/shell/shell-index-noncluster-rocksdb.js index 749febea4690..7cae9efcf93b 100644 --- a/tests/js/common/shell/shell-index-rocksdb.js +++ b/tests/js/common/shell/shell-index-noncluster-rocksdb.js @@ -93,8 +93,8 @@ function backgroundIndexSuite() { if (c.count() === 100000) { break; } - if (time() - start > 180) { // wait for 3 minutes maximum - fail("Timeout creating documents after 3 minutes"); + if (time() - start > 300) { // wait for 5 minutes maximum + fail("Timeout creating documents after 5 minutes: " + c.count()); } require("internal").wait(0.5, false); } From 3d2052147e95f8a8b0a8344fd3e4d6fc36b04f54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Fri, 7 Dec 2018 15:48:27 +0100 Subject: [PATCH 14/31] fix jslint --- tests/js/common/shell/shell-index-noncluster-rocksdb.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/js/common/shell/shell-index-noncluster-rocksdb.js b/tests/js/common/shell/shell-index-noncluster-rocksdb.js index 7cae9efcf93b..dc38e2a09641 100644 --- a/tests/js/common/shell/shell-index-noncluster-rocksdb.js +++ b/tests/js/common/shell/shell-index-noncluster-rocksdb.js @@ -65,7 +65,7 @@ function backgroundIndexSuite() { while(x-- > 0) { let docs = []; for(let i = 0; i < 1000; i++) { - docs.push({value:i}) + docs.push({value:i}); } c.save(docs); } @@ -129,7 +129,7 @@ function backgroundIndexSuite() { while(x < 10000) { let docs = []; for(let i = 0; i < 1000; i++) { - docs.push({value: x++}) + docs.push({value: x++}); } c.save(docs); } @@ -191,7 +191,7 @@ function backgroundIndexSuite() { while(x < 10000) { let docs = []; for(let i = 0; i < 1000; i++) { - docs.push({value: x++}) + docs.push({value: x++}); } c.save(docs); } @@ -218,7 +218,7 @@ function backgroundIndexSuite() { c.ensureIndex({type: 'hash', fields: ['value'], unique: true}); fail(); } catch(err) { - assertEqual(errors.ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED.code, err.errorNum); + assertEqual(errors.ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED.code, err.errorNum, err); } let time = require("internal").time; From 38005e04c031fc7630760f0911cf2ed42f8d589c Mon Sep 17 00:00:00 2001 From: Dan Larkin-York Date: Mon, 10 Dec 2018 11:18:14 -0500 Subject: [PATCH 15/31] Some test adjustments. --- lib/Basics/RocksDBUtils.cpp | 2 +- ...ster-rocksdb.js => shell-index-rocksdb.js} | 128 ++++++++++-------- 2 files changed, 69 insertions(+), 61 deletions(-) rename tests/js/common/shell/{shell-index-noncluster-rocksdb.js => shell-index-rocksdb.js} (67%) diff --git a/lib/Basics/RocksDBUtils.cpp b/lib/Basics/RocksDBUtils.cpp index 280e15eb1420..68979017db0f 100644 --- a/lib/Basics/RocksDBUtils.cpp +++ b/lib/Basics/RocksDBUtils.cpp @@ -138,7 +138,7 @@ arangodb::Result convertStatus(rocksdb::Status const& status, StatusHint hint, s // should actually not occur with our RocksDB configuration return {TRI_ERROR_RESOURCE_LIMIT, prefix + "failed to acquire lock due to lock number limit"+ postfix }; } - return {TRI_ERROR_ARANGO_CONFLICT}; + return {TRI_ERROR_ARANGO_CONFLICT, "write-write conflict"}; case rocksdb::Status::Code::kExpired: return {TRI_ERROR_INTERNAL, prefix + "key expired; TTL was set in error"+ postfix}; case rocksdb::Status::Code::kTryAgain: diff --git a/tests/js/common/shell/shell-index-noncluster-rocksdb.js b/tests/js/common/shell/shell-index-rocksdb.js similarity index 67% rename from tests/js/common/shell/shell-index-noncluster-rocksdb.js rename to tests/js/common/shell/shell-index-rocksdb.js index dc38e2a09641..be5359a265e1 100644 --- a/tests/js/common/shell/shell-index-noncluster-rocksdb.js +++ b/tests/js/common/shell/shell-index-rocksdb.js @@ -35,8 +35,26 @@ const db = internal.db; function backgroundIndexSuite() { 'use strict'; - let cn = "UnitTestsCollectionIdx"; - let tasks = require("@arangodb/tasks"); + const cn = "UnitTestsCollectionIdx"; + const tasks = require("@arangodb/tasks"); + const tasksCompleted = () => { + return 0 == tasks.get().filter((task) => { + return (task.id.match(/^UnitTest/) || task.name.match(/^UnitTest/)) + }).length; + }; + const waitForTasks = () => { + const time = require("internal").time; + const start = time(); + while (!tasksCompleted()) { + if (time() - start > 300) { // wait for 5 minutes maximum + fail("Timeout creating documents after 5 minutes: " + c.count()); + } + require("internal").wait(0.5, false); + } + require('internal').wal.flush(true, true); + // wait an extra second for good measure + require("internal").wait(1.0, false); + }; return { @@ -74,8 +92,9 @@ function backgroundIndexSuite() { let n = 9; for (let i = 0; i < n; ++i) { let command = `let c = require("internal").db._collection("${cn}"); - let x = 10; while(x-- > 0) { - let docs = []; + let x = 10; + while(x-- > 0) { + let docs = []; for(let i = 0; i < 1000; i++) { docs.push({value:i}) } @@ -87,34 +106,27 @@ function backgroundIndexSuite() { // create the index on the main thread c.ensureIndex({type: 'hash', fields: ['value'], unique: false}); - let time = require("internal").time; - let start = time(); - while (true) { - if (c.count() === 100000) { - break; - } - if (time() - start > 300) { // wait for 5 minutes maximum - fail("Timeout creating documents after 5 minutes: " + c.count()); - } - require("internal").wait(0.5, false); - } + // wait for insertion tasks to complete + waitForTasks(); + + // sanity check + assertEqual(c.count(), 100000); - // 250 entries of each value [0,999] - for (let i = 0; i < 1000; i++) { + // 100 entries of each value [0,999] + /*for (let i = 0; i < 1000; i++) { let cursor = db._query("FOR doc IN @@coll FILTER doc.value == @val RETURN 1", {'@coll': cn, 'val': i}, {count:true}); assertEqual(cursor.count(), 100); - } - - const estimate = 1000.0 / 100000.0; + }*/ + internal.waitForEstimatorSync(); // make sure estimates are consistent let indexes = c.getIndexes(true); for (let i of indexes) { switch (i.type) { case 'primary': break; case 'hash': - assertEqual(i.selectivityEstimate, estimate); + assertEqual(i.selectivityEstimate, 0.01); break; default: fail(); @@ -122,6 +134,8 @@ function backgroundIndexSuite() { } }, + // if we run this in isolation, it passes, but the count is off otherwise. + // the slow part of the test is the individual sanity checks testInsertParallelUnique: function () { let c = require("internal").db._collection(cn); // first lets add some initial documents @@ -139,36 +153,36 @@ function backgroundIndexSuite() { let command = `let c = require("internal").db._collection("${cn}"); let x = ${i} * 10000; while(x < ${i + 1} * 10000) { - let docs = []; - for(let i = 0; i < 1000; i++) { - docs.push({value: x++}) - } - c.save(docs); - }`; + let docs = []; + for(let i = 0; i < 1000; i++) { + docs.push({value: x++}) + } + let res = c.save(docs); + res.map((obj) => { + if (obj.error) { + require('internal').print(JSON.stringify(obj)); + } + }); + }`; tasks.register({ name: "UnitTestsIndexInsert" + i, command: command }); } // create the index on the main thread c.ensureIndex({type: 'hash', fields: ['value'], unique: true}); - let time = require("internal").time; - let start = time(); - while (true) { - if (c.count() === 50000) { - break; - } - if (time() - start > 300) { // wait for 5 minutes maximum - fail("Timeout creating documents after 5 minutes: " + c.count()); - } - require("internal").wait(0.5, false); - } - - // 250 entries of each value [0,999] + // wait for insertion tasks to complete + waitForTasks(); + + // sanity checks + const scanDocs = db._query("FOR doc IN @@coll RETURN doc", + {'@coll': cn}, {count:true, optimizer: {rules: ["-use-indexes"]}}).toArray(); + assertEqual(scanDocs.length, 50000); for (let i = 0; i < 50000; i++) { - let cursor = db._query("FOR doc IN @@coll FILTER doc.value == @val RETURN 1", + const cursor = db._query("FOR doc IN @@coll FILTER doc.value == @val RETURN 1", {'@coll': cn, 'val': i}, {count:true}); assertEqual(cursor.count(), 1); } + assertEqual(c.count(), 50000);*/ let indexes = c.getIndexes(true); for (let i of indexes) { @@ -201,17 +215,17 @@ function backgroundIndexSuite() { let command = `let c = require("internal").db._collection("${cn}"); let x = ${i} * 10000; while(x < ${i + 1} * 10000) { - let docs = []; - for(let i = 0; i < 1000; i++) { - docs.push({value: x++}) - } - c.save(docs); - }`; + let docs = []; + for(let i = 0; i < 1000; i++) { + docs.push({value: x++}) + } + c.save(docs); + }`; tasks.register({ name: "UnitTestsIndexInsert" + i, command: command }); } - c.save({value: 1 }); // now trigger a conflict - //tasks.register({ name: "UnitTestsIndexInsert6" + i, command: `require("internal").db._collection("${cn}").save({value: 1 });` }); + // now insert a document that will cause a conflict while indexing + c.save({value: 1 }); try { // create the index on the main thread @@ -221,17 +235,11 @@ function backgroundIndexSuite() { assertEqual(errors.ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED.code, err.errorNum, err); } - let time = require("internal").time; - let start = time(); - while (true) { - if (c.count() === 50001) { - break; - } - if (time() - start > 300) { // wait for 5 minutes maximum - fail("Timeout creating documents after 5 minutes: " + c.count()); - } - require("internal").wait(0.5, false); - } + // wait for insertion tasks to complete + waitForTasks(); + + // sanity checks + assertEqual(c.count(), 50001); let indexes = c.getIndexes(); for (let i of indexes) { From a7ae28ad6ab11cfc39a2a1a431c34fdc2aed1c81 Mon Sep 17 00:00:00 2001 From: Dan Larkin-York Date: Mon, 10 Dec 2018 12:37:39 -0500 Subject: [PATCH 16/31] Fix merge error. --- arangod/RocksDBEngine/RocksDBBuilderIndex.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp index 2ac480573ff6..0fd43f40b29a 100644 --- a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp +++ b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp @@ -44,7 +44,7 @@ using namespace arangodb; using namespace arangodb::rocksutils; RocksDBBuilderIndex::RocksDBBuilderIndex(std::shared_ptr const& wp) - : RocksDBIndex(wp->id(), *wp->collection(), + : RocksDBIndex(wp->id(), wp->collection(), wp->fields(), wp->unique(), wp->sparse(), wp->columnFamily(), 0, false), _wrapped(wp), _hasError(false) { From db46a40334f50b706f1510f21cd37230b96c7a4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Mon, 10 Dec 2018 21:43:45 +0100 Subject: [PATCH 17/31] changes --- arangod/Cluster/ClusterInfo.cpp | 7 +- arangod/ClusterEngine/ClusterCollection.cpp | 14 +- arangod/RocksDBEngine/RocksDBBuilderIndex.cpp | 68 ++++---- arangod/RocksDBEngine/RocksDBBuilderIndex.h | 21 +-- arangod/RocksDBEngine/RocksDBCollection.cpp | 27 +-- arangod/RocksDBEngine/RocksDBIndex.cpp | 10 +- arangod/RocksDBEngine/RocksDBIndexFactory.cpp | 156 +++++++++--------- arangod/RocksDBEngine/RocksDBMethods.cpp | 18 +- arangod/RocksDBEngine/RocksDBMethods.h | 6 +- arangod/StorageEngine/PhysicalCollection.h | 3 +- arangod/VocBase/LogicalCollection.cpp | 2 +- lib/Basics/RocksDBUtils.cpp | 2 +- lib/Basics/StaticStrings.cpp | 1 + lib/Basics/StaticStrings.h | 1 + tests/IResearch/StorageEngineMock.cpp | 5 +- tests/IResearch/StorageEngineMock.h | 2 +- 16 files changed, 172 insertions(+), 171 deletions(-) diff --git a/arangod/Cluster/ClusterInfo.cpp b/arangod/Cluster/ClusterInfo.cpp index f12bf574e821..81ac076e5848 100644 --- a/arangod/Cluster/ClusterInfo.cpp +++ b/arangod/Cluster/ClusterInfo.cpp @@ -2433,14 +2433,11 @@ int ClusterInfo::ensureIndexCoordinator( // check index id uint64_t iid = 0; - VPackSlice const idSlice = slice.get(StaticStrings::IndexId); - if (idSlice.isString()) { - // use predefined index id + if (idSlice.isString()) { // use predefined index id iid = arangodb::basics::StringUtils::uint64(idSlice.copyString()); } - if (iid == 0) { - // no id set, create a new one! + if (iid == 0) { // no id set, create a new one! iid = uniqid(); } std::string const idString = arangodb::basics::StringUtils::itoa(iid); diff --git a/arangod/ClusterEngine/ClusterCollection.cpp b/arangod/ClusterEngine/ClusterCollection.cpp index 098d41f203cc..3af7849d79d8 100644 --- a/arangod/ClusterEngine/ClusterCollection.cpp +++ b/arangod/ClusterEngine/ClusterCollection.cpp @@ -368,14 +368,12 @@ std::shared_ptr ClusterCollection::createIndex( WRITE_LOCKER(guard, _exclusiveLock); std::shared_ptr idx; - { - WRITE_LOCKER(guard, _indexesLock); - idx = findIndex(info, _indexes); - if (idx) { - created = false; - // We already have this index. - return idx; - } + WRITE_LOCKER(guard2, _indexesLock); + idx = lookupIndex(info); + if (idx) { + created = false; + // We already have this index. + return idx; } StorageEngine* engine = EngineSelectorFeature::ENGINE; diff --git a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp index 2ac480573ff6..5e341b90e1ad 100644 --- a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp +++ b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp @@ -71,30 +71,19 @@ Result RocksDBBuilderIndex::insertInternal(transaction::Methods* trx, RocksDBMet arangodb::velocypack::Slice const& slice, OperationMode mode) { Result r = _wrapped->insertInternal(trx, mthd, documentId, slice, mode); - if (!r.ok() && !_hasError.load(std::memory_order_acquire)) { - _errorResult = r; - _hasError.store(true, std::memory_order_release); + if (r.is(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED) || + r.is(TRI_ERROR_LOCK_TIMEOUT) || + r.is(TRI_ERROR_DEADLOCK) || + r.is(TRI_ERROR_ARANGO_CONFLICT)) { + bool expected = false; + if (!r.ok() && !_hasError.compare_exchange_strong(expected, true, std::memory_order_release)) { + std::lock_guard guard(_errorMutex); + _errorResult = r; + } } return Result(); } -//Result RocksDBBuilderIndex::updateInternal(transaction::Methods* trx, RocksDBMethods* mthd, -// LocalDocumentId const& oldDocumentId, -// arangodb::velocypack::Slice const& oldDoc, -// LocalDocumentId const& newDocumentId, -// arangodb::velocypack::Slice const& newDoc, -// OperationMode mode) { -// // It is illegal to call this method on the primary index -// // RocksDBPrimaryIndex must override this method accordingly -// TRI_ASSERT(type() != TRI_IDX_TYPE_PRIMARY_INDEX); -// -// Result res = removeInternal(trx, mthd, oldDocumentId, oldDoc, mode); -// if (!res.ok()) { -// return res; -// } -// return insertInternal(trx, mthd, newDocumentId, newDoc, mode); -//} - /// remove index elements and put it in the specified write batch. Result RocksDBBuilderIndex::removeInternal(transaction::Methods* trx, RocksDBMethods* mthd, LocalDocumentId const& documentId, @@ -112,9 +101,15 @@ Result RocksDBBuilderIndex::removeInternal(transaction::Methods* trx, RocksDBMet } Result r = _wrapped->removeInternal(trx, mthd, documentId, slice, mode); - if (!r.ok() && !_hasError.load(std::memory_order_acquire)) { - _errorResult = r; - _hasError.store(true, std::memory_order_release); + if (r.is(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED) || + r.is(TRI_ERROR_LOCK_TIMEOUT) || + r.is(TRI_ERROR_DEADLOCK) || + r.is(TRI_ERROR_ARANGO_CONFLICT)) { + bool expected = false; + if (!r.ok() && !_hasError.compare_exchange_strong(expected, true, std::memory_order_release)) { + std::lock_guard guard(_errorMutex); + _errorResult = r; + } } return Result(); } @@ -128,6 +123,12 @@ arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function // 3. Avoid conflicts on unique index keys by using rocksdb::Transaction snapshot conflict checking // 4. Supress unique constraint violations / conflicts or client drivers + auto lockedDocsGuard = scopeGuard([&] { // clear all the processed documents + std::lock_guard guard(_lockedDocsMutex); + _lockedDocs.clear(); + _lockedDocsCond.notify_all(); + }); + // fillindex can be non transactional, we just need to clean up RocksDBEngine* engine = rocksutils::globalRocksEngine(); RocksDBCollection* rcoll = static_cast(_collection.getPhysical()); @@ -177,7 +178,7 @@ arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function } else { rtrx->DisableIndexing(); // we never check for existing index keys } - RocksDBSubTrxMethods batched(state, rtrx.get()); + RocksDBSideTrxMethods batched(state, rtrx.get()); RocksDBIndex* internal = _wrapped.get(); TRI_ASSERT(internal != nullptr); @@ -186,6 +187,7 @@ arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function while (it->Valid() && it->key().compare(upper) < 0) { if (_hasError.load(std::memory_order_acquire)) { + std::lock_guard guard(_errorMutex); res = _errorResult; // a Writer got an error break; } @@ -197,7 +199,7 @@ arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function continue; } std::lock_guard guard2(_lockedDocsMutex); - _lockedDocs.insert(docId.id());// must be done under _removedDocsMutex + _lockedDocs.insert(docId.id()); } res = internal->insertInternal(&trx, &batched, docId, @@ -215,7 +217,7 @@ arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function break; } { // clear all the processed documents - std::lock_guard guard2(_lockedDocsMutex); + std::lock_guard guard(_lockedDocsMutex); _lockedDocs.clear(); _lockedDocsCond.notify_all(); } @@ -240,11 +242,6 @@ arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function res = trx.commit(); // required to commit selectivity estimates } - // clear all the processed documents - std::lock_guard guard2(_lockedDocsMutex); - _lockedDocs.clear(); - _lockedDocsCond.notify_all(); - return res; } @@ -331,15 +328,12 @@ static arangodb::Result fillIndexFast(transaction::Methods& trx, /// non-transactional: fill index with existing documents /// from this collection -arangodb::Result RocksDBBuilderIndex::fillIndex(std::function const& unlock) { -// TRI_ASSERT(trx.state()->collection(_collection.id(), AccessMode::Type::WRITE)); +arangodb::Result RocksDBBuilderIndex::fillIndexFast() { -#if 0 RocksDBCollection* coll = static_cast(_collection.getPhysical()); - unlock(); // we do not need the outer lock SingleCollectionTransaction trx(transaction::StandaloneContext::Create(_collection.vocbase()), _collection, AccessMode::Type::EXCLUSIVE); - res = trx.begin(); + Result res = trx.begin(); if (!res.ok()) { THROW_ARANGO_EXCEPTION(res); } @@ -355,6 +349,4 @@ arangodb::Result RocksDBBuilderIndex::fillIndex(std::function const& unl rocksdb::WriteBatch batch(32 * 1024 * 1024); return ::fillIndexFast(trx, this, coll, batch); } -#endif - return fillIndexBackground(unlock); } diff --git a/arangod/RocksDBEngine/RocksDBBuilderIndex.h b/arangod/RocksDBEngine/RocksDBBuilderIndex.h index eb163cd6bc94..f8f4255a3d21 100644 --- a/arangod/RocksDBEngine/RocksDBBuilderIndex.h +++ b/arangod/RocksDBEngine/RocksDBBuilderIndex.h @@ -30,6 +30,9 @@ namespace arangodb { +/// Dummy index class that contains the logic to build indexes +/// without an exclusive lock. It wraps the actual index implementation +/// and adds some required synchronization logic on top class RocksDBBuilderIndex final : public arangodb::RocksDBIndex { public: @@ -89,13 +92,6 @@ class RocksDBBuilderIndex final : public arangodb::RocksDBIndex { LocalDocumentId const& documentId, arangodb::velocypack::Slice const&, OperationMode mode) override; -// -// Result updateInternal(transaction::Methods* trx, RocksDBMethods*, -// LocalDocumentId const& oldDocumentId, -// arangodb::velocypack::Slice const& oldDoc, -// LocalDocumentId const& newDocumentId, -// velocypack::Slice const& newDoc, -// OperationMode mode) override; /// remove index elements and put it in the specified write batch. Result removeInternal(transaction::Methods* trx, RocksDBMethods*, @@ -116,17 +112,18 @@ class RocksDBBuilderIndex final : public arangodb::RocksDBIndex { _wrapped->recalculateEstimates(); } - /// @brief fill the index - /// @param unlock will be called when the index lock can be released - Result fillIndex(std::function const& unlock); - -private: + /// @brief fill index, will exclusively lock the collection + Result fillIndexFast(); + /// @brief fill the index, assume already locked exclusively + /// @param unlock called when collection lock can be released Result fillIndexBackground(std::function const& unlock); private: std::shared_ptr _wrapped; + std::atomic _hasError; + std::mutex _errorMutex; Result _errorResult; std::mutex _removedDocsMutex; diff --git a/arangod/RocksDBEngine/RocksDBCollection.cpp b/arangod/RocksDBEngine/RocksDBCollection.cpp index beac732d7818..92064b3787a1 100644 --- a/arangod/RocksDBEngine/RocksDBCollection.cpp +++ b/arangod/RocksDBEngine/RocksDBCollection.cpp @@ -312,8 +312,8 @@ void RocksDBCollection::prepareIndexes( } std::shared_ptr RocksDBCollection::createIndex( - arangodb::velocypack::Slice const& info, bool restore, - bool& created) { + arangodb::velocypack::Slice const& info, + bool restore, bool& created) { TRI_ASSERT(info.isObject()); Result res; @@ -345,7 +345,7 @@ std::shared_ptr RocksDBCollection::createIndex( } RocksDBEngine* engine = static_cast(EngineSelectorFeature::ENGINE); - + // Step 2. We are sure that we do not have an index of this type. // We also hold the lock. Create it const bool generateKey = !restore; @@ -367,11 +367,10 @@ std::shared_ptr RocksDBCollection::createIndex( } auto buildIdx = std::make_shared(std::static_pointer_cast(idx)); - + // Step 3. add index to collection entry (for removal after a crash) - if (!engine->inRecovery()) { - // read collection info from database - RocksDBKey key; + if (!engine->inRecovery()) { // manually modify collection entry, other methods need lock + RocksDBKey key; // read collection info from database key.constructCollection(_logicalCollection.vocbase().id(), _logicalCollection.id()); rocksdb::PinnableSlice value; rocksdb::Status s = engine->db()->Get(rocksdb::ReadOptions(), @@ -399,13 +398,21 @@ std::shared_ptr RocksDBCollection::createIndex( RocksDBLogValue::Empty()); } } + + bool inBackground = basics::VelocyPackHelper::getBooleanValue( + info, StaticStrings::IndexInBackground, false); // Step 4. fill index if (res.ok()) { - _indexes.emplace_back(buildIdx); // add index to indexes list - res = buildIdx->fillIndex([&] { + if (inBackground) { // allow concurrent inserts into index + _indexes.emplace_back(buildIdx); + res = buildIdx->fillIndexBackground([&] { + unlockGuard.fire(); + }); + } else { unlockGuard.fire(); - }); + res = buildIdx->fillIndexFast(); // will lock again internally + } } // Step 5. cleanup diff --git a/arangod/RocksDBEngine/RocksDBIndex.cpp b/arangod/RocksDBEngine/RocksDBIndex.cpp index 63138114d400..bad12d46250b 100644 --- a/arangod/RocksDBEngine/RocksDBIndex.cpp +++ b/arangod/RocksDBEngine/RocksDBIndex.cpp @@ -53,8 +53,10 @@ using namespace arangodb::rocksutils; uint64_t const arangodb::RocksDBIndex::ESTIMATOR_SIZE = 4096; -inline static uint64_t ensureObjectId(uint64_t oid) { - return (oid != 0) ? oid : TRI_NewTickServer(); +namespace { + inline uint64_t ensureObjectId(uint64_t oid) { + return (oid != 0) ? oid : TRI_NewTickServer(); + } } RocksDBIndex::RocksDBIndex( @@ -68,7 +70,7 @@ RocksDBIndex::RocksDBIndex( bool useCache ) : Index(id, collection, attributes, unique, sparse), - _objectId(ensureObjectId(objectId)), + _objectId(::ensureObjectId(objectId)), _cf(cf), _cache(nullptr), _cachePresent(false), @@ -94,7 +96,7 @@ RocksDBIndex::RocksDBIndex( bool useCache ) : Index(id, collection, info), - _objectId(ensureObjectId(basics::VelocyPackHelper::stringUInt64(info.get("objectId")))), + _objectId(::ensureObjectId(basics::VelocyPackHelper::stringUInt64(info.get("objectId")))), _cf(cf), _cache(nullptr), _cachePresent(false), diff --git a/arangod/RocksDBEngine/RocksDBIndexFactory.cpp b/arangod/RocksDBEngine/RocksDBIndexFactory.cpp index d2ee4a5440d0..1bd64eb727c5 100644 --- a/arangod/RocksDBEngine/RocksDBIndexFactory.cpp +++ b/arangod/RocksDBEngine/RocksDBIndexFactory.cpp @@ -151,6 +151,17 @@ static void ProcessIndexDeduplicateFlag(VPackSlice const definition, builder.add("deduplicate", VPackValue(dup)); } +//////////////////////////////////////////////////////////////////////////////// +/// @brief process the index in background flag and add it to the json +//////////////////////////////////////////////////////////////////////////////// + +static void ProcessIndexInBackgroundFlag(VPackSlice const definition, + VPackBuilder& builder) { + bool bck = basics::VelocyPackHelper::getBooleanValue(definition, + StaticStrings::IndexInBackground, false); + builder.add(StaticStrings::IndexInBackground, VPackValue(bck)); +} + //////////////////////////////////////////////////////////////////////////////// /// @brief enhances the json of a vpack index //////////////////////////////////////////////////////////////////////////////// @@ -163,6 +174,7 @@ static int EnhanceJsonIndexVPack(VPackSlice const definition, ProcessIndexSparseFlag(definition, builder, create); ProcessIndexUniqueFlag(definition, builder); ProcessIndexDeduplicateFlag(definition, builder); + ProcessIndexInBackgroundFlag(definition, builder); } return res; @@ -291,9 +303,9 @@ static int EnhanceJsonIndexFulltext(VPackSlice const definition, namespace { struct DefaultIndexFactory: public arangodb::IndexTypeFactory { - std::string const _type; + arangodb::Index::IndexType const _type; - DefaultIndexFactory(std::string const& type): _type(type) {} + DefaultIndexFactory(arangodb::Index::IndexType type): _type(type) {} virtual bool equal( arangodb::velocypack::Slice const& lhs, @@ -321,10 +333,8 @@ struct DefaultIndexFactory: public arangodb::IndexTypeFactory { } } - auto type = Index::type(_type); - - if (arangodb::Index::IndexType::TRI_IDX_TYPE_GEO1_INDEX == type|| - arangodb::Index::IndexType::TRI_IDX_TYPE_GEO_INDEX == type) { + if (arangodb::Index::IndexType::TRI_IDX_TYPE_GEO1_INDEX == _type|| + arangodb::Index::IndexType::TRI_IDX_TYPE_GEO_INDEX == _type) { // geoJson must be identical if present value = lhs.get("geoJson"); @@ -332,7 +342,7 @@ struct DefaultIndexFactory: public arangodb::IndexTypeFactory { && arangodb::basics::VelocyPackHelper::compare(value, rhs.get("geoJson"), false)) { return false; } - } else if (arangodb::Index::IndexType::TRI_IDX_TYPE_FULLTEXT_INDEX == type) { + } else if (arangodb::Index::IndexType::TRI_IDX_TYPE_FULLTEXT_INDEX == _type) { // minLength value = lhs.get("minLength"); @@ -346,7 +356,7 @@ struct DefaultIndexFactory: public arangodb::IndexTypeFactory { value = lhs.get(arangodb::StaticStrings::IndexFields); if (value.isArray()) { - if (arangodb::Index::IndexType::TRI_IDX_TYPE_HASH_INDEX == type) { + if (arangodb::Index::IndexType::TRI_IDX_TYPE_HASH_INDEX == _type) { arangodb::velocypack::ValueLength const nv = value.length(); // compare fields in arbitrary order @@ -386,7 +396,8 @@ struct DefaultIndexFactory: public arangodb::IndexTypeFactory { }; struct EdgeIndexFactory: public DefaultIndexFactory { - EdgeIndexFactory(std::string const& type): DefaultIndexFactory(type) {} + EdgeIndexFactory() + : DefaultIndexFactory(arangodb::Index::TRI_IDX_TYPE_EDGE_INDEX) {} virtual arangodb::Result instantiate( std::shared_ptr& index, @@ -436,7 +447,8 @@ struct EdgeIndexFactory: public DefaultIndexFactory { }; struct FulltextIndexFactory: public DefaultIndexFactory { - FulltextIndexFactory(std::string const& type): DefaultIndexFactory(type) {} + FulltextIndexFactory() + : DefaultIndexFactory(arangodb::Index::TRI_IDX_TYPE_FULLTEXT_INDEX) {} virtual arangodb::Result instantiate( std::shared_ptr& index, @@ -479,7 +491,8 @@ struct FulltextIndexFactory: public DefaultIndexFactory { }; struct GeoIndexFactory: public DefaultIndexFactory { - GeoIndexFactory(std::string const& type): DefaultIndexFactory(type) {} + GeoIndexFactory() + : DefaultIndexFactory(arangodb::Index::TRI_IDX_TYPE_GEO_INDEX) {} virtual arangodb::Result instantiate( std::shared_ptr& index, @@ -522,7 +535,8 @@ struct GeoIndexFactory: public DefaultIndexFactory { }; struct Geo1IndexFactory: public DefaultIndexFactory { - Geo1IndexFactory(std::string const& type): DefaultIndexFactory(type) {} + Geo1IndexFactory() + : DefaultIndexFactory(arangodb::Index::TRI_IDX_TYPE_GEO_INDEX) {} virtual arangodb::Result instantiate( std::shared_ptr& index, @@ -565,7 +579,8 @@ struct Geo1IndexFactory: public DefaultIndexFactory { }; struct Geo2IndexFactory: public DefaultIndexFactory { - Geo2IndexFactory(std::string const& type): DefaultIndexFactory(type) {} + Geo2IndexFactory() + : DefaultIndexFactory(arangodb::Index::TRI_IDX_TYPE_GEO_INDEX) {} virtual arangodb::Result instantiate( std::shared_ptr& index, @@ -606,9 +621,10 @@ struct Geo2IndexFactory: public DefaultIndexFactory { return EnhanceJsonIndexGeo2(definition, normalized, isCreation); } }; - -struct HashIndexFactory: public DefaultIndexFactory { - HashIndexFactory(std::string const& type): DefaultIndexFactory(type) {} + +template +struct SecondaryIndexFactory: public DefaultIndexFactory { + SecondaryIndexFactory() : DefaultIndexFactory(type) {} virtual arangodb::Result instantiate( std::shared_ptr& index, @@ -617,10 +633,7 @@ struct HashIndexFactory: public DefaultIndexFactory { TRI_idx_iid_t id, bool isClusterConstructor ) const override { - index = std::make_shared( - id, collection, definition - ); - + index = std::make_shared(id, collection, definition); return arangodb::Result(); } @@ -633,7 +646,7 @@ struct HashIndexFactory: public DefaultIndexFactory { normalized.add( arangodb::StaticStrings::IndexType, arangodb::velocypack::Value( - arangodb::Index::oldtypeName(arangodb::Index::TRI_IDX_TYPE_HASH_INDEX) + arangodb::Index::oldtypeName(type) ) ); @@ -649,7 +662,7 @@ struct HashIndexFactory: public DefaultIndexFactory { return EnhanceJsonIndexVPack(definition, normalized, isCreation); } }; - +/* struct PersistentIndexFactory: public DefaultIndexFactory { PersistentIndexFactory(std::string const& type): DefaultIndexFactory(type) {} @@ -692,9 +705,42 @@ struct PersistentIndexFactory: public DefaultIndexFactory { return EnhanceJsonIndexVPack(definition, normalized, isCreation); } }; + +struct SkiplistIndexFactory: public DefaultIndexFactory { + SkiplistIndexFactory(std::string const& type): DefaultIndexFactory(type) {} + + virtual arangodb::Result instantiate(std::shared_ptr& index, + arangodb::LogicalCollection& collection, + arangodb::velocypack::Slice const& definition, + TRI_idx_iid_t id, + bool isClusterConstructor + ) const override { + index = std::make_shared(id, collection, definition); + return arangodb::Result(); + } + + virtual arangodb::Result normalize(arangodb::velocypack::Builder& normalized, + arangodb::velocypack::Slice definition, + bool isCreation) const override { + TRI_ASSERT(normalized.isOpenObject()); + normalized.add(arangodb::StaticStrings::IndexType, + arangodb::velocypack::Value(arangodb::Index::oldtypeName(arangodb::Index::TRI_IDX_TYPE_SKIPLIST_INDEX))); + + if (isCreation + && !ServerState::instance()->isCoordinator() + && !definition.hasKey("objectId")) { + normalized.add("objectId", + arangodb::velocypack::Value(std::to_string(TRI_NewTickServer()))); + } + + return EnhanceJsonIndexVPack(definition, normalized, isCreation); + } +}; +*/ struct PrimaryIndexFactory: public DefaultIndexFactory { - PrimaryIndexFactory(std::string const& type): DefaultIndexFactory(type) {} + PrimaryIndexFactory() + : DefaultIndexFactory(arangodb::Index::TRI_IDX_TYPE_PRIMARY_INDEX) {} virtual arangodb::Result instantiate( std::shared_ptr& index, @@ -740,61 +786,21 @@ struct PrimaryIndexFactory: public DefaultIndexFactory { } }; -struct SkiplistIndexFactory: public DefaultIndexFactory { - SkiplistIndexFactory(std::string const& type): DefaultIndexFactory(type) {} - - virtual arangodb::Result instantiate( - std::shared_ptr& index, - arangodb::LogicalCollection& collection, - arangodb::velocypack::Slice const& definition, - TRI_idx_iid_t id, - bool isClusterConstructor - ) const override { - index = std::make_shared( - id, collection, definition - ); - - return arangodb::Result(); - } - - virtual arangodb::Result normalize( - arangodb::velocypack::Builder& normalized, - arangodb::velocypack::Slice definition, - bool isCreation - ) const override { - TRI_ASSERT(normalized.isOpenObject()); - normalized.add( - arangodb::StaticStrings::IndexType, - arangodb::velocypack::Value(arangodb::Index::oldtypeName( - arangodb::Index::TRI_IDX_TYPE_SKIPLIST_INDEX - )) - ); - - if (isCreation - && !ServerState::instance()->isCoordinator() - && !definition.hasKey("objectId")) { - normalized.add( - "objectId", - arangodb::velocypack::Value(std::to_string(TRI_NewTickServer())) - ); - } - - return EnhanceJsonIndexVPack(definition, normalized, isCreation); - } -}; - } RocksDBIndexFactory::RocksDBIndexFactory() { - static const EdgeIndexFactory edgeIndexFactory("edge"); - static const FulltextIndexFactory fulltextIndexFactory("fulltext"); - static const GeoIndexFactory geoIndexFactory("geo"); - static const Geo1IndexFactory geo1IndexFactory("geo1"); - static const Geo2IndexFactory geo2IndexFactory("geo2"); - static const HashIndexFactory hashIndexFactory("hash"); - static const PersistentIndexFactory persistentIndexFactory("persistent"); - static const PrimaryIndexFactory primaryIndexFactory("primary"); - static const SkiplistIndexFactory skiplistIndexFactory("skiplist"); + static const EdgeIndexFactory edgeIndexFactory; + static const FulltextIndexFactory fulltextIndexFactory; + static const GeoIndexFactory geoIndexFactory; + static const Geo1IndexFactory geo1IndexFactory; + static const Geo2IndexFactory geo2IndexFactory; + static const SecondaryIndexFactory hashIndexFactory; + static const SecondaryIndexFactory persistentIndexFactory; + static const SecondaryIndexFactory skiplistIndexFactory; + static const PrimaryIndexFactory primaryIndexFactory; emplace("edge", edgeIndexFactory); emplace("fulltext", fulltextIndexFactory); diff --git a/arangod/RocksDBEngine/RocksDBMethods.cpp b/arangod/RocksDBEngine/RocksDBMethods.cpp index 08778ce1d88e..08b24b857d5e 100644 --- a/arangod/RocksDBEngine/RocksDBMethods.cpp +++ b/arangod/RocksDBEngine/RocksDBMethods.cpp @@ -384,46 +384,46 @@ std::unique_ptr RocksDBBatchedWithIndexMethods::NewIterator( _wb->NewIteratorWithBase(_db->NewIterator(ro, cf))); } -// =================== RocksDBSubTrxMethods ==================== +// =================== RocksDBSideTrxMethods ==================== -/// transaction wrapper, uses the current rocksdb transaction and non-tracking methods -RocksDBSubTrxMethods::RocksDBSubTrxMethods(RocksDBTransactionState* state, +/// transaction wrapper, uses the provided rocksdb transaction +RocksDBSideTrxMethods::RocksDBSideTrxMethods(RocksDBTransactionState* state, rocksdb::Transaction* trx) : RocksDBMethods(state), _trx(trx) { _ro.prefix_same_as_start = true; _ro.fill_cache = false; } -rocksdb::Status RocksDBSubTrxMethods::Get(rocksdb::ColumnFamilyHandle* cf, rocksdb::Slice const& key, +rocksdb::Status RocksDBSideTrxMethods::Get(rocksdb::ColumnFamilyHandle* cf, rocksdb::Slice const& key, std::string* val) { TRI_ASSERT(cf != nullptr); return _trx->Get(_ro, cf, key, val); } -rocksdb::Status RocksDBSubTrxMethods::Get(rocksdb::ColumnFamilyHandle* cf, rocksdb::Slice const& key, +rocksdb::Status RocksDBSideTrxMethods::Get(rocksdb::ColumnFamilyHandle* cf, rocksdb::Slice const& key, rocksdb::PinnableSlice* val) { TRI_ASSERT(cf != nullptr); return _trx->Get(_ro, cf, key, val); } -rocksdb::Status RocksDBSubTrxMethods::Put(rocksdb::ColumnFamilyHandle* cf, +rocksdb::Status RocksDBSideTrxMethods::Put(rocksdb::ColumnFamilyHandle* cf, RocksDBKey const& key, rocksdb::Slice const& val) { TRI_ASSERT(cf != nullptr); return _trx->Put(cf, key.string(), val); } -rocksdb::Status RocksDBSubTrxMethods::Delete(rocksdb::ColumnFamilyHandle* cf, +rocksdb::Status RocksDBSideTrxMethods::Delete(rocksdb::ColumnFamilyHandle* cf, RocksDBKey const& key) { TRI_ASSERT(cf != nullptr); return _trx->Delete(cf, key.string()); } -rocksdb::Status RocksDBSubTrxMethods::SingleDelete(rocksdb::ColumnFamilyHandle* cf, +rocksdb::Status RocksDBSideTrxMethods::SingleDelete(rocksdb::ColumnFamilyHandle* cf, RocksDBKey const& key) { TRI_ASSERT(cf != nullptr); return _trx->SingleDelete(cf, key.string()); } -bool RocksDBSubTrxMethods::DisableIndexing() { +bool RocksDBSideTrxMethods::DisableIndexing() { _trx->DisableIndexing(); return true; } diff --git a/arangod/RocksDBEngine/RocksDBMethods.h b/arangod/RocksDBEngine/RocksDBMethods.h index cae6f3569135..188283431721 100644 --- a/arangod/RocksDBEngine/RocksDBMethods.h +++ b/arangod/RocksDBEngine/RocksDBMethods.h @@ -249,10 +249,10 @@ class RocksDBBatchedWithIndexMethods final : public RocksDBMethods { rocksdb::WriteBatchWithIndex* _wb; }; -/// transaction wrapper, uses the current rocksdb transaction and non-tracking methods -class RocksDBSubTrxMethods final : public RocksDBMethods { +/// transaction wrapper, uses the provided rocksdb transaction +class RocksDBSideTrxMethods final : public RocksDBMethods { public: - explicit RocksDBSubTrxMethods(RocksDBTransactionState* state, + explicit RocksDBSideTrxMethods(RocksDBTransactionState* state, rocksdb::Transaction* trx); rocksdb::Status Get(rocksdb::ColumnFamilyHandle*, rocksdb::Slice const& key, diff --git a/arangod/StorageEngine/PhysicalCollection.h b/arangod/StorageEngine/PhysicalCollection.h index 0dd2bac52d7b..05482ec3c13a 100644 --- a/arangod/StorageEngine/PhysicalCollection.h +++ b/arangod/StorageEngine/PhysicalCollection.h @@ -119,8 +119,7 @@ class PhysicalCollection { /// @brief create or restore an index /// @param restore utilize specified ID, assume index has to be created virtual std::shared_ptr createIndex( - arangodb::velocypack::Slice const& info, bool restore, - bool& created) = 0; + arangodb::velocypack::Slice const& info, bool restore, bool& created) = 0; virtual bool dropIndex(TRI_idx_iid_t iid) = 0; diff --git a/arangod/VocBase/LogicalCollection.cpp b/arangod/VocBase/LogicalCollection.cpp index de0921a3d011..efbdfc5510e5 100644 --- a/arangod/VocBase/LogicalCollection.cpp +++ b/arangod/VocBase/LogicalCollection.cpp @@ -590,7 +590,7 @@ void LogicalCollection::toVelocyPackForClusterInventory(VPackBuilder& result, getIndexesVPack(result, Index::makeFlags(), [](arangodb::Index const* idx) { // we have to exclude the primary and the edge index here, because otherwise // at least the MMFiles engine will try to create it - // AND exclude arangosearch indexes + // AND exclude hidden indexes return (idx->type() != arangodb::Index::TRI_IDX_TYPE_PRIMARY_INDEX && idx->type() != arangodb::Index::TRI_IDX_TYPE_EDGE_INDEX && !idx->isHidden()); diff --git a/lib/Basics/RocksDBUtils.cpp b/lib/Basics/RocksDBUtils.cpp index 280e15eb1420..68979017db0f 100644 --- a/lib/Basics/RocksDBUtils.cpp +++ b/lib/Basics/RocksDBUtils.cpp @@ -138,7 +138,7 @@ arangodb::Result convertStatus(rocksdb::Status const& status, StatusHint hint, s // should actually not occur with our RocksDB configuration return {TRI_ERROR_RESOURCE_LIMIT, prefix + "failed to acquire lock due to lock number limit"+ postfix }; } - return {TRI_ERROR_ARANGO_CONFLICT}; + return {TRI_ERROR_ARANGO_CONFLICT, "write-write conflict"}; case rocksdb::Status::Code::kExpired: return {TRI_ERROR_INTERNAL, prefix + "key expired; TTL was set in error"+ postfix}; case rocksdb::Status::Code::kTryAgain: diff --git a/lib/Basics/StaticStrings.cpp b/lib/Basics/StaticStrings.cpp index 82005548c3fa..6572cfd53b42 100644 --- a/lib/Basics/StaticStrings.cpp +++ b/lib/Basics/StaticStrings.cpp @@ -96,6 +96,7 @@ std::string const StaticStrings::IndexSparse("sparse"); std::string const StaticStrings::IndexType("type"); std::string const StaticStrings::IndexUnique("unique"); std::string const StaticStrings::IndexIsBuilding("isBuilding"); +std::string const StaticStrings::IndexInBackground("inBackground"); // HTTP headers std::string const StaticStrings::Accept("accept"); diff --git a/lib/Basics/StaticStrings.h b/lib/Basics/StaticStrings.h index d038b8f29dec..7e38e6b007da 100644 --- a/lib/Basics/StaticStrings.h +++ b/lib/Basics/StaticStrings.h @@ -95,6 +95,7 @@ class StaticStrings { static std::string const IndexType; // index type static std::string const IndexUnique; // index uniqueness marker static std::string const IndexIsBuilding; // index build in-process + static std::string const IndexInBackground; // index in background // HTTP headers static std::string const Accept; diff --git a/tests/IResearch/StorageEngineMock.cpp b/tests/IResearch/StorageEngineMock.cpp index cbcdf7d5b097..c872cc338e85 100644 --- a/tests/IResearch/StorageEngineMock.cpp +++ b/tests/IResearch/StorageEngineMock.cpp @@ -531,7 +531,8 @@ int PhysicalCollectionMock::close() { return TRI_ERROR_NO_ERROR; // assume close successful } -std::shared_ptr PhysicalCollectionMock::createIndex(arangodb::velocypack::Slice const& info, bool restore, bool& created) { +std::shared_ptr PhysicalCollectionMock::createIndex(arangodb::velocypack::Slice const& info, + bool restore, bool& created) { before(); std::vector> docs; @@ -1593,4 +1594,4 @@ bool TransactionStateMock::hasFailedOperations() const { // ----------------------------------------------------------------------------- // --SECTION-- END-OF-FILE -// ----------------------------------------------------------------------------- \ No newline at end of file +// ----------------------------------------------------------------------------- diff --git a/tests/IResearch/StorageEngineMock.h b/tests/IResearch/StorageEngineMock.h index a9877b7358d3..c4cf1effaf7b 100644 --- a/tests/IResearch/StorageEngineMock.h +++ b/tests/IResearch/StorageEngineMock.h @@ -60,7 +60,7 @@ class PhysicalCollectionMock: public arangodb::PhysicalCollection { PhysicalCollectionMock(arangodb::LogicalCollection& collection, arangodb::velocypack::Slice const& info); virtual PhysicalCollection* clone(arangodb::LogicalCollection& collection) const override; virtual int close() override; - virtual std::shared_ptr createIndex(arangodb::velocypack::Slice const& info, bool restore, bool& created) override; + virtual std::shared_ptr createIndex(arangodb::velocypack::Slice const& info, bool, bool&) override; virtual void deferDropCollection(std::function const& callback) override; virtual bool dropIndex(TRI_idx_iid_t iid) override; virtual void figuresSpecific(std::shared_ptr&) override; From ef4bb05b3d08ac2f93188fb72d07ac24d5aac577 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Tue, 11 Dec 2018 02:45:42 +0100 Subject: [PATCH 18/31] adding inBackground flag --- arangod/RestHandler/RestShutdownHandler.cpp | 4 ++-- arangod/RocksDBEngine/RocksDBCollection.cpp | 16 ++++++++++------ tests/js/common/shell/shell-index-rocksdb.js | 13 ++++++------- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/arangod/RestHandler/RestShutdownHandler.cpp b/arangod/RestHandler/RestShutdownHandler.cpp index f146a3fc6996..d60dd59a4e36 100644 --- a/arangod/RestHandler/RestShutdownHandler.cpp +++ b/arangod/RestHandler/RestShutdownHandler.cpp @@ -103,12 +103,12 @@ RestStatus RestShutdownHandler::execute() { rest::Scheduler* scheduler = SchedulerFeature::SCHEDULER; // don't block the response for workers waiting on this callback // this should allow workers to go into the IDLE state - scheduler->queue(RequestPriority::HIGH, [this] { + scheduler->queue(RequestPriority::HIGH, []{ // Give the server 2 seconds to send the reply: std::this_thread::sleep_for(std::chrono::seconds(2)); // Go down: ApplicationServer::server->beginShutdown(); - }); + }); return RestStatus::DONE; } diff --git a/arangod/RocksDBEngine/RocksDBCollection.cpp b/arangod/RocksDBEngine/RocksDBCollection.cpp index 92064b3787a1..7bf2ee1c8218 100644 --- a/arangod/RocksDBEngine/RocksDBCollection.cpp +++ b/arangod/RocksDBEngine/RocksDBCollection.cpp @@ -417,17 +417,21 @@ std::shared_ptr RocksDBCollection::createIndex( // Step 5. cleanup if (res.ok()) { - { // swap in actual index + { WRITE_LOCKER(indexGuard, _indexesLock); - for (size_t i = 0; i < _indexes.size(); i++) { - if (_indexes[i]->id() == buildIdx->id()) { - _indexes[i] = idx; - break; + if (inBackground) { // swap in actual index + for (size_t i = 0; i < _indexes.size(); i++) { + if (_indexes[i]->id() == buildIdx->id()) { + _indexes[i] = idx; + break; + } } + } else { + _indexes.push_back(idx); } } - // we should sync the selectivity estimates + // we should sync the selectivity estimates TODO fix res = engine->settingsManager()->sync(false); if (res.fail()) { // not critical LOG_TOPIC(WARN, Logger::ENGINES) << "could not sync settings: " diff --git a/tests/js/common/shell/shell-index-rocksdb.js b/tests/js/common/shell/shell-index-rocksdb.js index be5359a265e1..d9418ca1d98b 100644 --- a/tests/js/common/shell/shell-index-rocksdb.js +++ b/tests/js/common/shell/shell-index-rocksdb.js @@ -8,7 +8,7 @@ /// /// DISCLAIMER /// -/// Copyright 2010-2012 triagens GmbH, Cologne, Germany +/// Copyright 2018 ArangoDB GmbH, Cologne, Germany /// /// Licensed under the Apache License, Version 2.0 (the "License"); /// you may not use this file except in compliance with the License. @@ -22,10 +22,9 @@ /// See the License for the specific language governing permissions and /// limitations under the License. /// -/// Copyright holder is triAGENS GmbH, Cologne, Germany +/// Copyright holder is ArangoDB GmbH, Cologne, Germany /// -/// @author Dr. Frank Celler, Lucas Dohmen -/// @author Copyright 2012, triAGENS GmbH, Cologne, Germany +/// @author 2018 Simon Grätzer, Dan Larkin-York //////////////////////////////////////////////////////////////////////////////// const jsunity = require("jsunity"); @@ -104,7 +103,7 @@ function backgroundIndexSuite() { } // create the index on the main thread - c.ensureIndex({type: 'hash', fields: ['value'], unique: false}); + c.ensureIndex({type: 'hash', fields: ['value'], unique: false, inBackground: true}); // wait for insertion tasks to complete waitForTasks(); @@ -168,7 +167,7 @@ function backgroundIndexSuite() { } // create the index on the main thread - c.ensureIndex({type: 'hash', fields: ['value'], unique: true}); + c.ensureIndex({type: 'hash', fields: ['value'], unique: true, inBackground: true }); // wait for insertion tasks to complete waitForTasks(); @@ -229,7 +228,7 @@ function backgroundIndexSuite() { try { // create the index on the main thread - c.ensureIndex({type: 'hash', fields: ['value'], unique: true}); + c.ensureIndex({type: 'hash', fields: ['value'], unique: true, inBackground: true}); fail(); } catch(err) { assertEqual(errors.ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED.code, err.errorNum, err); From 2ebf591c9c0ba589a5d933d7b5633982fcc42ed9 Mon Sep 17 00:00:00 2001 From: Dan Larkin-York Date: Tue, 11 Dec 2018 09:18:11 -0500 Subject: [PATCH 19/31] Fix merge errors. --- arangod/RocksDBEngine/RocksDBBuilderIndex.cpp | 8 +- arangod/RocksDBEngine/RocksDBBuilderIndex.h | 17 +- arangod/RocksDBEngine/RocksDBCollection.cpp | 149 +----------------- arangod/RocksDBEngine/RocksDBEngine.cpp | 6 +- tests/js/common/shell/shell-index-rocksdb.js | 19 +-- 5 files changed, 32 insertions(+), 167 deletions(-) diff --git a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp index 08ac97cbcebb..0179be40205b 100644 --- a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp +++ b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp @@ -66,7 +66,7 @@ void RocksDBBuilderIndex::toVelocyPack(VPackBuilder& builder, } /// insert index elements into the specified write batch. -Result RocksDBBuilderIndex::insertInternal(transaction::Methods* trx, RocksDBMethods* mthd, +Result RocksDBBuilderIndex::insertInternal(transaction::Methods& trx, RocksDBMethods* mthd, LocalDocumentId const& documentId, arangodb::velocypack::Slice const& slice, OperationMode mode) { @@ -85,7 +85,7 @@ Result RocksDBBuilderIndex::insertInternal(transaction::Methods* trx, RocksDBMet } /// remove index elements and put it in the specified write batch. -Result RocksDBBuilderIndex::removeInternal(transaction::Methods* trx, RocksDBMethods* mthd, +Result RocksDBBuilderIndex::removeInternal(transaction::Methods& trx, RocksDBMethods* mthd, LocalDocumentId const& documentId, arangodb::velocypack::Slice const& slice, OperationMode mode) { @@ -202,7 +202,7 @@ arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function _lockedDocs.insert(docId.id()); } - res = internal->insertInternal(&trx, &batched, docId, + res = internal->insertInternal(trx, &batched, docId, VPackSlice(it->value().data()), Index::OperationMode::normal); if (res.fail()) { @@ -283,7 +283,7 @@ static arangodb::Result fillIndexFast(transaction::Methods& trx, it->Seek(bounds.start()); while (it->Valid() && it->key().compare(upper) < 0) { - res = ridx->insertInternal(&trx, &batched, RocksDBKey::documentId(it->key()), + res = ridx->insertInternal(trx, &batched, RocksDBKey::documentId(it->key()), VPackSlice(it->value().data()), Index::OperationMode::normal); if (res.fail()) { diff --git a/arangod/RocksDBEngine/RocksDBBuilderIndex.h b/arangod/RocksDBEngine/RocksDBBuilderIndex.h index f8f4255a3d21..dbe9bd916e88 100644 --- a/arangod/RocksDBEngine/RocksDBBuilderIndex.h +++ b/arangod/RocksDBEngine/RocksDBBuilderIndex.h @@ -66,7 +66,7 @@ class RocksDBBuilderIndex final : public arangodb::RocksDBIndex { return _wrapped->memory(); } - int drop() override { + Result drop() override { return _wrapped->drop(); } @@ -88,13 +88,13 @@ class RocksDBBuilderIndex final : public arangodb::RocksDBIndex { } /// insert index elements into the specified write batch. - Result insertInternal(transaction::Methods* trx, RocksDBMethods*, + Result insertInternal(transaction::Methods& trx, RocksDBMethods*, LocalDocumentId const& documentId, arangodb::velocypack::Slice const&, OperationMode mode) override; /// remove index elements and put it in the specified write batch. - Result removeInternal(transaction::Methods* trx, RocksDBMethods*, + Result removeInternal(transaction::Methods& trx, RocksDBMethods*, LocalDocumentId const& documentId, arangodb::velocypack::Slice const&, OperationMode mode) override; @@ -119,6 +119,17 @@ class RocksDBBuilderIndex final : public arangodb::RocksDBIndex { /// @param unlock called when collection lock can be released Result fillIndexBackground(std::function const& unlock); + virtual IndexIterator* iteratorForCondition( + transaction::Methods* trx, + ManagedDocumentResult* result, + aql::AstNode const* condNode, + aql::Variable const* var, + IndexIteratorOptions const& opts + ) { + TRI_ASSERT(false); + return nullptr; + } + private: std::shared_ptr _wrapped; diff --git a/arangod/RocksDBEngine/RocksDBCollection.cpp b/arangod/RocksDBEngine/RocksDBCollection.cpp index 4d1b5059992a..563b521a4efe 100644 --- a/arangod/RocksDBEngine/RocksDBCollection.cpp +++ b/arangod/RocksDBEngine/RocksDBCollection.cpp @@ -366,7 +366,6 @@ std::shared_ptr RocksDBCollection::createIndex( } } -<<<<<<< HEAD auto buildIdx = std::make_shared(std::static_pointer_cast(idx)); // Step 3. add index to collection entry (for removal after a crash) @@ -414,27 +413,6 @@ std::shared_ptr RocksDBCollection::createIndex( unlockGuard.fire(); res = buildIdx->fillIndexFast(); // will lock again internally } -======= - res = fillIndexes(trx, idx); - - if (!res.ok()) { - THROW_ARANGO_EXCEPTION(res); - } - - // we need to sync the selectivity estimates - res = engine->settingsManager()->sync(false); - - if (res.fail()) { - LOG_TOPIC(WARN, Logger::ENGINES) << "could not sync settings: " - << res.errorMessage(); - } - - rocksdb::Status s = engine->db()->GetRootDB()->FlushWAL(true); - - if (!s.ok()) { - LOG_TOPIC(WARN, Logger::ENGINES) << "could not flush wal: " - << s.ToString(); ->>>>>>> devel } // Step 5. cleanup @@ -517,7 +495,6 @@ bool RocksDBCollection::dropIndex(TRI_idx_iid_t iid) { // invalid index id or primary index return true; } -<<<<<<< HEAD std::shared_ptr toRemove; { @@ -526,22 +503,6 @@ bool RocksDBCollection::dropIndex(TRI_idx_iid_t iid) { for (std::shared_ptr& idx : _indexes) { if (iid == idx->id()) { toRemove = std::move(idx); -======= - - size_t i = 0; - WRITE_LOCKER(guard, _indexesLock); - for (std::shared_ptr index : _indexes) { - RocksDBIndex* cindex = static_cast(index.get()); - TRI_ASSERT(cindex != nullptr); - - if (iid == cindex->id()) { - auto rv = cindex->drop().errorNumber(); - - if (rv == TRI_ERROR_NO_ERROR) { - // trigger compaction before deleting the object - cindex->cleanup(); - ->>>>>>> devel _indexes.erase(_indexes.begin() + i); break; } @@ -559,8 +520,8 @@ bool RocksDBCollection::dropIndex(TRI_idx_iid_t iid) { RocksDBIndex* cindex = static_cast(toRemove.get()); TRI_ASSERT(cindex != nullptr); - int res = cindex->drop(); - if (res == TRI_ERROR_NO_ERROR) { + Result res = cindex->drop(); + if (res.ok()) { events::DropIndex("", std::to_string(iid), TRI_ERROR_NO_ERROR); // trigger compaction before deleting the object @@ -580,7 +541,7 @@ bool RocksDBCollection::dropIndex(TRI_idx_iid_t iid) { ) ); } - return res == TRI_ERROR_NO_ERROR; + return res.ok(); } std::unique_ptr RocksDBCollection::getAllIterator(transaction::Methods* trx) const { @@ -1329,110 +1290,6 @@ void RocksDBCollection::addIndex(std::shared_ptr idx) { } } -<<<<<<< HEAD -======= -template -static arangodb::Result fillIndex( - transaction::Methods& trx, - RocksDBIndex* ridx, - std::unique_ptr it, - WriteBatchType& batch, - RocksDBCollection* rcol -) { - auto state = RocksDBTransactionState::toState(&trx); - - // fillindex can be non transactional, we just need to clean up - rocksdb::DB* db = rocksutils::globalRocksDB()->GetRootDB(); - TRI_ASSERT(db != nullptr); - - uint64_t numDocsWritten = 0; - // write batch will be reset every x documents - MethodsType batched(state, &batch); - - arangodb::Result res; - auto cb = [&](LocalDocumentId const& documentId, VPackSlice slice) { - if (res.ok()) { - res = ridx->insertInternal(trx, &batched, documentId, slice, - Index::OperationMode::normal); - - if (res.ok()) { - numDocsWritten++; - } - } - }; - - rocksdb::WriteOptions wo; - - bool hasMore = true; - while (hasMore && res.ok()) { - hasMore = it->nextDocument(cb, 250); - - if (TRI_VOC_COL_STATUS_DELETED == it->collection()->status() - || it->collection()->deleted()) { - res = TRI_ERROR_INTERNAL; - } else if (application_features::ApplicationServer::isStopping()) { - res = TRI_ERROR_SHUTTING_DOWN; - } - - if (res.ok()) { - rocksdb::Status s = db->Write(wo, batch.GetWriteBatch()); - if (!s.ok()) { - res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); - break; - } - } - - batch.Clear(); - } - - // we will need to remove index elements created before an error - // occurred, this needs to happen since we are non transactional - if (res.fail()) { - RocksDBKeyBounds bounds = ridx->getBounds(); - arangodb::Result res2 = rocksutils::removeLargeRange(rocksutils::globalRocksDB(), bounds, - true, /*useRangeDel*/numDocsWritten > 25000); - if (res2.fail()) { - LOG_TOPIC(WARN, Logger::ENGINES) << "was not able to roll-back " - << "index creation: " << res2.errorMessage(); - } - } - - return res; -} - -/// non-transactional: fill index with existing documents -/// from this collection -arangodb::Result RocksDBCollection::fillIndexes( - transaction::Methods& trx, - std::shared_ptr added -) { - TRI_ASSERT(trx.state()->collection( - _logicalCollection.id(), AccessMode::Type::EXCLUSIVE - )); - - std::unique_ptr it(new RocksDBAllIndexIterator( - &_logicalCollection, &trx, primaryIndex() - )); - - RocksDBIndex* ridx = static_cast(added.get()); - - if (ridx->unique()) { - // unique index. we need to keep track of all our changes because we need to avoid - // duplicate index keys. must therefore use a WriteBatchWithIndex - rocksdb::WriteBatchWithIndex batch(ridx->columnFamily()->GetComparator(), 32 * 1024 * 1024); - return fillIndex( - trx, ridx, std::move(it), batch, this); - } else { - // non-unique index. all index keys will be unique anyway because they contain the document id - // we can therefore get away with a cheap WriteBatch - rocksdb::WriteBatch batch(32 * 1024 * 1024); - return fillIndex( - trx, ridx, std::move(it), batch, this); - } - return Result(); -} - ->>>>>>> devel Result RocksDBCollection::insertDocument( arangodb::transaction::Methods* trx, LocalDocumentId const& documentId, VPackSlice const& doc, OperationOptions& options) const { diff --git a/arangod/RocksDBEngine/RocksDBEngine.cpp b/arangod/RocksDBEngine/RocksDBEngine.cpp index d8bd69de2a8f..4031e9324767 100644 --- a/arangod/RocksDBEngine/RocksDBEngine.cpp +++ b/arangod/RocksDBEngine/RocksDBEngine.cpp @@ -1289,6 +1289,7 @@ arangodb::Result RocksDBEngine::dropCollection( // delete indexes, RocksDBIndex::drop() has its own check std::vector> vecShardIndex = coll->getIndexes(); TRI_ASSERT(!vecShardIndex.empty()); + for (auto& index : vecShardIndex) { RocksDBIndex* ridx = static_cast(index.get()); res = RocksDBCollectionMeta::deleteIndexEstimate(db, ridx->objectId()); @@ -1296,8 +1297,9 @@ arangodb::Result RocksDBEngine::dropCollection( LOG_TOPIC(WARN, Logger::ENGINES) << "could not delete index estimate: " << res.errorMessage(); } - - int dropRes = index->drop(); + + auto dropRes = index->drop().errorNumber(); + if (dropRes != TRI_ERROR_NO_ERROR) { // We try to remove all indexed values. // If it does not work they cannot be accessed any more and leaked. diff --git a/tests/js/common/shell/shell-index-rocksdb.js b/tests/js/common/shell/shell-index-rocksdb.js index d9418ca1d98b..0b7fd79981d2 100644 --- a/tests/js/common/shell/shell-index-rocksdb.js +++ b/tests/js/common/shell/shell-index-rocksdb.js @@ -91,9 +91,9 @@ function backgroundIndexSuite() { let n = 9; for (let i = 0; i < n; ++i) { let command = `let c = require("internal").db._collection("${cn}"); - let x = 10; - while(x-- > 0) { - let docs = []; + let x = 10; + while(x-- > 0) { + let docs = []; for(let i = 0; i < 1000; i++) { docs.push({value:i}) } @@ -112,11 +112,11 @@ function backgroundIndexSuite() { assertEqual(c.count(), 100000); // 100 entries of each value [0,999] - /*for (let i = 0; i < 1000; i++) { + for (let i = 0; i < 1000; i++) { let cursor = db._query("FOR doc IN @@coll FILTER doc.value == @val RETURN 1", {'@coll': cn, 'val': i}, {count:true}); assertEqual(cursor.count(), 100); - }*/ + } internal.waitForEstimatorSync(); // make sure estimates are consistent let indexes = c.getIndexes(true); @@ -156,12 +156,7 @@ function backgroundIndexSuite() { for(let i = 0; i < 1000; i++) { docs.push({value: x++}) } - let res = c.save(docs); - res.map((obj) => { - if (obj.error) { - require('internal').print(JSON.stringify(obj)); - } - }); + c.save(docs); }`; tasks.register({ name: "UnitTestsIndexInsert" + i, command: command }); } @@ -181,7 +176,7 @@ function backgroundIndexSuite() { {'@coll': cn, 'val': i}, {count:true}); assertEqual(cursor.count(), 1); } - assertEqual(c.count(), 50000);*/ + assertEqual(c.count(), 50000); let indexes = c.getIndexes(true); for (let i of indexes) { From 2082f781a42fc2e8b7bbb91a71252785fa966b90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Wed, 12 Dec 2018 02:48:01 +0100 Subject: [PATCH 20/31] adding some docs --- Documentation/Books/Manual/Indexing/Hash.md | 24 ++++ .../Books/Manual/Indexing/IndexBasics.md | 124 ++++++++++++++---- .../Books/Manual/Indexing/Skiplist.md | 25 ++++ 3 files changed, 148 insertions(+), 25 deletions(-) diff --git a/Documentation/Books/Manual/Indexing/Hash.md b/Documentation/Books/Manual/Indexing/Hash.md index 25753cda7578..dc4763c064bb 100644 --- a/Documentation/Books/Manual/Indexing/Hash.md +++ b/Documentation/Books/Manual/Indexing/Hash.md @@ -118,6 +118,30 @@ details, including the index-identifier, is returned. @endDocuBlock ensureHashIndexArray +Creating Hash Index in Background +--------------------------------- + +{% hint 'info' %} +This section only applies to the *rocksdb* storage engine +{% endhint %} + +Creating new indexes is by default done under an exclusive collection lock. This means +that the collection (or the respective shards) are not available as long as the index +is created. This "foreground" index creation can be undesireable, if you have to perform it +on a live system without a dedicated maintenance window. + +Indexes can also be created in "background", not using an exclusive lock during the creation. +The collection remains available, other CRUD operations can run on the collection while the index is created. +This can be achieved by using the *inBackground* option. + +To create an hash index in the background in *arangosh* just specify `inBackground: true`: + +```js +db.collection.ensureIndex({ type: "hash", fields: [ "value" ], inBackground: true }); +``` + +For more information see [Creating Indexes in Background](IndexBasics.md#creating-indexes-in-background) + Ensure uniqueness of relations in edge collections -------------------------------------------------- diff --git a/Documentation/Books/Manual/Indexing/IndexBasics.md b/Documentation/Books/Manual/Indexing/IndexBasics.md index 76ec323d6a98..13ecf7d04aff 100644 --- a/Documentation/Books/Manual/Indexing/IndexBasics.md +++ b/Documentation/Books/Manual/Indexing/IndexBasics.md @@ -22,6 +22,14 @@ are covered by an edge collection's edge index automatically. Using the system attribute `_id` in user-defined indexes is not possible, but indexing `_key`, `_rev`, `_from`, and `_to` is. +Creating new indexes is usually done under an exclusive collection lock. The collection is not +available as long as the index is created. This "foreground" index creation can be undesireable, +if you have to perform it on a live system without a dedicated maintenance window. + +For potentially long running index creation operations the _rocksdb_ storage-engine also supports +creating indexes in "background". The colletion remains available durint the index creation, +see the section [Creating Indexes in Background](#creating-indexes-in-background) for more information. + ArangoDB provides the following index types: Primary Index @@ -243,31 +251,6 @@ Skiplist indexes support [indexing array values](#indexing-array-values) if the attribute name is extended with a [\*]`. -Persistent Index ----------------- - -The persistent index is a sorted index with persistence. The index entries are written to -disk when documents are stored or updated. That means the index entries do not need to be -rebuilt from the collection data when the server is restarted or the indexed collection -is initially loaded. Thus using persistent indexes may reduce collection loading times. - -The persistent index type can be used for secondary indexes at the moment. That means the -persistent index currently cannot be made the only index for a collection, because there -will always be the in-memory primary index for the collection in addition, and potentially -more indexes (such as the edges index for an edge collection). - -The index implementation is using the RocksDB engine, and it provides logarithmic complexity -for insert, update, and remove operations. As the persistent index is not an in-memory -index, it does not store pointers into the primary index as all the in-memory indexes do, -but instead it stores a document's primary key. To retrieve a document via a persistent -index via an index value lookup, there will therefore be an additional O(1) lookup into -the primary index to fetch the actual document. - -As the persistent index is sorted, it can be used for point lookups, range queries and sorting -operations, but only if either all index attributes are provided in a query, or if a leftmost -prefix of the index attributes is specified. - - Geo Index --------- @@ -307,6 +290,37 @@ minimum length will be included in the index. The fulltext index is used via dedicated functions in AQL or the simple queries, but will not be enabled for other types of queries or conditions. + +Persistent Index +---------------- + +{% hint 'warning' %} +this index should not be used anymore, instead use the rocksdb storage engine +with either the *skiplist* or *hash* index. +{% endhint %} + +The persistent index is a sorted index with persistence. The index entries are written to +disk when documents are stored or updated. That means the index entries do not need to be +rebuilt from the collection data when the server is restarted or the indexed collection +is initially loaded. Thus using persistent indexes may reduce collection loading times. + +The persistent index type can be used for secondary indexes at the moment. That means the +persistent index currently cannot be made the only index for a collection, because there +will always be the in-memory primary index for the collection in addition, and potentially +more indexes (such as the edges index for an edge collection). + +The index implementation is using the RocksDB engine, and it provides logarithmic complexity +for insert, update, and remove operations. As the persistent index is not an in-memory +index, it does not store pointers into the primary index as all the in-memory indexes do, +but instead it stores a document's primary key. To retrieve a document via a persistent +index via an index value lookup, there will therefore be an additional O(1) lookup into +the primary index to fetch the actual document. + +As the persistent index is sorted, it can be used for point lookups, range queries and sorting +operations, but only if either all index attributes are provided in a query, or if a leftmost +prefix of the index attributes is specified. + + Indexing attributes and sub-attributes -------------------------------------- @@ -534,3 +548,63 @@ optimizer may prefer the default edge index over vertex centric indexes based on the costs it estimates, even if a vertex centric index might in fact be faster. Vertex centric indexes are more likely to be chosen for highly connected graphs and with RocksDB storage engine. + + +Creating Indexes in Background +------------------------------ + +{% hint 'info' %} +This section only applies to the *rocksdb* storage engine +{% endhint %} + +Creating new indexes is by default done under an exclusive collection lock. This means +that the collection (or the respective shards) are not available as long as the index +is created. This "foreground" index creation can be undesireable, if you have to perform it +on a live system without a dedicated maintenance window. + +Indexes can also be created in "background", not using an exclusive lock during the creation. +The collection remains available, other CRUD operations can run on the collection while the index is created. +This can be achieved by using the *inBackground* option. + +To create a indexes in the background in *arangosh* just specify `inBackground: true`, +like in the following examples: + +```js +// create the hash index in the background +db.collection.ensureIndex({ type: "hash", fields: [ "value" ], unique: false, inBackground: true }); +db.collection.ensureIndex({ type: "hash", fields: [ "email" ], unique: true, inBackground: true }); + +// skiplist indexes work also of course +db.collection.ensureIndex({ type :"skiplist", fields: ["abc", "cdef"], unique: true, inBackground: true }); +db.collection.ensureIndex({ type :"skiplist", fields: ["abc", "cdef"], sparse: true, inBackground: true }); + +// also supported on fulltext indexes +db.collection.ensureIndex({ type: "geo", fields: [ "latitude", "longitude"], inBackground: true }); +db.collection.ensureIndex({ type: "geo", fields: [ "latitude", "longitude"], inBackground: true }); +db.collection.ensureIndex({ type: "fulltext", fields: [ "text" ], minLength: 4, inBackground: true }) +``` + +### Behaviour + +Indexes that are still in the build process will not be visible via the ArangoDB API. Nevertheless it is not +possible to create the same index twice via the *ensureIndex* API. AQL Queries will not use these indexes either +until the indexes report back as finished. Note that the initial *ensureIndex* call or HTTP request will block until the index is completely ready. Existing single-threaded client programs can safely specify the +*inBackground* option as *true* and continue to work as before. + +{% hint 'info' %} +Should you be building an index in the background you cannot rename or drop the collection. +These operations will block until the index creation is finished. +{% endhint %} + +Interrupted index build (i.e. due to a server crash) will remove the partially build index. +In the ArangoDB cluster the index might then be automatically recreated on affected shards. + +### Performance + +The backround index creation might be slower than the "foreground" index creation and require more RAM. +Under a write heavy load (specifically many remove, update or replace) operations, +the background index creation needs to keep a list of removed documents in RAM. This might become unsuistainable +if this list grows to tens of millions of entries. + +Building an index is always a write heavy operation (internally), it is alsways a good idea to build indexes +during times with less load. diff --git a/Documentation/Books/Manual/Indexing/Skiplist.md b/Documentation/Books/Manual/Indexing/Skiplist.md index a7c78f59d44a..72f2a2537932 100644 --- a/Documentation/Books/Manual/Indexing/Skiplist.md +++ b/Documentation/Books/Manual/Indexing/Skiplist.md @@ -185,3 +185,28 @@ and { "a" : { "c" : 1, "b" : 1 } } ``` will match. + + +Creating Skiplist Index in Background +--------------------------------- + +{% hint 'info' %} +This section only applies to the *rocksdb* storage engine +{% endhint %} + +Creating new indexes is by default done under an exclusive collection lock. This means +that the collection (or the respective shards) are not available as long as the index +is created. This "foreground" index creation can be undesireable, if you have to perform it +on a live system without a dedicated maintenance window. + +Indexes can also be created in "background", not using an exclusive lock during the creation. +The collection remains available, other CRUD operations can run on the collection while the index is created. +This can be achieved by using the *inBackground* option. + +To create a Skiplist index in the background in *arangosh* just specify `inBackground: true`: + +```js +db.collection.ensureIndex({ type: "skiplist", fields: [ "value" ], inBackground: true }); +``` + +For more information see [Creating Indexes in Background](IndexBasics.md#creating-indexes-in-background) From 98ddd167c829d93f7bf240e524ec9dfcbfaf5e9d Mon Sep 17 00:00:00 2001 From: Dan Larkin-York Date: Wed, 12 Dec 2018 13:21:42 -0500 Subject: [PATCH 21/31] Some small changes. --- arangod/Indexes/Index.cpp | 28 +++++++++++-------- arangod/RocksDBEngine/RocksDBBuilderIndex.cpp | 12 +++++--- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/arangod/Indexes/Index.cpp b/arangod/Indexes/Index.cpp index 7d935ee1e04a..193e51fa451d 100644 --- a/arangod/Indexes/Index.cpp +++ b/arangod/Indexes/Index.cpp @@ -133,6 +133,10 @@ void markAsNonNull(arangodb::aql::AstNode const* op, arangodb::aql::AstNode cons } } +bool typeMatch(char const* type, size_t len, char const* expected) { + return (len == ::strlen(expected)) && (::memcmp(type, expected, len) == 0); +} + } // namespace // If the Index is on a coordinator instance the index may not access the @@ -220,41 +224,41 @@ void Index::validateFields(VPackSlice const& slice) { /// @brief return the index type based on a type name Index::IndexType Index::type(char const* type, size_t len) { - if (7 == len && ::memcmp(type, "primary", len) == 0) { + if (::typeMatch(type, len, "primary")) { return TRI_IDX_TYPE_PRIMARY_INDEX; } - if (4 == len && ::memcmp(type, "edge", len) == 0) { + if (::typeMatch(type, len, "edge")) { return TRI_IDX_TYPE_EDGE_INDEX; } - if (4 == len && ::memcmp(type, "hash", len) == 0) { + if (::typeMatch(type, len, "hash")) { return TRI_IDX_TYPE_HASH_INDEX; } - if (8 == len && ::memcmp(type, "skiplist", len) == 0) { + if (::typeMatch(type, len, "skiplist")) { return TRI_IDX_TYPE_SKIPLIST_INDEX; } - if ((10 == len && ::memcmp(type, "persistent", len) == 0) || - (7 == len && ::memcmp(type, "rocksdb", len) == 0)) { + if (::typeMatch(type, len, "persistent") || + ::typeMatch(type, len, "rocksdb")) { return TRI_IDX_TYPE_PERSISTENT_INDEX; } - if (8 == len && ::memcmp(type, "fulltext", len) == 0) { + if (::typeMatch(type, len, "fulltext")) { return TRI_IDX_TYPE_FULLTEXT_INDEX; } - if (3 == len && ::memcmp(type, "geo", len) == 0) { + if (::typeMatch(type, len, "geo")) { return TRI_IDX_TYPE_GEO_INDEX; } - if (4 == len && ::memcmp(type, "geo1", len) == 0) { + if (::typeMatch(type, len, "geo1")) { return TRI_IDX_TYPE_GEO1_INDEX; } - if (4 == len && ::memcmp(type, "geo2", len) == 0) { + if (::typeMatch(type, len, "geo2")) { return TRI_IDX_TYPE_GEO2_INDEX; } #ifdef USE_IRESEARCH std::string const& tmp = arangodb::iresearch::DATA_SOURCE_TYPE.name(); - if (tmp.size() == len && ::memcmp(type, tmp.c_str(), len) == 0) { + if (::typeMatch(type, len, tmp.c_str())) { return TRI_IDX_TYPE_IRESEARCH_LINK; } #endif - if (8 == len && ::strcmp(type, "noaccess") == 0) { + if (::typeMatch(type, len, "noaccess")) { return TRI_IDX_TYPE_NO_ACCESS_INDEX; } diff --git a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp index 0179be40205b..4fe83d9c76fa 100644 --- a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp +++ b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp @@ -75,13 +75,15 @@ Result RocksDBBuilderIndex::insertInternal(transaction::Methods& trx, RocksDBMet r.is(TRI_ERROR_LOCK_TIMEOUT) || r.is(TRI_ERROR_DEADLOCK) || r.is(TRI_ERROR_ARANGO_CONFLICT)) { + // these are expected errors; store in builder and suppress bool expected = false; - if (!r.ok() && !_hasError.compare_exchange_strong(expected, true, std::memory_order_release)) { + if (!r.ok() && _hasError.compare_exchange_strong(expected, true)) { std::lock_guard guard(_errorMutex); _errorResult = r; } + return Result(); } - return Result(); + return r; } /// remove index elements and put it in the specified write batch. @@ -105,13 +107,15 @@ Result RocksDBBuilderIndex::removeInternal(transaction::Methods& trx, RocksDBMet r.is(TRI_ERROR_LOCK_TIMEOUT) || r.is(TRI_ERROR_DEADLOCK) || r.is(TRI_ERROR_ARANGO_CONFLICT)) { + // these are expected errors; store in builder and suppress bool expected = false; - if (!r.ok() && !_hasError.compare_exchange_strong(expected, true, std::memory_order_release)) { + if (!r.ok() && _hasError.compare_exchange_strong(expected, true)) { std::lock_guard guard(_errorMutex); _errorResult = r; } + return Result(); } - return Result(); + return r; } // Background index filler task From ee51fa53ced178ea86b96738cd42dc8fe04665b9 Mon Sep 17 00:00:00 2001 From: Dan Larkin-York Date: Wed, 12 Dec 2018 17:04:32 -0500 Subject: [PATCH 22/31] Fixed removal bug and added test. --- arangod/RocksDBEngine/RocksDBBuilderIndex.cpp | 15 ++- tests/js/common/shell/shell-index-rocksdb.js | 93 ++++++++++++++++--- 2 files changed, 85 insertions(+), 23 deletions(-) diff --git a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp index 4fe83d9c76fa..cf2cc7a89526 100644 --- a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp +++ b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp @@ -71,10 +71,7 @@ Result RocksDBBuilderIndex::insertInternal(transaction::Methods& trx, RocksDBMet arangodb::velocypack::Slice const& slice, OperationMode mode) { Result r = _wrapped->insertInternal(trx, mthd, documentId, slice, mode); - if (r.is(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED) || - r.is(TRI_ERROR_LOCK_TIMEOUT) || - r.is(TRI_ERROR_DEADLOCK) || - r.is(TRI_ERROR_ARANGO_CONFLICT)) { + if (r.is(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED)) { // these are expected errors; store in builder and suppress bool expected = false; if (!r.ok() && _hasError.compare_exchange_strong(expected, true)) { @@ -103,10 +100,7 @@ Result RocksDBBuilderIndex::removeInternal(transaction::Methods& trx, RocksDBMet } Result r = _wrapped->removeInternal(trx, mthd, documentId, slice, mode); - if (r.is(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED) || - r.is(TRI_ERROR_LOCK_TIMEOUT) || - r.is(TRI_ERROR_DEADLOCK) || - r.is(TRI_ERROR_ARANGO_CONFLICT)) { + if (r.is(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED)) { // these are expected errors; store in builder and suppress bool expected = false; if (!r.ok() && _hasError.compare_exchange_strong(expected, true)) { @@ -198,11 +192,14 @@ arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function LocalDocumentId docId = RocksDBKey::documentId(it->key()); { + // must acquire both locks here to prevent interleaved operations std::lock_guard guard(_removedDocsMutex); + std::lock_guard guard2(_lockedDocsMutex); if (_removedDocs.find(docId.id()) != _removedDocs.end()) { + _removedDocs.erase(_removedDocs.find(docId.id())); + it->Next(); continue; } - std::lock_guard guard2(_lockedDocsMutex); _lockedDocs.insert(docId.id()); } diff --git a/tests/js/common/shell/shell-index-rocksdb.js b/tests/js/common/shell/shell-index-rocksdb.js index 0b7fd79981d2..a4884cb48e25 100644 --- a/tests/js/common/shell/shell-index-rocksdb.js +++ b/tests/js/common/shell/shell-index-rocksdb.js @@ -90,7 +90,7 @@ function backgroundIndexSuite() { // lets insert the rest via tasks let n = 9; for (let i = 0; i < n; ++i) { - let command = `let c = require("internal").db._collection("${cn}"); + let command = `const c = require("internal").db._collection("${cn}"); let x = 10; while(x-- > 0) { let docs = []; @@ -108,11 +108,9 @@ function backgroundIndexSuite() { // wait for insertion tasks to complete waitForTasks(); - // sanity check + // sanity checks assertEqual(c.count(), 100000); - - // 100 entries of each value [0,999] - for (let i = 0; i < 1000; i++) { + for (let i = 0; i < 1000; i++) { // 100 entries of each value [0,999] let cursor = db._query("FOR doc IN @@coll FILTER doc.value == @val RETURN 1", {'@coll': cn, 'val': i}, {count:true}); assertEqual(cursor.count(), 100); @@ -133,8 +131,6 @@ function backgroundIndexSuite() { } }, - // if we run this in isolation, it passes, but the count is off otherwise. - // the slow part of the test is the individual sanity checks testInsertParallelUnique: function () { let c = require("internal").db._collection(cn); // first lets add some initial documents @@ -149,7 +145,7 @@ function backgroundIndexSuite() { // lets insert the rest via tasks for (let i = 1; i < 5; ++i) { - let command = `let c = require("internal").db._collection("${cn}"); + let command = `const c = require("internal").db._collection("${cn}"); let x = ${i} * 10000; while(x < ${i + 1} * 10000) { let docs = []; @@ -168,15 +164,12 @@ function backgroundIndexSuite() { waitForTasks(); // sanity checks - const scanDocs = db._query("FOR doc IN @@coll RETURN doc", - {'@coll': cn}, {count:true, optimizer: {rules: ["-use-indexes"]}}).toArray(); - assertEqual(scanDocs.length, 50000); + assertEqual(c.count(), 50000); for (let i = 0; i < 50000; i++) { const cursor = db._query("FOR doc IN @@coll FILTER doc.value == @val RETURN 1", {'@coll': cn, 'val': i}, {count:true}); assertEqual(cursor.count(), 1); } - assertEqual(c.count(), 50000); let indexes = c.getIndexes(true); for (let i of indexes) { @@ -206,7 +199,7 @@ function backgroundIndexSuite() { // lets insert the rest via tasks for (let i = 1; i < 5; ++i) { - let command = `let c = require("internal").db._collection("${cn}"); + let command = `const c = require("internal").db._collection("${cn}"); let x = ${i} * 10000; while(x < ${i + 1} * 10000) { let docs = []; @@ -245,7 +238,79 @@ function backgroundIndexSuite() { fail(); } } - } + }, + + testRemoveParallel: function () { + let c = require("internal").db._collection(cn); + // first lets add some initial documents + let x = 0; + while(x < 100000) { + let docs = []; + for(let i = 0; i < 1000; i++) { + docs.push({_key: "test_" + x, value: x++}); + } + c.save(docs); + } + + assertEqual(c.count(), 100000); + + // lets remove half via tasks + for (let i = 0; i < 10; ++i) { + let command = `const c = require("internal").db._collection("${cn}"); + if (!c) { + throw new Error('could not find collection'); + } + let x = ${i} * 10000; + while(x < ${i} * 10000 + 5000) { + let docs = []; + for(let i = 0; i < 1000; i++) { + docs.push("test_" + x++); + } + let removed = false; + while (!removed) { + const res = c.remove(docs); + removed = (res.filter(r => !r.error).length === 0); + } + }`; + tasks.register({ name: "UnitTestsIndexRemove" + i, command: command }); + } + + // create the index on the main thread + c.ensureIndex({type: 'hash', fields: ['value'], inBackground: true }); + + // wait for insertion tasks to complete + waitForTasks(); + + // sanity checks + assertEqual(c.count(), 50000); + for (let i = 0; i < 10; i++) { // check for remaining docs via index + for (let x = i * 10000 + 5000; x < (i+1) * 10000; x++) { + const cursor = db._query("FOR doc IN @@coll FILTER doc.value == @val RETURN 1", + {'@coll': cn, 'val': x}, {count:true}); + assertEqual(cursor.count(), 1); + } + } + for (let i = 0; i < 10; i++) { // check for removed docs via index + for (let x = i * 10000; x < i * 10000 + 5000; x++) { + const cursor = db._query("FOR doc IN @@coll FILTER doc.value == @val RETURN 1", + {'@coll': cn, 'val': x}, {count:true}); + assertEqual(cursor.count(), 0); + } + } + + let indexes = c.getIndexes(true); + for (let i of indexes) { + switch (i.type) { + case 'primary': + break; + case 'hash': + break; + default: + fail(); + } + } + }, + }; } From 422e0ce34f0ac507a355de0d942f8ada45c3dc03 Mon Sep 17 00:00:00 2001 From: Dan Larkin-York Date: Mon, 17 Dec 2018 18:48:57 -0500 Subject: [PATCH 23/31] Added update test. --- tests/js/common/shell/shell-index-rocksdb.js | 66 ++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/tests/js/common/shell/shell-index-rocksdb.js b/tests/js/common/shell/shell-index-rocksdb.js index a4884cb48e25..778adfe5b15c 100644 --- a/tests/js/common/shell/shell-index-rocksdb.js +++ b/tests/js/common/shell/shell-index-rocksdb.js @@ -311,6 +311,72 @@ function backgroundIndexSuite() { } }, + testUpdateParallel: function () { + let c = require("internal").db._collection(cn); + // first lets add some initial documents + let x = 0; + while(x < 100000) { + let docs = []; + for(let i = 0; i < 1000; i++) { + docs.push({_key: "test_" + x, value: x++}); + } + c.save(docs); + } + + assertEqual(c.count(), 100000); + + // lets update all via tasks + for (let i = 0; i < 10; ++i) { + let command = `const c = require("internal").db._collection("${cn}"); + if (!c) { + throw new Error('could not find collection'); + } + let x = ${i * 10000}; + while(x < ${(i+1) * 10000}) { + let updated = false; + const current = x++; + const key = "test_" + current + const doc = {value: current + 100000}; + while (!updated) { + try { + const res = c.update(key, doc); + updated = true; + } catch (err) {} + } + }`; + tasks.register({ name: "UnitTestsIndexUpdate" + i, command: command }); + } + + // wait for insertion tasks to complete + waitForTasks(); + + // create the index on the main thread + c.ensureIndex({type: 'skiplist', fields: ['value'], inBackground: true }); + + // sanity checks + assertEqual(c.count(), 100000); + // check for new entries via index + const newCursor = db._query("FOR doc IN @@coll FILTER doc.value >= @val RETURN 1", + {'@coll': cn, 'val': 100000}, {count:true}); + assertEqual(newCursor.count(), 100000); + // check for old entries via index + const oldCursor = db._query("FOR doc IN @@coll FILTER doc.value < @val RETURN 1", + {'@coll': cn, 'val': 100000}, {count:true}); + assertEqual(oldCursor.count(), 0); + + let indexes = c.getIndexes(true); + for (let i of indexes) { + switch (i.type) { + case 'primary': + break; + case 'skiplist': + break; + default: + fail(); + } + } + }, + }; } From a34f928e0463741b58ba35864cb54098bf00d82a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Tue, 18 Dec 2018 17:32:15 +0100 Subject: [PATCH 24/31] forgot to comment out docs --- Documentation/Books/Manual/Indexing/Hash.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/Books/Manual/Indexing/Hash.md b/Documentation/Books/Manual/Indexing/Hash.md index dc4763c064bb..9f6f20598f91 100644 --- a/Documentation/Books/Manual/Indexing/Hash.md +++ b/Documentation/Books/Manual/Indexing/Hash.md @@ -118,7 +118,7 @@ details, including the index-identifier, is returned. @endDocuBlock ensureHashIndexArray -Creating Hash Index in Background + For more information see [Creating Indexes in Background](IndexBasics.md#creating-indexes-in-background) From 021453c62d28217bff8273e9351ef111be7374f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Wed, 19 Dec 2018 01:53:01 +0100 Subject: [PATCH 25/31] fixing some code --- arangod/RocksDBEngine/RocksDBBuilderIndex.cpp | 14 +++++++++++--- arangod/RocksDBEngine/RocksDBBuilderIndex.h | 2 +- arangod/RocksDBEngine/RocksDBCollection.cpp | 5 ++--- arangod/RocksDBEngine/RocksDBIndexFactory.cpp | 2 +- 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp index cd2ead670893..156d721a3b8f 100644 --- a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp +++ b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp @@ -88,6 +88,7 @@ Result RocksDBBuilderIndex::removeInternal(transaction::Methods& trx, RocksDBMet LocalDocumentId const& documentId, arangodb::velocypack::Slice const& slice, OperationMode mode) { + TRI_ASSERT(false); // not enabled { std::lock_guard guard(_removedDocsMutex); _removedDocs.insert(documentId.id()); @@ -331,24 +332,31 @@ static arangodb::Result fillIndexFast(transaction::Methods& trx, /// non-transactional: fill index with existing documents /// from this collection arangodb::Result RocksDBBuilderIndex::fillIndexFast() { + Result res; RocksDBCollection* coll = static_cast(_collection.getPhysical()); SingleCollectionTransaction trx(transaction::StandaloneContext::Create(_collection.vocbase()), _collection, AccessMode::Type::EXCLUSIVE); - Result res = trx.begin(); + res = trx.begin(); if (!res.ok()) { THROW_ARANGO_EXCEPTION(res); } + RocksDBIndex* internal = _wrapped.get(); + TRI_ASSERT(internal != nullptr); if (this->unique()) { // unique index. we need to keep track of all our changes because we need to avoid // duplicate index keys. must therefore use a WriteBatchWithIndex rocksdb::WriteBatchWithIndex batch(_cf->GetComparator(), 32 * 1024 * 1024); - return ::fillIndexFast(trx, this, coll, batch); + res = ::fillIndexFast(trx, internal, coll, batch); } else { // non-unique index. all index keys will be unique anyway because they contain the document id // we can therefore get away with a cheap WriteBatch rocksdb::WriteBatch batch(32 * 1024 * 1024); - return ::fillIndexFast(trx, this, coll, batch); + res = ::fillIndexFast(trx, internal, coll, batch); } + if (res.ok()) { + res = trx.commit(); // required to commit selectivity estimates + } + return res; } diff --git a/arangod/RocksDBEngine/RocksDBBuilderIndex.h b/arangod/RocksDBEngine/RocksDBBuilderIndex.h index dbe9bd916e88..ea638d38814c 100644 --- a/arangod/RocksDBEngine/RocksDBBuilderIndex.h +++ b/arangod/RocksDBEngine/RocksDBBuilderIndex.h @@ -125,7 +125,7 @@ class RocksDBBuilderIndex final : public arangodb::RocksDBIndex { aql::AstNode const* condNode, aql::Variable const* var, IndexIteratorOptions const& opts - ) { + ) override { TRI_ASSERT(false); return nullptr; } diff --git a/arangod/RocksDBEngine/RocksDBCollection.cpp b/arangod/RocksDBEngine/RocksDBCollection.cpp index e79078a4201e..b879f5556436 100644 --- a/arangod/RocksDBEngine/RocksDBCollection.cpp +++ b/arangod/RocksDBEngine/RocksDBCollection.cpp @@ -327,7 +327,7 @@ std::shared_ptr RocksDBCollection::createIndex( auto releaseGuard = scopeGuard([&] { vocbase.releaseCollection(&_logicalCollection); }); - res = lockWrite(); // MOVE ?!!! + res = lockWrite(); if (res.fail()) { THROW_ARANGO_EXCEPTION(res); } @@ -413,6 +413,7 @@ std::shared_ptr RocksDBCollection::createIndex( unlockGuard.fire(); }); } else { + _indexes.push_back(idx); unlockGuard.fire(); res = buildIdx->fillIndexFast(); // will lock again internally } @@ -429,8 +430,6 @@ std::shared_ptr RocksDBCollection::createIndex( break; } } - } else { - _indexes.push_back(idx); } } diff --git a/arangod/RocksDBEngine/RocksDBIndexFactory.cpp b/arangod/RocksDBEngine/RocksDBIndexFactory.cpp index 1bd64eb727c5..126deda5748d 100644 --- a/arangod/RocksDBEngine/RocksDBIndexFactory.cpp +++ b/arangod/RocksDBEngine/RocksDBIndexFactory.cpp @@ -796,7 +796,7 @@ RocksDBIndexFactory::RocksDBIndexFactory() { static const Geo2IndexFactory geo2IndexFactory; static const SecondaryIndexFactory hashIndexFactory; - static const SecondaryIndexFactory persistentIndexFactory; static const SecondaryIndexFactory skiplistIndexFactory; From f61b5e938927d38fd36299113e3487831f472160 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Wed, 19 Dec 2018 10:56:15 +0100 Subject: [PATCH 26/31] fix jslint --- tests/js/common/shell/shell-index-rocksdb-disabled.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/js/common/shell/shell-index-rocksdb-disabled.js b/tests/js/common/shell/shell-index-rocksdb-disabled.js index 778adfe5b15c..aec8935296f6 100644 --- a/tests/js/common/shell/shell-index-rocksdb-disabled.js +++ b/tests/js/common/shell/shell-index-rocksdb-disabled.js @@ -37,8 +37,8 @@ function backgroundIndexSuite() { const cn = "UnitTestsCollectionIdx"; const tasks = require("@arangodb/tasks"); const tasksCompleted = () => { - return 0 == tasks.get().filter((task) => { - return (task.id.match(/^UnitTest/) || task.name.match(/^UnitTest/)) + return 0 === tasks.get().filter((task) => { + return (task.id.match(/^UnitTest/) || task.name.match(/^UnitTest/)); }).length; }; const waitForTasks = () => { @@ -46,7 +46,7 @@ function backgroundIndexSuite() { const start = time(); while (!tasksCompleted()) { if (time() - start > 300) { // wait for 5 minutes maximum - fail("Timeout creating documents after 5 minutes: " + c.count()); + fail("Timeout after 5 minutes"); } require("internal").wait(0.5, false); } From b013b153366ac4eb01a70d6f602072d508f13f0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Wed, 19 Dec 2018 14:44:06 +0100 Subject: [PATCH 27/31] remove some code --- arangod/RestHandler/RestShutdownHandler.cpp | 3 +- arangod/RocksDBEngine/RocksDBBuilderIndex.cpp | 21 ++++-- arangod/RocksDBEngine/RocksDBCollection.cpp | 28 ++----- arangod/RocksDBEngine/RocksDBIndexFactory.cpp | 75 ------------------- arangod/RocksDBEngine/RocksDBMethods.h | 3 - .../RocksDBEngine/RocksDBTransactionState.h | 1 - 6 files changed, 25 insertions(+), 106 deletions(-) diff --git a/arangod/RestHandler/RestShutdownHandler.cpp b/arangod/RestHandler/RestShutdownHandler.cpp index d60dd59a4e36..607539e2df9e 100644 --- a/arangod/RestHandler/RestShutdownHandler.cpp +++ b/arangod/RestHandler/RestShutdownHandler.cpp @@ -100,10 +100,11 @@ RestStatus RestShutdownHandler::execute() { } catch (...) { // Ignore the error } + auto self = shared_from_this(); rest::Scheduler* scheduler = SchedulerFeature::SCHEDULER; // don't block the response for workers waiting on this callback // this should allow workers to go into the IDLE state - scheduler->queue(RequestPriority::HIGH, []{ + scheduler->queue(RequestPriority::HIGH, [self] { // Give the server 2 seconds to send the reply: std::this_thread::sleep_for(std::chrono::seconds(2)); // Go down: diff --git a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp index 156d721a3b8f..e77ddc67ec09 100644 --- a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp +++ b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp @@ -70,6 +70,7 @@ Result RocksDBBuilderIndex::insertInternal(transaction::Methods& trx, RocksDBMet LocalDocumentId const& documentId, arangodb::velocypack::Slice const& slice, OperationMode mode) { + TRI_ASSERT(false); // not enabled Result r = _wrapped->insertInternal(trx, mthd, documentId, slice, mode); if (r.is(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED)) { // these are expected errors; store in builder and suppress @@ -255,6 +256,7 @@ static arangodb::Result fillIndexFast(transaction::Methods& trx, RocksDBCollection* coll, WriteBatchType& batch) { auto state = RocksDBTransactionState::toState(&trx); + auto methds = RocksDBTransactionState::toMethods(&trx); arangodb::Result res; // fillindex can be non transactional, we just need to clean up @@ -273,19 +275,25 @@ static arangodb::Result fillIndexFast(transaction::Methods& trx, rocksdb::WriteOptions wo; wo.disableWAL = false; // TODO set to true eventually - // we iterator without a snapshot + const rocksdb::Snapshot* snap = rootDB->GetSnapshot(); + auto snapGuard = scopeGuard([&] { + rootDB->ReleaseSnapshot(snap); + }); + rocksdb::ReadOptions ro; + ro.snapshot = snap; ro.prefix_same_as_start = true; ro.iterate_upper_bound = &upper; ro.verify_checksums = false; ro.fill_cache = false; - rocksdb::ColumnFamilyHandle* docCF = bounds.columnFamily(); - std::unique_ptr it(rootDB->NewIterator(ro, docCF)); + rocksdb::ColumnFamilyHandle* docCF = RocksDBColumnFamily::documents(); + std::unique_ptr it = methds->NewIterator(ro, docCF); it->Seek(bounds.start()); - while (it->Valid() && it->key().compare(upper) < 0) { - + while (it->Valid()) { + TRI_ASSERT(it->key().compare(upper) < 0); + res = ridx->insertInternal(trx, &batched, RocksDBKey::documentId(it->key()), VPackSlice(it->value().data()), Index::OperationMode::normal); @@ -345,9 +353,10 @@ arangodb::Result RocksDBBuilderIndex::fillIndexFast() { RocksDBIndex* internal = _wrapped.get(); TRI_ASSERT(internal != nullptr); if (this->unique()) { + const rocksdb::Comparator* cmp = internal->columnFamily()->GetComparator(); // unique index. we need to keep track of all our changes because we need to avoid // duplicate index keys. must therefore use a WriteBatchWithIndex - rocksdb::WriteBatchWithIndex batch(_cf->GetComparator(), 32 * 1024 * 1024); + rocksdb::WriteBatchWithIndex batch(cmp, 32 * 1024 * 1024); res = ::fillIndexFast(trx, internal, coll, batch); } else { // non-unique index. all index keys will be unique anyway because they contain the document id diff --git a/arangod/RocksDBEngine/RocksDBCollection.cpp b/arangod/RocksDBEngine/RocksDBCollection.cpp index b879f5556436..b4c3012d7634 100644 --- a/arangod/RocksDBEngine/RocksDBCollection.cpp +++ b/arangod/RocksDBEngine/RocksDBCollection.cpp @@ -276,7 +276,6 @@ void RocksDBCollection::prepareIndexes( ); } - bool droppedIndex = false; for (std::shared_ptr& idx : indexes) { addIndex(std::move(idx)); } @@ -285,27 +284,16 @@ void RocksDBCollection::prepareIndexes( (TRI_COL_TYPE_EDGE == _logicalCollection.type() && (_indexes[1]->type() != Index::IndexType::TRI_IDX_TYPE_EDGE_INDEX || _indexes[2]->type() != Index::IndexType::TRI_IDX_TYPE_EDGE_INDEX))) { - std::string msg = "got invalid indexes for collection '" - + _logicalCollection.name() + "'"; - LOG_TOPIC(ERR, arangodb::Logger::ENGINES) << msg; - + + std::string msg = "got invalid indexes for collection '" + + _logicalCollection.name() + "'"; + LOG_TOPIC(ERR, arangodb::Logger::ENGINES) << msg; #ifdef ARANGODB_ENABLE_MAINTAINER_MODE - for (auto it : _indexes) { - LOG_TOPIC(ERR, arangodb::Logger::ENGINES) << "- " << it.get(); - } -#endif - - THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, msg); + for (auto it : _indexes) { + LOG_TOPIC(ERR, arangodb::Logger::ENGINES) << "- " << it.get(); } - - if (droppedIndex) { - auto builder = _logicalCollection.toVelocyPackIgnore({"path", "statusString"}, true, true); - // log this event in the WAL and in the collection meta-data - auto engine = static_cast(EngineSelectorFeature::ENGINE); - engine->writeCreateCollectionMarker(_logicalCollection.vocbase().id(), - _logicalCollection.id(), - builder.slice(), - RocksDBLogValue::Empty()); +#endif + THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, msg); } TRI_ASSERT(!_indexes.empty()); diff --git a/arangod/RocksDBEngine/RocksDBIndexFactory.cpp b/arangod/RocksDBEngine/RocksDBIndexFactory.cpp index 126deda5748d..5a68f38b2c99 100644 --- a/arangod/RocksDBEngine/RocksDBIndexFactory.cpp +++ b/arangod/RocksDBEngine/RocksDBIndexFactory.cpp @@ -662,81 +662,6 @@ struct SecondaryIndexFactory: public DefaultIndexFactory { return EnhanceJsonIndexVPack(definition, normalized, isCreation); } }; -/* -struct PersistentIndexFactory: public DefaultIndexFactory { - PersistentIndexFactory(std::string const& type): DefaultIndexFactory(type) {} - - virtual arangodb::Result instantiate( - std::shared_ptr& index, - arangodb::LogicalCollection& collection, - arangodb::velocypack::Slice const& definition, - TRI_idx_iid_t id, - bool isClusterConstructor - ) const override { - index = std::make_shared( - id, collection, definition - ); - - return arangodb::Result(); - } - - virtual arangodb::Result normalize( - arangodb::velocypack::Builder& normalized, - arangodb::velocypack::Slice definition, - bool isCreation - ) const override { - TRI_ASSERT(normalized.isOpenObject()); - normalized.add( - arangodb::StaticStrings::IndexType, - arangodb::velocypack::Value(arangodb::Index::oldtypeName( - arangodb::Index::TRI_IDX_TYPE_PERSISTENT_INDEX - )) - ); - - if (isCreation - && !ServerState::instance()->isCoordinator() - && !definition.hasKey("objectId")) { - normalized.add( - "objectId", - arangodb::velocypack::Value(std::to_string(TRI_NewTickServer())) - ); - } - - return EnhanceJsonIndexVPack(definition, normalized, isCreation); - } -}; - -struct SkiplistIndexFactory: public DefaultIndexFactory { - SkiplistIndexFactory(std::string const& type): DefaultIndexFactory(type) {} - - virtual arangodb::Result instantiate(std::shared_ptr& index, - arangodb::LogicalCollection& collection, - arangodb::velocypack::Slice const& definition, - TRI_idx_iid_t id, - bool isClusterConstructor - ) const override { - index = std::make_shared(id, collection, definition); - return arangodb::Result(); - } - - virtual arangodb::Result normalize(arangodb::velocypack::Builder& normalized, - arangodb::velocypack::Slice definition, - bool isCreation) const override { - TRI_ASSERT(normalized.isOpenObject()); - normalized.add(arangodb::StaticStrings::IndexType, - arangodb::velocypack::Value(arangodb::Index::oldtypeName(arangodb::Index::TRI_IDX_TYPE_SKIPLIST_INDEX))); - - if (isCreation - && !ServerState::instance()->isCoordinator() - && !definition.hasKey("objectId")) { - normalized.add("objectId", - arangodb::velocypack::Value(std::to_string(TRI_NewTickServer()))); - } - - return EnhanceJsonIndexVPack(definition, normalized, isCreation); - } -}; -*/ struct PrimaryIndexFactory: public DefaultIndexFactory { PrimaryIndexFactory() diff --git a/arangod/RocksDBEngine/RocksDBMethods.h b/arangod/RocksDBEngine/RocksDBMethods.h index 188283431721..e00ab2ae81ef 100644 --- a/arangod/RocksDBEngine/RocksDBMethods.h +++ b/arangod/RocksDBEngine/RocksDBMethods.h @@ -74,9 +74,6 @@ class RocksDBMethods { /// @brief current sequence number rocksdb::SequenceNumber sequenceNumber(); - - /// @brief read options for use with iterators - rocksdb::ReadOptions readOptions(); /// @brief read options for use with iterators rocksdb::ReadOptions iteratorReadOptions(); diff --git a/arangod/RocksDBEngine/RocksDBTransactionState.h b/arangod/RocksDBEngine/RocksDBTransactionState.h index 82e72b656b23..49ab743e5dc2 100644 --- a/arangod/RocksDBEngine/RocksDBTransactionState.h +++ b/arangod/RocksDBEngine/RocksDBTransactionState.h @@ -74,7 +74,6 @@ class RocksDBTransactionState final : public TransactionState { friend class RocksDBTrxUntrackedMethods; friend class RocksDBBatchedMethods; friend class RocksDBBatchedWithIndexMethods; - friend class RocksDBSubTrxMethods; public: RocksDBTransactionState( From 82f4b6e382bed825b59f3dbda9f526a963257cb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Thu, 20 Dec 2018 15:12:39 +0100 Subject: [PATCH 28/31] fix reporting of unfinished indexes --- arangod/RocksDBEngine/RocksDBBuilderIndex.cpp | 105 ++++++++++++------ arangod/RocksDBEngine/RocksDBCollection.cpp | 12 +- .../RocksDBTransactionCollection.h | 21 +++- 3 files changed, 91 insertions(+), 47 deletions(-) diff --git a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp index e77ddc67ec09..5bb1e3463b73 100644 --- a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp +++ b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp @@ -26,6 +26,7 @@ #include "RocksDBEngine/RocksDBCollection.h" #include "RocksDBEngine/RocksDBCommon.h" #include "RocksDBEngine/RocksDBMethods.h" +#include "RocksDBEngine/RocksDBTransactionCollection.h" #include "RocksDBEngine/RocksDBTransactionState.h" #include "StorageEngine/EngineSelectorFeature.h" #include "Transaction/StandaloneContext.h" @@ -46,7 +47,7 @@ using namespace arangodb::rocksutils; RocksDBBuilderIndex::RocksDBBuilderIndex(std::shared_ptr const& wp) : RocksDBIndex(wp->id(), wp->collection(), wp->fields(), wp->unique(), - wp->sparse(), wp->columnFamily(), 0, false), + wp->sparse(), wp->columnFamily(), wp->objectId(), /*useCache*/false), _wrapped(wp), _hasError(false) { TRI_ASSERT(_wrapped); } @@ -114,6 +115,28 @@ Result RocksDBBuilderIndex::removeInternal(transaction::Methods& trx, RocksDBMet return r; } +namespace { + struct BuilderTrx : public arangodb::transaction::Methods { + BuilderTrx(std::shared_ptr const& transactionContext, + LogicalDataSource const& collection, + AccessMode::Type type) : + transaction::Methods(transactionContext), + _cid(collection.id()) { + // add the (sole) data-source + addCollection(collection.id(), collection.name(), type); + addHint(transaction::Hints::Hint::NO_DLD); + } + + /// @brief get the underlying transaction collection + RocksDBTransactionCollection* resolveTrxCollection() { + return static_cast(trxCollection(_cid)); + } + + private: + TRI_voc_cid_t _cid; +}; +} + // Background index filler task // FIXME simon: not used right now because rollbacks are not correctly handled yet arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function const& unlock) { @@ -139,6 +162,7 @@ arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function uint64_t numDocsWritten = 0; auto bounds = RocksDBKeyBounds::CollectionDocuments(rcoll->objectId()); + rocksdb::Slice upper(bounds.end()); // exclusive upper bound rocksdb::Status s; rocksdb::WriteOptions wo; @@ -162,6 +186,7 @@ arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function std::unique_ptr it(rootDB->NewIterator(ro, docCF)); unlock(); // release indexes write lock + // FIXME use buildertrx SingleCollectionTransaction trx(transaction::StandaloneContext::Create(_collection.vocbase()), _collection, AccessMode::Type::WRITE); res = trx.begin(); @@ -184,6 +209,7 @@ arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function RocksDBIndex* internal = _wrapped.get(); TRI_ASSERT(internal != nullptr); + // FIXE make selectivity estimates batch wise it->Seek(bounds.start()); while (it->Valid() && it->key().compare(upper) < 0) { @@ -249,15 +275,23 @@ arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function return res; } -// fast mode assuming exclusive access +// fast mode assuming exclusive access locked from outside template -static arangodb::Result fillIndexFast(transaction::Methods& trx, - RocksDBIndex* ridx, - RocksDBCollection* coll, +static arangodb::Result fillIndexFast(RocksDBIndex& ridx, + LogicalCollection& coll, WriteBatchType& batch) { + Result res; + ::BuilderTrx trx(transaction::StandaloneContext::Create(coll.vocbase()), + coll, AccessMode::Type::READ); + res = trx.begin(); + if (!res.ok()) { + THROW_ARANGO_EXCEPTION(res); + } + + RocksDBCollection* rcoll = static_cast(coll.getPhysical()); auto state = RocksDBTransactionState::toState(&trx); auto methds = RocksDBTransactionState::toMethods(&trx); - arangodb::Result res; + RocksDBTransactionCollection* trxColl = trx.resolveTrxCollection(); // fillindex can be non transactional, we just need to clean up RocksDBEngine* engine = rocksutils::globalRocksEngine(); @@ -268,7 +302,7 @@ static arangodb::Result fillIndexFast(transaction::Methods& trx, // write batch will be reset every x documents MethodsType batched(state, &batch); - auto bounds = RocksDBKeyBounds::CollectionDocuments(coll->objectId()); + auto bounds = RocksDBKeyBounds::CollectionDocuments(rcoll->objectId()); rocksdb::Slice upper(bounds.end()); rocksdb::Status s; @@ -290,42 +324,55 @@ static arangodb::Result fillIndexFast(transaction::Methods& trx, rocksdb::ColumnFamilyHandle* docCF = RocksDBColumnFamily::documents(); std::unique_ptr it = methds->NewIterator(ro, docCF); + auto commitLambda = [&] { + if (batch.GetWriteBatch()->Count() > 0) { + s = rootDB->Write(wo, batch.GetWriteBatch()); + if (!s.ok()) { + res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); + } + } + batch.Clear(); + + auto ops = trxColl->stealTrackedOperations(); + if (!ops.empty()) { + TRI_ASSERT(ridx.hasSelectivityEstimate() && ops.size() == 1); + auto it = ops.begin(); + ridx.estimator()->bufferUpdates(it->first, std::move(it->second.inserts), + std::move(it->second.removals)); + } + }; + it->Seek(bounds.start()); while (it->Valid()) { TRI_ASSERT(it->key().compare(upper) < 0); - res = ridx->insertInternal(trx, &batched, RocksDBKey::documentId(it->key()), - VPackSlice(it->value().data()), - Index::OperationMode::normal); + res = ridx.insertInternal(trx, &batched, RocksDBKey::documentId(it->key()), + VPackSlice(it->value().data()), + Index::OperationMode::normal); if (res.fail()) { break; } numDocsWritten++; if (numDocsWritten % 200 == 0) { // commit buffered writes - s = rootDB->Write(wo, batch.GetWriteBatch()); - if (!s.ok()) { - res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); + commitLambda(); + if (res.fail()) { break; } - batch.Clear(); } it->Next(); } - if (res.ok() && batch.GetWriteBatch()->Count() > 0) { // - s = rootDB->Write(wo, batch.GetWriteBatch()); - if (!s.ok()) { - res = rocksutils::convertStatus(s, rocksutils::StatusHint::index); - } + if (res.ok()) { + commitLambda(); } batch.Clear(); // we will need to remove index elements created before an error // occurred, this needs to happen since we are non transactional if (res.fail()) { - RocksDBKeyBounds bounds = ridx->getBounds(); + RocksDBKeyBounds bounds = ridx.getBounds(); arangodb::Result res2 = rocksutils::removeLargeRange(rocksutils::globalRocksDB(), bounds, true, /*useRangeDel*/numDocsWritten > 25000); if (res2.fail()) { @@ -340,16 +387,6 @@ static arangodb::Result fillIndexFast(transaction::Methods& trx, /// non-transactional: fill index with existing documents /// from this collection arangodb::Result RocksDBBuilderIndex::fillIndexFast() { - Result res; - - RocksDBCollection* coll = static_cast(_collection.getPhysical()); - SingleCollectionTransaction trx(transaction::StandaloneContext::Create(_collection.vocbase()), - _collection, AccessMode::Type::EXCLUSIVE); - res = trx.begin(); - if (!res.ok()) { - THROW_ARANGO_EXCEPTION(res); - } - RocksDBIndex* internal = _wrapped.get(); TRI_ASSERT(internal != nullptr); if (this->unique()) { @@ -357,15 +394,11 @@ arangodb::Result RocksDBBuilderIndex::fillIndexFast() { // unique index. we need to keep track of all our changes because we need to avoid // duplicate index keys. must therefore use a WriteBatchWithIndex rocksdb::WriteBatchWithIndex batch(cmp, 32 * 1024 * 1024); - res = ::fillIndexFast(trx, internal, coll, batch); + return ::fillIndexFast(*internal, _collection, batch); } else { // non-unique index. all index keys will be unique anyway because they contain the document id // we can therefore get away with a cheap WriteBatch rocksdb::WriteBatch batch(32 * 1024 * 1024); - res = ::fillIndexFast(trx, internal, coll, batch); + return ::fillIndexFast(*internal, _collection, batch); } - if (res.ok()) { - res = trx.commit(); // required to commit selectivity estimates - } - return res; } diff --git a/arangod/RocksDBEngine/RocksDBCollection.cpp b/arangod/RocksDBEngine/RocksDBCollection.cpp index b4c3012d7634..f5767da34cc6 100644 --- a/arangod/RocksDBEngine/RocksDBCollection.cpp +++ b/arangod/RocksDBEngine/RocksDBCollection.cpp @@ -321,7 +321,7 @@ std::shared_ptr RocksDBCollection::createIndex( } WRITE_LOCKER(indexGuard, _indexesLock); auto unlockGuard = scopeGuard([&] { - indexGuard.unlock(); + indexGuard.unlock(); // unlock in reverse order this->unlockWrite(); }); @@ -398,11 +398,10 @@ std::shared_ptr RocksDBCollection::createIndex( if (inBackground) { // allow concurrent inserts into index _indexes.emplace_back(buildIdx); res = buildIdx->fillIndexBackground([&] { - unlockGuard.fire(); + unlockGuard.fire(); // will be called at appropriate time }); } else { - _indexes.push_back(idx); - unlockGuard.fire(); + indexGuard.unlock(); // do not block maintenance reporting in cluster res = buildIdx->fillIndexFast(); // will lock again internally } } @@ -418,9 +417,11 @@ std::shared_ptr RocksDBCollection::createIndex( break; } } + } else { + _indexes.push_back(idx); } } - + // we should sync the selectivity estimates TODO fix res = engine->settingsManager()->sync(false); if (res.fail()) { // not critical @@ -457,6 +458,7 @@ std::shared_ptr RocksDBCollection::createIndex( } } + unlockGuard.fire(); // may have already been fired if (res.fail()) { // We could not create the index. Better abort // Remove the Index in the local list again. diff --git a/arangod/RocksDBEngine/RocksDBTransactionCollection.h b/arangod/RocksDBEngine/RocksDBTransactionCollection.h index 9fff50936489..0b56f446f1e4 100644 --- a/arangod/RocksDBEngine/RocksDBTransactionCollection.h +++ b/arangod/RocksDBEngine/RocksDBTransactionCollection.h @@ -94,6 +94,20 @@ class RocksDBTransactionCollection final : public TransactionCollection { /// @brief Every index can track hashes removed from this index /// Used to update the estimate after the trx commited void trackIndexRemove(uint64_t idxObjectId, uint64_t hash); + + /// @brief tracked index operations + struct IndexOperations { + std::vector inserts; + std::vector removals; + }; + typedef std::unordered_map OperationsMap; + + /// @brief steal the tracked operations from the map + OperationsMap stealTrackedOperations() { + OperationsMap empty; + empty.swap(empty); + return empty; + } private: /// @brief request a lock for a collection @@ -114,14 +128,9 @@ class RocksDBTransactionCollection final : public TransactionCollection { uint64_t _numRemoves; bool _usageLocked; - struct IndexOperations { - std::vector inserts; - std::vector removals; - }; - /// @brief A list where all indexes with estimates can store their operations /// Will be applied to the inserter on commit and not applied on abort - std::unordered_map _trackedIndexOperations; + OperationsMap _trackedIndexOperations; }; } From 2f1cc80ff593a8d62c2a6e56079808965cf554a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Fri, 21 Dec 2018 01:39:23 +0100 Subject: [PATCH 29/31] fixing fillIndex for iresearch --- arangod/IResearch/IResearchView.cpp | 2 ++ arangod/RocksDBEngine/RocksDBBuilderIndex.cpp | 7 ++++++- arangod/RocksDBEngine/RocksDBTransactionCollection.h | 2 +- arangod/VocBase/vocbase.cpp | 4 +++- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/arangod/IResearch/IResearchView.cpp b/arangod/IResearch/IResearchView.cpp index 7c38f819221e..a5fbe62d5b48 100644 --- a/arangod/IResearch/IResearchView.cpp +++ b/arangod/IResearch/IResearchView.cpp @@ -927,7 +927,9 @@ arangodb::Result IResearchView::properties( return res; } +#if USE_PLAN_CACHE arangodb::aql::PlanCache::instance()->invalidate(&vocbase()); +#endif arangodb::aql::QueryCache::instance()->invalidate(&vocbase()); return arangodb::ServerState::instance()->isSingleServer() diff --git a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp index 5bb1e3463b73..b6f40c84919b 100644 --- a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp +++ b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp @@ -282,7 +282,8 @@ static arangodb::Result fillIndexFast(RocksDBIndex& ridx, WriteBatchType& batch) { Result res; ::BuilderTrx trx(transaction::StandaloneContext::Create(coll.vocbase()), - coll, AccessMode::Type::READ); + coll, AccessMode::Type::EXCLUSIVE); + trx.addHint(transaction::Hints::Hint::LOCK_NEVER); // already locked res = trx.begin(); if (!res.ok()) { THROW_ARANGO_EXCEPTION(res); @@ -369,6 +370,10 @@ static arangodb::Result fillIndexFast(RocksDBIndex& ridx, } batch.Clear(); + if (res.ok()) { // required so iresearch commits + res = trx.commit(); + } + // we will need to remove index elements created before an error // occurred, this needs to happen since we are non transactional if (res.fail()) { diff --git a/arangod/RocksDBEngine/RocksDBTransactionCollection.h b/arangod/RocksDBEngine/RocksDBTransactionCollection.h index 0b56f446f1e4..7f60735c6830 100644 --- a/arangod/RocksDBEngine/RocksDBTransactionCollection.h +++ b/arangod/RocksDBEngine/RocksDBTransactionCollection.h @@ -105,7 +105,7 @@ class RocksDBTransactionCollection final : public TransactionCollection { /// @brief steal the tracked operations from the map OperationsMap stealTrackedOperations() { OperationsMap empty; - empty.swap(empty); + _trackedIndexOperations.swap(empty); return empty; } diff --git a/arangod/VocBase/vocbase.cpp b/arangod/VocBase/vocbase.cpp index 7023c50b72bd..2b341df512c6 100644 --- a/arangod/VocBase/vocbase.cpp +++ b/arangod/VocBase/vocbase.cpp @@ -1750,7 +1750,9 @@ arangodb::Result TRI_vocbase_t::dropView( } // invalidate all entries in the plan and query cache now +#if USE_PLAN_CACHE arangodb::aql::PlanCache::instance()->invalidate(this); +#endif arangodb::aql::QueryCache::instance()->invalidate(this); unregisterView(*view); @@ -2234,4 +2236,4 @@ TRI_voc_rid_t TRI_StringToRid(char const* p, size_t len, bool& isOld, // ----------------------------------------------------------------------------- // --SECTION-- END-OF-FILE -// ----------------------------------------------------------------------------- \ No newline at end of file +// ----------------------------------------------------------------------------- From 371c738414ac93cd326def0144aacec43f7ec7be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Fri, 21 Dec 2018 14:24:28 +0100 Subject: [PATCH 30/31] revert a change --- arangod/VocBase/LogicalCollection.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arangod/VocBase/LogicalCollection.cpp b/arangod/VocBase/LogicalCollection.cpp index 3f72f624b5aa..c4d7d06fd63f 100644 --- a/arangod/VocBase/LogicalCollection.cpp +++ b/arangod/VocBase/LogicalCollection.cpp @@ -642,13 +642,14 @@ arangodb::Result LogicalCollection::appendVelocyPack( // Indexes result.add(VPackValue("indexes")); auto flags = Index::makeFlags(); - auto filter = [&](arangodb::Index const* idx) { // hide hidden indexes + // FIXME simon: hide links here, but increase chance of ASAN errors + /* auto filter = [&](arangodb::Index const* idx) { // hide hidden indexes return (forPersistence || !idx->isHidden()); - }; + };*/ if (forPersistence) { flags = Index::makeFlags(Index::Serialize::Internals); } - getIndexesVPack(result, flags, filter); + getIndexesVPack(result, flags/*, filter*/); // Cluster Specific result.add(StaticStrings::IsSmart, VPackValue(_isSmart)); From 3f46816649ee2470bedfb181c3da0c234a921903 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Fri, 21 Dec 2018 18:07:54 +0100 Subject: [PATCH 31/31] fixng a deadlock --- arangod/IResearch/IResearchLink.cpp | 6 +- arangod/RocksDBEngine/RocksDBCollection.cpp | 94 +++++++++++---------- 2 files changed, 51 insertions(+), 49 deletions(-) diff --git a/arangod/IResearch/IResearchLink.cpp b/arangod/IResearch/IResearchLink.cpp index bb647139280f..a72fdd46831d 100644 --- a/arangod/IResearch/IResearchLink.cpp +++ b/arangod/IResearch/IResearchLink.cpp @@ -674,9 +674,9 @@ arangodb::Result IResearchLink::init( } } } else if (arangodb::ServerState::instance()->isDBServer()) { // db-server link - auto* engine = arangodb::ClusterInfo::instance(); + auto* ci = arangodb::ClusterInfo::instance(); - if (!engine) { + if (!ci) { return arangodb::Result( TRI_ERROR_INTERNAL, std::string("failure to get storage engine while initializing arangosearch link '") + std::to_string(_id) + "'" @@ -693,7 +693,7 @@ arangodb::Result IResearchLink::init( } } - auto logicalView = engine->getView(vocbase.name(), viewId); // valid to call ClusterInfo (initialized in ClusterFeature::prepare()) even from Databasefeature::start() + auto logicalView = ci->getView(vocbase.name(), viewId); // valid to call ClusterInfo (initialized in ClusterFeature::prepare()) even from Databasefeature::start() // if there is no logicalView present yet then skip this step if (logicalView) { diff --git a/arangod/RocksDBEngine/RocksDBCollection.cpp b/arangod/RocksDBEngine/RocksDBCollection.cpp index f5767da34cc6..a08ebbf273a7 100644 --- a/arangod/RocksDBEngine/RocksDBCollection.cpp +++ b/arangod/RocksDBEngine/RocksDBCollection.cpp @@ -262,20 +262,20 @@ void RocksDBCollection::open(bool /*ignoreErrors*/) { void RocksDBCollection::prepareIndexes( arangodb::velocypack::Slice indexesSlice) { - WRITE_LOCKER(guard, _indexesLock); TRI_ASSERT(indexesSlice.isArray()); StorageEngine* engine = EngineSelectorFeature::ENGINE; std::vector> indexes; - - if (indexesSlice.length() == 0 && _indexes.empty()) { - engine->indexFactory().fillSystemIndexes(_logicalCollection, indexes); - } else { - engine->indexFactory().prepareIndexes( - _logicalCollection, indexesSlice, indexes - ); + { + READ_LOCKER(guard, _indexesLock); // link creation needs read-lock too + if (indexesSlice.length() == 0 && _indexes.empty()) { + engine->indexFactory().fillSystemIndexes(_logicalCollection, indexes); + } else { + engine->indexFactory().prepareIndexes(_logicalCollection, indexesSlice, indexes); + } } - + + WRITE_LOCKER(guard, _indexesLock); for (std::shared_ptr& idx : indexes) { addIndex(std::move(idx)); } @@ -319,17 +319,19 @@ std::shared_ptr RocksDBCollection::createIndex( if (res.fail()) { THROW_ARANGO_EXCEPTION(res); } - WRITE_LOCKER(indexGuard, _indexesLock); +// WRITE_LOCKER(indexGuard, _indexesLock); auto unlockGuard = scopeGuard([&] { - indexGuard.unlock(); // unlock in reverse order +// indexGuard.unlock(); // unlock in reverse order this->unlockWrite(); }); - - // Step 1. Check for matching index - std::shared_ptr idx = findIndex(info, _indexes); - if (idx) { - created = false; // We already have this index. - return idx; + + std::shared_ptr idx; + { // Step 1. Check for matching index + WRITE_LOCKER(guard, _indexesLock); + if ((idx = findIndex(info, _indexes)) != nullptr) { + created = false; // We already have this index. + return idx; + } } RocksDBEngine* engine = static_cast(EngineSelectorFeature::ENGINE); @@ -348,14 +350,16 @@ std::shared_ptr RocksDBCollection::createIndex( TRI_ASSERT(idx->type() != Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX); TRI_ASSERT(idx->type() != Index::IndexType::TRI_IDX_TYPE_EDGE_INDEX); - for (auto const& other : _indexes) { // conflicting index exists - if (other->id() == idx->id()) { - return other; // index already exists + { + READ_LOCKER(guard, _indexesLock); + for (auto const& other : _indexes) { // conflicting index exists + if (other->id() == idx->id()) { + return other; // index already exists + } } } auto buildIdx = std::make_shared(std::static_pointer_cast(idx)); - // Step 3. add index to collection entry (for removal after a crash) if (!engine->inRecovery()) { // manually modify collection entry, other methods need lock RocksDBKey key; // read collection info from database @@ -401,7 +405,6 @@ std::shared_ptr RocksDBCollection::createIndex( unlockGuard.fire(); // will be called at appropriate time }); } else { - indexGuard.unlock(); // do not block maintenance reporting in cluster res = buildIdx->fillIndexFast(); // will lock again internally } } @@ -409,7 +412,7 @@ std::shared_ptr RocksDBCollection::createIndex( // Step 5. cleanup if (res.ok()) { { - WRITE_LOCKER(indexGuard, _indexesLock); + WRITE_LOCKER(guard, _indexesLock); if (inBackground) { // swap in actual index for (size_t i = 0; i < _indexes.size(); i++) { if (_indexes[i]->id() == buildIdx->id()) { @@ -422,19 +425,19 @@ std::shared_ptr RocksDBCollection::createIndex( } } - // we should sync the selectivity estimates TODO fix - res = engine->settingsManager()->sync(false); - if (res.fail()) { // not critical - LOG_TOPIC(WARN, Logger::ENGINES) << "could not sync settings: " - << res.errorMessage(); - res.reset(); - } - - rocksdb::Status s = engine->db()->GetRootDB()->FlushWAL(true); - if (!s.ok()) { // not critical - LOG_TOPIC(WARN, Logger::ENGINES) << "could not flush wal: " - << s.ToString(); - } +// // we should sync the selectivity estimates TODO fix +// res = engine->settingsManager()->sync(false); +// if (res.fail()) { // not critical +// LOG_TOPIC(WARN, Logger::ENGINES) << "could not sync settings: " +// << res.errorMessage(); +// res.reset(); +// } +// +// rocksdb::Status s = engine->db()->GetRootDB()->FlushWAL(true); +// if (!s.ok()) { // not critical +// LOG_TOPIC(WARN, Logger::ENGINES) << "could not flush wal: " +// << s.ToString(); +// } #if USE_PLAN_CACHE arangodb::aql::PlanCache::instance()->invalidate(_logicalCollection->vocbase()); @@ -460,19 +463,18 @@ std::shared_ptr RocksDBCollection::createIndex( unlockGuard.fire(); // may have already been fired if (res.fail()) { - // We could not create the index. Better abort - // Remove the Index in the local list again. - size_t i = 0; - WRITE_LOCKER(guard, _indexesLock); - for (auto index : _indexes) { - if (index == idx) { - _indexes.erase(_indexes.begin() + i); - break; + { // We could not create the index. Better abort + WRITE_LOCKER(guard, _indexesLock); + auto it = _indexes.begin(); + while (it != _indexes.end()) { + if ((*it)->id() == idx->id()) { + _indexes.erase(it); + break; + } + it++; } - ++i; } idx->drop(); - THROW_ARANGO_EXCEPTION(res); }