diff --git a/CHANGELOG b/CHANGELOG index ff7377494530..1b818f5aa59d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,18 @@ v3.7.13 (XXXX-XX-XX) -------------------- +* Fixed ES-863: reloading of users within the Cluster. + If a Coordinator is asked to reload its users (e.g. by the UserManager in + Foxx, it is also possible to do via API, but this is internal and on purpose + not documented, so unlikely that it is used), in concurrency with user + management updates there is a chance that the reload is not correctly + performed on this coordinator. It may have missed the last update locally, + causing one user to have an older state. It will be fixed on the next + modification of any other users/permissions. Unfortunately this bug can + cascade and when hit again, the coordinator can now be off by two updates. + In DC2DC this situation is more likely to happen on the target datacenter, + causing this datacenter to have other users/permissions than the source one. + * Fix BTS-446: When finding a not yet fully initialized agency, do not immediately fatal exit. Keep trying for (very generous) 5 minutes. diff --git a/arangod/Auth/UserManager.cpp b/arangod/Auth/UserManager.cpp index a1744ec77faa..e1561d0052fd 100644 --- a/arangod/Auth/UserManager.cpp +++ b/arangod/Auth/UserManager.cpp @@ -87,10 +87,7 @@ auth::UserManager::UserManager(application_features::ApplicationServer& server) : _server(server), _globalVersion(1), _internalVersion(0) {} #else auth::UserManager::UserManager(application_features::ApplicationServer& server) - : _server(server), - _globalVersion(1), - _internalVersion(0), - _authHandler(nullptr) {} + : _server(server), _globalVersion(1), _internalVersion(0), _authHandler(nullptr) {} auth::UserManager::UserManager(application_features::ApplicationServer& server, std::unique_ptr handler) @@ -129,7 +126,8 @@ static std::shared_ptr QueryAllUsers(application_features::Applica if (vocbase == nullptr) { LOG_TOPIC("b8c47", DEBUG, arangodb::Logger::AUTHENTICATION) << "system database is unknown"; - THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, "system database is unknown"); + THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, + "system database is unknown"); } // we cannot set this execution context, otherwise the transaction @@ -138,8 +136,8 @@ static std::shared_ptr QueryAllUsers(application_features::Applica std::string const queryStr("FOR user IN _users RETURN user"); auto emptyBuilder = std::make_shared(); arangodb::aql::Query query(transaction::StandaloneContext::Create(*vocbase), - arangodb::aql::QueryString(queryStr), - emptyBuilder, emptyBuilder); + arangodb::aql::QueryString(queryStr), emptyBuilder, + emptyBuilder); query.queryOptions().cache = false; query.queryOptions().ttl = 30; @@ -156,7 +154,8 @@ static std::shared_ptr QueryAllUsers(application_features::Applica THROW_ARANGO_EXCEPTION(TRI_ERROR_REQUEST_CANCELED); } THROW_ARANGO_EXCEPTION_MESSAGE(queryResult.result.errorNumber(), - "Error executing user query: " + queryResult.result.errorMessage()); + "Error executing user query: " + + queryResult.result.errorMessage()); } VPackSlice usersSlice = queryResult.data->slice(); @@ -200,6 +199,13 @@ void auth::UserManager::loadFromDB() { return; } + TRI_IF_FAILURE("UserManager::performDBLookup") { + // Used in GTest. It is used to identify + // if the UserManager would have updated it's + // cache in a specific situation. + THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); + } + try { std::shared_ptr builder = QueryAllUsers(_server); if (builder) { @@ -325,7 +331,7 @@ Result auth::UserManager::storeUserInternal(auth::User const& entry, bool replac // user was outdated, we should trigger a reload triggerLocalReload(); LOG_TOPIC("cf922", DEBUG, Logger::AUTHENTICATION) - << "Cannot update user : '" << res.errorMessage() << "'"; + << "Cannot update user : '" << res.errorMessage() << "'"; } } return res; @@ -343,7 +349,8 @@ void auth::UserManager::createRootUser() { WRITE_LOCKER(writeGuard, _userCacheLock); // must be second UserMap::iterator const& it = _userCache.find("root"); if (it != _userCache.end()) { - LOG_TOPIC("bbc97", TRACE, Logger::AUTHENTICATION) << "\"root\" already exists"; + LOG_TOPIC("bbc97", TRACE, Logger::AUTHENTICATION) + << "\"root\" already exists"; return; } TRI_ASSERT(_userCache.empty()); @@ -368,7 +375,8 @@ void auth::UserManager::createRootUser() { << "unable to create user \"root\": " << ex.what(); } catch (...) { // No action - LOG_TOPIC("268eb", ERR, Logger::AUTHENTICATION) << "unable to create user \"root\""; + LOG_TOPIC("268eb", ERR, Logger::AUTHENTICATION) + << "unable to create user \"root\""; } triggerGlobalReload(); @@ -490,20 +498,20 @@ Result auth::UserManager::enumerateUsers(std::function&& func } } } - + bool triggerUpdate = !toUpdate.empty(); - + Result res; do { auto it = toUpdate.begin(); - while(it != toUpdate.end()) { + while (it != toUpdate.end()) { WRITE_LOCKER(writeGuard, _userCacheLock); - res = storeUserInternal(*it, /*replace*/true); - + res = storeUserInternal(*it, /*replace*/ true); + if (res.is(TRI_ERROR_ARANGO_CONFLICT) && retryOnConflict) { res.reset(); writeGuard.unlock(); - loadFromDB(); // should be noop iff nothing changed + loadFromDB(); // should be noop iff nothing changed writeGuard.lock(); UserMap::iterator it2 = _userCache.find(it->username()); if (it2 != _userCache.end()) { @@ -715,7 +723,8 @@ bool auth::UserManager::checkPassword(std::string const& username, std::string c AuthenticationFeature* af = AuthenticationFeature::instance(); if (it != _userCache.end() && (it->second.source() == auth::Source::Local)) { if (af != nullptr && !af->localAuthentication()) { - LOG_TOPIC("d3220", DEBUG, Logger::AUTHENTICATION) << "Local users are forbidden"; + LOG_TOPIC("d3220", DEBUG, Logger::AUTHENTICATION) + << "Local users are forbidden"; return false; } auth::User const& user = it->second; @@ -799,6 +808,7 @@ auth::Level auth::UserManager::collectionAuthLevel(std::string const& user, return level; } +#ifdef ARANGODB_USE_GOOGLE_TESTS /// Only used for testing void auth::UserManager::setAuthInfo(auth::UserMap const& newMap) { MUTEX_LOCKER(guard, _loadFromDBLock); // must be first @@ -806,3 +816,4 @@ void auth::UserManager::setAuthInfo(auth::UserMap const& newMap) { _userCache = newMap; _internalVersion.store(_globalVersion.load()); } +#endif diff --git a/arangod/Auth/UserManager.h b/arangod/Auth/UserManager.h index 656e45e72bed..bc70f6993d5e 100644 --- a/arangod/Auth/UserManager.h +++ b/arangod/Auth/UserManager.h @@ -80,7 +80,7 @@ class UserManager { /// @brief reload user cache and token caches void triggerLocalReload() { - _globalVersion.fetch_add(1, std::memory_order_release); + _internalVersion.store(0, std::memory_order_release); } /// @brief used for caching @@ -104,8 +104,7 @@ class UserManager { bool active, velocypack::Slice extras); /// Enumerate list of all users - Result enumerateUsers(std::function&&, - bool retryOnConflict); + Result enumerateUsers(std::function&&, bool retryOnConflict); /// Update specific user Result updateUser(std::string const& user, UserCallback&&); /// Access user without modifying it @@ -135,9 +134,11 @@ class UserManager { auth::Level collectionAuthLevel(std::string const& username, std::string const& dbname, std::string const& coll, bool configured = false); +#ifdef ARANGODB_USE_GOOGLE_TESTS /// Overwrite internally cached permissions, only use /// for testing purposes void setAuthInfo(auth::UserMap const& userEntryMap); +#endif #ifdef USE_ENTERPRISE diff --git a/tests/Auth/UserManagerClusterTest.cpp b/tests/Auth/UserManagerClusterTest.cpp new file mode 100644 index 000000000000..e3f943e0efd2 --- /dev/null +++ b/tests/Auth/UserManagerClusterTest.cpp @@ -0,0 +1,240 @@ +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2021-2021 ArangoDB GmbH, Cologne, Germany +/// +/// Licensed under the Apache License, Version 2.0 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// http://www.apache.org/licenses/LICENSE-2.0 +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Michael Hackstein +//////////////////////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +#include "Mocks/Servers.h" + +#include "Auth/UserManager.h" +#include "GeneralServer/AuthenticationFeature.h" + +namespace arangodb { +namespace tests { +namespace auth_info_test { + +namespace { +static char const* FailureOnLoadDB = "UserManager::performDBLookup"; +} + +class UserManagerClusterTest : public ::testing::Test { + public: + mocks::MockCoordinator _server{}; + + protected: + auth::UserManager* userManager() { + auto um = _server.getFeature().userManager(); + TRI_ASSERT(um != nullptr); + return um; + } + + void simulateOneHeartbeat() { + /* + * NOTE: Sorry i gave up on this. Something that + * requires the heartbeat is absolutely untestable + * it does everything at once, and you need to setup + * a complete functioning world to somehow execute it. + * Furthermore it has tons of undesired side-effects. + * + * All i wanted to do, is let the heartbeat detect + * the UserVersion and inject it the way it does into + * the UserManager. + */ + } + + uint64_t getAgencyUserVersion() { + // Simply read the UserVersion from the Agency. + // This is copy pasted from HeartbeatThread. + auto& cache = _server.getFeature().agencyCache(); + auto [acb, idx] = cache.read( + std::vector{AgencyCommHelper::path("Sync/UserVersion")}); + auto result = acb->slice(); + VPackSlice slice = result[0].get(std::vector( + {AgencyCommHelper::path(), "Sync", "UserVersion"})); + TRI_ASSERT(slice.isInteger()); + // there is a UserVersion, and it has to be an UINT + return slice.getUInt(); + } +}; + +#ifdef ARANGODB_ENABLE_FAILURE_TESTS +TEST_F(UserManagerClusterTest, regression_forgotten_update) { + /* The following order of events did lead to a missing update: + * 1. um->triggerLocalReload(); + * 2. um->triggerGlobalReload(); + * 3. heartbeat + * 4. um->loadFromDB(); + * + * 1. and 2. moved internal versions forward two times + * 3. heartbeat resets one of the movings + * 4. Does not perform the actual load, as the heartbeat reset indicates everything is okay. + */ + + TRI_AddFailurePointDebugging(FailureOnLoadDB); + auto guard = arangodb::scopeGuard( + []() { TRI_RemoveFailurePointDebugging(FailureOnLoadDB); }); + + auto um = userManager(); + // If for some reason this EXPECT ever triggers, we can + // inject either the AgencyValue into the UserManager + // or vice-versa. This is just an assertion that we + // expect everything to start at default (1). + EXPECT_EQ(um->globalVersion(), getAgencyUserVersion()); + + um->triggerLocalReload(); + EXPECT_EQ(um->globalVersion(), getAgencyUserVersion()); + + um->triggerGlobalReload(); + EXPECT_EQ(um->globalVersion(), getAgencyUserVersion()); + + /* + * This is the correct test here, the heartbeat has + * a side-effect, but that is simply untestable in the + * current design. So the only thing i could + * fall back to is to assert that the UserVersions in the agency + * do stay aligned. + * + * this->simulateOneHeartbeat(); + */ + // This needs to trigger a reload from DB + try { + um->userExists("unknown user"); + FAIL(); + } catch (arangodb::basics::Exception const& e) { + // This Execption indicates that we got past the version + // checks and would contact DBServer now. + // This is not under test here, we only want to test that we load + // the plan + EXPECT_EQ(e.code(), TRI_ERROR_DEBUG); + } catch (...) { + FAIL(); + } +} + +TEST_F(UserManagerClusterTest, cacheRevalidationShouldKeepVersionsInLine) { + TRI_AddFailurePointDebugging(FailureOnLoadDB); + auto guard = arangodb::scopeGuard( + []() { TRI_RemoveFailurePointDebugging(FailureOnLoadDB); }); + + auto um = userManager(); + // If for some reason this EXPECT ever triggers, we can + // inject either the AgencyValue into the UserManager + // or vice-versa. This is just an assertion that we + // expect everything to start at default (1). + EXPECT_EQ(um->globalVersion(), getAgencyUserVersion()); + + try { + // This needs to trigger a reload from DB + // Internally it will do local, global, and loadFromDB. + um->triggerCacheRevalidation(); + FAIL(); + } catch (arangodb::basics::Exception const& e) { + // This Execption indicates that we got past the version + // checks and would contact DBServer now. + // This is not under test here, we only want to test that we load + // the plan + EXPECT_EQ(e.code(), TRI_ERROR_DEBUG); + } catch (...) { + FAIL(); + } + EXPECT_EQ(um->globalVersion(), getAgencyUserVersion()); +} + +TEST_F(UserManagerClusterTest, triggerLocalReloadShouldNotUpdateClusterVersion) { + TRI_AddFailurePointDebugging(FailureOnLoadDB); + auto guard = arangodb::scopeGuard( + []() { TRI_RemoveFailurePointDebugging(FailureOnLoadDB); }); + + auto um = userManager(); + // If for some reason this EXPECT ever triggers, we can + // inject either the AgencyValue into the UserManager + // or vice-versa. This is just an assertion that we + // expect everything to start at default (1). + EXPECT_EQ(um->globalVersion(), getAgencyUserVersion()); + + uint64_t versionBefore = getAgencyUserVersion(); + + um->triggerLocalReload(); + EXPECT_EQ(versionBefore, getAgencyUserVersion()); + + /* + * This is the correct test here, see above + * + * this->simulateOneHeartbeat(); + */ + // This needs to trigger a reload from DB + try { + um->userExists("unknown user"); + FAIL(); + } catch (arangodb::basics::Exception const& e) { + // This Execption indicates that we got past the version + // checks and would contact DBServer now. + // This is not under test here, we only want to test that we load + // the plan + EXPECT_EQ(e.code(), TRI_ERROR_DEBUG); + } catch (...) { + FAIL(); + } +} + +TEST_F(UserManagerClusterTest, triggerGlobalReloadShouldUpdateClusterVersion) { + TRI_AddFailurePointDebugging(FailureOnLoadDB); + auto guard = arangodb::scopeGuard( + []() { TRI_RemoveFailurePointDebugging(FailureOnLoadDB); }); + + auto um = userManager(); + // If for some reason this EXPECT ever triggers, we can + // inject either the AgencyValue into the UserManager + // or vice-versa. This is just an assertion that we + // expect everything to start at default (1). + EXPECT_EQ(um->globalVersion(), getAgencyUserVersion()); + + uint64_t versionBefore = getAgencyUserVersion(); + + um->triggerGlobalReload(); + // The version in the Agency needs to be increased + EXPECT_LT(versionBefore, getAgencyUserVersion()); + + /* + * This is the correct test here, see above + * + * this->simulateOneHeartbeat(); + */ + // This needs to trigger a reload from DB + try { + um->userExists("unknown user"); + FAIL(); + } catch (arangodb::basics::Exception const& e) { + // This Execption indicates that we got past the version + // checks and would contact DBServer now. + // This is not under test here, we only want to test that we load + // the plan + EXPECT_EQ(e.code(), TRI_ERROR_DEBUG); + } catch (...) { + FAIL(); + } +} + +#endif + +} // namespace auth_info_test +} // namespace tests +} // namespace arangodb diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6a648b8a5588..608ae4e60110 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -92,6 +92,7 @@ set(ARANGODB_TESTS_SOURCES Aql/WaitingExecutionBlockMock.cpp AsyncAgencyComm/AsyncAgencyCommTest.cpp Auth/UserManagerTest.cpp + Auth/UserManagerClusterTest.cpp Basics/conversions-test.cpp Basics/csv-test.cpp Basics/datetime.cpp