8000 Bug fix/user manager version race (#14382) · arangodb/arangodb@911c1cb · GitHub
[go: up one dir, main page]

Skip to content

Commit 911c1cb

Browse files
authored
Bug fix/user manager version race (#14382)
Fixes ES-863
1 parent 7f9c774 commit 911c1cb

File tree

5 files changed

+263
-4
lines changed

5 files changed

+263
-4
lines changed

CHANGELOG

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,18 @@ devel
33
* Fix DEVSUP-753: now it is safe to call visit on exhausted disjunction
44
iterator.
55

6+
* Fixed reloading of users within the Cluster.
7+
If a Coordinator is asked to reload its users (e.g. by the UserManager in
8+
Foxx, it is also possible to do via API, but this is internal and on purpose
9+
not documented, so unlikely that it is used), in concurrency with user
10+
management updates there is a chance that the reload is not correctly performed
11+
on this coordinator. It may have missed the last update locally, causing
12+
one user to have an older state. It will be fixed on the next modification
13+
of any other users/permissions. Unfortunately this bug can cascade and when
14+
hit again, the coordinator can now be off by two updates.
15+
In DC2DC this situation is more likely to happen on the target datacenter,
16+
causing this datacenter to have other users/permissions than the source one.
17+
618
* Slightly improve specific warning messages for better readability.
719

820
* Add 3 AQL functions: DECAY_GAUSS, DECAY_EXP and DECAY_LINEAR.

arangod/Auth/UserManager.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,13 @@ void auth::UserManager::loadFromDB() {
209209
return;
210210
}
211211

212+
TRI_IF_FAILURE("UserManager::performDBLookup") {
213+
// Used in GTest. It is used to identify
214+
// if the UserManager would have updated it's
215+
// cache in a specific situation.
216+
THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG);
217+
}
218+
212219
try {
213220
std::shared_ptr<VPackBuilder> builder = QueryAllUsers(_server);
214221
if (builder) {

arangod/Auth/UserManager.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ class UserManager {
7979

8080
/// @brief reload user cache and token caches
8181
void triggerLocalReload() {
82-
_globalVersion.fetch_add(1, std::memory_order_release);
82+
_internalVersion.store(0, std::memory_order_release);
8383
}
8484

8585
/// @brief used for caching
@@ -103,8 +103,7 @@ class UserManager {
103103
bool active, velocypack::Slice extras);
104104

105105
/// Enumerate list of all users
106-
Result enumerateUsers(std::function<bool(auth::User&)>&&,
107-
bool retryOnConflict);
106+
Result enumerateUsers(std::function<bool(auth::User&)>&&, bool retryOnConflict);
108107
/// Update specific user
109108
Result updateUser(std::string const& user, UserCallback&&);
110109
/// Access user without modifying it
@@ -133,7 +132,7 @@ class UserManager {
133132
std::string const& dbname, bool configured = false);
134133
auth::Level collectionAuthLevel(std::string const& username, std::string const& dbname,
135134
std::string const& coll, bool configured = false);
136-
135+
137136
#ifdef ARANGODB_USE_GOOGLE_TESTS
138137
/// Overwrite internally cached permissions, only use
139138
/// for testing purposes

tests/Auth/UserManagerClusterTest.cpp

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
////////////////////////////////////////////////////////////////////////////////
2+
/// DISCLAIMER
3+
///
4+
/// Copyright 2021-2021 ArangoDB GmbH, Cologne, Germany
5+
///
6+
/// Licensed under the Apache License, Version 2.0 (the "License");
7+
/// you may not use this file except in compliance with the License.
8+
/// You may obtain a copy of the License at
9+
///
10+
/// http://www.apache.org/licenses/LICENSE-2.0
11+
///
12+
/// Unless required by applicable law or agreed to in writing, software
13+
/// distributed under the License is distributed on an "AS IS" BASIS,
14+
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
/// See the License for the specific language governing permissions and
16 1E79 +
/// limitations under the License.
17+
///
18+
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
///
20+
/// @author Michael Hackstein
21+
////////////////////////////////////////////////////////////////////////////////
22+
23+
#include "gtest/gtest.h"
24+
25+
#include "Mocks/Servers.h"
26+
27+
#include "Auth/UserManager.h"
28+
#include "GeneralServer/AuthenticationFeature.h"
29+
30+
namespace arangodb {
31+
namespace tests {
32+
namespace auth_info_test {
33+
34+
namespace {
35+
static char const* FailureOnLoadDB = "UserManager::performDBLookup";
36+
}
37+
38+
class UserManagerClusterTest : public ::testing::Test {
39+
public:
40+
mocks::MockCoordinator _server{};
41+
42+
protected:
43+
auth::UserManager* userManager() {
44+
auto um = _server.getFeature<AuthenticationFeature>().userManager();
45+
TRI_ASSERT(um != nullptr);
46+
return um;
47+
}
48+
49+
void simulateOneHeartbeat() {
50+
/*
51+
* NOTE: Sorry i gave up on this. Something that
52+
* requires the heartbeat is absolutely untestable
53+
* it does everything at once, and you need to setup
54+
* a complete functioning world to somehow execute it.
55+
* Furthermore it has tons of undesired side-effects.
56+
*
57+
* All i wanted to do, is let the heartbeat detect
58+
* the UserVersion and inject it the way it does into
59+
* the UserManager.
60+
*/
61+
}
62+
63+
uint64_t getAgencyUserVersion() {
64+
// Simply read the UserVersion from the Agency.
65+
// This is copy pasted from HeartbeatThread.
66+
auto& cache = _server.getFeature<ClusterFeature>().agencyCache();
67+
auto [acb, idx] = cache.read(
68+
std::vector<std::string>{AgencyCommHelper::path("Sync/UserVersion")});
69+
auto result = acb->slice();
70+
VPackSlice slice = result[0].get(std::vector<std::string>(
71+
{AgencyCommHelper::path(), "Sync", "UserVersion"}));
72+
TRI_ASSERT(slice.isInteger());
73+
// there is a UserVersion, and it has to be an UINT
74+
return slice.getUInt();
75+
}
76+
};
77+
78+
#ifdef ARANGODB_ENABLE_FAILURE_TESTS
79+
TEST_F(UserManagerClusterTest, regression_forgotten_update) {
80+
/* The following order of events did lead to a missing update:
81+
* 1. um->triggerLocalReload();
82+
* 2. um->triggerGlobalReload();
83+
* 3. heartbeat
84+
* 4. um->loadFromDB();
85+
*
86+
* 1. and 2. moved internal versions forward two times
87+
* 3. heartbeat resets one of the movings
88+
* 4. Does not perform the actual load, as the heartbeat reset indicates everything is okay.
89+
*/
90+
91+
TRI_AddFailurePointDebugging(FailureOnLoadDB);
92+
auto guard = arangodb::scopeGuard(
93+
[]() { TRI_RemoveFailurePointDebugging(FailureOnLoadDB); });
94+
95+
auto um = userManager();
96+
// If for some reason this EXPECT ever triggers, we can
97+
// inject either the AgencyValue into the UserManager
98+
// or vice-versa. This is just an assertion that we
99+
// expect everything to start at default (1).
100+
EXPECT_EQ(um->globalVersion(), getAgencyUserVersion());
101+
102+
um->triggerLocalReload();
103+
EXPECT_EQ(um->globalVersion(), getAgencyUserVersion());
104+
105+
um->triggerGlobalReload();
106+
EXPECT_EQ(um->globalVersion(), getAgencyUserVersion());
107+
108+
/*
109+
* This is the correct test here, the heartbeat has
110+
* a side-effect, but that is simply untestable in the
111+
* current design. So the only thing i could
112+
* fall back to is to assert that the UserVersions in the agency
113+
* do stay aligned.
114+
*
115+
* this->simulateOneHeartbeat();
116+
*/
117+
// This needs to trigger a reload from DB
118+
try {
119+
um->userExists("unknown user");
120+
FAIL();
121+
} catch (arangodb::basics::Exception const& e) {
122+
// This Execption indicates that we got past the version
123+
// checks and would contact DBServer now.
124+
// This is not under test here, we only want to test that we load
125+
// the plan
126+
EXPECT_EQ(e.code(), TRI_ERROR_DEBUG);
127+
} catch (...) {
128+
FAIL();
129+
}
130+
}
131+
132+
TEST_F(UserManagerClusterTest, cacheRevalidationShouldKeepVersionsInLine) {
133+
TRI_AddFailurePointDebugging(FailureOnLoadDB);
134+
auto guard = arangodb::scopeGuard(
135+
[]() { TRI_RemoveFailurePointDebugging(FailureOnLoadDB); });
136+
137+
auto um = userManager();
138+
// If for some reason this EXPECT ever triggers, we can
139+
// inject either the AgencyValue into the UserManager
140+
// or vice-versa. This is just an assertion that we
141+
// expect everything to start at default (1).
142+
EXPECT_EQ(um->globalVersion(), getAgencyUserVersion());
143+
144+
try {
145+
// This needs to trigger a reload from DB
146+
// Internally it will do local, global, and loadFromDB.
147+
um->triggerCacheRevalidation();
148+
FAIL();
149+
} catch (arangodb::basics::Exception const& e) {
150+
// This Execption indicates that we got past the version
151+
// checks and would contact DBServer now.
152+
// This is not under test here, we only want to test that we load
153+
// the plan
154+
EXPECT_EQ(e.code(), TRI_ERROR_DEBUG);
155+
} catch (...) {
156+
FAIL();
157+
}
158+
EXPECT_EQ(um->globalVersion(), getAgencyUserVersion());
159+
}
160+
161+
TEST_F(UserManagerClusterTest, triggerLocalReloadShouldNotUpdateClusterVersion) {
162+
TRI_AddFailurePointDebugging(FailureOnLoadDB);
163+
auto guard = arangodb::scopeGuard(
164+
[]() { TRI_RemoveFailurePointDebugging(FailureOnLoadDB); });
165+
166+
auto um = userManager();
167+
// If for some reason this EXPECT ever triggers, we can
168+
// inject either the AgencyValue into the UserManager
169+
// or vice-versa. This is just an assertion that we
170+
// expect everything to start at default (1).
171+
EXPECT_EQ(um->globalVersion(), getAgencyUserVersion());
172+
173+
uint64_t versionBefore = getAgencyUserVersion();
174+
175+
um->triggerLocalReload();
176+
EXPECT_EQ(versionBefore, getAgencyUserVersion());
177+
178+
/*
179+
* This is the correct test here, see above
180+
*
181+
* this->simulateOneHeartbeat();
182+
*/
183+
// This needs to trigger a reload from DB
184+
try {
185+
um->userExists("unknown user");
186+
FAIL();
187+
} catch (arangodb::basics::Exception const& e) {
188+
// This Execption indicates that we got past the version
189+
// checks and would contact DBServer now.
190+
// This is not under test here, we only want to test that we load
191+
// the plan
192+
EXPECT_EQ(e.code(), TRI_ERROR_DEBUG);
193+
} catch (...) {
194+
FAIL();
195+
}
196+
}
197+
198+
TEST_F(UserManagerClusterTest, triggerGlobalReloadShouldUpdateClusterVersion) {
199+
TRI_AddFailurePointDebugging(FailureOnLoadDB);
200+
auto guard = arangodb::scopeGuard(
201+
[]() { TRI_RemoveFailurePointDebugging(FailureOnLoadDB); });
202+
203+
auto um = userManager();
204+
// If for some reason this EXPECT ever triggers, we can
205+
// inject either the AgencyValue into the UserManager
206+
// or vice-versa. This is just an assertion that we
207+
// expect everything to start at default (1).
208+
EXPECT_EQ(um->globalVersion(), getAgencyUserVersion());
209+
210+
uint64_t versionBefore = getAgencyUserVersion();
211+
212+
um->triggerGlobalReload();
213+
// The version in the Agency needs to be increased
214+
EXPECT_LT(versionBefore, getAgencyUserVersion());
215+
216+
/*
217+
* This is the correct test here, see above
218+
*
219+
* this->simulateOneHeartbeat();
220+
*/
221+
// This needs to trigger a reload from DB
222+
try {
223+
um->userExists("unknown user");
224+
FAIL();
225+
} catch (arangodb::basics::Exception const& e) {
226+
// This Execption indicates that we got past the version
227+
// checks and would contact DBServer now.
228+
// This is not under test here, we only want to test that we load
229+
// the plan
230+
EXPECT_EQ(e.code(), TRI_ERROR_DEBUG);
231+
} catch (...) {
232+
FAIL();
233+
}
234+
}
235+
236+
#endif
237+
238+
} // namespace auth_info_test
239+
} // namespace tests
240+
} // namespace arangodb

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ set(ARANGODB_TESTS_SOURCES
160160
Aql/DecaysFunctionTest.cpp
161161
AsyncAgencyComm/AsyncAgencyCommTest.cpp
162162
Auth/UserManagerTest.cpp
163+
Auth/UserManagerClusterTest.cpp
163164
Basics/conversions-test.cpp
164165
Basics/csv-test.cpp
165166
Basics/datetime.cpp

0 commit comments

Comments
 (0)
0