8000 chore: implement oom/ood processing component by DanielleMaywood · Pull Request #16436 · coder/coder · GitHub
[go: up one dir, main page]

Skip to content

chore: implement oom/ood processing component #16436

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 44 commits into from
Feb 17, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
6c6240b
chore: add workspace reached resource threshold notification
DanielleMaywood Jan 24, 2025
b3081de
chore: split out into two notifications
DanielleMaywood Jan 30, 2025
a9c8676
chore: update golden files
DanielleMaywood Jan 30, 2025
1a84f96
chore: begin impl of processing logic for oom/ood
DanielleMaywood Jan 29, 2025
78ede46
chore: appease the linter for now
DanielleMaywood Jan 29, 2025
0d2b970
chore: use latest changes to #247, start debounce logic
DanielleMaywood Jan 30, 2025
0df2fd5
chore: add more tests
DanielleMaywood Jan 30, 2025
854d81a
chore: remove mock db for workspace monitor agentapi test
DanielleMaywood Jan 30, 2025
9d9d7b4
chore: remove todo comment
DanielleMaywood Jan 30, 2025
4c21ce7
Merge branch 'main' into dm-internal-247
DanielleMaywood Feb 4, 2025
944fdb5
chore: rewrite ood notification
DanielleMaywood Feb 4, 2025
6444176
chore: updaten golden file
DanielleMaywood Feb 4, 2025
bc87268
chore: silly me
DanielleMaywood Feb 4, 2025
d2265f6
chore: rename test 8000
DanielleMaywood Feb 4, 2025
82a9852
Merge branch 'dm-internal-247' into dm-internal-248
DanielleMaywood Feb 4, 2025
62621d4
chore: add more tests, fix broken sql query
DanielleMaywood Feb 4, 2025
81f43d3
Merge branch 'main' into dm-internal-248
DanielleMaywood Feb 4, 2025
7522b37
chore: update to match main
DanielleMaywood Feb 4, 2025
69c4f42
chore: run 'make gen'
DanielleMaywood Feb 4, 2025
44ebf65
chore: run 'make fmt'
DanielleMaywood Feb 5, 2025
714e743
chore: remove cruft
DanielleMaywood Feb 5, 2025
7cf5212
chore: align interface
DanielleMaywood Feb 5, 2025
d08e713
chore: add another test
DanielleMaywood Feb 6, 2025
ed42eae
chore: improve volume monitor test
DanielleMaywood Feb 6, 2025
1b0d0d2
chore: rename fields
DanielleMaywood Feb 11, 2025
51b16c6
Merge branch 'main' into dm-internal-248
DanielleMaywood Feb 11, 2025
4e43bab
chore: align with other branch
DanielleMaywood Feb 11, 2025
4e144ae
Merge branch 'main' into dm-internal-248
DanielleMaywood Feb 12, 2025
da25ecc
chore: bump migration number
DanielleMaywood Feb 12, 2025
fe1e805
chore: add test and align better
DanielleMaywood Feb 12, 2025
abbd522
chore: appease linter
DanielleMaywood Feb 12, 2025
456989e
Merge branch 'main' into dm-internal-248
DanielleMaywood Feb 14, 2025
1550cc6
chore: update rbac
DanielleMaywood Feb 14, 2025
7998f89
chore: handle missing datapoints
DanielleMaywood Feb 14, 2025
bda8f29
chore: add tests for unknown state on memory monitor
DanielleMaywood Feb 14, 2025
9d662a3
chore: add tests for missing datapoints in volume monitors
DanielleMaywood Feb 14, 2025
bff48dc
chore: add default debounce of 5 minutes
DanielleMaywood Feb 14, 2025
c343a70
chore: implement feedback
DanielleMaywood Feb 14, 2025
babc48f
chore: feedback
DanielleMaywood Feb 17, 2025
01ca549
chore: feedback
DanielleMaywood Feb 17, 2025
a975810
chore: forgot to run the linter
DanielleMaywood Feb 17, 2025
ee35d85
chore: use percentages for alert config
DanielleMaywood Feb 17, 2025
d2fa8df
Merge branch 'main' into dm-internal-248
DanielleMaywood Feb 17, 2025
27d78d1
chore: fmt and bump migration number
DanielleMaywood Feb 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
chore: feedback
  • Loading branch information
DanielleMaywood committed Feb 17, 2025
commit babc48f8cfd6faf3a73e3af3c3d7d40c4a43b7f5
112 changes: 63 additions & 49 deletions coderd/agentapi/resources_monitoring.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,32 +125,39 @@ func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints [
return xerrors.Errorf("update workspace monitor: %w", err)
}

if shouldNotify {
workspace, err := a.Database.GetWorkspaceByID(ctx, a.WorkspaceID)
if err != nil {
return xerrors.Errorf("get workspace by id: %w", err)
}
if !shouldNotify {
return nil
}

_, err = a.NotificationsEnqueuer.EnqueueWithData(
// nolint:gocritic // We need to be able to send the notification.
dbauthz.AsNotifier(ctx),
workspace.OwnerID,
notifications.TemplateWorkspaceOutOfMemory,
map[string]string{
"workspace": workspace.Name,
"threshold": fmt.Sprintf("%d%%", monitor.Threshold),
},
map[string]any{
// NOTE(DanielleMaywood):
// We are injecting a timestamp to circumvent the notification
// deduplication logic.
"timestamp": a.Clock.Now(),
},
"workspace-monitor-memory",
)
if err != nil {
return xerrors.Errorf("notify workspace OOM: %w", err)
}
workspace, err := a.Database.GetWorkspaceByID(ctx, a.WorkspaceID)
if err != nil {
return xerrors.Errorf("get workspace by id: %w", err)
}

_, err = a.NotificationsEnqueuer.EnqueueWithData(
// nolint:gocritic // We need to be able to send the notification.
dbauthz.AsNotifier(ctx),
workspace.OwnerID,
notifications.TemplateWorkspaceOutOfMemory,
map[string]string{
"workspace": workspace.Name,
"threshold": fmt.Sprintf("%d%%", monitor.Threshold),
},
map[string]any{
// NOTE(DanielleMaywood):
// When notifications are enqueued, they are checked to be
// unique within a single day. This means that if we attempt
// to send two OOM notifications for the same workspace on
// the same day, the enqueuer will prevent us from sending
// a second one. We are inject a timestamp to make the
// notifications appear different enough to circumvent this
// deduplication logic.
"timestamp": a.Clock.Now(),
},
"workspace-monitor-memory",
)
if err != nil {
return xerrors.Errorf("notify workspace OOM: %w", err)
}

return nil
Expand Down Expand Up @@ -209,31 +216,38 @@ func (a *ResourcesMonitoringAPI) monitorVolumes(ctx context.Context, datapoints
}
}

if len(outOfDiskVolumes) != 0 {
workspace, err := a.Database.GetWorkspaceByID(ctx, a.WorkspaceID)
if err != nil {
return xerrors.Errorf("get workspace by id: %w", err)
}
if len(outOfDiskVolumes) == 0 {
return nil
}

if _, err := a.NotificationsEnqueuer.EnqueueWithData(
// nolint:gocritic // We need to be able to send the notification.
dbauthz.AsNotifier(ctx),
workspace.OwnerID,
notifications.TemplateWorkspaceOutOfDisk,
map[string]string{
"workspace": workspace.Name,
},
map[string]any{
"volumes": outOfDiskVolumes,
// NOTE(DanielleMaywood):
// We are injecting a timestamp to circumvent the notification
// deduplication logic.
"timestamp": a.Clock.Now(),
},
"workspace-monitor-volumes",
); err != nil {
return xerrors.Errorf("notify workspace OOD: %w", err)
}
workspace, err := a.Database.GetWorkspaceByID(ctx, a.WorkspaceID)
if err != nil {
return xerrors.Errorf("get workspace by id: %w", err)
}

if _, err := a.NotificationsEnqueuer.EnqueueWithData(
// nolint:gocritic // We need to be able to send the notification.
dbauthz.AsNotifier(ctx),
workspace.OwnerID,
notifications.TemplateWorkspaceOutOfDisk,
map[string]string{
"workspace": workspace.Name,
},
map[string]any{
"volumes": outOfDiskVolumes,
// NOTE(DanielleMaywood):
// When notifications are enqueued, they are checked to be
// unique within a single day. This means that if we attempt
// to send two OOM notifications for the same workspace on
// the same day, the enqueuer will prevent us from sending
// a second one. We are inject a timestamp to make the
// notifications appear different enough to circumvent this
// deduplication logic.
"timestamp": a.Clock.Now(),
},
"workspace-monitor-volumes",
); err != nil {
return xerrors.Errorf("notify workspace OOD: %w", err)
}

return nil
Expand Down
3 changes: 3 additions & 0 deletions coderd/agentapi/resourcesmonitor/resources_monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ func NextState(c Config, oldState database.WorkspaceAgentMonitorState, states []
return database.WorkspaceAgentMonitorStateNOK
}

// We do not explicitly handle StateUnknown because it could have
// been either StateOK or StateNOK if collection didn't fail. As
// it could be either, our best bet is to ignore it.
nokCount, okCount := 0, 0
for _, state := range states {
switch state {
Expand Down
6 changes: 6 additions & 0 deletions coderd/database/dbmem/dbmem.go
Original file line number Diff line number Diff line change
Expand Up @@ -9649,6 +9649,9 @@ func (q *FakeQuerier) UpdateMemoryResourceMonitor(_ context.Context, arg databas
return err
}

q.mutex.Lock()
defer q.mutex.Unlock()

for i, monitor := range q.workspaceAgentMemoryResourceMonitors {
if monitor.AgentID != arg.AgentID {
continue
Expand Down Expand Up @@ -10448,6 +10451,9 @@ func (q *FakeQuerier) UpdateVolumeResourceMonitor(_ context.Context, arg databas
return err
}

q.mutex.Lock()
defer q.mutex.Unlock()

for i, monitor := range q.workspaceAgentVolumeResourceMonitors {
if monitor.AgentID != arg.AgentID || monitor.Path != arg.Path {
continue
Expand Down
Loading
0