diff --git a/enterprise/coderd/prebuilds/metricscollector.go b/enterprise/coderd/prebuilds/metricscollector.go index 7a7734b6f8093..90257c26dd580 100644 --- a/enterprise/coderd/prebuilds/metricscollector.go +++ b/enterprise/coderd/prebuilds/metricscollector.go @@ -27,6 +27,7 @@ const ( MetricDesiredGauge = namespace + "desired" MetricRunningGauge = namespace + "running" MetricEligibleGauge = namespace + "eligible" + MetricPresetHardLimitedGauge = namespace + "preset_hard_limited" MetricLastUpdatedGauge = namespace + "metrics_last_updated" ) @@ -82,6 +83,12 @@ var ( labels, nil, ) + presetHardLimitedDesc = prometheus.NewDesc( + MetricPresetHardLimitedGauge, + "Indicates whether a given preset has reached the hard failure limit (1 = hard-limited). Metric is omitted otherwise.", + labels, + nil, + ) lastUpdateDesc = prometheus.NewDesc( MetricLastUpdatedGauge, "The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached.", @@ -104,17 +111,22 @@ type MetricsCollector struct { replacementsCounter map[replacementKey]float64 replacementsCounterMu sync.Mutex + + isPresetHardLimited map[hardLimitedPresetKey]bool + isPresetHardLimitedMu sync.Mutex } var _ prometheus.Collector = new(MetricsCollector) func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector { log := logger.Named("prebuilds_metrics_collector") + return &MetricsCollector{ database: db, logger: log, snapshotter: snapshotter, replacementsCounter: make(map[replacementKey]float64), + isPresetHardLimited: make(map[hardLimitedPresetKey]bool), } } @@ -126,6 +138,7 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) { descCh <- desiredPrebuildsDesc descCh <- runningPrebuildsDesc descCh <- eligiblePrebuildsDesc + descCh <- presetHardLimitedDesc descCh <- lastUpdateDesc } @@ -173,6 +186,17 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName) } + mc.isPresetHardLimitedMu.Lock() + for key, isHardLimited := range mc.isPresetHardLimited { + var val float64 + if isHardLimited { + val = 1 + } + + metricsCh <- prometheus.MustNewConstMetric(presetHardLimitedDesc, prometheus.GaugeValue, val, key.templateName, key.presetName, key.orgName) + } + mc.isPresetHardLimitedMu.Unlock() + metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, float64(currentState.createdAt.Unix())) } @@ -247,3 +271,25 @@ func (mc *MetricsCollector) trackResourceReplacement(orgName, templateName, pres // cause an issue (or indeed if either would), so we just track the replacement. mc.replacementsCounter[key]++ } + +type hardLimitedPresetKey struct { + orgName, templateName, presetName string +} + +func (k hardLimitedPresetKey) String() string { + return fmt.Sprintf("%s:%s:%s", k.orgName, k.templateName, k.presetName) +} + +// nolint:revive // isHardLimited determines if the preset should be reported as hard-limited in Prometheus. +func (mc *MetricsCollector) trackHardLimitedStatus(orgName, templateName, presetName string, isHardLimited bool) { + mc.isPresetHardLimitedMu.Lock() + defer mc.isPresetHardLimitedMu.Unlock() + + key := hardLimitedPresetKey{orgName: orgName, templateName: templateName, presetName: presetName} + + if isHardLimited { + mc.isPresetHardLimited[key] = true + } else { + delete(mc.isPresetHardLimited, key) + } +} diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go index 7796e43777951..603c0da3588a6 100644 --- a/enterprise/coderd/prebuilds/reconcile.go +++ b/enterprise/coderd/prebuilds/reconcile.go @@ -361,17 +361,22 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres slog.F("preset_name", ps.Preset.Name), ) - // If the preset was previously hard-limited, log it and exit early. - if ps.Preset.PrebuildStatus == database.PrebuildStatusHardLimited { - logger.Warn(ctx, "skipping hard limited preset") - return nil - } + // Report a preset as hard-limited only if all the following conditions are met: + // - The preset is marked as hard-limited + // - The preset is using the active version of its template, and the template has not been deleted + // + // The second condition is important because a hard-limited preset that has become outdated is no longer relevant. + // Its associated prebuilt workspaces were likely deleted, and it's not meaningful to continue reporting it + // as hard-limited to the admin. + reportAsHardLimited := ps.IsHardLimited && ps.Preset.UsingActiveVersion && !ps.Preset.Deleted + c.metrics.trackHardLimitedStatus(ps.Preset.OrganizationName, ps.Preset.TemplateName, ps.Preset.Name, reportAsHardLimited) // If the preset reached the hard failure limit for the first time during this iteration: // - Mark it as hard-limited in the database // - Send notifications to template admins - if ps.IsHardLimited { - logger.Warn(ctx, "skipping hard limited preset") + // - Continue execution, we disallow only creation operation for hard-limited presets. Deletion is allowed. + if ps.Preset.PrebuildStatus != database.PrebuildStatusHardLimited && ps.IsHardLimited { + logger.Warn(ctx, "preset is hard limited, notifying template admins") err := c.store.UpdatePresetPrebuildStatus(ctx, database.UpdatePresetPrebuildStatusParams{ Status: database.PrebuildStatusHardLimited, @@ -384,10 +389,7 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres err = c.notifyPrebuildFailureLimitReached(ctx, ps) if err != nil { logger.Error(ctx, "failed to notify that number of prebuild failures reached the limit", slog.Error(err)) - return nil } - - return nil } state := ps.CalculateState() @@ -452,6 +454,13 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres actions.Create = desired } + // If preset is hard-limited, and it's a create operation, log it and exit early. + // Creation operation is disallowed for hard-limited preset. + if ps.IsHardLimited && actions.Create > 0 { + logger.Warn(ctx, "skipping hard limited preset for create operation") + return nil + } + var multiErr multierror.Error for range actions.Create { diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go index f52a77ca500b9..b7f5974197150 100644 --- a/enterprise/coderd/prebuilds/reconcile_test.go +++ b/enterprise/coderd/prebuilds/reconcile_test.go @@ -696,7 +696,8 @@ func TestSkippingHardLimitedPresets(t *testing.T) { ).Leveled(slog.LevelDebug) db, pubSub := dbtestutil.NewDB(t) fakeEnqueuer := newFakeEnqueuer() - controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, prometheus.NewRegistry(), fakeEnqueuer) + registry := prometheus.NewRegistry() + controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, registry, fakeEnqueuer) // Template admin to receive a notification. templateAdmin := dbgen.User(t, db, database.User{ @@ -732,6 +733,17 @@ func TestSkippingHardLimitedPresets(t *testing.T) { workspaceCount := len(workspaces) require.Equal(t, 1, workspaceCount) + // Verify initial state: metric is not set - meaning preset is not hard limited. + require.NoError(t, controller.ForceMetricsUpdate(ctx)) + mf, err := registry.Gather() + require.NoError(t, err) + metric := findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ + "template_name": template.Name, + "preset_name": preset.Name, + "org_name": org.Name, + }) + require.Nil(t, metric) + // We simulate a failed prebuild in the test; Consequently, the backoff mechanism is triggered when ReconcileAll is called. // Even though ReconciliationBackoffInterval is set to zero, we still need to advance the clock by at least one nanosecond. clock.Advance(time.Nanosecond).MustWait(ctx) @@ -755,6 +767,16 @@ func TestSkippingHardLimitedPresets(t *testing.T) { // When hard limit is not reached, a new workspace should be created. require.Equal(t, 2, len(workspaces)) require.Equal(t, database.PrebuildStatusHealthy, updatedPreset.PrebuildStatus) + + // When hard limit is not reached, metric is not set. + mf, err = registry.Gather() + require.NoError(t, err) + metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ + "template_name": template.Name, + "preset_name": preset.Name, + "org_name": org.Name, + }) + require.Nil(t, metric) return } @@ -775,6 +797,252 @@ func TestSkippingHardLimitedPresets(t *testing.T) { return true }) require.Len(t, matching, 1) + + // When hard limit is reached, metric is set to 1. + mf, err = registry.Gather() + require.NoError(t, err) + metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ + "template_name": template.Name, + "preset_name": preset.Name, + "org_name": org.Name, + }) + require.NotNil(t, metric) + require.NotNil(t, metric.GetGauge()) + require.EqualValues(t, 1, metric.GetGauge().GetValue()) + }) + } +} + +func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) { + t.Parallel() + + if !dbtestutil.WillUsePostgres() { + t.Skip("This test requires postgres") + } + + // Test cases verify the behavior of prebuild creation depending on configured failure limits. + testCases := []struct { + name string + hardLimit int64 + createNewTemplateVersion bool + deleteTemplate bool + }{ + { + // hard limit is hit - but we allow deletion of prebuilt workspace because it's outdated (new template version was created) + name: "new template version is created", + hardLimit: 1, + createNewTemplateVersion: true, + deleteTemplate: false, + }, + { + // hard limit is hit - but we allow deletion of prebuilt workspace because template is deleted + name: "template is deleted", + hardLimit: 1, + createNewTemplateVersion: false, + deleteTemplate: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + clock := quartz.NewMock(t) + ctx := testutil.Context(t, testutil.WaitShort) + cfg := codersdk.PrebuildsConfig{ + FailureHardLimit: serpent.Int64(tc.hardLimit), + ReconciliationBackoffInterval: 0, + } + logger := slogtest.Make( + t, &slogtest.Options{IgnoreErrors: true}, + ).Leveled(slog.LevelDebug) + db, pubSub := dbtestutil.NewDB(t) + fakeEnqueuer := newFakeEnqueuer() + registry := prometheus.NewRegistry() + controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, registry, fakeEnqueuer) + + // Template admin to receive a notification. + templateAdmin := dbgen.User(t, db, database.User{ + RBACRoles: []string{codersdk.RoleTemplateAdmin}, + }) + + // Set up test environment with a template, version, and preset. + ownerID := uuid.New() + dbgen.User(t, db, database.User{ + ID: ownerID, + }) + org, template := setupTestDBTemplate(t, db, ownerID, false) + templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, pubSub, org.ID, ownerID, template.ID) + preset := setupTestDBPreset(t, db, templateVersionID, 2, uuid.New().String()) + + // Create a successful prebuilt workspace. + successfulWorkspace, _ := setupTestDBPrebuild( + t, + clock, + db, + pubSub, + database.WorkspaceTransitionStart, + database.ProvisionerJobStatusSucceeded, + org.ID, + preset, + template.ID, + templateVersionID, + ) + + // Make sure that prebuilt workspaces created in such order: [successful, failed]. + clock.Advance(time.Second).MustWait(ctx) + + // Create a failed prebuilt workspace that counts toward the hard failure limit. + setupTestDBPrebuild( + t, + clock, + db, + pubSub, + database.WorkspaceTransitionStart, + database.ProvisionerJobStatusFailed, + org.ID, + preset, + template.ID, + templateVersionID, + ) + + getJobStatusMap := func(workspaces []database.WorkspaceTable) map[database.ProvisionerJobStatus]int { + jobStatusMap := make(map[database.ProvisionerJobStatus]int) + for _, workspace := range workspaces { + workspaceBuilds, err := db.GetWorkspaceBuildsByWorkspaceID(ctx, database.GetWorkspaceBuildsByWorkspaceIDParams{ + WorkspaceID: workspace.ID, + }) + require.NoError(t, err) + + for _, workspaceBuild := range workspaceBuilds { + job, err := db.GetProvisionerJobByID(ctx, workspaceBuild.JobID) + require.NoError(t, err) + jobStatusMap[job.JobStatus]++ + } + } + return jobStatusMap + } + + // Verify initial state: two workspaces exist, one successful, one failed. + workspaces, err := db.GetWorkspacesByTemplateID(ctx, template.ID) + require.NoError(t, err) + require.Equal(t, 2, len(workspaces)) + jobStatusMap := getJobStatusMap(workspaces) + require.Len(t, jobStatusMap, 2) + require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusSucceeded]) + require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusFailed]) + + // Verify initial state: metric is not set - meaning preset is not hard limited. + require.NoError(t, controller.ForceMetricsUpdate(ctx)) + mf, err := registry.Gather() + require.NoError(t, err) + metric := findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ + "template_name": template.Name, + "preset_name": preset.Name, + "org_name": org.Name, + }) + require.Nil(t, metric) + + // We simulate a failed prebuild in the test; Consequently, the backoff mechanism is triggered when ReconcileAll is called. + // Even though ReconciliationBackoffInterval is set to zero, we still need to advance the clock by at least one nanosecond. + clock.Advance(time.Nanosecond).MustWait(ctx) + + // Trigger reconciliation to attempt creating a new prebuild. + // The outcome depends on whether the hard limit has been reached. + require.NoError(t, controller.ReconcileAll(ctx)) + + // These two additional calls to ReconcileAll should not trigger any notifications. + // A notification is only sent once. + require.NoError(t, controller.ReconcileAll(ctx)) + require.NoError(t, controller.ReconcileAll(ctx)) + + // Verify the final state after reconciliation. + // When hard limit is reached, no new workspace should be created. + workspaces, err = db.GetWorkspacesByTemplateID(ctx, template.ID) + require.NoError(t, err) + require.Equal(t, 2, len(workspaces)) + jobStatusMap = getJobStatusMap(workspaces) + require.Len(t, jobStatusMap, 2) + require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusSucceeded]) + require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusFailed]) + + updatedPreset, err := db.GetPresetByID(ctx, preset.ID) + require.NoError(t, err) + require.Equal(t, database.PrebuildStatusHardLimited, updatedPreset.PrebuildStatus) + + // When hard limit is reached, a notification should be sent. + matching := fakeEnqueuer.Sent(func(notification *notificationstest.FakeNotification) bool { + if !assert.Equal(t, notifications.PrebuildFailureLimitReached, notification.TemplateID, "unexpected template") { + return false + } + + if !assert.Equal(t, templateAdmin.ID, notification.UserID, "unexpected receiver") { + return false + } + + return true + }) + require.Len(t, matching, 1) + + // When hard limit is reached, metric is set to 1. + mf, err = registry.Gather() + require.NoError(t, err) + metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ + "template_name": template.Name, + "preset_name": preset.Name, + "org_name": org.Name, + }) + require.NotNil(t, metric) + require.NotNil(t, metric.GetGauge()) + require.EqualValues(t, 1, metric.GetGauge().GetValue()) + + if tc.createNewTemplateVersion { + // Create a new template version and mark it as active + // This marks the template version that we care about as inactive + setupTestDBTemplateVersion(ctx, t, clock, db, pubSub, org.ID, ownerID, template.ID) + } + + if tc.deleteTemplate { + require.NoError(t, db.UpdateTemplateDeletedByID(ctx, database.UpdateTemplateDeletedByIDParams{ + ID: template.ID, + Deleted: true, + UpdatedAt: dbtime.Now(), + })) + } + + // Trigger reconciliation to make sure that successful, but outdated prebuilt workspace will be deleted. + require.NoError(t, controller.ReconcileAll(ctx)) + + workspaces, err = db.GetWorkspacesByTemplateID(ctx, template.ID) + require.NoError(t, err) + require.Equal(t, 2, len(workspaces)) + + jobStatusMap = getJobStatusMap(workspaces) + require.Len(t, jobStatusMap, 3) + require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusSucceeded]) + require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusFailed]) + // Pending job should be the job that deletes successful, but outdated prebuilt workspace. + // Prebuilt workspace MUST be deleted, despite the fact that preset is marked as hard limited. + require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusPending]) + + workspaceBuilds, err := db.GetWorkspaceBuildsByWorkspaceID(ctx, database.GetWorkspaceBuildsByWorkspaceIDParams{ + WorkspaceID: successfulWorkspace.ID, + }) + require.NoError(t, err) + require.Equal(t, 2, len(workspaceBuilds)) + // Make sure that successfully created, but outdated prebuilt workspace was scheduled for deletion. + require.Equal(t, database.WorkspaceTransitionDelete, workspaceBuilds[0].Transition) + require.Equal(t, database.WorkspaceTransitionStart, workspaceBuilds[1].Transition) + + // Metric is deleted after preset became outdated. + mf, err = registry.Gather() + require.NoError(t, err) + metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ + "template_name": template.Name, + "preset_name": preset.Name, + "org_name": org.Name, + }) + require.Nil(t, metric) }) } }