8000 refactor: replace startup script logs EOF with starting/ready time (#… · coder/coder@8dac035 · GitHub
[go: up one dir, main page]

Skip to content

Commit 8dac035

Browse files
authored
refactor: replace startup script logs EOF with starting/ready time (#8082)
This commit reverts some of the changes in #8029 and implements an alternative method of keeping track of when the startup script has ended and there will be no more logs. This is achieved by adding new agent fields for tracking when the agent enters the "starting" and "ready"/"start_error" lifecycle states. The timestamps simplify logic since we don't need understand if the current state is before or after the state we're interested in. They can also be used to show data like how long the startup script took to execute. This also allowed us to remove the EOF field from the logs as the implementation was problematic when we returned the EOF log entry in the response since requesting _after_ that ID would give no logs and the API would thus lose track of EOF.
1 parent b1d1b63 commit 8dac035

29 files changed

+462
-540
lines changed

agent/agent.go

Lines changed: 73 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"net/http"
1515
"net/netip"
1616
"os"
17+
"os/exec"
1718
"os/user"
1819
"path/filepath"
1920
"sort"
@@ -126,6 +127,7 @@ func New(options Options) Agent {
126127
tempDir: options.TempDir,
127128
lifecycleUpdate: make(chan struct{}, 1),
128129
lifecycleReported: make(chan codersdk.WorkspaceAgentLifecycle, 1),
130+
lifecycleStates: []agentsdk.PostLifecycleRequest{{State: codersdk.WorkspaceAgentLifecycleCreated}},
129131
ignorePorts: options.IgnorePorts,
130132
connStatsChan: make(chan *agentsdk.Stats, 1),
131133
sshMaxTimeout: options.SSHMaxTimeout,
@@ -170,7 +172,7 @@ type agent struct {
170172
lifecycleUpdate chan struct{}
171173
lifecycleReported chan codersdk.WorkspaceAgentLifecycle
172174
lifecycleMu sync.RWMutex // Protects following.
173-
lifecycleState codersdk.WorkspaceAgentLifecycle
175+
lifecycleStates []agentsdk.PostLifecycleRequest
174176

175177
network *tailnet.Conn
176178
connStatsChan chan *agentsdk.Stats
@@ -414,11 +416,10 @@ func (a *agent) reportMetadataLoop(ctx context.Context) {
414416
}
415417
}
416418

417-
// reportLifecycleLoop reports the current lifecycle state once.
418-
// Only the latest state is reported, intermediate states may be
419-
// lost if the agent can't communicate with the API.
419+
// reportLifecycleLoop reports the current lifecycle state once. All state
420+
// changes are reported in order.
420421
func (a *agent) reportLifecycleLoop(ctx context.Context) {
421-
var lastReported codersdk.WorkspaceAgentLifecycle
422+
lastReportedIndex := 0 // Start off with the created state without reporting it.
422423
for {
423424
select {
424425
case <-a.lifecycleUpdate:
@@ -428,24 +429,32 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
428429

429430
for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); {
430431
a.lifecycleMu.RLock()
431-
state := a.lifecycleState
432+
lastIndex := len(a.lifecycleStates) - 1
433+
report := a.lifecycleStates[lastReportedIndex]
434+
if len(a.lifecycleStates) > lastReportedIndex+1 {
435+
report = a.lifecycleStates[lastReportedIndex+1]
436+
}
432437
a.lifecycleMu.RUnlock()
433438

434-
if state == lastReported {
439+
if lastIndex == lastReportedIndex {
435440
break
436441
}
437442

438-
a.logger.Debug(ctx, "reporting lifecycle state", slog.F("state", state))
443+
a.logger.Debug(ctx, "reporting lifecycle state", slog.F("payload", report))
439444

440-
err := a.client.PostLifecycle(ctx, agentsdk.PostLifecycleRequest{
441-
State: state,
442-
})
445+
err := a.client.PostLifecycle(ctx, report)
443446
if err == nil {
444-
lastReported = state
447+
lastReportedIndex++
445448
select {
446-
case a.lifecycleReported <- state:
449+
case a.lifecycleReported <- report.State:
447450
case <-a.lifecycleReported:
448-
a.lifecycleReported <- state
451+
a.lifecycleReported <- report.State
452+
}
453+
if lastReportedIndex < lastIndex {
454+
// Keep reporting until we've sent all messages, we can't
455+
// rely on the channel triggering us before the backlog is
456+
// consumed.
457+
continue
449458
}
450459
break
451460
}
@@ -461,15 +470,20 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
461470
// setLifecycle sets the lifecycle state and notifies the lifecycle loop.
462471
// The state is only updated if it's a valid state transition.
463472
func (a *agent) setLifecycle(ctx context.Context, state codersdk.WorkspaceAgentLifecycle) {
473+
report := agentsdk.PostLifecycleRequest{
474+
State: state,
475+
ChangedAt: database.Now(),
476+
}
477+
464478
a.lifecycleMu.Lock()
465-
lastState := a.lifecycleState
466-
if slices.Index(codersdk.WorkspaceAgentLifecycleOrder, lastState) > slices.Index(codersdk.WorkspaceAgentLifecycleOrder, state) {
467-
a.logger.Warn(ctx, "attempted to set lifecycle state to a previous state", slog.F("last", lastState), slog.F("state", state))
479+
lastReport := a.lifecycleStates[len(a.lifecycleStates)-1]
480+
if slices.Index(codersdk.WorkspaceAgentLifecycleOrder, lastReport.State) >= slices.Index(codersdk.WorkspaceAgentLifecycleOrder, report.State) {
481+
a.logger.Warn(ctx, "attempted to set lifecycle state to a previous state", slog.F("last", lastReport), slog.F("current", report))
468482
a.lifecycleMu.Unlock()
469483
return
470484
}
471-
a.lifecycleState = state
472-
a.logger.Debug(ctx, "set lifecycle state", slog.F("state", state), slog.F("last", lastState))
485+
a.lifecycleStates = append(a.lifecycleStates, report)
486+
a.logger.Debug(ctx, "set lifecycle state", slog.F("current", report), slog.F("last", lastReport))
473487
a.lifecycleMu.Unlock()
474488

475489
select {
@@ -534,7 +548,6 @@ func (a *agent) run(ctx context.Context) error {
534548

535549
lifecycleState := codersdk.WorkspaceAgentLifecycleReady
536550
scriptDone := make(chan error, 1)
537-
scriptStart := time.Now()
538551
err = a.trackConnGoroutine(func() {
539552
defer close(scriptDone)
540553
scriptDone <- a.runStartupScript(ctx, manifest.StartupScript)
@@ -556,22 +569,15 @@ func (a *agent) run(ctx context.Context) error {
556569
select {
557570
case err = <-scriptDone:
558571
case <-timeout:
559-
a.logger.Warn(ctx, "startup script timed out")
572+
a.logger.Warn(ctx, "script timed out", slog.F("lifecycle", "startup"), slog.F("timeout", manifest.ShutdownScriptTimeout))
560573
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleStartTimeout)
561574
err = <-scriptDone // The script can still complete after a timeout.
562575
}
563-
if errors.Is(err, context.Canceled) {
564-
return
565-
}
566-
// Only log if there was a startup script.
567-
if manifest.StartupScript != "" {
568-
execTime := time.Since(scriptStart)
569-
if err != nil {
570-
a.logger.Warn(ctx, "startup script failed", slog.F("execution_time", execTime), slog.Error(err))
571-
lifecycleState = codersdk.WorkspaceAgentLifecycleStartError
572-
} else {
573-
a.logger.Info(ctx, "startup script completed", slog.F("execution_time", execTime))
576+
if err != nil {
577+
if errors.Is(err, context.Canceled) {
578+
return
574579
}
580+
lifecycleState = codersdk.WorkspaceAgentLifecycleStartError
575581
}
576582
a.setLifecycle(ctx, lifecycleState)
577583
}()
@@ -830,20 +836,31 @@ func (a *agent) runShutdownScript(ctx context.Context, script string) error {
830836
return a.runScript(ctx, "shutdown", script)
831837
}
832838

833-
func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
839+
func (a *agent) runScript(ctx context.Context, lifecycle, script string) (err error) {
834840
if script == "" {
835841
return nil
836842
}
837843

838-
a.logger.Info(ctx, "running script", slog.F("lifecycle", lifecycle), slog.F("script", script))
844+
logger := a.logger.With(slog.F("lifecycle", lifecycle))
845+
846+
logger.Info(ctx, fmt.Sprintf("running %s script", lifecycle), slog.F("script", script))
839847
fileWriter, err := a.filesystem.OpenFile(filepath.Join(a.logDir, fmt.Sprintf("coder-%s-script.log", lifecycle)), os.O_CREATE|os.O_RDWR, 0o600)
840848
if err != nil {
841849
return xerrors.Errorf("open %s script log file: %w", lifecycle, err)
842850
}
843851
defer func() {
844-
_ = fileWriter.Close()
852+
err := fileWriter.Close()
853+
if err != nil {
854+
logger.Warn(ctx, fmt.Sprintf("close %s script log file", lifecycle), slog.Error(err))
855+
}
845856
}()
846857

858+
cmdPty, err := a.sshServer.CreateCommand(ctx, script, nil)
859+
if err != nil {
860+
return xerrors.Errorf("%s script: create command: %w", lifecycle, err)
861+
}
862+
cmd := cmdPty.AsExec()
863+
847864
var writer io.Writer = fileWriter
848865
if lifecycle == "startup" {
849866
// Create pipes for startup logs reader and writer
@@ -854,29 +871,42 @@ func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
854871
writer = io.MultiWriter(fileWriter, logsWriter)
855872
flushedLogs, err := a.trackScriptLogs(ctx, logsReader)
856873
if err != nil {
857-
return xerrors.Errorf("track script logs: %w", err)
874+
return xerrors.Errorf("track %s script logs: %w", lifecycle, err)
858875
}
859876
defer func() {
860877
_ = logsWriter.Close()
861878
<-flushedLogs
862879
}()
863880
}
864881

865-
cmdPty, err := a.sshServer.CreateCommand(ctx, script, nil)
866-
if err != nil {
867-
return xerrors.Errorf("create command: %w", err)
868-
}
869-
cmd := cmdPty.AsExec()
870882
cmd.Stdout = writer
871883
cmd.Stderr = writer
884+
885+
start := time.Now()
886+
defer func() {
887+
end := time.Now()
888+
execTime := end.Sub(start)
889+
exitCode := 0
890+
if err != nil {
891+
exitCode = 255 // Unknown status.
892+
var exitError *exec.ExitError
893+
if xerrors.As(err, &exitError) {
894+
exitCode = exitError.ExitCode()
895+
}
896+
logger.Warn(ctx, fmt.Sprintf("%s script failed", lifecycle), slog.F("execution_time", execTime), slog.F("exit_code", exitCode), slog.Error(err))
897+
} else {
898+
logger.Info(ctx, fmt.Sprintf("%s script completed", lifecycle), slog.F("execution_time", execTime), slog.F("exit_code", exitCode))
899+
}
900+
}()
901+
872902
err = cmd.Run()
873903
if err != nil {
874904
// cmd.Run does not return a context canceled error, it returns "signal: killed".
875905
if ctx.Err() != nil {
876906
return ctx.Err()
877907
}
878908

879-
return xerrors.Errorf("run: %w", err)
909+
return xerrors.Errorf("%s script: run: %w", lifecycle, err)
880910
}
881911
return nil
882912
}
@@ -929,11 +959,6 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str
929959
if errors.As(err, &sdkErr) {
930960
if sdkErr.StatusCode() == http.StatusRequestEntityTooLarge {
931961
a.logger.Warn(ctx, "startup logs too large, dropping logs")
932-
// Always send the EOF even if logs overflow.
933-
if len(logsToSend) > 1 && logsToSend[len(logsToSend)-1].EOF {
934-
logsToSend = logsToSend[len(logsToSend)-1:]
935-
continue
936-
}
937962
break
938963
}
939964
}
@@ -986,11 +1011,6 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str
9861011
if err := scanner.Err(); err != nil {
9871012
a.logger.Error(ctx, "scan startup logs", slog.Error(err))
9881013
}
989-
queueLog(agentsdk.StartupLog{
990-
CreatedAt: database.Now(),
991-
Output: "",
992-
EOF: true,
993-
})
9941014
defer close(logsFinished)
9951015
logsFlushed.L.Lock()
9961016
for {
@@ -1392,7 +1412,6 @@ func (a *agent) Close() error {
13921412
lifecycleState := codersdk.WorkspaceAgentLifecycleOff
13931413
if manifest := a.manifest.Load(); manifest != nil && manifest.ShutdownScript != "" {
13941414
scriptDone := make(chan error, 1)
1395-
scriptStart := time.Now()
13961415
go func() {
13971416
defer close(scriptDone)
13981417
scriptDone <- a.runShutdownScript(ctx, manifest.ShutdownScript)
@@ -1411,16 +1430,12 @@ func (a *agent) Close() error {
14111430
select {
14121431
case err = <-scriptDone:
14131432
case <-timeout:
1414-
a.logger.Warn(ctx, "shutdown script timed out")
1433+
a.logger.Warn(ctx, "script timed out", slog.F("lifecycle", "shutdown"), slog.F("timeout", manifest.ShutdownScriptTimeout))
14151434
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleShutdownTimeout)
14161435
err = <-scriptDone // The script can still complete after a timeout.
14171436
}
1418-
execTime := time.Since(scriptStart)
14191437
if err != nil {
1420-
a.logger.Warn(ctx, "shutdown script failed", slog.F("execution_time", execTime), slog.Error(err))
14211438
lifecycleState = codersdk.WorkspaceAgentLifecycleShutdownError
1422-
} else {
1423-
a.logger.Info(ctx, "shutdown script completed", slog.F("exe F438 cution_time", execTime))
14241439
}
14251440
}
14261441

agent/agent_test.go

Lines changed: 9 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -905,10 +905,8 @@ func TestAgent_StartupScript(t *testing.T) {
905905
return len(got) > 0 && got[len(got)-1] == codersdk.WorkspaceAgentLifecycleReady
906906
}, testutil.WaitShort, testutil.IntervalMedium)
907907

908-
require.Len(t, client.getStartupLogs(), 2)
908+
require.Len(t, client.getStartupLogs(), 1)
909909
require.Equal(t, output, client.getStartupLogs()[0].Output)
910-
require.False(t, client.getStartupLogs()[0].EOF)
911-
require.True(t, client.getStartupLogs()[1].EOF)
912910
})
913911
// This ensures that even when coderd sends back that the startup
914912
// script has written too many lines it will still succeed!
@@ -1115,15 +1113,8 @@ func TestAgent_Lifecycle(t *testing.T) {
11151113
got = client.getLifecycleStates()
11161114
return len(got) > 0 && got[len(got)-1] == want[len(want)-1]
11171115
}, testutil.WaitShort, testutil.IntervalMedium)
1118-
switch len(got) {
1119-
case 1:
1120-
// This can happen if lifecycle state updates are
1121-
// too fast, only the latest one is reported.
1122-
require.Equal(t, want[1:], got)
1123-
default:
1124-
// This is the expected case.
1125-
require.Equal(t, want, got)
1126-
}
1116+
1117+
require.Equal(t, want, got)
11271118
})
11281119

11291120
t.Run("StartError", func(t *testing.T) {
@@ -1144,15 +1135,8 @@ func TestAgent_Lifecycle(t *testing.T) {
11441135
got = client.getLifecycleStates()
11451136
return len(got) > 0 && got[len(got)-1] == want[len(want)-1]
11461137
}, 10000 testutil.WaitShort, testutil.IntervalMedium)
1147-
switch len(got) {
1148-
case 1:
1149-
// This can happen if lifecycle state updates are
1150-
// too fast, only the latest one is reported.
1151-
require.Equal(t, want[1:], got)
1152-
default:
1153-
// This is the expected case.
1154-
require.Equal(t, want, got)
1155-
}
1138+
1139+
require.Equal(t, want, got)
11561140
})
11571141

11581142
t.Run("Ready", func(t *testing.T) {
@@ -1173,15 +1157,8 @@ func TestAgent_Lifecycle(t *testing.T) {
11731157
got = client.getLifecycleStates()
11741158
return len(got) > 0 && got[len(got)-1] == want[len(want)-1]
11751159
}, testutil.WaitShort, testutil.IntervalMedium)
1176-
switch len(got) {
1177-
case 1:
1178-
// This can happen if lifecycle state updates are
1179-
// too fast, only the latest one is reported.
1180-
require.Equal(t, want[1:], got)
1181-
default:
1182-
// This is the expected case.
1183-
require.Equal(t, want, got)
1184-
}
1160+
1161+
require.Equal(t, want, got)
11851162
})
11861163

11871164
t.Run("ShuttingDown", func(t *testing.T) {
@@ -1258,15 +1235,7 @@ func TestAgent_Lifecycle(t *testing.T) {
12581235
return len(got) > 0 && got[len(got)-1] == want[len(want)-1]
12591236
}, testutil.WaitShort, testutil.IntervalMedium)
12601237

1261-
switch len(got) {
1262-
case 1:
1263-
// This can happen if lifecycle state updates are
1264-
// too fast, only the latest one is reported.
1265-
require.Equal(t, want[1:], got)
1266-
default:
1267-
// This is the expected case.
1268-
require.Equal(t, want, got)
1269-
}
1238+
require.Equal(t, want, got)
12701239
})
12711240

12721241
t.Run("ShutdownError", func(t *testing.T) {
@@ -1305,15 +1274,7 @@ func TestAgent_Lifecycle(t *testing.T) {
13051274
return len(got) > 0 && got[len(got)-1] == want[len(want)-1]
13061275
}, testutil.WaitShort, testutil.IntervalMedium)
13071276

1308-
switch len(got) {
1309-
case 1:
1310-
// This can happen if lifecycle state updates are
1311-
// too fast, only the latest one is reported.
1312-
require.Equal(t, want[1:], got)
1313-
default:
1314-
// This is the expected case.
1315-
require.Equal(t, want, got)
1316-
}
1277+
require.Equal(t, want, got)
13171278
})
13181279

13191280
t.Run("ShutdownScriptOnce", func(t *testing.T) {

0 commit comments

Comments
 (0)
0