-
Notifications
You must be signed in to change notification settings - Fork 938
refactor: replace startup script logs EOF with starting/ready time #8082
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
73f19d6
43a0f87
8ff8c02
b4f78eb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,7 @@ import ( | |
"net/http" | ||
"net/netip" | ||
"os" | ||
"os/exec" | ||
"os/user" | ||
"path/filepath" | ||
"sort" | ||
|
@@ -126,6 +127,7 @@ func New(options Options) Agent { | |
tempDir: options.TempDir, | ||
lifecycleUpdate: make(chan struct{}, 1), | ||
lifecycleReported: make(chan codersdk.WorkspaceAgentLifecycle, 1), | ||
lifecycleStates: []agentsdk.PostLifecycleRequest{{State: codersdk.WorkspaceAgentLifecycleCreated}}, | ||
ignorePorts: options.IgnorePorts, | ||
connStatsChan: make(chan *agentsdk.Stats, 1), | ||
sshMaxTimeout: options.SSHMaxTimeout, | ||
|
@@ -170,7 +172,7 @@ type agent struct { | |
lifecycleUpdate chan struct{} | ||
lifecycleReported chan codersdk.WorkspaceAgentLifecycle | ||
lifecycleMu sync.RWMutex // Protects following. | ||
lifecycleState codersdk.WorkspaceAgentLifecycle | ||
lifecycleStates []agentsdk.PostLifecycleRequest | ||
|
||
network *tailnet.Conn | ||
connStatsChan chan *agentsdk.Stats | ||
|
@@ -414,11 +416,10 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { | |
} | ||
} | ||
|
||
// reportLifecycleLoop reports the current lifecycle state once. | ||
// Only the latest state is reported, intermediate states may be | ||
// lost if the agent can't communicate with the API. | ||
// reportLifecycleLoop reports the current lifecycle state once. All state | ||
// changes are reported in order. | ||
func (a *agent) reportLifecycleLoop(ctx context.Context) { | ||
var lastReported codersdk.WorkspaceAgentLifecycle | ||
lastReportedIndex := 0 // Start off with the created state without reporting it. | ||
for { | ||
select { | ||
case <-a.lifecycleUpdate: | ||
|
@@ -428,24 +429,32 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) { | |
|
||
for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); { | ||
a.lifecycleMu.RLock() | ||
state := a.lifecycleState | ||
lastIndex := len(a.lifecycleStates) - 1 | ||
report := a.lifecycleStates[lastReportedIndex] | ||
if len(a.lifecycleStates) > lastReportedIndex+1 { | ||
report = a.lifecycleStates[lastReportedIndex+1] | ||
} | ||
a.lifecycleMu.RUnlock() | ||
|
||
if state == lastReported { | ||
if lastIndex == lastReportedIndex { | ||
break | ||
} | ||
|
||
a.logger.Debug(ctx, "reporting lifecycle state", slog.F("state", state)) | ||
a.logger.Debug(ctx, "reporting lifecycle state", slog.F("payload", report)) | ||
|
||
err := a.client.PostLifecycle(ctx, agentsdk.PostLifecycleRequest{ | ||
State: state, | ||
}) | ||
err := a.client.PostLifecycle(ctx, report) | ||
if err == nil { | ||
lastReported = state | ||
lastReportedIndex++ | ||
select { | ||
case a.lifecycleReported <- state: | ||
case a.lifecycleReported <- report.State: | ||
case <-a.lifecycleReported: | ||
a.lifecycleReported <- state | ||
a.lifecycleReported <- report.State | ||
} | ||
if lastReportedIndex < lastIndex { | ||
// Keep reporting until we've sent all messages, we can't | ||
// rely on the channel triggering us before the backlog is | ||
// consumed. | ||
continue | ||
} | ||
break | ||
} | ||
|
@@ -461,15 +470,20 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) { | |
// setLifecycle sets the lifecycle state and notifies the lifecycle loop. | ||
// The state is only updated if it's a valid state transition. | ||
func (a *agent) setLifecycle(ctx context.Context, state codersdk.WorkspaceAgentLifecycle) { | ||
report := agentsdk.PostLifecycleRequest{ | ||
State: state, | ||
ChangedAt: database.Now(), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Intentionally used Thoughts @kylecarbs? For all intents and purposes, this shouldn't matter since the DB fields are There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
} | ||
|
||
a.lifecycleMu.Lock() | ||
lastState := a.lifecycleState | ||
if slices.Index(codersdk.WorkspaceAgentLifecycleOrder, lastState) > slices.Index(codersdk.WorkspaceAgentLifecycleOrder, state) { | ||
a.logger.Warn(ctx, "attempted to set lifecycle state to a previous state", slog.F("last", lastState), slog.F("state", state)) | ||
lastReport := a.lifecycleStates[len(a.lifecycleStates)-1] | ||
if slices.Index(codersdk.WorkspaceAgentLifecycleOrder, lastReport.State) >= slices.Index(codersdk.WorkspaceAgentLifecycleOrder, report.State) { | ||
a.logger.Warn(ctx, "attempted to set lifecycle state to a previous state", slog.F("last", lastReport), slog.F("current", report)) | ||
a.lifecycleMu.Unlock() | ||
return | ||
} | ||
a.lifecycleState = state | ||
a.logger.Debug(ctx, "set lifecycle state", slog.F("state", state), slog.F("last", lastState)) | ||
a.lifecycleStates = append(a.lifecycleStates, report) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Review: Guaranteed to not grow unboundedly ( |
||
a.logger.Debug(ctx, "set lifecycle state", slog.F("current", report), slog.F("last", lastReport)) | ||
a.lifecycleMu.Unlock() | ||
|
||
select { | ||
|
@@ -534,7 +548,6 @@ func (a *agent) run(ctx context.Context) error { | |
|
||
lifecycleState := codersdk.WorkspaceAgentLifecycleReady | ||
scriptDone := make(chan error, 1) | ||
scriptStart := time.Now() | ||
err = a.trackConnGoroutine(func() { | ||
defer close(scriptDone) | ||
scriptDone <- a.runStartupScript(ctx, manifest.StartupScript) | ||
|
@@ -556,22 +569,15 @@ func (a *agent) run(ctx context.Context) error { | |
select { | ||
case err = <-scriptDone: | ||
case <-timeout: | ||
a.logger.Warn(ctx, "startup script timed out") | ||
a.logger.Warn(ctx, "script timed out", slog.F("lifecycle", "startup"), slog.F("timeout", manifest.ShutdownScriptTimeout)) | ||
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleStartTimeout) | ||
err = <-scriptDone // The script can still complete after a timeout. | ||
} | ||
if errors.Is(err, context.Canceled) { | ||
return | ||
} | ||
// Only log if there was a startup script. | ||
if manifest.StartupScript != "" { | ||
execTime := time.Since(scriptStart) | ||
if err != nil { | ||
a.logger.Warn(ctx, "startup script failed", slog.F("execution_time", execTime), slog.Error(err)) | ||
lifecycleState = codersdk.WorkspaceAgentLifecycleStartError | ||
} else { | ||
a.logger.Info(ctx, "startup script completed", slog.F("execution_time", execTime)) | ||
if err != nil { | ||
if errors.Is(err, context.Canceled) { | ||
return | ||
} | ||
lifecycleState = codersdk.WorkspaceAgentLifecycleStartError | ||
} | ||
a.setLifecycle(ctx, lifecycleState) | ||
}() | ||
|
@@ -830,20 +836,31 @@ func (a *agent) runShutdownScript(ctx context.Context, script string) error { | |
return a.runScript(ctx, "shutdown", script) | ||
} | ||
|
||
func (a *agent) runScript(ctx context.Context, lifecycle, script string) error { | ||
func (a *agent) runScript(ctx context.Context, lifecycle, script string) (err error) { | ||
if script == "" { | ||
return nil | ||
} | ||
|
||
a.logger.Info(ctx, "running script", slog.F("lifecycle", lifecycle), slog.F("script", script)) | ||
logger := a.logger.With(slog.F("lifecycle", lifecycle)) | ||
|
||
logger.Info(ctx, fmt.Sprintf("running %s script", lifecycle), slog.F("script", script)) | ||
fileWriter, err := a.filesystem.OpenFile(filepath.Join(a.logDir, fmt.Sprintf("coder-%s-script.log", lifecycle)), os.O_CREATE|os.O_RDWR, 0o600) | ||
if err != nil { | ||
return xerrors.Errorf("open %s script log file: %w", lifecycle, err) | ||
} | ||
defer func() { | ||
_ = fileWriter.Close() | ||
err := fileWriter.Close() | ||
if err != nil { | ||
logger.Warn(ctx, fmt.Sprintf("close %s script log file", lifecycle), slog.Error(err)) | ||
} | ||
}() | ||
|
||
cmdPty, err := a.sshServer.CreateCommand(ctx, script, nil) | ||
if err != nil { | ||
return xerrors.Errorf("%s script: create command: %w", lifecycle, err) | ||
} | ||
cmd := cmdPty.AsExec() | ||
|
||
var writer io.Writer = fileWriter | ||
if lifecycle == "startup" { | ||
// Create pipes for startup logs reader and writer | ||
|
@@ -854,29 +871,42 @@ func (a *agent) runScript(ctx context.Context, lifecycle, script string) error { | |
writer = io.MultiWriter(fileWriter, logsWriter) | ||
flushedLogs, err := a.trackScriptLogs(ctx, logsReader) | ||
if err != nil { | ||
return xerrors.Errorf("track script logs: %w", err) | ||
return xerrors.Errorf("track %s script logs: %w", lifecycle, err) | ||
} | ||
defer func() { | ||
_ = logsWriter.Close() | ||
<-flushedLogs | ||
}() | ||
} | ||
|
||
cmdPty, err := a.sshServer.CreateCommand(ctx, script, nil) | ||
if err != nil { | ||
return xerrors.Errorf("create command: %w", err) | ||
} | ||
cmd := cmdPty.AsExec() | ||
cmd.Stdout = writer | ||
cmd.Stderr = writer | ||
|
||
start := time.Now() | ||
defer func() { | ||
end := time.Now() | ||
execTime := end.Sub(start) | ||
exitCode := 0 | ||
if err != nil { | ||
exitCode = 255 // Unknown status. | ||
var exitError *exec.ExitError | ||
if xerrors.As(err, &exitError) { | ||
exitCode = exitError.ExitCode() | ||
} | ||
logger.Warn(ctx, fmt.Sprintf("%s script failed", lifecycle), slog.F("execution_time", execTime), slog.F("exit_code", exitCode), slog.Error(err)) | ||
} else { | ||
logger.Info(ctx, fmt.Sprintf("%s script completed", lifecycle), slog.F("execution_time", execTime), slog.F("exit_code", exitCode)) | ||
} | ||
}() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Review: Small cleanup to unify logging between startup and shutdown scripts. |
||
|
||
err = cmd.Run() | ||
if err != nil { | ||
// cmd.Run does not return a context canceled error, it returns "signal: killed". | ||
if ctx.Err() != nil { | ||
return ctx.Err() | ||
} | ||
|
||
return xerrors.Errorf("run: %w", err) | ||
return xerrors.Errorf("%s script: run: %w", lifecycle, err) | ||
} | ||
return nil | ||
} | ||
|
@@ -929,11 +959,6 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str | |
if errors.As(err, &sdkErr) { | ||
if sdkErr.StatusCode() == http.StatusRequestEntityTooLarge { | ||
a.logger.Warn(ctx, "startup logs too large, dropping logs") | ||
// Always send the EOF even if logs overflow. | ||
if len(logsToSend) > 1 && logsToSend[len(logsToSend)-1].EOF { | ||
logsToSend = logsToSend[len(logsToSend)-1:] | ||
continue | ||
} | ||
break | ||
} | ||
} | ||
|
@@ -986,11 +1011,6 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str | |
if err := scanner.Err(); err != nil { | ||
a.logger.Error(ctx, "scan startup logs", slog.Error(err)) | ||
} | ||
queueLog(agentsdk.StartupLog{ | ||
CreatedAt: database.Now(), | ||
Output: "", | ||
EOF: true, | ||
}) | ||
defer close(logsFinished) | ||
logsFlushed.L.Lock() | ||
for { | ||
|
@@ -1392,7 +1412,6 @@ func (a *agent) Close() error { | |
lifecycleState := codersdk.WorkspaceAgentLifecycleOff | ||
if manifest := a.manifest.Load(); manifest != nil && manifest.ShutdownScript != "" { | ||
scriptDone := make(chan error, 1) | ||
scriptStart := time.Now() | ||
go func() { | ||
defer close(scriptDone) | ||
scriptDone <- a.runShutdownScript(ctx, manifest.ShutdownScript) | ||
|
@@ -1411,16 +1430,12 @@ func (a *agent) Close() error { | |
select { | ||
case err = <-scriptDone: | ||
case <-timeout: | ||
a.logger.Warn(ctx, "shutdown script timed out") | ||
a.logger.Warn(ctx, "script timed out", slog.F("lifecycle", "shutdown"), slog.F("timeout", manifest.ShutdownScriptTimeout)) | ||
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleShutdownTimeout) | ||
err = <-scriptDone // The script can still complete after a timeout. | ||
} | ||
execTime := time.Since(scriptStart) | ||
if err != nil { | ||
a.logger.Warn(ctx, "shutdown script failed", slog.F("execution_time", execTime), slog.Error(err)) | ||
lifecycleState = codersdk.WorkspaceAgentLifecycleShutdownError | ||
} else { | ||
a.logger.Info(ctx, "shutdown script completed", slog.F("execution_time", execTime)) | ||
} | ||
} | ||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Review: Originally, this was a weird/performance choice to only submit the latest status which simply resulted in needing to document the behavior increasing the complexity. We now change it so that the agent always reports all states (still non-blocking), we achieve this by adding the timestamp for the event in the payload.