8000 refactor: replace startup script logs EOF with starting/ready time by mafredri · Pull Request #8082 · coder/coder · GitHub
[go: up one dir, main page]

Skip to content

refactor: replace startup script logs EOF with starting/ready time #8082

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 73 additions & 58 deletions agent/agent.go
F438
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"net/http"
"net/netip"
"os"
"os/exec"
"os/user"
"path/filepath"
"sort"
Expand Down Expand Up @@ -126,6 +127,7 @@ func New(options Options) Agent {
tempDir: options.TempDir,
lifecycleUpdate: make(chan struct{}, 1),
lifecycleReported: make(chan codersdk.WorkspaceAgentLifecycle, 1),
lifecycleStates: []agentsdk.PostLifecycleRequest{{State: codersdk.WorkspaceAgentLifecycleCreated}},
ignorePorts: options.IgnorePorts,
connStatsChan: make(chan *agentsdk.Stats, 1),
sshMaxTimeout: options.SSHMaxTimeout,
Expand Down Expand Up @@ -170,7 +172,7 @@ type agent struct {
lifecycleUpdate chan struct{}
lifecycleReported chan codersdk.WorkspaceAgentLifecycle
lifecycleMu sync.RWMutex // Protects following.
lifecycleState codersdk.WorkspaceAgentLifecycle
lifecycleStates []agentsdk.PostLifecycleRequest

network *tailnet.Conn
connStatsChan chan *agentsdk.Stats
Expand Down Expand Up @@ -414,11 +416,10 @@ func (a *agent) reportMetadataLoop(ctx context.Context) {
}
}

// reportLifecycleLoop reports the current lifecycle state once.
// Only the latest state is reported, intermediate states may be
// lost if the agent can't communicate with the API.
// reportLifecycleLoop reports the current lifecycle state once. All state
// changes are reported in order.
Copy link
Member Author
@mafredri mafredri Jun 19, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Review: Originally, this was a weird/performance choice to only submit the latest status which simply resulted in needing to document the behavior increasing the complexity. We now change it so that the agent always reports all states (still non-blocking), we achieve this by adding the timestamp for the event in the payload.

func (a *agent) reportLifecycleLoop(ctx context.Context) {
var lastReported codersdk.WorkspaceAgentLifecycle
lastReportedIndex := 0 // Start off with the created state without reporting it.
for {
select {
case <-a.lifecycleUpdate:
Expand All @@ -428,24 +429,32 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {

for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); {
a.lifecycleMu.RLock()
state := a.lifecycleState
lastIndex := len(a.lifecycleStates) - 1
report := a.lifecycleStates[lastReportedIndex]
if len(a.lifecycleStates) > lastReportedIndex+1 {
report = a.lifecycleStates[lastReportedIndex+1]
}
a.lifecycleMu.RUnlock()

if state == lastReported {
if lastIndex == lastReportedIndex {
break
}

a.logger.Debug(ctx, "reporting lifecycle state", slog.F("state", state))
a.logger.Debug(ctx, "reporting lifecycle state", slog.F("payload", report))

err := a.client.PostLifecycle(ctx, agentsdk.PostLifecycleRequest{
State: state,
})
err := a.client.PostLifecycle(ctx, report)
if err == nil {
lastReported = state
lastReportedIndex++
select {
case a.lifecycleReported <- state:
case a.lifecycleReported <- report.State:
case <-a.lifecycleReported:
a.lifecycleReported <- state
a.lifecycleReported <- report.State
}
if lastReportedIndex < lastIndex {
// Keep reporting until we've sent all messages, we can't
// rely on the channel triggering us before the backlog is
// consumed.
continue
}
break
}
Expand All @@ -461,15 +470,20 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
// setLifecycle sets the lifecycle state and notifies the lifecycle loop.
// The state is only updated if it's a valid state transition.
func (a *agent) setLifecycle(ctx context.Context, state codersdk.WorkspaceAgentLifecycle) {
report := agentsdk.PostLifecycleRequest{
State: state,
ChangedAt: database.Now(),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

database.Now() or time.Now()?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intentionally used database.Now() here for consistency since it's a value that will be stored in the DB. Startup logs also use database.Now() but we do seem to have some mixed use in the agent and some other places may be wrong?

Thoughts @kylecarbs? For all intents and purposes, this shouldn't matter since the DB fields are timestamptz, what's the motivation for using database.Now() that always uses UTC? Logging purposes?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's the motivation for using database.Now() that always uses UTC? Logging purposes?

  • It might be a good candidate for the linter rule.

}

a.lifecycleMu.Lock()
lastState := a.lifecycleState
if slices.Index(codersdk.WorkspaceAgentLifecycleOrder, lastState) > slices.Index(codersdk.WorkspaceAgentLifecycleOrder, state) {
a.logger.Warn(ctx, "attempted to set lifecycle state to a previous state", slog.F("last", lastState), slog.F("state", state))
lastReport := a.lifecycleStates[len(a.lifecycleStates)-1]
if slices.Index(codersdk.WorkspaceAgentLifecycleOrder, lastReport.State) >= slices.Index(codersdk.WorkspaceAgentLifecycleOrder, report.State) {
a.logger.Warn(ctx, "attempted to set lifecycle state to a previous state", slog.F("last", lastReport), slog.F("current", report))
a.lifecycleMu.Unlock()
return
}
a.lifecycleState = state
a.logger.Debug(ctx, "set lifecycle state", slog.F("state", state), slog.F("last", lastState))
a.lifecycleStates = append(a.lifecycleStates, report)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Review: Guaranteed to not grow unboundedly (len(a.lifecycleStates) <= enum entries).

a.logger.Debug(ctx, "set lifecycle state", slog.F("current", report), slog.F("last", lastReport))
a.lifecycleMu.Unlock()

select {
Expand Down Expand Up @@ -534,7 +548,6 @@ func (a *agent) run(ctx context.Context) error {

lifecycleState := codersdk.WorkspaceAgentLifecycleReady
scriptDone := make(chan error, 1)
scriptStart := time.Now()
err = a.trackConnGoroutine(func() {
defer close(scriptDone)
scriptDone <- a.runStartupScript(ctx, manifest.StartupScript)
Expand All @@ -556,22 +569,15 @@ func (a *agent) run(ctx context.Context) error {
select {
case err = <-scriptDone:
case <-timeout:
a.logger.Warn(ctx, "startup script timed out")
a.logger.Warn(ctx, "script timed out", slog.F("lifecycle", "startup"), slog.F("timeout", manifest.ShutdownScriptTimeout))
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleStartTimeout)
err = <-scriptDone // The script can still complete after a timeout.
}
if errors.Is(err, context.Canceled) {
return
}
// Only log if there was a startup script.
if manifest.StartupScript != "" {
execTime := time.Since(scriptStart)
if err != nil {
a.logger.Warn(ctx, "startup script failed", slog.F("execution_time", execTime), slog.Error(err))
lifecycleState = codersdk.WorkspaceAgentLifecycleStartError
} else {
a.logger.Info(ctx, "startup script completed", slog.F("execution_time", execTime))
if err != nil {
if errors.Is(err, context.Canceled) {
return
}
lifecycleState = codersdk.WorkspaceAgentLifecycleStartError
}
a.setLifecycle(ctx, lifecycleState)
}()
Expand Down Expand Up @@ -830,20 +836,31 @@ func (a *agent) runShutdownScript(ctx context.Context, script string) error {
return a.runScript(ctx, "shutdown", script)
}

func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
func (a *agent) runScript(ctx context.Context, lifecycle, script string) (err error) {
if script == "" {
return nil
}

a.logger.Info(ctx, "running script", slog.F("lifecycle", lifecycle), slog.F("script", script))
logger := a.logger.With(slog.F("lifecycle", lifecycle))

logger.Info(ctx, fmt.Sprintf("running %s script", lifecycle), slog.F("script", script))
fileWriter, err := a.filesystem.OpenFile(filepath.Join(a.logDir, fmt.Sprintf("coder-%s-script.log", lifecycle)), os.O_CREATE|os.O_RDWR, 0o600)
if err != nil {
return xerrors.Errorf("open %s script log file: %w", lifecycle, err)
}
defer func() {
_ = fileWriter.Close()
err := fileWriter.Close()
if err != nil {
logger.Warn(ctx, fmt.Sprintf("close %s script log file", lifecycle), slog.Error(err))
}
}()

cmdPty, err := a.sshServer.CreateCommand(ctx, script, nil)
if err != nil {
return xerrors.Errorf("%s script: create command: %w", lifecycle, err)
}
cmd := cmdPty.AsExec()

var writer io.Writer = fileWriter
if lifecycle == "startup" {
// Create pipes for startup logs reader and writer
Expand All @@ -854,29 +871,42 @@ func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
writer = io.MultiWriter(fileWriter, logsWriter)
flushedLogs, err := a.trackScriptLogs(ctx, logsReader)
if err != nil {
return xerrors.Errorf("track script logs: %w", err)
return xerrors.Errorf("track %s script logs: %w", lifecycle, err)
}
defer func() {
_ = logsWriter.Close()
<-flushedLogs
}()
}

cmdPty, err := a.sshServer.CreateCommand(ctx, script, nil)
if err != nil {
return xerrors.Errorf("create command: %w", err)
}
cmd := cmdPty.AsExec()
cmd.Stdout = writer
cmd.Stderr = writer

start := time.Now()
defer func() {
end := time.Now()
execTime := end.Sub(start)
exitCode := 0
if err != nil {
exitCode = 255 // Unknown status.
var exitError *exec.ExitError
if xerrors.As(err, &exitError) {
exitCode = exitError.ExitCode()
}
logger.Warn(ctx, fmt.Sprintf("%s script failed", lifecycle), slog.F("execution_time", execTime), slog.F("exit_code", exitCode), slog.Error(err))
} else {
logger.Info(ctx, fmt.Sprintf("%s script completed", lifecycle), slog.F("execution_time", execTime), slog.F("exit_code", exitCode))
}
}()
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Review: Small cleanup to unify logging between startup and shutdown scripts.


err = cmd.Run()
if err != nil {
// cmd.Run does not return a context canceled error, it returns "signal: killed".
if ctx.Err() != nil {
return ctx.Err()
}

return xerrors.Errorf("run: %w", err)
return xerrors.Errorf("%s script: run: %w", lifecycle, err)
}
return nil
}
Expand Down Expand Up @@ -929,11 +959,6 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str
if errors.As(err, &sdkErr) {
if sdkErr.StatusCode() == http.StatusRequestEntityTooLarge {
a.logger.Warn(ctx, "startup logs too large, dropping logs")
// Always send the EOF even if logs overflow.
if len(logsToSend) > 1 && logsToSend[len(logsToSend)-1].EOF {
logsToSend = logsToSend[len(logsToSend)-1:]
continue
}
break
}
}
Expand Down Expand Up @@ -986,11 +1011,6 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str
if err := scanner.Err(); err != nil {
a.logger.Error(ctx, "scan startup logs", slog.Error(err))
}
queueLog(agentsdk.StartupLog{
CreatedAt: database.Now(),
Output: "",
EOF: true,
})
defer close(logsFinished)
logsFlushed.L.Lock()
for {
Expand Down Expand Up @@ -1392,7 +1412,6 @@ func (a *agent) Close() error {
lifecycleState := codersdk.WorkspaceAgentLifecycleOff
if manifest := a.manifest.Load(); manifest != nil && manifest.ShutdownScript != "" {
scriptDone := make(chan error, 1)
scriptStart := time.Now()
go func() {
defer close(scriptDone)
scriptDone <- a.runShutdownScript(ctx, manifest.ShutdownScript)
Expand All @@ -1411,16 +1430,12 @@ func (a *agent) Close() error {
select {
case err = <-scriptDone:
case <-timeout:
a.logger.Warn(ctx, "shutdown script timed out")
a.logger.Warn(ctx, "script timed out", slog.F("lifecycle", "shutdown"), slog.F("timeout", manifest.ShutdownScriptTimeout))
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleShutdownTimeout)
err = <-scriptDone // The script can still complete after a timeout.
}
execTime := time.Since(scriptStart)
if err != nil {
a.logger.Warn(ctx, "shutdown script failed", slog.F("execution_time", execTime), slog.Error(err))
lifecycleState = codersdk.WorkspaceAgentLifecycleShutdownError
} else {
a.logger.Info(ctx, "shutdown script completed", slog.F("execution_time", execTime))
}
}

Expand Down
57 changes: 9 additions & 48 deletions agent/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -905,10 +905,8 @@ func TestAgent_StartupScript(t *testing.T) {
return len(got) > 0 && got[len(got)-1] == codersdk.WorkspaceAgentLifecycleReady
}, testutil.WaitShort, testutil.IntervalMedium)

require.Len(t, client.getStartupLogs(), 2)
require.Len(t, client.getStartupLogs(), 1)
require.Equal(t, output, client.getStartupLogs()[0].Output)
require.False(t, client.getStartupLogs()[0].EOF)
require.True(t, client.getStartupLogs()[1].EOF)
})
// This ensures that even when coderd sends back that the startup
// script has written too many lines it will still succeed!
Expand Down Expand Up @@ -1115,15 +1113,8 @@ func TestAgent_Lifecycle(t *testing.T) {
got = client.getLifecycleStates()
return len(got) > 0 && got[len(got)-1] == want[len(want)-1]
}, testutil.WaitShort, testutil.IntervalMedium)
switch len(got) {
case 1:
// This can happen if lifecycle state updates are
// too fast, only the latest one is reported.
require.Equal(t, want[1:], got)
default:
// This is the expected case.
require.Equal(t, want, got)
}

require.Equal(t, want, got)
})

t.Run("StartError", func(t *testing.T) {
Expand All @@ -1144,15 +1135,8 @@ func TestAgent_Lifecycle(t *testing.T) {
got = client.getLifecycleStates()
return len(got) > 0 && got[len(got)-1] == want[len(want)-1]
}, testutil.WaitShort, testutil.IntervalMedium)
switch len(got) {
case 1:
// This can happen if lifecycle state updates are
// too fast, only the latest one is reported.
require.Equal(t, want[1:], got)
default:
// This is the expected case.
require.Equal(t, want, got)
}

require.Equal(t, want, got)
})

t.Run("Ready", func(t *testing.T) {
Expand All @@ -1173,15 +1157,8 @@ func TestAgent_Lifecycle(t *testing.T) {
got = client.getLifecycleStates()
return len(got) > 0 && got[len(got)-1] == want[len(want)-1]
}, testutil.WaitShort, testutil.IntervalMedium)
switch len(got) {
case 1:
// This can happen if lifecycle state updates are
// too fast, only the latest one is reported.
require.Equal(t, want[1:], got)
default:
// This is the expected case.
require.Equal(t, want, got)
}

require.Equal(t, want, got)
})

t.Run("ShuttingDown", func(t *testing.T) {
Expand Down Expand Up @@ -1258,15 +1235,7 @@ func TestAgent_Lifecycle(t *testing.T) {
return len(got) > 0 && got[len(got)-1] == want[len(want)-1]
}, testutil.WaitShort, testutil.IntervalMedium)

switch len(got) {
case 1:
// This can happen if lifecycle state updates are
// too fast, only the latest one is reported.
require.Equal(t, want[1:], got)
default:
// This is the expected case.
require.Equal(t, want, got)
}
require.Equal(t, want, got)
})

t.Run("ShutdownError", func(t *testing.T) {
Expand Down Expand Up @@ -1305,15 +1274,7 @@ func TestAgent_Lifecycle(t *testing.T) {
return len(got) > 0 && got[len(got)-1] == want[len(want)-1]
}, testutil.WaitShort, testutil.IntervalMedium)

switch len(got) {
case 1:
// This can happen if lifecycle state updates are
// too fast, only the latest one is reported.
require.Equal(t, want[1:], got)
default:
// This is the expected case.
require.Equal(t, want, got)
}
require.Equal(t, want, got)
})

t.Run("ShutdownScriptOnce", func(t *testing.T) {
Expand Down
Loading
0