8000 fix(agent): start devcontainers through agentcontainers package (#18471) · coder/coder@c4e4fe8 · GitHub
[go: up one dir, main page]

Skip to content

Commit c4e4fe8

Browse files
fix(agent): start devcontainers through agentcontainers package (#18471)
Fixes coder/internal#706 Context for the implementation here coder/internal#706 (comment) Synchronously starts dev containers defined in terraform with our `DevcontainerCLI` abstraction, instead of piggybacking off of our `agentscripts` package. This gives us more control over logs, instead of being reliant on packages which may or may not exist in the user-provided image.
1 parent f6d9765 commit c4e4fe8

File tree

12 files changed

+304
-480
lines changed

12 files changed

+304
-480
lines changed

agent/agent.go

Lines changed: 77 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ type Options struct {
9191
Execer agentexec.Execer
9292
Devcontainers bool
9393
DevcontainerAPIOptions []agentcontainers.Option // Enable Devcontainers for these to be effective.
94+
Clock quartz.Clock
9495
}
9596

9697
type Client interface {
@@ -144,6 +145,9 @@ func New(options Options) Agent {
144145
if options.PortCacheDuration == 0 {
145146
options.PortCacheDuration = 1 * time.Second
146147
}
148+
if options.Clock == nil {
149+
options.Clock = quartz.NewReal()
150+
}
147151

148152
prometheusRegistry := options.PrometheusRegistry
149153
if prometheusRegistry == nil {
@@ -157,6 +161,7 @@ func New(options Options) Agent {
157161
hardCtx, hardCancel := context.WithCancel(context.Background())
158162
gracefulCtx, gracefulCancel := context.WithCancel(hardCtx)
159163
a := &agent{
164+
clock: options.Clock,
160165
tailnetListenPort: options.TailnetListenPort,
161166
reconnectingPTYTimeout: options.ReconnectingPTYTimeout,
162167
logger: options.Logger,
@@ -204,6 +209,7 @@ func New(options Options) Agent {
204209
}
205210

206211
type agent struct {
212+
clock quartz.Clock
207213
logger slog.Logger
208214
client Client
209215
exchangeToken func(ctx context.Context) (string, error)
@@ -273,7 +279,7 @@ type agent struct {
273279

274280
devcontainers bool
275281
containerAPIOptions []agentcontainers.Option
276-
containerAPI atomic.Pointer[agentcontainers.API] // Set by apiHandler.
282+
containerAPI *agentcontainers.API
277283
}
278284

279285
func (a *agent) TailnetConn() *tailnet.Conn {
@@ -330,6 +336,19 @@ func (a *agent) init() {
330336
// will not report anywhere.
331337
a.scriptRunner.RegisterMetrics(a.prometheusRegistry)
332338

339+
if a.devcontainers {
340+
containerAPIOpts := []agentcontainers.Option{
341+
agentcontainers.WithExecer(a.execer),
342+
agentcontainers.WithCommandEnv(a.sshServer.CommandEnv),
343+
agentcontainers.WithScriptLogger(func(logSourceID uuid.UUID) agentcontainers.ScriptLogger {
344+
return a.logSender.GetScriptLogger(logSourceID)
345+
}),
346+
}
347+
containerAPIOpts = append(containerAPIOpts, a.containerAPIOptions...)
348+
349+
a.containerAPI = agentcontainers.NewAPI(a.logger.Named("containers"), containerAPIOpts...)
350+
}
351+
333352
a.reconnectingPTYServer = reconnectingpty.NewServer(
334353
a.logger.Named("reconnecting-pty"),
335354
a.sshServer,
@@ -1141,15 +1160,18 @@ func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context,
11411160
}
11421161

11431162
var (
1144-
scripts = manifest.Scripts
1145-
scriptRunnerOpts []agentscripts.InitOption
1163+
scripts = manifest.Scripts
1164+
scriptRunnerOpts []agentscripts.InitOption
1165+
devcontainerScripts map[uuid.UUID]codersdk.WorkspaceAgentScript
11461166
)
11471167
if a.devcontainers {
1148-
var dcScripts []codersdk.WorkspaceAgentScript
1149-
scripts, dcScripts = agentcontainers.ExtractAndInitializeDevcontainerScripts(manifest.Devcontainers, scripts)
1150-
// See ExtractAndInitializeDevcontainerScripts for motivation
1151-
// behind running dcScripts as post start scripts.
1152-
scriptRunnerOpts = append(scriptRunnerOpts, agentscripts.WithPostStartScripts(dcScripts...))
1168+
a.containerAPI.Init(
1169+
agentcontainers.WithManifestInfo(manifest.OwnerName, manifest.WorkspaceName),
1170+
agentcontainers.WithDevcontainers(manifest.Devcontainers, scripts),
1171+
agentcontainers.WithSubAgentClient(agentcontainers.NewSubAgentClientFromAPI(a.logger, aAPI)),
1172+
)
1173+
1174+
scripts, devcontainerScripts = agentcontainers.ExtractDevcontainerScripts(manifest.Devcontainers, scripts)
11531175
}
11541176
err = a.scriptRunner.Init(scripts, aAPI.ScriptCompleted, scriptRunnerOpts...)
11551177
if err != nil {
@@ -1168,7 +1190,12 @@ func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context,
11681190
// finished (both start and post start). For instance, an
11691191
// autostarted devcontainer will be included in this time.
11701192
err := a.scriptRunner.Execute(a.gracefulCtx, agentscripts.ExecuteStartScripts)
1171-
err = errors.Join(err, a.scriptRunner.Execute(a.gracefulCtx, agentscripts.ExecutePostStartScripts))
1193+
1194+
for _, dc := range manifest.Devcontainers {
1195+
cErr := a.createDevcontainer(ctx, aAPI, dc, devcontainerScripts[dc.ID])
1196+
err = errors.Join(err, cErr)
1197+
}
1198+
11721199
dur := time.Since(start).Seconds()
11731200
if err != nil {
11741201
a.logger.Warn(ctx, "startup script(s) failed", slog.Error(err))
@@ -1187,14 +1214,6 @@ func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context,
11871214
}
11881215
a.metrics.startupScriptSeconds.WithLabelValues(label).Set(dur)
11891216
a.scriptRunner.StartCron()
1190-
1191-
// If the container API is enabled, trigger an immediate refresh
1192-
// for quick sub agent injection.
1193-
if cAPI := a.containerAPI.Load(); cAPI != nil {
1194-
if err := cAPI.RefreshContainers(ctx); err != nil {
1195-
a.logger.Error(ctx, "failed to refresh containers", slog.Error(err))
1196-
}
1197-
}
11981217
})
11991218
if err != nil {
12001219
return xerrors.Errorf("track conn goroutine: %w", err)
@@ -1204,6 +1223,38 @@ func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context,
12041223
}
12051224
}
12061225

1226+
func (a *agent) createDevcontainer(
1227+
ctx context.Context,
1228+
aAPI proto.DRPCAgentClient26,
1229+
dc codersdk.WorkspaceAgentDevcontainer,
1230+
script codersdk.WorkspaceAgentScript,
1231+
) (err error) {
1232+
var (
1233+
exitCode = int32(0)
1234+
startTime = a.clock.Now()
1235+
status = proto.Timing_OK
1236+
)
1237+
if err = a.containerAPI.CreateDevcontainer(dc.WorkspaceFolder, dc.ConfigPath); err != nil {
1238+
exitCode = 1
1239+
status = proto.Timing_EXIT_FAILURE
1240+
}
1241+
endTime := a.clock.Now()
1242+
1243+
if _, scriptErr := aAPI.ScriptCompleted(ctx, &proto.WorkspaceAgentScriptCompletedRequest{
1244+
Timing: &proto.Timing{
1245+
ScriptId: script.ID[:],
1246+
Start: timestamppb.New(startTime),
1247+
End: timestamppb.New(endTime),
1248+
ExitCode: exitCode,
1249+
Stage: proto.Timing_START,
1250+
Status: status,
1251+
},
1252+
}); scriptErr != nil {
1253+
a.logger.Warn(ctx, "reporting script completed failed", slog.Error(scriptErr))
1254+
}
1255+
return err
1256+
}
1257+
12071258
// createOrUpdateNetwork waits for the manifest to be set using manifestOK, then creates or updates
12081259
// the tailnet using the information in the manifest
12091260
func (a *agent) createOrUpdateNetwork(manifestOK, networkOK *checkpoint) func(context.Context, proto.DRPCAgentClient26) error {
@@ -1227,7 +1278,6 @@ func (a *agent) createOrUpdateNetwork(manifestOK, networkOK *checkpoint) func(co
12271278
// agent API.
12281279
network, err = a.createTailnet(
12291280
a.gracefulCtx,
1230-
aAPI,
12311281
manifest.AgentID,
12321282
manifest.DERPMap,
12331283
manifest.DERPForceWebSockets,
@@ -1262,9 +1312,9 @@ func (a *agent) createOrUpdateNetwork(manifestOK, networkOK *checkpoint) func(co
12621312
network.SetBlockEndpoints(manifest.DisableDirectConnections)
12631313

12641314
// Update the subagent client if the container API is available.
1265-
if cAPI := a.containerAPI.Load(); cAPI != nil {
1315+
if a.containerAPI != nil {
12661316
client := agentcontainers.NewSubAgentClientFromAPI(a.logger, aAPI)
1267-
cAPI.UpdateSubAgentClient(client)
1317+
a.containerAPI.UpdateSubAgentClient(client)
12681318
}
12691319
}
12701320
return nil
@@ -1382,7 +1432,6 @@ func (a *agent) trackGoroutine(fn func()) error {
13821432

13831433
func (a *agent) createTailnet(
13841434
ctx context.Context,
1385-
aAPI proto.DRPCAgentClient26,
13861435
agentID uuid.UUID,
13871436
derpMap *tailcfg.DERPMap,
13881437
derpForceWebSockets, disableDirectConnections bool,
@@ -1515,10 +1564,7 @@ func (a *agent) createTailnet(
15151564
}()
15161565
if err = a.trackGoroutine(func() {
15171566
defer apiListener.Close()
1518-
apiHandler, closeAPIHAndler := a.apiHandler(aAPI)
1519-
defer func() {
1520-
_ = closeAPIHAndler()
1521-
}()
1567+
apiHandler := a.apiHandler()
15221568
server := &http.Server{
15231569
BaseContext: func(net.Listener) context.Context { return ctx },
15241570
Handler: apiHandler,
@@ -1532,7 +1578,6 @@ func (a *agent) createTailnet(
15321578
case <-ctx.Done():
15331579
case <-a.hardCtx.Done():
15341580
}
1535-
_ = closeAPIHAndler()
15361581
_ = server.Close()
15371582
}()
15381583

@@ -1871,6 +1916,12 @@ func (a *agent) Close() error {
18711916
a.logger.Error(a.hardCtx, "script runner close", slog.Error(err))
18721917
}
18731918

1919+
if a.containerAPI != nil {
1920+
if err := a.containerAPI.Close(); err != nil {
1921+
a.logger.Error(a.hardCtx, "container API close", slog.Error(err))
1922+
}
1923+
}
1924+
18741925
// Wait for the graceful shutdown to complete, but don't wait forever so
18751926
// that we don't break user expectations.
18761927
go func() {

agent/agentcontainers/api.go

Lines changed: 61 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,10 @@ func WithDevcontainers(devcontainers []codersdk.WorkspaceAgentDevcontainer, scri
207207
api.devcontainerNames = make(map[string]bool, len(devcontainers))
208208
api.devcontainerLogSourceIDs = make(map[string]uuid.UUID)
209209
for _, dc := range devcontainers {
210+
if dc.Status == "" {
211+
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusStarting
212+
}
213+
210214
api.knownDevcontainers[dc.WorkspaceFolder] = dc
211215
api.devcontainerNames[dc.Name] = true
212216
for _, script := range scripts {
@@ -265,8 +269,6 @@ func NewAPI(logger slog.Logger, options ...Option) *API {
265269
api := &API{
266270
ctx: ctx,
267271
cancel: cancel,
268-
watcherDone: make(chan struct{}),
269-
updaterDone: make(chan struct{}),
270272
initialUpdateDone: make(chan struct{}),
271273
updateTrigger: make(chan chan error),
272274
updateInterval: defaultUpdateInterval,
@@ -315,10 +317,28 @@ func NewAPI(logger slog.Logger, options ...Option) *API {
315317
api.subAgentClient.Store(&c)
316318
}
317319

320+
return api
321+
}
322+
323+
// Init applies a final set of options to the API and then
324+
// begins the watcherLoop and updaterLoop. This function
325+
// must only be called once.
326+
func (api *API) Init(opts ...Option) {
327+
api.mu.Lock()
328+
defer api.mu.Unlock()
329+
if api.closed {
330+
return
331+
}
332+
333+
for _, opt := range opts {
334+
opt(api)
335+
}
336+
337+
api.watcherDone = make(chan struct{})
338+
api.updaterDone = make(chan struct{})
339+
318340
go api.watcherLoop()
319341
go api.updaterLoop()
320-
321-
return api
322342
}
323343

324344
func (api *API) watcherLoop() {
@@ -909,8 +929,9 @@ func (api *API) handleDevcontainerRecreate(w http.ResponseWriter, r *http.Reques
909929
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusStarting
910930
dc.Container = nil
911931
api.knownDevcontainers[dc.WorkspaceFolder] = dc
912-
api.asyncWg.Add(1)
913-
go api.recreateDevcontainer(dc, configPath)
932+
go func() {
933+
_ = api.CreateDevcontainer(dc.WorkspaceFolder, configPath, WithRemoveExistingContainer())
934+
}()
914935

915936
api.mu.Unlock()
916937

@@ -920,15 +941,29 @@ func (api *API) handleDevcontainerRecreate(w http.ResponseWriter, r *http.Reques
920941
})
921942
}
922943

923-
// recreateDevcontainer should run in its own goroutine and is responsible for
944+
// createDevcontainer should run in its own goroutine and is responsible for
924945
// recreating a devcontainer based on the provided devcontainer configuration.
925946
// It updates the devcontainer status and logs the process. The configPath is
926947
// passed as a parameter for the odd chance that the container being recreated
927948
// has a different config file than the one stored in the devcontainer state.
928949
// The devcontainer state must be set to starting and the asyncWg must be
929950
// incremented before calling this function.
930-
func (api *API) recreateDevcontainer(dc codersdk.WorkspaceAgentDevcontainer, configPath string) {
951+
func (api *API) CreateDevcontainer(workspaceFolder, configPath string, opts ...DevcontainerCLIUpOptions) error {
952+
api.mu.Lock()
953+
if api.closed {
954+
api.mu.Unlock()
955+
return nil
956+
}
957+
958+
dc, found := api.knownDevcontainers[workspaceFolder]
959+
if !found {
960+
api.mu.Unlock()
961+
return xerrors.Errorf("devcontainer not found")
962+
}
963+
964+
api.asyncWg.Add(1)
931965
defer api.asyncWg.Done()
966+
api.mu.Unlock()
932967

933968
var (
934969
err error
@@ -969,12 +1004,15 @@ func (api *API) recreateDevcontainer(dc codersdk.WorkspaceAgentDevcontainer, con
9691004

9701005
logger.Debug(ctx, "starting devcontainer recreation")
9711006

972-
_, err = api.dccli.Up(ctx, dc.WorkspaceFolder, configPath, WithUpOutput(infoW, errW), WithRemoveExistingContainer())
1007+
upOptions := []DevcontainerCLIUpOptions{WithUpOutput(infoW, errW)}
1008+
upOptions = append(upOptions, opts...)
1009+
1010+
_, err = api.dccli.Up(ctx, dc.WorkspaceFolder, configPath, upOptions...)
9731011
if err != nil {
9741012
// No need to log if the API is closing (context canceled), as this
9751013
// is expected behavior when the API is shutting down.
9761014
if !errors.Is(err, context.Canceled) {
977-
logger.Error(ctx, "devcontainer recreation failed", slog.Error(err))
1015+
logger.Error(ctx, "devcontainer creation failed", slog.Error(err))
9781016
}
9791017

9801018
api.mu.Lock()
@@ -983,10 +1021,11 @@ func (api *API) recreateDevcontainer(dc codersdk.WorkspaceAgentDevcontainer, con
9831021
api.knownDevcontainers[dc.WorkspaceFolder] = dc
9841022
api.recreateErrorTimes[dc.WorkspaceFolder] = api.clock.Now("agentcontainers", "recreate", "errorTimes")
9851023
api.mu.Unlock()
986-
return
1024+
1025+
return xerrors.Errorf("start devcontainer: %w", err)
9871026
}
9881027

989-
logger.Info(ctx, "devcontainer recreated successfully")
1028+
logger.Info(ctx, "devcontainer created successfully")
9901029

9911030
api.mu.Lock()
9921031
dc = api.knownDevcontainers[dc.WorkspaceFolder]
@@ -1009,8 +1048,11 @@ func (api *API) recreateDevcontainer(dc codersdk.WorkspaceAgentDevcontainer, con
10091048
// Ensure an immediate refresh to accurately reflect the
10101049
// devcontainer state after recreation.
10111050
if err := api.RefreshContainers(ctx); err != nil {
1012-
logger.Error(ctx, "failed to trigger immediate refresh after devcontainer recreation", slog.Error(err))
1051+
logger.Error(ctx, "failed to trigger immediate refresh after devcontainer creation", slog.Error(err))
1052+
return xerrors.Errorf("refresh containers: %w", err)
10131053
}
1054+
1055+
return nil
10141056
}
10151057

10161058
// markDevcontainerDirty finds the devcontainer with the given config file path
@@ -1609,8 +1651,12 @@ func (api *API) Close() error {
16091651
err := api.watcher.Close()
16101652

16111653
// Wait for loops to finish.
1612-
<-api.watcherDone
1613-
<-api.updaterDone
1654+
if api.watcherDone != nil {
1655+
<-api.watcherDone
1656+
}
1657+
if api.updaterDone != nil {
1658+
<-api.updaterDone
1659+
}
16141660

16151661
// Wait for all async tasks to complete.
16161662
api.asyncWg.Wait()

0 commit comments

Comments
 (0)
0