8000 feat: enable agent connection reports by default, remove flag (cherry-pick #16778) by matifali · Pull Request #16809 · coder/coder · GitHub
[go: up one dir, main page]

Skip to content

feat: enable agent connection reports by default, remove flag (cherry-pick #16778) #16809

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
172e523
feat(agent): wire up agentssh server to allow exec into container (#1…
johnstcn Feb 26, 2025
38c0e8a
fix(agent/agentssh): ensure RSA key generation always produces valid …
ThomasK33 Feb 26, 2025
c5a265f
feat(cli): add experimental rpty command (#16700)
johnstcn Feb 26, 2025
a2cc1b8
fix: display premium banner on audit page when license inactive (#16713)
mtojek Feb 26, 2025
2971957
ci: also restart tagged provisioner deployment (#16716)
johnstcn Feb 26, 2025
f1b357d
feat: support session audit log (#16703)
BrunoQuaresma Feb 26, 2025
b94d2cb
fix(coderd): handle deletes and links for new agent/app audit resourc…
mafredri Feb 26, 2025
7c035a4
fix: remove provisioners from deployment sidebar (#16717)
BrunoQuaresma Feb 26, 2025
7cd6e9c
fix: return provisioners in desc order and add limit to cli (#16720)
mafredri Feb 26, 2025
5295902
docs: clarified prometheus integration behavior (#16724)
michaelvp411 Feb 26, 2025
1cb864b
fix: allow viewOrgRoles for custom roles page (#16722)
jaaydenh Feb 26, 2025
81ef9e9
docs: document new feature stages (#16719)
EdwardAngert Feb 26, 2025
2aa749a
chore(cli): fix test flake caused by agent connect race (#16725)
johnstcn Feb 26, 2025
6b69635
chore: warn user without permissions to view org members (#16721)
jaaydenh Feb 26, 2025
5cdc13b
docs: fix broken links in feature-stages (#16727)
EdwardAngert Feb 26, 2025
b3d6755
docs: copy edit early access section in feature-stages doc (#16730)
EdwardAngert Feb 27, 2025
95363c9
fix(enterprise/coderd): remove useless provisioner daemon id from req…
johnstcn Feb 27, 2025
6dd51f9
chore: test metricscache on postgres (#16711)
DanielleMaywood Feb 27, 2025
4ba5a8a
feat(agent): add connection reporting for SSH and reconnecting PTY (#…
mafredri Feb 27, 2025
cccdf1e
feat: implement WorkspaceCreationBan org role (#16686)
Emyrk Feb 27, 2025
464fccd
chore: create collapsible summary component (#16705)
jaaydenh Feb 27, 2025
bf5b002
fix: add org role read permissions to site wide template admins and a…
jaaydenh Feb 27, 2025
91a4a98
chore: add an unassign action for roles (#16728)
aslilac Feb 27, 2025
0ea0601
fix: handle undefined job while updating build progress (#16732)
mtojek Feb 27, 2025
7e33902
chore: use org-scoped roles for organization groups and members e2e t…
aslilac Feb 27, 2025
b23e05b
fix(vpn): fail early if wintun.dll is not present (#16707)
deansheather Feb 28, 2025
3997eee
chore: update tailscale (#16737)
deansheather Feb 28, 2025
64fec8b
feat: include winres metadata in Windows binaries (#16706)
deansheather Feb 28, 2025
ec44f06
feat(cli): allow SSH command to connect to running container (#16726)
johnstcn Feb 28, 2025
6889ad2
fix(agent/agentcontainers): remove empty warning if no containers exi…
johnstcn Feb 28, 2025
e27953d
fix(site): add a beta badge for presets (#16751)
SasSwart Feb 28, 2025
930816f
fix: locate Terraform entrypoint file (#16753)
mtojek Feb 28, 2025
4216e28
fix: editor: fallback to default entrypoint (#16757)
mtojek Feb 28, 2025
fc2815c
docs: fix anchor and repo links (#16555)
guspan-tanadi Mar 2, 2025
ca23abe
feat(provisioner): add support for workspace_owner_rbac_roles (#16407)
nxf5025 Mar 2, 2025
d0e2060
feat(agent): add second SSH listener on port 22 (#16627)
ThomasK33 Mar 3, 2025
c074f77
feat: add notifications inbox db (#16599)
defelmnq Mar 3, 2025
a5842e5
docs: document default GitHub OAuth2 configuration and device flow (#…
hugodutka Mar 3, 2025
9c5d496
docs: suggest disabling the default GitHub OAuth2 provider on k8s (#1…
hugodutka Mar 3, 2025
0f4f6bd
docs: describe default sign up behavior with GitHub (#16765)
hugodutka Mar 3, 2025
88f0131
fix: use dbtime in dbmem query to fix flake (#16773)
ethanndickson Mar 3, 2025
04c3396
refactor: replace `golang.org/x/exp/slices` with `slices` (#16772)
Juneezee Mar 3, 2025
ca23abc
chore(cli): fix test flake in TestSSH_Container/NotFound (#16771)
johnstcn Mar 3, 2025
7dc05cc
feat: enable agent connection reports by default, remove flag
mafredri Mar 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
feat(agent): add connection reporting for SSH and reconnecting PTY (#…
…16652)

Updates #15139
  • Loading branch information
mafredri authored Feb 27, 2025
commit 4ba5a8a2ba8ec5a03c7b2360797806aeb3158bff
158 changes: 158 additions & 0 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"fmt"
"hash/fnv"
"io"
"net"
"net/http"
"net/netip"
"os"
Expand All @@ -28,6 +29,7 @@ import (
"golang.org/x/exp/slices"
"golang.org/x/sync/errgroup"
"golang.org/x/xerrors"
"google.golang.org/protobuf/types/known/timestamppb"
"tailscale.com/net/speedtest"
"tailscale.com/tailcfg"
"tailscale.com/types/netlogtype"
Expand Down Expand Up @@ -90,6 +92,7 @@ type Options struct {
ContainerLister agentcontainers.Lister

ExperimentalContainersEnabled bool
ExperimentalConnectionReports bool
}

type Client interface {
Expand Down Expand Up @@ -177,6 +180,7 @@ func New(options Options) Agent {
lifecycleUpdate: make(chan struct{}, 1),
lifecycleReported: make(chan codersdk.WorkspaceAgentLifecycle, 1),
lifecycleStates: []agentsdk.PostLifecycleRequest{{State: codersdk.WorkspaceAgentLifecycleCreated}},
reportConnectionsUpdate: make(chan struct{}, 1),
ignorePorts: options.IgnorePorts,
portCacheDuration: options.PortCacheDuration,
reportMetadataInterval: options.ReportMetadataInterval,
Expand All @@ -192,6 +196,7 @@ func New(options Options) Agent {
lister: options.ContainerLister,

experimentalDevcontainersEnabled: options.ExperimentalContainersEnabled,
experimentalConnectionReports: options.ExperimentalConnectionReports,
}
// Initially, we have a closed channel, reflecting the fact that we are not initially connected.
// Each time we connect we replace the channel (while holding the closeMutex) with a new one
Expand Down Expand Up @@ -252,6 +257,10 @@ type agent struct {
lifecycleStates []agentsdk.PostLifecycleRequest
lifecycleLastReportedIndex int // Keeps track of the last lifecycle state we successfully reported.

reportConnectionsUpdate chan struct{}
reportConnectionsMu sync.Mutex
reportConnections []*proto.ReportConnectionRequest

network *tailnet.Conn
statsReporter *statsReporter
logSender *agentsdk.LogSender
Expand All @@ -264,6 +273,7 @@ type agent struct {
lister agentcontainers.Lister

experimentalDevcontainersEnabled bool
experimentalConnectionReports bool
}

func (a *agent) TailnetConn() *tailnet.Conn {
Expand All @@ -279,6 +289,24 @@ func (a *agent) init() {
UpdateEnv: a.updateCommandEnv,
WorkingDirectory: func() string { return a.manifest.Load().Directory },
BlockFileTransfer: a.blockFileTransfer,
ReportConnection: func(id uuid.UUID, magicType agentssh.MagicSessionType, ip string) func(code int, reason string) {
var connectionType proto.Connection_Type
switch magicType {
case agentssh.MagicSessionTypeSSH:
connectionType = proto.Connection_SSH
case agentssh.MagicSessionTypeVSCode:
connectionType = proto.Connection_VSCODE
case agentssh.MagicSessionTypeJetBrains:
connectionType = proto.Connection_JETBRAINS
case agentssh.MagicSessionTypeUnknown:
connectionType = proto.Connection_TYPE_UNSPECIFIED
default:
a.logger.Error(a.hardCtx, "unhandled magic session type when reporting connection", slog.F("magic_type", magicType))
connectionType = proto.Connection_TYPE_UNSPECIFIED
}

return a.reportConnection(id, connectionType, ip)
},
})
if err != nil {
panic(err)
Expand All @@ -301,6 +329,9 @@ func (a *agent) init() {
a.reconnectingPTYServer = reconnectingpty.NewServer(
a.logger.Named("reconnecting-pty"),
a.sshServer,
func(id uuid.UUID, ip string) func(code int, reason string) {
return a.reportConnection(id, proto.Connection_RECONNECTING_PTY, ip)
},
a.metrics.connectionsTotal, a.metrics.reconnectingPTYErrors,
a.reconnectingPTYTimeout,
func(s *reconnectingpty.Server) {
Expand Down Expand Up @@ -713,6 +744,129 @@ func (a *agent) setLifecycle(state codersdk.WorkspaceAgentLifecycle) {
}
}

// reportConnectionsLoop reports connections to the agent for auditing.
func (a *agent) reportConnectionsLoop(ctx context.Context, aAPI proto.DRPCAgentClient24) error {
for {
select {
case <-a.reportConnectionsUpdate:
case <-ctx.Done():
return ctx.Err()
}

for {
a.reportConnectionsMu.Lock()
if len(a.reportConnections) == 0 {
a.reportConnectionsMu.Unlock()
break
}
payload := a.reportConnections[0]
// Release lock while we send the payload, this is safe
// since we only append to the slice.
a.reportConnectionsMu.Unlock()

logger := a.logger.With(slog.F("payload", payload))
logger.Debug(ctx, "reporting connection")
_, err := aAPI.ReportConnection(ctx, payload)
if err != nil {
return xerrors.Errorf("failed to report connection: %w", err)
}

logger.Debug(ctx, "successfully reported connection")

// Remove the payload we sent.
a.reportConnectionsMu.Lock()
a.reportConnections[0] = nil // Release the pointer from the underlying array.
a.reportConnections = a.reportConnections[1:]
a.reportConnectionsMu.Unlock()
}
}
}

const (
// reportConnectionBufferLimit limits the number of connection reports we
// buffer to avoid growing the buffer indefinitely. This should not happen
// unless the agent has lost connection to coderd for a long time or if
// the agent is being spammed with connections.
//
// If we assume ~150 byte per connection report, this would be around 300KB
// of memory which seems acceptable. We could reduce this if necessary by
// not using the proto struct directly.
reportConnectionBufferLimit = 2048
)

func (a *agent) reportConnection(id uuid.UUID, connectionType proto.Connection_Type, ip string) (disconnected func(code int, reason string)) {
// If the experiment hasn't been enabled, we don't report connections.
if !a.experimentalConnectionReports {
return func(int, string) {} // Noop.
}

// Remove the port from the IP because ports are not supported in coderd.
if host, _, err := net.SplitHostPort(ip); err != nil {
a.logger.Error(a.hardCtx, "split host and port for connection report failed", slog.F("ip", ip), slog.Error(err))
} else {
// Best effort.
ip = host
}

a.reportConnectionsMu.Lock()
defer a.reportConnectionsMu.Unlock()

if len(a.reportConnections) >= reportConnectionBufferLimit {
a.logger.Warn(a.hardCtx, "connection report buffer limit reached, dropping connect",
slog.F("limit", reportConnectionBufferLimit),
slog.F("connection_id", id),
slog.F("connection_type", connectionType),
slog.F("ip", ip),
)
} else {
a.reportConnections = append(a.reportConnections, &proto.ReportConnectionRequest{
Connection: &proto.Connection{
Id: id[:],
Action: proto.Connection_CONNECT,
Type: connectionType,
Timestamp: timestamppb.New(time.Now()),
Ip: ip,
StatusCode: 0,
Reason: nil,
},
})
select {
case a.reportConnectionsUpdate <- struct{}{}:
default:
}
}

return func(code int, reason string) {
a.reportConnectionsMu.Lock()
defer a.reportConnectionsMu.Unlock()
if len(a.reportConnections) >= reportConnectionBufferLimit {
a.logger.Warn(a.hardCtx, "connection report buffer limit reached, dropping disconnect",
slog.F("limit", reportConnectionBufferLimit),
slog.F("connection_id", id),
slog.F("connection_type", connectionType),
slog.F("ip", ip),
)
return
}

a.reportConnections = append(a.reportConnections, &proto.ReportConnectionRequest{
Connection: &proto.Connection{
Id: id[:],
Action: proto.Connection_DISCONNECT,
Type: connectionType,
Timestamp: timestamppb.New(time.Now()),
Ip: ip,
StatusCode: int32(code), //nolint:gosec
Reason: &reason,
},
})
select {
case a.reportConnectionsUpdate <- struct{}{}:
default:
}
}
}

// fetchServiceBannerLoop fetches the service banner on an interval. It will
// not be fetched immediately; the expectation is that it is primed elsewhere
// (and must be done before the session actually starts).
Expand Down Expand Up @@ -823,6 +977,10 @@ func (a *agent) run() (retErr error) {
return resourcesmonitor.Start(ctx)
})

// Connection reports are part of auditing, we should keep sending them via
// gracefulShutdownBehaviorRemain.
connMan.startAgentAPI("report connections", gracefulShutdownBehaviorRemain, a.reportConnectionsLoop)

// channels to sync goroutines below
// handle manifest
// |
Expand Down
Loading
Loading
0