diff --git a/coderd/apidoc/docs.go b/coderd/apidoc/docs.go index 1e772cded92fb..4cb4940a03775 100644 --- a/coderd/apidoc/docs.go +++ b/coderd/apidoc/docs.go @@ -8208,6 +8208,40 @@ const docTemplate = `{ "ProvisionerStorageMethodFile" ] }, + "codersdk.ProxyHealthReport": { + "type": "object", + "properties": { + "errors": { + "description": "Errors are problems that prevent the workspace proxy from being healthy", + "type": "array", + "items": { + "type": "string" + } + }, + "warnings": { + "description": "Warnings do not prevent the workspace proxy from being healthy, but\nshould be addressed.", + "type": "array", + "items": { + "type": "string" + } + } + } + }, + "codersdk.ProxyHealthStatus": { + "type": "string", + "enum": [ + "reachable", + "unreachable", + "unhealthy", + "unregistered" + ], + "x-enum-varnames": [ + "ProxyReachable", + "ProxyUnreachable", + "ProxyUnhealthy", + "ProxyUnregistered" + ] + }, "codersdk.PutExtendWorkspaceRequest": { "type": "object", "required": [ @@ -9701,6 +9735,14 @@ const docTemplate = `{ "name": { "type": "string" }, + "status": { + "description": "Status is the latest status check of the proxy. This will be empty for deleted\nproxies. This value can be used to determine if a workspace proxy is healthy\nand ready to use.", + "allOf": [ + { + "$ref": "#/definitions/codersdk.WorkspaceProxyStatus" + } + ] + }, "updated_at": { "type": "string", "format": "date-time" @@ -9715,6 +9757,26 @@ const docTemplate = `{ } } }, + "codersdk.WorkspaceProxyStatus": { + "type": "object", + "properties": { + "checked_at": { + "type": "string", + "format": "date-time" + }, + "report": { + "description": "Report provides more information about the health of the workspace proxy.", + "allOf": [ + { + "$ref": "#/definitions/codersdk.ProxyHealthReport" + } + ] + }, + "status": { + "$ref": "#/definitions/codersdk.ProxyHealthStatus" + } + } + }, "codersdk.WorkspaceQuota": { "type": "object", "properties": { diff --git a/coderd/apidoc/swagger.json b/coderd/apidoc/swagger.json index fd277d4f23eef..558360d0c0cc6 100644 --- a/coderd/apidoc/swagger.json +++ b/coderd/apidoc/swagger.json @@ -7354,6 +7354,35 @@ "enum": ["file"], "x-enum-varnames": ["ProvisionerStorageMethodFile"] }, + "codersdk.ProxyHealthReport": { + "type": "object", + "properties": { + "errors": { + "description": "Errors are problems that prevent the workspace proxy from being healthy", + "type": "array", + "items": { + "type": "string" + } + }, + "warnings": { + "description": "Warnings do not prevent the workspace proxy from being healthy, but\nshould be addressed.", + "type": "array", + "items": { + "type": "string" + } + } + } + }, + "codersdk.ProxyHealthStatus": { + "type": "string", + "enum": ["reachable", "unreachable", "unhealthy", "unregistered"], + "x-enum-varnames": [ + "ProxyReachable", + "ProxyUnreachable", + "ProxyUnhealthy", + "ProxyUnregistered" + ] + }, "codersdk.PutExtendWorkspaceRequest": { "type": "object", "required": ["deadline"], @@ -8764,6 +8793,14 @@ "name": { "type": "string" }, + "status": { + "description": "Status is the latest status check of the proxy. This will be empty for deleted\nproxies. This value can be used to determine if a workspace proxy is healthy\nand ready to use.", + "allOf": [ + { + "$ref": "#/definitions/codersdk.WorkspaceProxyStatus" + } + ] + }, "updated_at": { "type": "string", "format": "date-time" @@ -8778,6 +8815,26 @@ } } }, + "codersdk.WorkspaceProxyStatus": { + "type": "object", + "properties": { + "checked_at": { + "type": "string", + "format": "date-time" + }, + "report": { + "description": "Report provides more information about the health of the workspace proxy.", + "allOf": [ + { + "$ref": "#/definitions/codersdk.ProxyHealthReport" + } + ] + }, + "status": { + "$ref": "#/definitions/codersdk.ProxyHealthStatus" + } + } + }, "codersdk.WorkspaceQuota": { "type": "object", "properties": { diff --git a/codersdk/workspaceproxy.go b/codersdk/workspaceproxy.go index 9a902a6b17835..57f180b4e7aff 100644 --- a/codersdk/workspaceproxy.go +++ b/codersdk/workspaceproxy.go @@ -12,17 +12,55 @@ import ( "github.com/google/uuid" ) +type ProxyHealthStatus string + +const ( + // ProxyReachable means the proxy access url is reachable and returns a healthy + // status code. + ProxyReachable ProxyHealthStatus = "reachable" + // ProxyUnreachable means the proxy access url is not responding. + ProxyUnreachable ProxyHealthStatus = "unreachable" + // ProxyUnhealthy means the proxy access url is responding, but there is some + // problem with the proxy. This problem may or may not be preventing functionality. + ProxyUnhealthy ProxyHealthStatus = "unhealthy" + // ProxyUnregistered means the proxy has not registered a url yet. This means + // the proxy was created with the cli, but has not yet been started. + ProxyUnregistered ProxyHealthStatus = "unregistered" +) + +type WorkspaceProxyStatus struct { + Status ProxyHealthStatus `json:"status" table:"status"` + // Report provides more information about the health of the workspace proxy. + Report ProxyHealthReport `json:"report,omitempty" table:"report"` + CheckedAt time.Time `json:"checked_at" table:"checked_at" format:"date-time"` +} + +// ProxyHealthReport is a report of the health of the workspace proxy. +// A healthy report will have no errors. Warnings are not fatal. +type ProxyHealthReport struct { + // Errors are problems that prevent the workspace proxy from being healthy + Errors []string + // Warnings do not prevent the workspace proxy from being healthy, but + // should be addressed. + Warnings []string +} + type WorkspaceProxy struct { - ID uuid.UUID `db:"id" json:"id" format:"uuid" table:"id"` - Name string `db:"name" json:"name" table:"name,default_sort"` - Icon string `db:"icon" json:"icon" table:"icon"` + ID uuid.UUID `json:"id" format:"uuid" table:"id"` + Name string `json:"name" table:"name,default_sort"` + Icon string `json:"icon" table:"icon"` // Full url including scheme of the proxy api url: https://us.example.com - URL string `db:"url" json:"url" table:"url"` + URL string `json:"url" table:"url"` // WildcardHostname with the wildcard for subdomain based app hosting: *.us.example.com - WildcardHostname string `db:"wildcard_hostname" json:"wildcard_hostname" table:"wildcard_hostname"` - CreatedAt time.Time `db:"created_at" json:"created_at" format:"date-time" table:"created_at"` - UpdatedAt time.Time `db:"updated_at" json:"updated_at" format:"date-time" table:"updated_at"` - Deleted bool `db:"deleted" json:"deleted" table:"deleted"` + WildcardHostname string `json:"wildcard_hostname" table:"wildcard_hostname"` + CreatedAt time.Time `json:"created_at" format:"date-time" table:"created_at"` + UpdatedAt time.Time `json:"updated_at" format:"date-time" table:"updated_at"` + Deleted bool `json:"deleted" table:"deleted"` + + // Status is the latest status check of the proxy. This will be empty for deleted + // proxies. This value can be used to determine if a workspace proxy is healthy + // and ready to use. + Status WorkspaceProxyStatus `json:"status,omitempty" table:"status"` } type CreateWorkspaceProxyRequest struct { diff --git a/docs/api/enterprise.md b/docs/api/enterprise.md index 41bb8b8816673..fbee85b9970f1 100644 --- a/docs/api/enterprise.md +++ b/docs/api/enterprise.md @@ -1185,6 +1185,14 @@ curl -X GET http://coder-server:8080/api/v2/workspaceproxies \ "icon": "string", "id": "497f6eca-6276-4993-bfeb-53cbbbba6f08", "name": "string", + "status": { + "checked_at": "2019-08-24T14:15:22Z", + "report": { + "errors": ["string"], + "warnings": ["string"] + }, + "status": "reachable" + }, "updated_at": "2019-08-24T14:15:22Z", "url": "string", "wildcard_hostname": "string" @@ -1202,17 +1210,32 @@ curl -X GET http://coder-server:8080/api/v2/workspaceproxies \ Status Code **200** -| Name | Type | Required | Restrictions | Description | -| --------------------- | ----------------- | -------- | ------------ | -------------------------------------------------------------------------------------- | -| `[array item]` | array | false | | | -| `» created_at` | string(date-time) | false | | | -| `» deleted` | boolean | false | | | -| `» icon` | string | false | | | -| `» id` | string(uuid) | false | | | -| `» name` | string | false | | | -| `» updated_at` | string(date-time) | false | | | -| `» url` | string | false | | Full URL including scheme of the proxy api url: https://us.example.com | -| `» wildcard_hostname` | string | false | | Wildcard hostname with the wildcard for subdomain based app hosting: \*.us.example.com | +| Name | Type | Required | Restrictions | Description | +| --------------------- | ------------------------------------------------------------------------ | -------- | ------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `[array item]` | array | false | | | +| `» created_at` | string(date-time) | false | | | +| `» deleted` | boolean | false | | | +| `» icon` | string | false | | | +| `» id` | string(uuid) | false | | | +| `» name` | string | false | | | +| `» status` | [codersdk.WorkspaceProxyStatus](schemas.md#codersdkworkspaceproxystatus) | false | | Status is the latest status check of the proxy. This will be empty for deleted proxies. This value can be used to determine if a workspace proxy is healthy and ready to use. | +| `»» checked_at` | string(date-time) | false | | | +| `»» report` | [codersdk.ProxyHealthReport](schemas.md#codersdkproxyhealthreport) | false | | Report provides more information about the health of the workspace proxy. | +| `»»» errors` | array | false | | Errors are problems that prevent the workspace proxy from being healthy | +| `»»» warnings` | array | false | | Warnings do not prevent the workspace proxy from being healthy, but should be addressed. | +| `»» status` | [codersdk.ProxyHealthStatus](schemas.md#codersdkproxyhealthstatus) | false | | | +| `» updated_at` | string(date-time) | false | | | +| `» url` | string | false | | Full URL including scheme of the proxy api url: https://us.example.com | +| `» wildcard_hostname` | string | false | | Wildcard hostname with the wildcard for subdomain based app hosting: \*.us.example.com | + +#### Enumerated Values + +| Property | Value | +| -------- | -------------- | +| `status` | `reachable` | +| `status` | `unreachable` | +| `status` | `unhealthy` | +| `status` | `unregistered` | To perform this operation, you must be authenticated. [Learn more](authentication.md). @@ -1257,6 +1280,14 @@ curl -X POST http://coder-server:8080/api/v2/workspaceproxies \ "icon": "string", "id": "497f6eca-6276-4993-bfeb-53cbbbba6f08", "name": "string", + "status": { + "checked_at": "2019-08-24T14:15:22Z", + "report": { + "errors": ["string"], + "warnings": ["string"] + }, + "status": "reachable" + }, "updated_at": "2019-08-24T14:15:22Z", "url": "string", "wildcard_hostname": "string" diff --git a/docs/api/schemas.md b/docs/api/schemas.md index 74d37cd758df3..d297b31169065 100644 --- a/docs/api/schemas.md +++ b/docs/api/schemas.md @@ -3382,6 +3382,39 @@ Parameter represents a set value for the scope. | ------ | | `file` | +## codersdk.ProxyHealthReport + +```json +{ + "errors": ["string"], + "warnings": ["string"] +} +``` + +### Properties + +| Name | Type | Required | Restrictions | Description | +| ---------- | --------------- | -------- | ------------ | ---------------------------------------------------------------------------------------- | +| `errors` | array of string | false | | Errors are problems that prevent the workspace proxy from being healthy | +| `warnings` | array of string | false | | Warnings do not prevent the workspace proxy from being healthy, but should be addressed. | + +## codersdk.ProxyHealthStatus + +```json +"reachable" +``` + +### Properties + +#### Enumerated Values + +| Value | +| -------------- | +| `reachable` | +| `unreachable` | +| `unhealthy` | +| `unregistered` | + ## codersdk.PutExtendWorkspaceRequest ```json @@ -5192,6 +5225,14 @@ Parameter represents a set value for the scope. "icon": "string", "id": "497f6eca-6276-4993-bfeb-53cbbbba6f08", "name": "string", + "status": { + "checked_at": "2019-08-24T14:15:22Z", + "report": { + "errors": ["string"], + "warnings": ["string"] + }, + "status": "reachable" + }, "updated_at": "2019-08-24T14:15:22Z", "url": "string", "wildcard_hostname": "string" @@ -5200,16 +5241,38 @@ Parameter represents a set value for the scope. ### Properties -| Name | Type | Required | Restrictions | Description | -| ------------------- | ------- | -------- | ------------ | -------------------------------------------------------------------------------------- | -| `created_at` | string | false | | | -| `deleted` | boolean | false | | | -| `icon` | string | false | | | -| `id` | string | false | | | -| `name` | string | false | | | -| `updated_at` | string | false | | | -| `url` | string | false | | Full URL including scheme of the proxy api url: https://us.example.com | -| `wildcard_hostname` | string | false | | Wildcard hostname with the wildcard for subdomain based app hosting: \*.us.example.com | +| Name | Type | Required | Restrictions | Description | +| ------------------- | -------------------------------------------------------------- | -------- | ------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `created_at` | string | false | | | +| `deleted` | boolean | false | | | +| `icon` | string | false | | | +| `id` | string | false | | | +| `name` | string | false | | | +| `status` | [codersdk.WorkspaceProxyStatus](#codersdkworkspaceproxystatus) | false | | Status is the latest status check of the proxy. This will be empty for deleted proxies. This value can be used to determine if a workspace proxy is healthy and ready to use. | +| `updated_at` | string | false | | | +| `url` | string | false | | Full URL including scheme of the proxy api url: https://us.example.com | +| `wildcard_hostname` | string | false | | Wildcard hostname with the wildcard for subdomain based app hosting: \*.us.example.com | + +## codersdk.WorkspaceProxyStatus + +```json +{ + "checked_at": "2019-08-24T14:15:22Z", + "report": { + "errors": ["string"], + "warnings": ["string"] + }, + "status": "reachable" +} +``` + +### Properties + +| Name | Type | Required | Restrictions | Description | +| ------------ | -------------------------------------------------------- | -------- | ------------ | ------------------------------------------------------------------------- | +| `checked_at` | string | false | | | +| `report` | [codersdk.ProxyHealthReport](#codersdkproxyhealthreport) | false | | Report provides more information about the health of the workspace proxy. | +| `status` | [codersdk.ProxyHealthStatus](#codersdkproxyhealthstatus) | false | | | ## codersdk.WorkspaceQuota diff --git a/enterprise/cli/proxyserver.go b/enterprise/cli/proxyserver.go index 855eb98f26570..af5716424bc0e 100644 --- a/enterprise/cli/proxyserver.go +++ b/enterprise/cli/proxyserver.go @@ -227,6 +227,7 @@ func (*RootCmd) proxyServer() *clibase.Cmd { proxy, err := wsproxy.New(ctx, &wsproxy.Options{ Logger: logger, + HTTPClient: httpClient, DashboardURL: primaryAccessURL.Value(), AccessURL: cfg.AccessURL.Value(), AppHostname: appHostname, diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 287e9b38db8f7..ed0aea963b0e8 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -24,6 +24,7 @@ import ( "github.com/coder/coder/coderd/schedule" "github.com/coder/coder/codersdk" "github.com/coder/coder/enterprise/coderd/license" + "github.com/coder/coder/enterprise/coderd/proxyhealth" "github.com/coder/coder/enterprise/derpmesh" "github.com/coder/coder/enterprise/replicasync" "github.com/coder/coder/enterprise/tailnet" @@ -52,9 +53,11 @@ func New(ctx context.Context, options *Options) (*API, error) { } ctx, cancelFunc := context.WithCancel(ctx) api := &API{ - AGPL: coderd.New(options.Options), - Options: options, - cancelEntitlementsLoop: cancelFunc, + ctx: ctx, + cancel: cancelFunc, + + AGPL: coderd.New(options.Options), + Options: options, } api.AGPL.Options.SetUserGroups = api.setUserGroups @@ -226,6 +229,24 @@ func New(ctx context.Context, options *Options) (*API, error) { } api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer, meshTLSConfig) + if api.AGPL.Experiments.Enabled(codersdk.ExperimentMoons) { + // Proxy health is a moon feature. + api.proxyHealth, err = proxyhealth.New(&proxyhealth.Options{ + Interval: time.Second * 5, + DB: api.Database, + Logger: options.Logger.Named("proxyhealth"), + Client: api.HTTPClient, + Prometheus: api.PrometheusRegistry, + }) + if err != nil { + return nil, xerrors.Errorf("initialize proxy health: %w", err) + } + go api.proxyHealth.Run(ctx) + // Force the initial loading of the cache. Do this in a go routine in case + // the calls to the workspace proxies hang and this takes some time. + go api.forceWorkspaceProxyHealthUpdate(ctx) + } + err = api.updateEntitlements(ctx) if err != nil { return nil, xerrors.Errorf("update entitlements: %w", err) @@ -249,6 +270,7 @@ type Options struct { DERPServerRegionID int EntitlementsUpdateInterval time.Duration + ProxyHealthInterval time.Duration Keys map[string]ed25519.PublicKey } @@ -256,18 +278,24 @@ type API struct { AGPL *coderd.API *Options + // ctx is canceled immediately on shutdown, it can be used to abort + // interruptible tasks. + ctx context.Context + cancel context.CancelFunc + // Detects multiple Coder replicas running at the same time. replicaManager *replicasync.Manager // Meshes DERP connections from multiple replicas. derpMesh *derpmesh.Mesh + // proxyHealth checks the reachability of all workspace proxies. + proxyHealth *proxyhealth.ProxyHealth - cancelEntitlementsLoop func() - entitlementsMu sync.RWMutex - entitlements codersdk.Entitlements + entitlementsMu sync.RWMutex + entitlements codersdk.Entitlements } func (api *API) Close() error { - api.cancelEntitlementsLoop() + api.cancel() _ = api.replicaManager.Close() _ = api.derpMesh.Close() return api.AGPL.Close() diff --git a/enterprise/coderd/proxyhealth/proxyhealth.go b/enterprise/coderd/proxyhealth/proxyhealth.go new file mode 100644 index 0000000000000..ab532f5892618 --- /dev/null +++ b/enterprise/coderd/proxyhealth/proxyhealth.go @@ -0,0 +1,292 @@ +package proxyhealth + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" + "golang.org/x/sync/errgroup" + "golang.org/x/xerrors" + + "cdr.dev/slog" + "github.com/coder/coder/coderd/database" + "github.com/coder/coder/coderd/database/dbauthz" + "github.com/coder/coder/coderd/prometheusmetrics" + "github.com/coder/coder/codersdk" +) + +type Status string + +const ( + // Unknown should never be returned by the proxy health check. + Unknown Status = "unknown" + // Healthy means the proxy access url is reachable and returns a healthy + // status code. + Healthy Status = "ok" + // Unreachable means the proxy access url is not responding. + Unreachable Status = "unreachable" + // Unhealthy means the proxy access url is responding, but there is some + // problem with the proxy. This problem may or may not be preventing functionality. + Unhealthy Status = "unhealthy" + // Unregistered means the proxy has not registered a url yet. This means + // the proxy was created with the cli, but has not yet been started. + Unregistered Status = "unregistered" +) + +type Options struct { + // Interval is the interval at which the proxy health is checked. + Interval time.Duration + DB database.Store + Logger slog.Logger + Client *http.Client + Prometheus *prometheus.Registry +} + +// ProxyHealth runs a go routine that periodically checks the health of all +// workspace proxies. This information is stored in memory, so each coderd +// replica has its own view of the health of the proxies. These views should be +// consistent, and if they are not, it indicates a problem. +type ProxyHealth struct { + db database.Store + interval time.Duration + logger slog.Logger + client *http.Client + + cache *atomic.Pointer[map[uuid.UUID]ProxyStatus] + + // PromMetrics + healthCheckDuration prometheus.Histogram + healthCheckResults *prometheusmetrics.CachedGaugeVec +} + +func New(opts *Options) (*ProxyHealth, error) { + if opts.Interval <= 0 { + opts.Interval = time.Minute + } + if opts.DB == nil { + return nil, xerrors.Errorf("db is required") + } + if opts.Prometheus == nil { + opts.Prometheus = prometheus.NewRegistry() + } + + client := opts.Client + if client == nil { + client = http.DefaultClient + } + // Set a timeout on the client, so we don't wait forever for a healthz response. + tmp := *client + tmp.Timeout = time.Second * 5 + client = &tmp + + // Prometheus metrics + healthCheckDuration := prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: "coderd", + Subsystem: "proxyhealth", + Name: "health_check_duration_seconds", + Help: "Histogram for duration of proxy health collection in seconds.", + Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30}, + }) + opts.Prometheus.MustRegister(healthCheckDuration) + + healthCheckResults := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "proxyhealth", + Name: "health_check_results", + Help: "This endpoint returns a number to indicate the health status. " + + "-3 (unknown), -2 (Unreachable), -1 (Unhealthy), 0 (Unregistered), 1 (Healthy)", + }, []string{"proxy_id"})) + opts.Prometheus.MustRegister(healthCheckResults) + + return &ProxyHealth{ + db: opts.DB, + interval: opts.Interval, + logger: opts.Logger, + client: client, + cache: &atomic.Pointer[map[uuid.UUID]ProxyStatus]{}, + healthCheckDuration: healthCheckDuration, + healthCheckResults: healthCheckResults, + }, nil +} + +// Run will block until the context is canceled. It will periodically check the +// health of all proxies and store the results in the cache. +func (p *ProxyHealth) Run(ctx context.Context) { + ticker := time.NewTicker(p.interval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case now := <-ticker.C: + statuses, err := p.runOnce(ctx, now) + if err != nil { + p.logger.Error(ctx, "proxy health check failed", slog.Error(err)) + continue + } + // Store the statuses in the cache. + p.cache.Store(&statuses) + } + } +} + +// ForceUpdate runs a single health check and updates the cache. If the health +// check fails, the cache is not updated and an error is returned. This is useful +// to trigger an update when a proxy is created or deleted. +func (p *ProxyHealth) ForceUpdate(ctx context.Context) error { + statuses, err := p.runOnce(ctx, time.Now()) + if err != nil { + return err + } + + // Store the statuses in the cache. + p.cache.Store(&statuses) + return nil +} + +// HealthStatus returns the current health status of all proxies stored in the +// cache. +func (p *ProxyHealth) HealthStatus() map[uuid.UUID]ProxyStatus { + ptr := p.cache.Load() + if ptr == nil { + return map[uuid.UUID]ProxyStatus{} + } + return *ptr +} + +type ProxyStatus struct { + // ProxyStatus includes the value of the proxy at the time of checking. This is + // useful to know as it helps determine if the proxy checked has different values + // then the proxy in hand. AKA if the proxy was updated, and the status was for + // an older proxy. + Proxy database.WorkspaceProxy + Status Status + Report codersdk.ProxyHealthReport + CheckedAt time.Time +} + +// runOnce runs the health check for all workspace proxies. If there is an +// unexpected error, an error is returned. Expected errors will mark a proxy as +// unreachable. +func (p *ProxyHealth) runOnce(ctx context.Context, now time.Time) (map[uuid.UUID]ProxyStatus, error) { + // Record from the given time. + defer p.healthCheckDuration.Observe(time.Since(now).Seconds()) + + //nolint:gocritic // Proxy health is a system service. + proxies, err := p.db.GetWorkspaceProxies(dbauthz.AsSystemRestricted(ctx)) + if err != nil { + return nil, xerrors.Errorf("get workspace proxies: %w", err) + } + + // Just use a mutex to protect map writes. + var statusMu sync.Mutex + proxyStatus := map[uuid.UUID]ProxyStatus{} + + grp, gctx := errgroup.WithContext(ctx) + // Arbitrary parallelism limit. + grp.SetLimit(5) + + for _, proxy := range proxies { + if proxy.Deleted { + // Ignore deleted proxies. + continue + } + // Each proxy needs to have a status set. Make a local copy for the + // call to be run async. + proxy := proxy + status := ProxyStatus{ + Proxy: proxy, + CheckedAt: now, + Status: Unknown, + } + + grp.Go(func() error { + if proxy.Url == "" { + // Empty URL means the proxy has not registered yet. + // When the proxy is started, it will update the url. + statusMu.Lock() + defer statusMu.Unlock() + p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, 0, proxy.ID.String()) + status.Status = Unregistered + proxyStatus[proxy.ID] = status + return nil + } + + // Try to hit the healthz-report endpoint for a comprehensive health check. + reqURL := fmt.Sprintf("%s/healthz-report", strings.TrimSuffix(proxy.Url, "/")) + req, err := http.NewRequestWithContext(gctx, http.MethodGet, reqURL, nil) + if err != nil { + return xerrors.Errorf("new request: %w", err) + } + req = req.WithContext(gctx) + + resp, err := p.client.Do(req) + if err == nil { + defer resp.Body.Close() + } + // A switch statement felt easier to categorize the different cases than + // if else statements or nested if statements. + switch { + case err == nil && resp.StatusCode == http.StatusOK: + err := json.NewDecoder(resp.Body).Decode(&status.Report) + if err != nil { + // If we cannot read the report, mark the proxy as unhealthy. + status.Report.Errors = []string{fmt.Sprintf("failed to decode health report: %s", err.Error())} + status.Status = Unhealthy + break + } + if len(status.Report.Errors) > 0 { + status.Status = Unhealthy + break + } + status.Status = Healthy + case err == nil && resp.StatusCode != http.StatusOK: + // Unhealthy as we did reach the proxy but it got an unexpected response. + status.Status = Unhealthy + status.Report.Errors = []string{fmt.Sprintf("unexpected status code %d", resp.StatusCode)} + case err != nil: + // Request failed, mark the proxy as unreachable. + status.Status = Unreachable + status.Report.Errors = []string{fmt.Sprintf("request to proxy failed: %s", err.Error())} + default: + // This should never happen + status.Status = Unknown + } + + // Set the prometheus metric correctly. + switch status.Status { + case Healthy: + p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, 1, proxy.ID.String()) + case Unhealthy: + p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, -1, proxy.ID.String()) + case Unreachable: + p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, -2, proxy.ID.String()) + default: + // Unknown + p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, -3, proxy.ID.String()) + } + + statusMu.Lock() + defer statusMu.Unlock() + proxyStatus[proxy.ID] = status + return nil + }) + } + + err = grp.Wait() + if err != nil { + return nil, xerrors.Errorf("group run: %w", err) + } + p.healthCheckResults.Commit() + + return proxyStatus, nil +} diff --git a/enterprise/coderd/proxyhealth/proxyhealth_test.go b/enterprise/coderd/proxyhealth/proxyhealth_test.go new file mode 100644 index 0000000000000..5fb9614385c5f --- /dev/null +++ b/enterprise/coderd/proxyhealth/proxyhealth_test.go @@ -0,0 +1,174 @@ +package proxyhealth_test + +import ( + "context" + "net" + "net/http" + "net/http/httptest" + "testing" + + "github.com/stretchr/testify/require" + "golang.org/x/xerrors" + + "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/coderd/database" + "github.com/coder/coder/coderd/database/dbfake" + "github.com/coder/coder/coderd/database/dbgen" + "github.com/coder/coder/coderd/httpapi" + "github.com/coder/coder/codersdk" + "github.com/coder/coder/enterprise/coderd/proxyhealth" + "github.com/coder/coder/testutil" +) + +func insertProxy(t *testing.T, db database.Store, url string) database.WorkspaceProxy { + t.Helper() + + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort) + defer cancel() + + proxy, _ := dbgen.WorkspaceProxy(t, db, database.WorkspaceProxy{}) + _, err := db.RegisterWorkspaceProxy(ctx, database.RegisterWorkspaceProxyParams{ + Url: url, + WildcardHostname: "", + ID: proxy.ID, + }) + require.NoError(t, err, "failed to update proxy") + return proxy +} + +func TestProxyHealth_Unregistered(t *testing.T) { + t.Parallel() + db := dbfake.New() + + proxies := []database.WorkspaceProxy{ + insertProxy(t, db, ""), + insertProxy(t, db, ""), + } + + ph, err := proxyhealth.New(&proxyhealth.Options{ + Interval: 0, + DB: db, + Logger: slogtest.Make(t, nil), + }) + require.NoError(t, err, "failed to create proxy health") + + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort) + defer cancel() + + err = ph.ForceUpdate(ctx) + require.NoError(t, err, "failed to force update") + for _, p := range proxies { + require.Equal(t, ph.HealthStatus()[p.ID].Status, proxyhealth.Unregistered, "expect unregistered proxy") + } +} + +func TestProxyHealth_Unhealthy(t *testing.T) { + t.Parallel() + db := dbfake.New() + + srvBadReport := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + httpapi.Write(context.Background(), w, http.StatusOK, codersdk.ProxyHealthReport{ + Errors: []string{"We have a problem!"}, + }) + })) + defer srvBadReport.Close() + + srvBadCode := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusBadRequest) + })) + defer srvBadCode.Close() + + proxies := []database.WorkspaceProxy{ + // Same url for both, just checking multiple proxies are checked. + insertProxy(t, db, srvBadReport.URL), + insertProxy(t, db, srvBadCode.URL), + } + + ph, err := proxyhealth.New(&proxyhealth.Options{ + Interval: 0, + DB: db, + Logger: slogtest.Make(t, nil), + Client: srvBadReport.Client(), + }) + require.NoError(t, err, "failed to create proxy health") + + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort) + defer cancel() + + err = ph.ForceUpdate(ctx) + require.NoError(t, err, "failed to force update") + for _, p := range proxies { + require.Equal(t, ph.HealthStatus()[p.ID].Status, proxyhealth.Unhealthy, "expect reachable proxy") + } +} + +func TestProxyHealth_Reachable(t *testing.T) { + t.Parallel() + db := dbfake.New() + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + httpapi.Write(context.Background(), w, http.StatusOK, codersdk.ProxyHealthReport{ + Warnings: []string{"No problems, just a warning"}, + }) + })) + defer srv.Close() + + proxies := []database.WorkspaceProxy{ + // Same url for both, just checking multiple proxies are checked. + insertProxy(t, db, srv.URL), + insertProxy(t, db, srv.URL), + } + + ph, err := proxyhealth.New(&proxyhealth.Options{ + Interval: 0, + DB: db, + Logger: slogtest.Make(t, nil), + Client: srv.Client(), + }) + require.NoError(t, err, "failed to create proxy health") + + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort) + defer cancel() + + err = ph.ForceUpdate(ctx) + require.NoError(t, err, "failed to force update") + for _, p := range proxies { + require.Equal(t, ph.HealthStatus()[p.ID].Status, proxyhealth.Healthy, "expect reachable proxy") + } +} + +func TestProxyHealth_Unreachable(t *testing.T) { + t.Parallel() + db := dbfake.New() + + cli := &http.Client{ + Transport: &http.Transport{ + DialContext: func(ctx context.Context, network, addr string) (net.Conn, error) { + return nil, xerrors.New("Always fail") + }, + }, + } + + proxies := []database.WorkspaceProxy{ + // example.com is a real domain, but the client should always fail. + insertProxy(t, db, "https://example.com"), + insertProxy(t, db, "https://random.example.com"), + } + + ph, err := proxyhealth.New(&proxyhealth.Options{ + Interval: 0, + DB: db, + Logger: slogtest.Make(t, nil), + Client: cli, + }) + require.NoError(t, err, "failed to create proxy health") + + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort) + defer cancel() + + err = ph.ForceUpdate(ctx) + require.NoError(t, err, "failed to force update") + for _, p := range proxies { + require.Equal(t, ph.HealthStatus()[p.ID].Status, proxyhealth.Unreachable, "expect unreachable proxy") + } +} diff --git a/enterprise/coderd/workspaceproxy.go b/enterprise/coderd/workspaceproxy.go index a1bc400377c5c..136a000e57289 100644 --- a/enterprise/coderd/workspaceproxy.go +++ b/enterprise/coderd/workspaceproxy.go @@ -1,15 +1,18 @@ package coderd import ( + "context" "crypto/sha256" "database/sql" "fmt" "net/http" "net/url" + "time" "github.com/google/uuid" "golang.org/x/xerrors" + "cdr.dev/slog" agpl "github.com/coder/coder/coderd" "github.com/coder/coder/coderd/audit" "github.com/coder/coder/coderd/database" @@ -19,9 +22,18 @@ import ( "github.com/coder/coder/coderd/workspaceapps" "github.com/coder/coder/codersdk" "github.com/coder/coder/cryptorand" + "github.com/coder/coder/enterprise/coderd/proxyhealth" "github.com/coder/coder/enterprise/wsproxy/wsproxysdk" ) +// forceWorkspaceProxyHealthUpdate forces an update of the proxy health. +// This is useful when a proxy is created or deleted. Errors will be logged. +func (api *API) forceWorkspaceProxyHealthUpdate(ctx context.Context) { + if err := api.proxyHealth.ForceUpdate(ctx); err != nil { + api.Logger.Error(ctx, "force proxy health update", slog.Error(err)) + } +} + // @Summary Delete workspace proxy // @ID delete-workspace-proxy // @Security CoderSessionToken @@ -62,6 +74,9 @@ func (api *API) deleteWorkspaceProxy(rw http.ResponseWriter, r *http.Request) { httpapi.Write(ctx, rw, http.StatusOK, codersdk.Response{ Message: "Proxy has been deleted!", }) + + // Update the proxy health cache to remove this proxy. + go api.forceWorkspaceProxyHealthUpdate(api.ctx) } // @Summary Create workspace proxy @@ -122,9 +137,16 @@ func (api *API) postWorkspaceProxy(rw http.ResponseWriter, r *http.Request) { aReq.New = proxy httpapi.Write(ctx, rw, http.StatusCreated, codersdk.CreateWorkspaceProxyResponse{ - Proxy: convertProxy(proxy), + Proxy: convertProxy(proxy, proxyhealth.ProxyStatus{ + Proxy: proxy, + CheckedAt: time.Now(), + Status: proxyhealth.Unregistered, + }), ProxyToken: fullToken, }) + + // Update the proxy health cache to include this new proxy. + go api.forceWorkspaceProxyHealthUpdate(api.ctx) } // nolint:revive @@ -158,28 +180,8 @@ func (api *API) workspaceProxies(rw http.ResponseWriter, r *http.Request) { return } - httpapi.Write(ctx, rw, http.StatusOK, convertProxies(proxies)) -} - -func convertProxies(p []database.WorkspaceProxy) []codersdk.WorkspaceProxy { - resp := make([]codersdk.WorkspaceProxy, 0, len(p)) - for _, proxy := range p { - resp = append(resp, convertProxy(proxy)) - } - return resp -} - -func convertProxy(p database.WorkspaceProxy) codersdk.WorkspaceProxy { - return codersdk.WorkspaceProxy{ - ID: p.ID, - Name: p.Name, - Icon: p.Icon, - URL: p.Url, - WildcardHostname: p.WildcardHostname, - CreatedAt: p.CreatedAt, - UpdatedAt: p.UpdatedAt, - Deleted: p.Deleted, - } + statues := api.proxyHealth.HealthStatus() + httpapi.Write(ctx, rw, http.StatusOK, convertProxies(proxies, statues)) } // @Summary Issue signed workspace app token @@ -295,6 +297,8 @@ func (api *API) workspaceProxyRegister(rw http.ResponseWriter, r *http.Request) httpapi.Write(ctx, rw, http.StatusCreated, wsproxysdk.RegisterWorkspaceProxyResponse{ AppSecurityKey: api.AppSecurityKey.String(), }) + + go api.forceWorkspaceProxyHealthUpdate(api.ctx) } // reconnectingPTYSignedToken issues a signed app token for use when connecting @@ -392,3 +396,29 @@ func (api *API) reconnectingPTYSignedToken(rw http.ResponseWriter, r *http.Reque SignedToken: tokenStr, }) } + +func convertProxies(p []database.WorkspaceProxy, statuses map[uuid.UUID]proxyhealth.ProxyStatus) []codersdk.WorkspaceProxy { + resp := make([]codersdk.WorkspaceProxy, 0, len(p)) + for _, proxy := range p { + resp = append(resp, convertProxy(proxy, statuses[proxy.ID])) + } + return resp +} + +func convertProxy(p database.WorkspaceProxy, status proxyhealth.ProxyStatus) codersdk.WorkspaceProxy { + return codersdk.WorkspaceProxy{ + ID: p.ID, + Name: p.Name, + Icon: p.Icon, + URL: p.Url, + WildcardHostname: p.WildcardHostname, + CreatedAt: p.CreatedAt, + UpdatedAt: p.UpdatedAt, + Deleted: p.Deleted, + Status: codersdk.WorkspaceProxyStatus{ + Status: codersdk.ProxyHealthStatus(status.Status), + Report: status.Report, + CheckedAt: status.CheckedAt, + }, + } +} diff --git a/enterprise/coderd/workspaceproxy_test.go b/enterprise/coderd/workspaceproxy_test.go index ea391dfee63af..ec467986efd5c 100644 --- a/enterprise/coderd/workspaceproxy_test.go +++ b/enterprise/coderd/workspaceproxy_test.go @@ -60,7 +60,7 @@ func TestWorkspaceProxyCRUD(t *testing.T) { proxies, err := client.WorkspaceProxies(ctx) require.NoError(t, err) require.Len(t, proxies, 1) - require.Equal(t, proxyRes.Proxy, proxies[0]) + require.Equal(t, proxyRes.Proxy.ID, proxies[0].ID) require.NotEmpty(t, proxyRes.ProxyToken) }) diff --git a/enterprise/wsproxy/wsproxy.go b/enterprise/wsproxy/wsproxy.go index b30fea54ed4cd..3f03d486fe87c 100644 --- a/enterprise/wsproxy/wsproxy.go +++ b/enterprise/wsproxy/wsproxy.go @@ -2,6 +2,7 @@ package wsproxy import ( "context" + "fmt" "net/http" "net/url" "reflect" @@ -30,6 +31,7 @@ import ( type Options struct { Logger slog.Logger + HTTPClient *http.Client // DashboardURL is the URL of the primary coderd instance. DashboardURL *url.URL // AccessURL is the URL of the WorkspaceProxy. @@ -120,6 +122,11 @@ func New(ctx context.Context, opts *Options) (*Server, error) { return nil, xerrors.Errorf("set client token: %w", err) } + // Use the configured client if provided. + if opts.HTTPClient != nil { + client.SDKClient.HTTPClient = opts.HTTPClient + } + // TODO: Probably do some version checking here info, err := client.SDKClient.BuildInfo(ctx) if err != nil { @@ -224,6 +231,8 @@ func New(ctx context.Context, opts *Options) (*Server, error) { r.Get("/buildinfo", s.buildInfo) r.Get("/healthz", func(w http.ResponseWriter, r *http.Request) { _, _ = w.Write([]byte("OK")) }) + // TODO: @emyrk should this be authenticated or debounced? + r.Get("/healthz-report", s.healthReport) return s, nil } @@ -246,6 +255,46 @@ func (s *Server) buildInfo(rw http.ResponseWriter, r *http.Request) { }) } +// healthReport is a more thorough health check than the '/healthz' endpoint. +// This endpoint not only responds if the server is running, but can do some +// internal diagnostics to ensure that the server is running correctly. The +// primary coderd will use this to determine if this workspace proxy can be used +// by the users. This endpoint will take longer to respond than the '/healthz'. +// Checks: +// - Can communicate with primary coderd +// +// TODO: Config checks to ensure consistent with primary +func (s *Server) healthReport(rw http.ResponseWriter, r *http.Request) { + ctx := r.Context() + var report codersdk.ProxyHealthReport + + // Hit the build info to do basic version checking. + primaryBuild, err := s.SDKClient.SDKClient.BuildInfo(ctx) + if err != nil { + report.Errors = append(report.Errors, fmt.Sprintf("failed to get build info: %s", err.Error())) + httpapi.Write(r.Context(), rw, http.StatusOK, report) + return + } + + if primaryBuild.WorkspaceProxy { + // This could be a simple mistake of using a proxy url as the dashboard url. + report.Errors = append(report.Errors, + fmt.Sprintf("dashboard url (%s) is a workspace proxy, must be a primary coderd", s.DashboardURL.String())) + } + + // If we are in dev mode, never check versions. + if !buildinfo.IsDev() && !buildinfo.VersionsMatch(primaryBuild.Version, buildinfo.Version()) { + // Version mismatches are not fatal, but should be reported. + report.Warnings = append(report.Warnings, + fmt.Sprintf("version mismatch: primary coderd (%s) != workspace proxy (%s)", primaryBuild.Version, buildinfo.Version())) + } + + // TODO: We should hit the deployment config endpoint and do some config + // checks. We can check the version from the X-CODER-BUILD-VERSION header + + httpapi.Write(r.Context(), rw, http.StatusOK, report) +} + type optErrors []error func (e optErrors) Error() string { diff --git a/site/src/api/typesGenerated.ts b/site/src/api/typesGenerated.ts index 71cb9f259f19f..e2bb4a33f589d 100644 --- a/site/src/api/typesGenerated.ts +++ b/site/src/api/typesGenerated.ts @@ -690,6 +690,12 @@ export interface ProvisionerJobLog { readonly output: string } +// From codersdk/workspaceproxy.go +export interface ProxyHealthReport { + readonly Errors: string[] + readonly Warnings: string[] +} + // From codersdk/workspaces.go export interface PutExtendWorkspaceRequest { readonly deadline: string @@ -1242,6 +1248,7 @@ export interface WorkspaceProxy { readonly created_at: string readonly updated_at: string readonly deleted: boolean + readonly status?: WorkspaceProxyStatus } // From codersdk/deployment.go @@ -1250,6 +1257,13 @@ export interface WorkspaceProxyBuildInfo { readonly dashboard_url: string } +// From codersdk/workspaceproxy.go +export interface WorkspaceProxyStatus { + readonly status: ProxyHealthStatus + readonly report?: ProxyHealthReport + readonly checked_at: string +} + // From codersdk/workspaces.go export interface WorkspaceQuota { readonly credits_consumed: number @@ -1447,6 +1461,19 @@ export const ProvisionerStorageMethods: ProvisionerStorageMethod[] = ["file"] export type ProvisionerType = "echo" | "terraform" export const ProvisionerTypes: ProvisionerType[] = ["echo", "terraform"] +// From codersdk/workspaceproxy.go +export type ProxyHealthStatus = + | "reachable" + | "unhealthy" + | "unreachable" + | "unregistered" +export const ProxyHealthStatuses: ProxyHealthStatus[] = [ + "reachable", + "unhealthy", + "unreachable", + "unregistered", +] + // From codersdk/rbacresources.go export type RBACResource = | "api_key"