8000 feat(coderd): add prometheus metrics to servertailnet by coadler · Pull Request #11988 · coder/coder · GitHub
[go: up one dir, main page]

Skip to content

feat(coderd): add prometheus metrics to servertailnet #11988

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion coderd/coderd.go
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,7 @@ func New(options *Options) *API {

api.Auditor.Store(&options.Auditor)
api.TailnetCoordinator.Store(&options.TailnetCoordinator)
api.agentProvider, err = NewServerTailnet(api.ctx,
stn, err := NewServerTailnet(api.ctx,
options.Logger,
options.DERPServer,
api.DERPMap,
Expand All @@ -485,6 +485,10 @@ func New(options *Options) *API {
if err != nil {
panic("failed to setup server tailnet: " + err.Error())
}
api.agentProvider = stn
if options.DeploymentValues.Prometheus.Enable {
options.PrometheusRegistry.MustRegister(stn)
}
api.TailnetClientService, err = tailnet.NewClientService(
api.Logger.Named("tailnetclient"),
&api.TailnetCoordinator,
Expand Down
79 changes: 19 additions & 60 deletions coderd/database/pubsub/pubsub_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (
"testing"

"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

Expand Down Expand Up @@ -43,8 +42,8 @@ func TestPGPubsub_Metrics(t *testing.T) {

metrics, err := registry.Gather()
require.NoError(t, err)
require.True(t, gaugeHasValue(t, metrics, 0, "coder_pubsub_current_events"))
require.True(t, gaugeHasValue(t, metrics, 0, "coder_pubsub_current_subscribers"))
require.True(t, testutil.PromGaugeHasValue(t, metrics, 0, "coder_pubsub_current_events"))
require.True(t, testutil.PromGaugeHasValue(t, metrics, 0, "coder_pubsub_current_subscribers"))

event := "test"
data := "testing"
Expand All @@ -63,14 +62,14 @@ func TestPGPubsub_Metrics(t *testing.T) {
require.Eventually(t, func() bool {
metrics, err = registry.Gather()
assert.NoError(t, err)
return gaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
gaugeHasValue(t, metrics, 1, "coder_pubsub_current_subscribers") &&
gaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
counterHasValue(t, metrics, 1, "coder_pubsub_publishes_total", "true") &&
counterHasValue(t, metrics, 1, "coder_pubsub_subscribes_total", "true") &&
counterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
counterHasValue(t, metrics, 7, "coder_pubsub_received_bytes_total") &&
counterHasValue(t, metrics, 7, "coder_pubsub_published_bytes_total")
return testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_subscribers") &&
testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_publishes_total", "true") &&
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_subscribes_total", "true") &&
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
testutil.PromCounterHasValue(t, metrics, 7, "coder_pubsub_received_bytes_total") &&
testutil.PromCounterHasValue(t, metrics, 7, "coder_pubsub_published_bytes_total")
}, testutil.WaitShort, testutil.IntervalFast)

colossalData := make([]byte, 7600)
Expand All @@ -93,54 +92,14 @@ func TestPGPubsub_Metrics(t *testing.T) {
require.Eventually(t, func() bool {
metrics, err = registry.Gather()
assert.NoError(t, err)
return gaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
gaugeHasValue(t, metrics, 2, "coder_pubsub_current_subscribers") &&
gaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
counterHasValue(t, metrics, 2, "coder_pubsub_publishes_total", "true") &&
counterHasValue(t, metrics, 2, "coder_pubsub_subscribes_total", "true") &&
counterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
counterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "colossal") &&
counterHasValue(t, metrics, 7607, "coder_pubsub_received_bytes_total") &&
counterHasValue(t, metrics, 7607, "coder_pubsub_published_bytes_total")
return testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
testutil.PromGaugeHasValue(t, metrics, 2, "coder_pubsub_current_subscribers") &&
testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
testutil.PromCounterHasValue(t, metrics, 2, "coder_pubsub_publishes_total", "true") &&
testutil.PromCounterHasValue(t, metrics, 2, "coder_pubsub_subscribes_total", "true") &&
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "colossal") &&
testutil.PromCounterHasValue(t, metrics, 7607, "coder_pubsub_received_bytes_total") &&
testutil.PromCounterHasValue(t, metrics, 7607, "coder_pubsub_published_bytes_total")
}, testutil.WaitShort, testutil.IntervalFast)
}

func gaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
t.Helper()
for _, family := range metrics {
if family.GetName() != name {
continue
}
ms := family.GetMetric()
for _, m := range ms {
require.Equal(t, len(label), len(m.GetLabel()))
for i, lv := range label {
if lv != m.GetLabel()[i].GetValue() {
continue
}
}
return value == m.GetGauge().GetValue()
}
}
return false
}

func counterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
t.Helper()
for _, family := range metrics {
if family.GetName() != name {
continue
}
ms := family.GetMetric()
for _, m := range ms {
require.Equal(t, len(label), len(m.GetLabel()))
for i, lv := range label {
if lv != m.GetLabel()[i].GetValue() {
continue
}
}
return value == m.GetCounter().GetValue()
}
}
return false
}
54 changes: 53 additions & 1 deletion coderd/tailnet.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"time"

"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"go.opentelemetry.io/otel/trace"
"golang.org/x/xerrors"
"tailscale.com/derp"
Expand Down Expand Up @@ -97,6 +98,18 @@ func NewServerTailnet(
agentConnectionTimes: map[uuid.UUID]time.Time{},
agentTickets: map[uuid.UUID]map[uuid.UUID]struct{}{},
transport: tailnetTransport.Clone(),
connsPerAgent: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coder",
Subsystem: "servertailnet",
Name: "open_connections",
Help: "Total number of TCP connections currently open to workspace agents.",
}, []string{"network"}),
totalConns: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "coder",
Subsystem: "servertailnet",
Name: "connections_total",
Help: "Total number of TCP connections made to workspace agents.",
}, []string{"network"}),
}
tn.transport.DialContext = tn.dialContext
// These options are mostly just picked at random, and they can likely be
Expand Down Expand Up @@ -170,6 +183,16 @@ func NewServerTailnet(
return tn, nil
}

func (s *ServerTailnet) Describe(descs chan<- *prometheus.Desc) {
s.connsPerAgent.Describe(descs)
s.totalConns.Describe(descs)
}

func (s *ServerTailnet) Collect(metrics chan<- prometheus.Metric) {
s.connsPerAgent.Collect(metrics)
s.totalConns.Collect(metrics)
}

func (s *ServerTailnet) expireOldAgents() {
const (
tick = 5 * time.Minute
Expand Down Expand Up @@ -304,6 +327,9 @@ type ServerTailnet struct {
agentTickets map[uuid.UUID]map[uuid.UUID]struct{}

transport *http.Transport

connsPerAgent *prometheus.GaugeVec
totalConns *prometheus.CounterVec
}

func (s *ServerTailnet) ReverseProxy(targetURL, dashboardURL *url.URL, agentID uuid.UUID) *httputil.ReverseProxy {
Expand Down Expand Up @@ -349,7 +375,18 @@ func (s *ServerTailnet) dialContext(ctx context.Context, network, addr string) (
return nil, xerrors.Errorf("no agent id attached")
}

return s.DialAgentNetConn(ctx, agentID, network, addr)
nc, err := s.DialAgentNetConn(ctx, agentID, network, addr)
if err != nil {
return nil, err
}

s.connsPerAgent.WithLabelValues("tcp").Inc()
s.totalConns.WithLabelValues("tcp").Inc()
return &instrumentedConn{
Conn: nc,
agentID: agentID,
connsPerAgent: s.connsPerAgent,
}, nil
}

func (s *ServerTailnet) ensureAgent(agentID uuid.UUID) error {
Expand Down Expand Up @@ -455,3 +492,18 @@ func (s *ServerTailnet) Close() error {
<-s.derpMapUpdaterClosed
return nil
}

type instrumentedConn struct {
net.Conn

agentID uuid.UUID
closeOnce sync.Once
connsPerAgent *prometheus.GaugeVec
}

func (c *instrumentedConn) Close() error {
c.closeOnce.Do(func() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are network connections always explicitly closed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one I'm not 100% sure on. Admittedly, I found this idea from a stackoverflow post which seemed to work for a couple other people. Was planning to get this into dev and monitor to make sure it works as intended with a lot more usage than I can reproduce myself.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess maybe they should be and this gauge will tell us if there's a leak...

c.connsPerAgent.WithLabelValues("tcp").Dec()
})
return c.Conn.Close()
}
38 changes: 38 additions & 0 deletions coderd/tailnet_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"testing"

"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"github.com/spf13/afero"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -79,6 +80,43 @@ func TestServerTailnet_ReverseProxy(t *testing.T) {
assert.Equal(t, http.StatusOK, res.StatusCode)
})

t.Run("Metrics", func(t *testing.T) {
t.Parallel()

ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()

agents, serverTailnet := setupServerTailnetAgent(t, 1)
a := agents[0]

registry := prometheus.NewRegistry()
require.NoError(t, registry.Register(serverTailnet))

u, err := url.Parse(fmt.Sprintf("http://127.0.0.1:%d", codersdk.WorkspaceAgentHTTPAPIServerPort))
require.NoError(t, err)

rp := serverTailnet.ReverseProxy(u, u, a.id)

rw := httptest.NewRecorder()
req := httptest.NewRequest(
http.MethodGet,
u.String(),
nil,
).WithContext(ctx)

rp.ServeHTTP(rw, req)
res := rw.Result()
defer res.Body.Close()

assert.Equal(t, http.StatusOK, res.StatusCode)
require.Eventually(t, func() bool {
metrics, err := registry.Gather()
assert.NoError(t, err)
return testutil.PromCounterHasValue(t, metrics, 1, "coder_servertailnet_connections_total", "tcp") &&
testutil.PromGaugeHasValue(t, metrics, 1, "coder_servertailnet_open_connections", "tcp")
}, testutil.WaitShort, testutil.IntervalFast)
})

t.Run("HostRewrite", func(t *testing.T) {
t.Parallel()

Expand Down
50 changes: 50 additions & 0 deletions testutil/prometheus.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package testutil

import (
"testing"

dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/require"
)

func PromGaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
t.Helper()
for _, family := range metrics {
if family.GetName() != name {
continue
}
ms := family.GetMetric()
metricsLoop:
for _, m := range ms {
require.Equal(t, len(label), len(m.GetLabel()))
for i, lv := range label {
if lv != m.GetLabel()[i].GetValue() {
continue metricsLoop
}
}
return value == m.GetGauge().GetValue()
}
}
return false
}

func PromCounterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
t.Helper()
for _, family := range metrics {
if family.GetName() != name {
continue
}
ms := family.GetMetric()
metricsLoop:
for _, m := range ms {
require.Equal(t, len(label), len(m.GetLabel()))
for i, lv := range label {
if lv != m.GetLabel()[i].GetValue() {
continue metricsLoop
}
}
return value == m.GetCounter().GetValue()
}
}
return false
}
384E
0