8000 feat(coderd): add prometheus metrics to servertailnet by coadler · Pull Request #11988 · coder/coder · GitHub
[go: up one dir, main page]

Skip to content

feat(coderd): add prometheus metrics to servertailnet #11988

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 6, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
spike comments
  • Loading branch information
coadler committed Feb 6, 2024
commit 66cb907e8b91064abb6b2cfa1977e391b166731c
79 changes: 19 additions & 60 deletions coderd/database/pubsub/pubsub_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (
"testing"

"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

Expand Down Expand Up @@ -43,8 +42,8 @@ func TestPGPubsub_Metrics(t *testing.T) {

metrics, err := registry.Gather()
require.NoError(t, err)
require.True(t, gaugeHasValue(t, metrics, 0, "coder_pubsub_current_events"))
require.True(t, gaugeHasValue(t, metrics, 0, "coder_pubsub_current_subscribers"))
require.True(t, testutil.PromGaugeHasValue(t, metrics, 0, "coder_pubsub_current_events"))
require.True(t, testutil.PromGaugeHasValue(t, metrics, 0, "coder_pubsub_current_subscribers"))

event := "test"
data := "testing"
Expand All @@ -63,14 +62,14 @@ func TestPGPubsub_Metrics(t *testing.T) {
require.Eventually(t, func() bool {
metrics, err = registry.Gather()
assert.NoError(t, err)
return gaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
gaugeHasValue(t, metrics, 1, "coder_pubsub_current_subscribers") &&
gaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
counterHasValue(t, metrics, 1, "coder_pubsub_publishes_total", "true") &&
counterHasValue(t, metrics, 1, "coder_pubsub_subscribes_total", "true") &&
counterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
counterHasValue(t, metrics, 7, "coder_pubsub_received_bytes_total") &&
counterHasValue(t, metrics, 7, "coder_pubsub_published_bytes_total")
return testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_subscribers") &&
testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_publishes_total", "true") &&
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_subscribes_total", "true") &&
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
testutil.PromCounterHasValue(t, metrics, 7, "coder_pubsub_received_bytes_total") &&
testutil.PromCounterHasValue(t, metrics, 7, "coder_pubsub_published_bytes_total")
}, testutil.WaitShort, testutil.IntervalFast)

colossalData := make([]byte, 7600)
Expand All @@ -93,54 +92,14 @@ func TestPGPubsub_Metrics(t *testing.T) {
require.Eventually(t, func() bool {
metrics, err = registry.Gather()
assert.NoError(t, err)
return gaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
gaugeHasValue(t, metrics, 2, "coder_pubsub_current_subscribers") &&
gaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
counterHasValue(t, metrics, 2, "coder_pubsub_publishes_total", "true") &&
counterHasValue(t, metrics, 2, "coder_pubsub_subscribes_total", "true") &&
counterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
counterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "colossal") &&
counterHasValue(t, metrics, 7607, "coder_pubsub_received_bytes_total") &&
counterHasValue(t, metrics, 7607, "coder_pubsub_published_bytes_total")
return testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_current_events") &&
testutil.PromGaugeHasValue(t, metrics, 2, "coder_pubsub_current_subscribers") &&
testutil.PromGaugeHasValue(t, metrics, 1, "coder_pubsub_connected") &&
testutil.PromCounterHasValue(t, metrics, 2, "coder_pubsub_publishes_total", "true") &&
testutil.PromCounterHasValue(t, metrics, 2, "coder_pubsub_subscribes_total", "true") &&
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "normal") &&
testutil.PromCounterHasValue(t, metrics, 1, "coder_pubsub_messages_total", "colossal") &&
testutil.PromCounterHasValue(t, metrics, 7607, "coder_pubsub_received_bytes_total") &&
testutil.PromCounterHasValue(t, metrics, 7607, "coder_pubsub_published_bytes_total")
}, testutil.WaitShort, testutil.IntervalFast)
}

func gaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
t.Helper()
for _, family := range metrics {
if family.GetName() != name {
continue
}
ms := family.GetMetric()
for _, m := range ms {
require.Equal(t, len(label), len(m.GetLabel()))
for i, lv := range label {
if lv != m.GetLabel()[i].GetValue() {
continue
}
}
return value == m.GetGauge().GetValue()
}
}
return false
}

func counterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
t.Helper()
for _, family := range metrics {
if family.GetName() != name {
continue
}
ms := family.GetMetric()
for _, m := range ms {
require.Equal(t, len(label), len(m.GetLabel()))
for i, lv := range label {
if lv != m.GetLabel()[i].GetValue() {
continue
}
}
return value == m.GetCounter().GetValue()
}
}
return false
}
24 changes: 12 additions & 12 deletions coderd/tailnet.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,18 +98,18 @@ func NewServerTailnet(
agentConnectionTimes: map[uuid.UUID]time.Time{},
agentTickets: map[uuid.UUID]map[uuid.UUID]struct{}{},
transport: tailnetTransport.Clone(),
connsPerAgent: prometheus.NewGaugeVec(prometheus.GaugeOpts{
connsPerAgent: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "coder",
Subsystem: "servertailnet",
Name: "open_conns",
Name: "open_tcp_connections",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I realize we only use servertailnet for HTTP proxying at present, but what do you think about changing this to open_connections and making the network a label (e.g. network=tcp? Like, I dunno, what if some customer wants us to proxy QUIC or some shit?

Copy link 8000
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that's a good point. If we were to keep it like this we would definitely preclude ourselves from expanding this without a breaking change. Adding a network=tcp with it really only ever being tcp for the forseeable future basically has the same implications and usability as not having it, probably a good change.

Help: "Total number of TCP connections currently open to workspace agents.",
}, []string{"agent_id"}),
totalConns: prometheus.NewCounterVec(prometheus.CounterOpts{
}),
totalConns: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: "coder",
Subsystem: "servertailnet",
Name: "total_conns",
Name: "tcp_connections_total",
Help: "Total number of TCP connections made to workspace agents.",
}, []string{"agent_id"}),
}),
}
tn.transport.DialContext = tn.dialContext
// These options are mostly just picked at random, and they can likely be
Expand Down Expand Up @@ -328,8 +328,8 @@ type ServerTailnet struct {

transport *http.Transport

connsPerAgent *prometheus.GaugeVec
totalConns *prometheus.CounterVec
connsPerAgent prometheus.Gauge
totalConns prometheus.Counter
}

func (s *ServerTailnet) ReverseProxy(targetURL, dashboardURL *url.URL, agentID uuid.UUID) *httputil.ReverseProxy {
Expand Down Expand Up @@ -380,8 +380,8 @@ func (s *ServerTailnet) dialContext(ctx context.Context, network, addr string) (
return nil, err
}

s.connsPerAgent.With(prometheus.Labels{"agent_id": agentID.String()}).Inc()
s.totalConns.With(prometheus.Labels{"agent_id": agentID.String()}).Inc()
s.connsPerAgent.Inc()
s.totalConns.Inc()
return &instrumentedConn{
Conn: nc,
agentID: agentID,
Expand Down Expand Up @@ -498,12 +498,12 @@ type instrumentedConn struct {

agentID uuid.UUID
closeOnce sync.Once
connsPerAgent *prometheus.GaugeVec
connsPerAgent prometheus.Gauge
}

func (c *instrumentedConn) Close() error {
c.closeOnce.Do(func() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are network connections always explicitly closed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one I'm not 100% sure on. Admittedly, I found this idea from a stackoverflow post which seemed to work for a couple other people. Was planning to get this into dev and monitor to make sure it works as intended with a lot more usage than I can reproduce myself.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess maybe they should be and this gauge will tell us if there's a leak...

c.connsPerAgent.With(prometheus.Labels{"agent_id": c.agentID.String()}).Dec()
c.connsPerAgent.Dec()
})
return c.Conn.Close()
}
47 changes: 2 additions & 45 deletions coderd/tailnet_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import (

"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/spf13/afero"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -113,8 +112,8 @@ func TestServerTailnet_ReverseProxy(t *testing.T) {
require.Eventually(t, func() bool {
metrics, err := registry.Gather()
assert.NoError(t, err)
return counterHasValue(t, metrics, 1, "coder_servertailnet_total_conns", a.id.String()) &&
gaugeHasValue(t, metrics, 1, "coder_servertailnet_open_conns", a.id.String())
return testutil.PromCounterHasValue(t, metrics, 1, "coder_servertailnet_tcp_connections_total") &&
testutil.PromGaugeHasValue(t, metrics, 1, "coder_servertailnet_open_tcp_connections")
}, testutil.WaitShort, testutil.IntervalFast)
})

Expand Down Expand Up @@ -367,45 +366,3 @@ func setupServerTailnetAgent(t *testing.T, agentNum int) ([]agentWithID, *coderd

return agents, serverTailnet
}

func gaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
t.Helper()
for _, family := range metrics {
if family.GetName() != name {
continue
}
ms := family.GetMetric()
metricsLoop:
for _, m := range ms {
require.Equal(t, len(label), len(m.GetLabel()))
for i, lv := range label {
if lv != m.GetLabel()[i].GetValue() {
continue metricsLoop
}
}
return value == m.GetGauge().GetValue()
}
}
return false
}

func counterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
t.Helper()
for _, family := range metrics {
if family.GetName() != name {
continue
}
ms := family.GetMetric()
metricsLoop:
for _, m := range ms {
require.Equal(t, len(label), len(m.GetLabel()))
for i, lv := range label {
if lv != m.GetLabel()[i].GetValue() {
continue metricsLoop
}
}
return value == m.GetCounter().GetValue()
}
}
return false
}
50 changes: 50 additions & 0 deletions testutil/prometheus.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package testutil

import (
"testing"

dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/require"
)

func PromGaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
t.Helper()
for _, family := range metrics {
if family.GetName() != name {
continue
}
ms := family.GetMetric()
metricsLoop:
for _, m := range ms {
require.Equal(t, len(label), len(m.GetLabel()))
for i, lv := range label {
if lv != m.GetLabel()[i].GetValue() {
continue metricsLoop
}
}
return value == m.GetGauge().GetValue()
}
}
return false
}

func PromCounterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
t.Helper()
for _, family := range metrics {
if family.GetName() != name {
continue
}
ms := family.GetMetric()
metricsLoop:
for _, m := range ms {
require.Equal(t, len(label), len(m.GetLabel()))
for i, lv := range label {
if lv != m.GetLabel()[i].GetValue() {
continue metricsLoop
}
}
return value == m.GetCounter().GetValue()
}
}
return false
}
0