commit
fba1a94836
6 changed files with 513 additions and 0 deletions
@ -0,0 +1,7 @@
|
||||
FROM busybox:latest |
||||
|
||||
COPY exporter /bin/exporter |
||||
|
||||
EXPOSE 8081 |
||||
|
||||
ENTRYPOINT ["/bin/exporter"] |
||||
@ -0,0 +1,91 @@
|
||||
# SysEng coding challenge: Prometheus exporter |
||||
|
||||
Thanks for trying our coding challenge for systems engineers. |
||||
|
||||
## Prerequisites |
||||
|
||||
You need to be able to run Docker on your development computer to work on this |
||||
challenge. Please run |
||||
``` |
||||
docker run -it -p 8080:8080 beorn7/syseng-challenge |
||||
``` |
||||
|
||||
Then run `curl http://localhost:8080/stats`. The output should look similar to |
||||
the following: |
||||
|
||||
```json |
||||
{ |
||||
"requestCounters": { |
||||
"200": 65221, |
||||
"404": 14066, |
||||
"500": 12618 |
||||
}, |
||||
"requestRates": { |
||||
"200": 100, |
||||
"404": 1 |
||||
}, |
||||
"duration": { |
||||
"count": 91905, |
||||
"sum": 4484.3037570333245, |
||||
"average": 0.024613801985478054 |
||||
} |
||||
} |
||||
``` |
||||
|
||||
If you aren't already, you should make yourself familiar with the |
||||
[Prometheus monitoring and alerting system](https://prometheus.io). A good |
||||
starting point is Brian Brazil's very concise |
||||
[talk](https://www.youtube.com/watch?v=cwRmXqXKGtk) at FOSDEM 2016. |
||||
|
||||
## The Challenge |
||||
|
||||
Imagine the little binary you have started above is an instance of a |
||||
microservice that is running replicated with hundreds of instances on a |
||||
computing cluster. For the sake of the challenge, we are not interested in what |
||||
the service is actually doing. We are only interested in its metrics, and we |
||||
want to simulate monitoring it with Prometheus. Thankfully, the service is |
||||
providing metrics in JSON format via its `/stats` endpoint, as you have seen |
||||
above. The `requestCounters` tell you how often each HTTP status code has been |
||||
served during the lifetime of the binary. The `requestRates` tell you the same |
||||
but for the last second, i.e. they give you the current QPS. The `duration` |
||||
tells you how many requests have been served in total during the lifetime of |
||||
the binary (`count`) and how much total time those requests have taken in |
||||
seconds (`sum`). The `average` is the time in seconds a request has taken, |
||||
averaged over the last second. |
||||
|
||||
Unfortunately, Prometheus cannot ingest JSON directly but requires a custom |
||||
format. Usually, you would write your microservice in a way that it would |
||||
expose metrics in a format suitable for Prometheus directly. Let's imagine that |
||||
this direct instrumentation is, for some reason, not feasible in this case. (In |
||||
reality, this situation often arises when monitoring 3rd party software that |
||||
does not happen to be instrumented for Prometheus specifically.) The usual |
||||
solution is to write a so-called |
||||
[exporter](https://prometheus.io/docs/instrumenting/exporters/), a little glue |
||||
program that retrieves metrics from a 3rd party system and exposes them in the |
||||
Prometheus way. |
||||
|
||||
Your task is to write, in a language of your choice, such an exporter for the |
||||
simulated microservice running in your Docker right now. |
||||
|
||||
It might be helpful to also start a Prometheus server and scrape your exporter |
||||
and explore the possibilities enabled by your metrics. The simulated |
||||
microservice instance is looping through a number of scenarios over the course |
||||
of about 15 minutes. |
||||
|
||||
## Bonus questions |
||||
|
||||
Optionally, you may answer the following questions. Thinking about them might |
||||
also help you solve the coding challenge in a meaningful way. Keep answers |
||||
short. It's really just about sketching out a few ideas. If we invite you for |
||||
on-site interviews, we will have plenty of time to discuss them in detail. |
||||
|
||||
1. What are good ways of deploying hundreds of instances of our simulated |
||||
service? How would you deploy your exporter? And how would you configure |
||||
Prometheus to monitor them all? |
||||
2. What graphs about the service would you plot in a dashboard builder like |
||||
Grafana? Ideally, you can come up with PromQL expressions for them. |
||||
3. What would you alert on? What would be the urgency of the various alerts? |
||||
Again, it would be great if you could formulate alerting conditions with |
||||
PromQL. |
||||
4. If you were in control of the microservice, which exported metrics would you |
||||
add or modify next? |
||||
@ -0,0 +1,34 @@
|
||||
GO := GO15VENDOREXPERIMENT=1 go
|
||||
pkgs = $(shell $(GO) list ./... | grep -v /vendor/)
|
||||
|
||||
DOCKER_IMAGE_NAME ?= exporter
|
||||
DOCKER_IMAGE_TAG ?= latest
|
||||
|
||||
|
||||
all: format build test |
||||
|
||||
style: |
||||
@echo ">> checking code style"
|
||||
@! gofmt -d $(shell find . -path ./vendor -prune -o -name '*.go' -print) | grep '^'
|
||||
|
||||
test: |
||||
@echo ">> running tests"
|
||||
@$(GO) test -short $(pkgs)
|
||||
|
||||
format: |
||||
@echo ">> formatting code"
|
||||
@$(GO) fmt $(pkgs)
|
||||
|
||||
vet: |
||||
@echo ">> vetting code"
|
||||
@$(GO) vet $(pkgs)
|
||||
|
||||
build: |
||||
@echo ">> building binaries"
|
||||
@$(GO) build -o exporter
|
||||
|
||||
docker: |
||||
@echo ">> building docker image"
|
||||
@docker build -t "$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_TAG)" .
|
||||
|
||||
.PHONY: all style format build test vet docker |
||||
@ -0,0 +1,263 @@
|
||||
package main |
||||
|
||||
import ( |
||||
"encoding/json" |
||||
"flag" |
||||
"fmt" |
||||
"io/ioutil" |
||||
"net/http" |
||||
"syscall" |
||||
"time" |
||||
|
||||
"github.com/prometheus/client_golang/prometheus" |
||||
"github.com/prometheus/client_golang/prometheus/promhttp" |
||||
"github.com/prometheus/common/log" |
||||
"github.com/prometheus/common/version" |
||||
"net/url" |
||||
"strings" |
||||
) |
||||
|
||||
const ( |
||||
metricsPath = "/metrics" |
||||
namespace = "app" |
||||
) |
||||
|
||||
var ( |
||||
listenAddress = flag.String("listen", ":8080", "The address to listen on for HTTP requests.") |
||||
endpointApp = flag.String("endpoint", "http://localhost:8050/stats", "HTTP API address of the application") |
||||
hostname = flag.String("hostname", "", "Optional hostname which will be added to the exported metrics (defaults to $HOSTNAME)") |
||||
prometheusConstLabel = parseConstLabel() |
||||
) |
||||
|
||||
// run before creating the descriptor
|
||||
func parseConstLabel() prometheus.Labels { |
||||
// parse flags in an early state, so we can retrieve the instance id
|
||||
flag.Parse() |
||||
|
||||
if *hostname != "" { |
||||
// Try to set hostname from env var, so we can see if an pod does not work as expected
|
||||
if value, found := syscall.Getenv("HOSTNAME"); found { |
||||
hostname = &value |
||||
} |
||||
} |
||||
|
||||
// generate constant label if hostname is present
|
||||
if *hostname != "" { |
||||
return prometheus.Labels{"hostname": *hostname} |
||||
} |
||||
return prometheus.Labels{} |
||||
} |
||||
|
||||
var ( |
||||
// Create the prometheus descriptors
|
||||
descUp = prometheus.NewDesc( |
||||
prometheus.BuildFQName(namespace, "", "up"), |
||||
"Was the last query successful.", |
||||
nil, prometheusConstLabel, |
||||
) |
||||
descRequestCount = prometheus.NewDesc( |
||||
prometheus.BuildFQName(namespace, "", "request_count"), |
||||
"How many requests processed, partitioned by status code.", |
||||
[]string{"code"}, prometheusConstLabel, |
||||
) |
||||
descRequestCountTotal = prometheus.NewDesc( |
||||
prometheus.BuildFQName(namespace, "", "request_count_total"), |
||||
"How many requests processed of all status codes.", |
||||
nil, prometheusConstLabel, |
||||
) |
||||
descRequestRates = prometheus.NewDesc( |
||||
prometheus.BuildFQName(namespace, "", "request_rates"), |
||||
"How many requests processed in the last second, partitioned by status code.", |
||||
[]string{"code"}, prometheusConstLabel, |
||||
) |
||||
descRequestRatesTotal = prometheus.NewDesc( |
||||
prometheus.BuildFQName(namespace, "", "request_rates_total"), |
||||
"How many requests processed in the last second.", |
||||
nil, prometheusConstLabel, |
||||
) |
||||
descDurationSum = prometheus.NewDesc( |
||||
prometheus.BuildFQName(namespace, "", "duration_sum"), |
||||
"How much time consumed the requests in summary.", |
||||
nil, prometheusConstLabel, |
||||
) |
||||
descDurationAvg = prometheus.NewDesc( |
||||
prometheus.BuildFQName(namespace, "", "duration_avg"), |
||||
"How much time consumed the requests in average.", |
||||
nil, prometheusConstLabel, |
||||
) |
||||
) |
||||
|
||||
// AppStats represent the schema of the returned json
|
||||
type AppStats struct { |
||||
// RequestCounters are the served status codes during app lifetime
|
||||
RequestCounters map[string]int `json:"requestCounters"` |
||||
// RequestRates are the served status codes for the last second (QPS)
|
||||
RequestRates map[string]int `json:"requestRates"` |
||||
// Duration represent some request stats during the lifetime
|
||||
Duration *AppDuration `json:"duration"` |
||||
} |
||||
|
||||
// AppDuration the schema of the returned duration part of the json
|
||||
type AppDuration struct { |
||||
// Count is the total served request in the lifetime
|
||||
Count int `json:"count"` |
||||
// Sum is the total time of taken time the requests have taken in seconds
|
||||
Sum float64 `json:"sum"` |
||||
// Average time of usage a request has taken.
|
||||
Average float64 `json:"average"` |
||||
} |
||||
|
||||
// appScraper is a helper to retrieve stats in a generic way
|
||||
type appScraper struct { |
||||
endpoint string |
||||
client *http.Client |
||||
} |
||||
|
||||
// stats returns the fetched and parsed json
|
||||
func (s *appScraper) stats() (*AppStats, error) { |
||||
var stats AppStats |
||||
response, err := s.client.Get(s.endpoint) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
buf, err := ioutil.ReadAll(response.Body) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
err = json.Unmarshal(buf, &stats) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
// validate returned json is complete
|
||||
// requestCounter and requestRates should be empty maps, so we are fine here
|
||||
if stats.Duration == nil { |
||||
return nil, fmt.Errorf("Invalid JSON returned, could not retreive duration.*") |
||||
} |
||||
|
||||
|
||||
|
||||
return &stats, err |
||||
} |
||||
|
||||
// Exporter implements prometheus.Collector
|
||||
type Exporter struct { |
||||
scraper *appScraper |
||||
} |
||||
|
||||
// Describe implements prometheus.Describe
|
||||
func (e *Exporter) Describe(ch chan<- *prometheus.Desc) { |
||||
ch <- descUp |
||||
ch <- descRequestCount |
||||
ch <- descRequestCountTotal |
||||
ch <- descRequestRates |
||||
ch <- descDurationSum |
||||
ch <- descDurationAvg |
||||
|
||||
} |
||||
|
||||
func (e *Exporter) Collect(ch chan<- prometheus.Metric) { |
||||
|
||||
stats, err := e.scraper.stats() |
||||
if err != nil { |
||||
ch <- prometheus.MustNewConstMetric( |
||||
descUp, prometheus.GaugeValue, 0, |
||||
) |
||||
log.Error("Failed to scrape app stats: ", err) |
||||
return |
||||
} |
||||
ch <- prometheus.MustNewConstMetric( |
||||
descUp, prometheus.GaugeValue, 1, |
||||
) |
||||
|
||||
// Add counter per code on the fly (no need to update code if there are additional codes)
|
||||
for code, count := range stats.RequestCounters { |
||||
ch <- prometheus.MustNewConstMetric( |
||||
descRequestCount, prometheus.CounterValue, float64(count), code, |
||||
) |
||||
} |
||||
|
||||
// Add total of all requests
|
||||
ch <- prometheus.MustNewConstMetric( |
||||
descRequestCountTotal, prometheus.CounterValue, float64(stats.Duration.Count), |
||||
) |
||||
|
||||
// Add rates per code on the fly (no need to update code if there are additional codes)
|
||||
ratesSum := 0 |
||||
for code, count := range stats.RequestRates { |
||||
ch <- prometheus.MustNewConstMetric( |
||||
descRequestRates, prometheus.GaugeValue, float64(count), code, |
||||
) |
||||
ratesSum += count |
||||
} |
||||
// Additional sum of the rates, such like the requestCounter
|
||||
ch <- prometheus.MustNewConstMetric( |
||||
descRequestRatesTotal, prometheus.CounterValue, float64(ratesSum), |
||||
) |
||||
|
||||
ch <- prometheus.MustNewConstMetric( |
||||
descDurationSum, prometheus.CounterValue, stats.Duration.Sum, |
||||
) |
||||
ch <- prometheus.MustNewConstMetric( |
||||
descDurationAvg, prometheus.GaugeValue, stats.Duration.Average, |
||||
) |
||||
|
||||
} |
||||
|
||||
func NewExporter(endpoint string) (*Exporter, error) { |
||||
|
||||
if !strings.Contains(endpoint, "://") { |
||||
endpoint = "http://" + endpoint |
||||
} |
||||
u, err := url.Parse(endpoint) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("invalid endpoint URL: %s", err) |
||||
} |
||||
if u.Host == "" || (u.Scheme != "http" && u.Scheme != "https") { |
||||
return nil, fmt.Errorf("invalid endpoint URL: %s", endpoint) |
||||
} |
||||
|
||||
// use custom http client with specific timeout
|
||||
client := &http.Client{ |
||||
Timeout: time.Duration(100*time.Millisecond), |
||||
} |
||||
|
||||
// create api client
|
||||
appScraper := &appScraper{ |
||||
client: client, |
||||
endpoint: endpoint, |
||||
} |
||||
|
||||
return &Exporter{ |
||||
scraper: appScraper, |
||||
}, nil |
||||
} |
||||
|
||||
func init() { |
||||
prometheus.MustRegister(version.NewCollector(fmt.Sprintf("%s_exporter", namespace))) |
||||
} |
||||
|
||||
func main() { |
||||
|
||||
exporter, err := NewExporter(*endpointApp) |
||||
if err != nil { |
||||
log.Fatal(err) |
||||
} |
||||
prometheus.MustRegister(exporter) |
||||
|
||||
http.Handle(metricsPath, promhttp.Handler()) |
||||
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { |
||||
w.Write([]byte(`<html> |
||||
<head><title>App Exporter</title></head> |
||||
<body> |
||||
<h1>App Exporter</h1> |
||||
<p><a href='` + metricsPath + `'>Metrics</a></p> |
||||
</body> |
||||
</html>`)) |
||||
}) |
||||
|
||||
log.Infoln("Listening on", *listenAddress) |
||||
log.Fatal(http.ListenAndServe(*listenAddress, nil)) |
||||
} |
||||
@ -0,0 +1,116 @@
|
||||
package main |
||||
|
||||
import ( |
||||
"testing" |
||||
"net/http/httptest" |
||||
"fmt" |
||||
"net/http" |
||||
"reflect" |
||||
) |
||||
|
||||
func TestScraper(t *testing.T) { |
||||
|
||||
tests := []struct { |
||||
json string |
||||
expected *AppStats |
||||
ok bool |
||||
}{ |
||||
{ |
||||
json: ` |
||||
{ |
||||
"requestCounters": { |
||||
"200": 65221, |
||||
"404": 14066, |
||||
"500": 12618 |
||||
}, |
||||
"requestRates": { |
||||
"200": 100, |
||||
"404": 1 |
||||
}, |
||||
"duration": { |
||||
"count": 91905, |
||||
"sum": 4484.3037570333245, |
||||
"average": 0.024613801985478054 |
||||
} |
||||
} |
||||
`, |
||||
expected: &AppStats{ |
||||
RequestCounters: map[string]int{ |
||||
"200": 65221, |
||||
"404": 14066, |
||||
"500": 12618, |
||||
}, |
||||
RequestRates: map[string]int{ |
||||
"200": 100, |
||||
"404": 1, |
||||
}, |
||||
Duration: &AppDuration{ |
||||
Count: 91905, |
||||
Sum: 4484.3037570333245, |
||||
Average: 0.024613801985478054, |
||||
|
||||
}, |
||||
}, |
||||
ok: true, |
||||
}, |
||||
{ |
||||
json: "invalid", |
||||
ok: false, |
||||
}, |
||||
} |
||||
|
||||
for i, test := range tests { |
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { |
||||
w.Header().Set("Content-Type", "application/json") |
||||
fmt.Fprintln(w, test.json) |
||||
})) |
||||
defer server.Close() |
||||
|
||||
scraper := appScraper{ |
||||
endpoint: server.URL, |
||||
client: http.DefaultClient, |
||||
} |
||||
stats, err := scraper.stats() |
||||
|
||||
if err != nil{ |
||||
if !test.ok{ |
||||
continue |
||||
} |
||||
t.Fatalf("Test %v: http.Get(%q) unexpected error: %v", i, server.URL, err) |
||||
} |
||||
|
||||
if !reflect.DeepEqual(*test.expected.Duration, *stats.Duration) { |
||||
t.Fatalf("Test %v: Duration expected %v, got %v", i, *test.expected.Duration, *stats.Duration) |
||||
} |
||||
if !reflect.DeepEqual(test.expected.RequestCounters, stats.RequestCounters) { |
||||
t.Fatalf("Test %v: RequestCounters expected %v, got %v", i, test.expected.RequestCounters, stats.RequestCounters) |
||||
} |
||||
if !reflect.DeepEqual(test.expected.RequestRates, stats.RequestRates) { |
||||
t.Fatalf("Test %v: RequestRates expected %v, got %v", i, test.expected.RequestRates, stats.RequestRates) |
||||
} |
||||
} |
||||
} |
||||
|
||||
// Yes, this is stolen from the consul_exporter, but why reinvent the wheel? ;-)
|
||||
func TestNewExporter(t *testing.T) { |
||||
cases := []struct { |
||||
uri string |
||||
ok bool |
||||
}{ |
||||
{uri: "", ok: false}, |
||||
{uri: "localhost:8500", ok: true}, |
||||
{uri: "https://localhost:8500", ok: true}, |
||||
{uri: "http://some.where:8500", ok: true}, |
||||
{uri: "fuuuu://localhost:8500", ok: false}, |
||||
} |
||||
|
||||
for _, test := range cases { |
||||
_, err := NewExporter(test.uri) |
||||
if test.ok && err != nil { |
||||
t.Errorf("expected no error w/ %q, but got %q", test.uri, err) |
||||
} |
||||
if !test.ok && err == nil { |
||||
t.Errorf("expected error w/ %q, but got %q", test.uri, err) |
||||
} |
||||
} |
||||
} |
||||
Loading…
Reference in new issue