commit
fba1a94836
6 changed files with 513 additions and 0 deletions
@ -0,0 +1,7 @@ |
|||||||
|
FROM busybox:latest |
||||||
|
|
||||||
|
COPY exporter /bin/exporter |
||||||
|
|
||||||
|
EXPOSE 8081 |
||||||
|
|
||||||
|
ENTRYPOINT ["/bin/exporter"] |
||||||
@ -0,0 +1,91 @@ |
|||||||
|
# SysEng coding challenge: Prometheus exporter |
||||||
|
|
||||||
|
Thanks for trying our coding challenge for systems engineers. |
||||||
|
|
||||||
|
## Prerequisites |
||||||
|
|
||||||
|
You need to be able to run Docker on your development computer to work on this |
||||||
|
challenge. Please run |
||||||
|
``` |
||||||
|
docker run -it -p 8080:8080 beorn7/syseng-challenge |
||||||
|
``` |
||||||
|
|
||||||
|
Then run `curl http://localhost:8080/stats`. The output should look similar to |
||||||
|
the following: |
||||||
|
|
||||||
|
```json |
||||||
|
{ |
||||||
|
"requestCounters": { |
||||||
|
"200": 65221, |
||||||
|
"404": 14066, |
||||||
|
"500": 12618 |
||||||
|
}, |
||||||
|
"requestRates": { |
||||||
|
"200": 100, |
||||||
|
"404": 1 |
||||||
|
}, |
||||||
|
"duration": { |
||||||
|
"count": 91905, |
||||||
|
"sum": 4484.3037570333245, |
||||||
|
"average": 0.024613801985478054 |
||||||
|
} |
||||||
|
} |
||||||
|
``` |
||||||
|
|
||||||
|
If you aren't already, you should make yourself familiar with the |
||||||
|
[Prometheus monitoring and alerting system](https://prometheus.io). A good |
||||||
|
starting point is Brian Brazil's very concise |
||||||
|
[talk](https://www.youtube.com/watch?v=cwRmXqXKGtk) at FOSDEM 2016. |
||||||
|
|
||||||
|
## The Challenge |
||||||
|
|
||||||
|
Imagine the little binary you have started above is an instance of a |
||||||
|
microservice that is running replicated with hundreds of instances on a |
||||||
|
computing cluster. For the sake of the challenge, we are not interested in what |
||||||
|
the service is actually doing. We are only interested in its metrics, and we |
||||||
|
want to simulate monitoring it with Prometheus. Thankfully, the service is |
||||||
|
providing metrics in JSON format via its `/stats` endpoint, as you have seen |
||||||
|
above. The `requestCounters` tell you how often each HTTP status code has been |
||||||
|
served during the lifetime of the binary. The `requestRates` tell you the same |
||||||
|
but for the last second, i.e. they give you the current QPS. The `duration` |
||||||
|
tells you how many requests have been served in total during the lifetime of |
||||||
|
the binary (`count`) and how much total time those requests have taken in |
||||||
|
seconds (`sum`). The `average` is the time in seconds a request has taken, |
||||||
|
averaged over the last second. |
||||||
|
|
||||||
|
Unfortunately, Prometheus cannot ingest JSON directly but requires a custom |
||||||
|
format. Usually, you would write your microservice in a way that it would |
||||||
|
expose metrics in a format suitable for Prometheus directly. Let's imagine that |
||||||
|
this direct instrumentation is, for some reason, not feasible in this case. (In |
||||||
|
reality, this situation often arises when monitoring 3rd party software that |
||||||
|
does not happen to be instrumented for Prometheus specifically.) The usual |
||||||
|
solution is to write a so-called |
||||||
|
[exporter](https://prometheus.io/docs/instrumenting/exporters/), a little glue |
||||||
|
program that retrieves metrics from a 3rd party system and exposes them in the |
||||||
|
Prometheus way. |
||||||
|
|
||||||
|
Your task is to write, in a language of your choice, such an exporter for the |
||||||
|
simulated microservice running in your Docker right now. |
||||||
|
|
||||||
|
It might be helpful to also start a Prometheus server and scrape your exporter |
||||||
|
and explore the possibilities enabled by your metrics. The simulated |
||||||
|
microservice instance is looping through a number of scenarios over the course |
||||||
|
of about 15 minutes. |
||||||
|
|
||||||
|
## Bonus questions |
||||||
|
|
||||||
|
Optionally, you may answer the following questions. Thinking about them might |
||||||
|
also help you solve the coding challenge in a meaningful way. Keep answers |
||||||
|
short. It's really just about sketching out a few ideas. If we invite you for |
||||||
|
on-site interviews, we will have plenty of time to discuss them in detail. |
||||||
|
|
||||||
|
1. What are good ways of deploying hundreds of instances of our simulated |
||||||
|
service? How would you deploy your exporter? And how would you configure |
||||||
|
Prometheus to monitor them all? |
||||||
|
2. What graphs about the service would you plot in a dashboard builder like |
||||||
|
Grafana? Ideally, you can come up with PromQL expressions for them. |
||||||
|
3. What would you alert on? What would be the urgency of the various alerts? |
||||||
|
Again, it would be great if you could formulate alerting conditions with |
||||||
|
PromQL. |
||||||
|
4. If you were in control of the microservice, which exported metrics would you |
||||||
|
add or modify next? |
||||||
@ -0,0 +1,34 @@ |
|||||||
|
GO := GO15VENDOREXPERIMENT=1 go
|
||||||
|
pkgs = $(shell $(GO) list ./... | grep -v /vendor/)
|
||||||
|
|
||||||
|
DOCKER_IMAGE_NAME ?= exporter
|
||||||
|
DOCKER_IMAGE_TAG ?= latest
|
||||||
|
|
||||||
|
|
||||||
|
all: format build test |
||||||
|
|
||||||
|
style: |
||||||
|
@echo ">> checking code style"
|
||||||
|
@! gofmt -d $(shell find . -path ./vendor -prune -o -name '*.go' -print) | grep '^'
|
||||||
|
|
||||||
|
test: |
||||||
|
@echo ">> running tests"
|
||||||
|
@$(GO) test -short $(pkgs)
|
||||||
|
|
||||||
|
format: |
||||||
|
@echo ">> formatting code"
|
||||||
|
@$(GO) fmt $(pkgs)
|
||||||
|
|
||||||
|
vet: |
||||||
|
@echo ">> vetting code"
|
||||||
|
@$(GO) vet $(pkgs)
|
||||||
|
|
||||||
|
build: |
||||||
|
@echo ">> building binaries"
|
||||||
|
@$(GO) build -o exporter
|
||||||
|
|
||||||
|
docker: |
||||||
|
@echo ">> building docker image"
|
||||||
|
@docker build -t "$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_TAG)" .
|
||||||
|
|
||||||
|
.PHONY: all style format build test vet docker |
||||||
@ -0,0 +1,263 @@ |
|||||||
|
package main |
||||||
|
|
||||||
|
import ( |
||||||
|
"encoding/json" |
||||||
|
"flag" |
||||||
|
"fmt" |
||||||
|
"io/ioutil" |
||||||
|
"net/http" |
||||||
|
"syscall" |
||||||
|
"time" |
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus" |
||||||
|
"github.com/prometheus/client_golang/prometheus/promhttp" |
||||||
|
"github.com/prometheus/common/log" |
||||||
|
"github.com/prometheus/common/version" |
||||||
|
"net/url" |
||||||
|
"strings" |
||||||
|
) |
||||||
|
|
||||||
|
const ( |
||||||
|
metricsPath = "/metrics" |
||||||
|
namespace = "app" |
||||||
|
) |
||||||
|
|
||||||
|
var ( |
||||||
|
listenAddress = flag.String("listen", ":8080", "The address to listen on for HTTP requests.") |
||||||
|
endpointApp = flag.String("endpoint", "http://localhost:8050/stats", "HTTP API address of the application") |
||||||
|
hostname = flag.String("hostname", "", "Optional hostname which will be added to the exported metrics (defaults to $HOSTNAME)") |
||||||
|
prometheusConstLabel = parseConstLabel() |
||||||
|
) |
||||||
|
|
||||||
|
// run before creating the descriptor
|
||||||
|
func parseConstLabel() prometheus.Labels { |
||||||
|
// parse flags in an early state, so we can retrieve the instance id
|
||||||
|
flag.Parse() |
||||||
|
|
||||||
|
if *hostname != "" { |
||||||
|
// Try to set hostname from env var, so we can see if an pod does not work as expected
|
||||||
|
if value, found := syscall.Getenv("HOSTNAME"); found { |
||||||
|
hostname = &value |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// generate constant label if hostname is present
|
||||||
|
if *hostname != "" { |
||||||
|
return prometheus.Labels{"hostname": *hostname} |
||||||
|
} |
||||||
|
return prometheus.Labels{} |
||||||
|
} |
||||||
|
|
||||||
|
var ( |
||||||
|
// Create the prometheus descriptors
|
||||||
|
descUp = prometheus.NewDesc( |
||||||
|
prometheus.BuildFQName(namespace, "", "up"), |
||||||
|
"Was the last query successful.", |
||||||
|
nil, prometheusConstLabel, |
||||||
|
) |
||||||
|
descRequestCount = prometheus.NewDesc( |
||||||
|
prometheus.BuildFQName(namespace, "", "request_count"), |
||||||
|
"How many requests processed, partitioned by status code.", |
||||||
|
[]string{"code"}, prometheusConstLabel, |
||||||
|
) |
||||||
|
descRequestCountTotal = prometheus.NewDesc( |
||||||
|
prometheus.BuildFQName(namespace, "", "request_count_total"), |
||||||
|
"How many requests processed of all status codes.", |
||||||
|
nil, prometheusConstLabel, |
||||||
|
) |
||||||
|
descRequestRates = prometheus.NewDesc( |
||||||
|
prometheus.BuildFQName(namespace, "", "request_rates"), |
||||||
|
"How many requests processed in the last second, partitioned by status code.", |
||||||
|
[]string{"code"}, prometheusConstLabel, |
||||||
|
) |
||||||
|
descRequestRatesTotal = prometheus.NewDesc( |
||||||
|
prometheus.BuildFQName(namespace, "", "request_rates_total"), |
||||||
|
"How many requests processed in the last second.", |
||||||
|
nil, prometheusConstLabel, |
||||||
|
) |
||||||
|
descDurationSum = prometheus.NewDesc( |
||||||
|
prometheus.BuildFQName(namespace, "", "duration_sum"), |
||||||
|
"How much time consumed the requests in summary.", |
||||||
|
nil, prometheusConstLabel, |
||||||
|
) |
||||||
|
descDurationAvg = prometheus.NewDesc( |
||||||
|
prometheus.BuildFQName(namespace, "", "duration_avg"), |
||||||
|
"How much time consumed the requests in average.", |
||||||
|
nil, prometheusConstLabel, |
||||||
|
) |
||||||
|
) |
||||||
|
|
||||||
|
// AppStats represent the schema of the returned json
|
||||||
|
type AppStats struct { |
||||||
|
// RequestCounters are the served status codes during app lifetime
|
||||||
|
RequestCounters map[string]int `json:"requestCounters"` |
||||||
|
// RequestRates are the served status codes for the last second (QPS)
|
||||||
|
RequestRates map[string]int `json:"requestRates"` |
||||||
|
// Duration represent some request stats during the lifetime
|
||||||
|
Duration *AppDuration `json:"duration"` |
||||||
|
} |
||||||
|
|
||||||
|
// AppDuration the schema of the returned duration part of the json
|
||||||
|
type AppDuration struct { |
||||||
|
// Count is the total served request in the lifetime
|
||||||
|
Count int `json:"count"` |
||||||
|
// Sum is the total time of taken time the requests have taken in seconds
|
||||||
|
Sum float64 `json:"sum"` |
||||||
|
// Average time of usage a request has taken.
|
||||||
|
Average float64 `json:"average"` |
||||||
|
} |
||||||
|
|
||||||
|
// appScraper is a helper to retrieve stats in a generic way
|
||||||
|
type appScraper struct { |
||||||
|
endpoint string |
||||||
|
client *http.Client |
||||||
|
} |
||||||
|
|
||||||
|
// stats returns the fetched and parsed json
|
||||||
|
func (s *appScraper) stats() (*AppStats, error) { |
||||||
|
var stats AppStats |
||||||
|
response, err := s.client.Get(s.endpoint) |
||||||
|
if err != nil { |
||||||
|
return nil, err |
||||||
|
} |
||||||
|
|
||||||
|
buf, err := ioutil.ReadAll(response.Body) |
||||||
|
if err != nil { |
||||||
|
return nil, err |
||||||
|
} |
||||||
|
|
||||||
|
err = json.Unmarshal(buf, &stats) |
||||||
|
if err != nil { |
||||||
|
return nil, err |
||||||
|
} |
||||||
|
|
||||||
|
// validate returned json is complete
|
||||||
|
// requestCounter and requestRates should be empty maps, so we are fine here
|
||||||
|
if stats.Duration == nil { |
||||||
|
return nil, fmt.Errorf("Invalid JSON returned, could not retreive duration.*") |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
return &stats, err |
||||||
|
} |
||||||
|
|
||||||
|
// Exporter implements prometheus.Collector
|
||||||
|
type Exporter struct { |
||||||
|
scraper *appScraper |
||||||
|
} |
||||||
|
|
||||||
|
// Describe implements prometheus.Describe
|
||||||
|
func (e *Exporter) Describe(ch chan<- *prometheus.Desc) { |
||||||
|
ch <- descUp |
||||||
|
ch <- descRequestCount |
||||||
|
ch <- descRequestCountTotal |
||||||
|
ch <- descRequestRates |
||||||
|
ch <- descDurationSum |
||||||
|
ch <- descDurationAvg |
||||||
|
|
||||||
|
} |
||||||
|
|
||||||
|
func (e *Exporter) Collect(ch chan<- prometheus.Metric) { |
||||||
|
|
||||||
|
stats, err := e.scraper.stats() |
||||||
|
if err != nil { |
||||||
|
ch <- prometheus.MustNewConstMetric( |
||||||
|
descUp, prometheus.GaugeValue, 0, |
||||||
|
) |
||||||
|
log.Error("Failed to scrape app stats: ", err) |
||||||
|
return |
||||||
|
} |
||||||
|
ch <- prometheus.MustNewConstMetric( |
||||||
|
descUp, prometheus.GaugeValue, 1, |
||||||
|
) |
||||||
|
|
||||||
|
// Add counter per code on the fly (no need to update code if there are additional codes)
|
||||||
|
for code, count := range stats.RequestCounters { |
||||||
|
ch <- prometheus.MustNewConstMetric( |
||||||
|
descRequestCount, prometheus.CounterValue, float64(count), code, |
||||||
|
) |
||||||
|
} |
||||||
|
|
||||||
|
// Add total of all requests
|
||||||
|
ch <- prometheus.MustNewConstMetric( |
||||||
|
descRequestCountTotal, prometheus.CounterValue, float64(stats.Duration.Count), |
||||||
|
) |
||||||
|
|
||||||
|
// Add rates per code on the fly (no need to update code if there are additional codes)
|
||||||
|
ratesSum := 0 |
||||||
|
for code, count := range stats.RequestRates { |
||||||
|
ch <- prometheus.MustNewConstMetric( |
||||||
|
descRequestRates, prometheus.GaugeValue, float64(count), code, |
||||||
|
) |
||||||
|
ratesSum += count |
||||||
|
} |
||||||
|
// Additional sum of the rates, such like the requestCounter
|
||||||
|
ch <- prometheus.MustNewConstMetric( |
||||||
|
descRequestRatesTotal, prometheus.CounterValue, float64(ratesSum), |
||||||
|
) |
||||||
|
|
||||||
|
ch <- prometheus.MustNewConstMetric( |
||||||
|
descDurationSum, prometheus.CounterValue, stats.Duration.Sum, |
||||||
|
) |
||||||
|
ch <- prometheus.MustNewConstMetric( |
||||||
|
descDurationAvg, prometheus.GaugeValue, stats.Duration.Average, |
||||||
|
) |
||||||
|
|
||||||
|
} |
||||||
|
|
||||||
|
func NewExporter(endpoint string) (*Exporter, error) { |
||||||
|
|
||||||
|
if !strings.Contains(endpoint, "://") { |
||||||
|
endpoint = "http://" + endpoint |
||||||
|
} |
||||||
|
u, err := url.Parse(endpoint) |
||||||
|
if err != nil { |
||||||
|
return nil, fmt.Errorf("invalid endpoint URL: %s", err) |
||||||
|
} |
||||||
|
if u.Host == "" || (u.Scheme != "http" && u.Scheme != "https") { |
||||||
|
return nil, fmt.Errorf("invalid endpoint URL: %s", endpoint) |
||||||
|
} |
||||||
|
|
||||||
|
// use custom http client with specific timeout
|
||||||
|
client := &http.Client{ |
||||||
|
Timeout: time.Duration(100*time.Millisecond), |
||||||
|
} |
||||||
|
|
||||||
|
// create api client
|
||||||
|
appScraper := &appScraper{ |
||||||
|
client: client, |
||||||
|
endpoint: endpoint, |
||||||
|
} |
||||||
|
|
||||||
|
return &Exporter{ |
||||||
|
scraper: appScraper, |
||||||
|
}, nil |
||||||
|
} |
||||||
|
|
||||||
|
func init() { |
||||||
|
prometheus.MustRegister(version.NewCollector(fmt.Sprintf("%s_exporter", namespace))) |
||||||
|
} |
||||||
|
|
||||||
|
func main() { |
||||||
|
|
||||||
|
exporter, err := NewExporter(*endpointApp) |
||||||
|
if err != nil { |
||||||
|
log.Fatal(err) |
||||||
|
} |
||||||
|
prometheus.MustRegister(exporter) |
||||||
|
|
||||||
|
http.Handle(metricsPath, promhttp.Handler()) |
||||||
|
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { |
||||||
|
w.Write([]byte(`<html> |
||||||
|
<head><title>App Exporter</title></head> |
||||||
|
<body> |
||||||
|
<h1>App Exporter</h1> |
||||||
|
<p><a href='` + metricsPath + `'>Metrics</a></p> |
||||||
|
</body> |
||||||
|
</html>`)) |
||||||
|
}) |
||||||
|
|
||||||
|
log.Infoln("Listening on", *listenAddress) |
||||||
|
log.Fatal(http.ListenAndServe(*listenAddress, nil)) |
||||||
|
} |
||||||
@ -0,0 +1,116 @@ |
|||||||
|
package main |
||||||
|
|
||||||
|
import ( |
||||||
|
"testing" |
||||||
|
"net/http/httptest" |
||||||
|
"fmt" |
||||||
|
"net/http" |
||||||
|
"reflect" |
||||||
|
) |
||||||
|
|
||||||
|
func TestScraper(t *testing.T) { |
||||||
|
|
||||||
|
tests := []struct { |
||||||
|
json string |
||||||
|
expected *AppStats |
||||||
|
ok bool |
||||||
|
}{ |
||||||
|
{ |
||||||
|
json: ` |
||||||
|
{ |
||||||
|
"requestCounters": { |
||||||
|
"200": 65221, |
||||||
|
"404": 14066, |
||||||
|
"500": 12618 |
||||||
|
}, |
||||||
|
"requestRates": { |
||||||
|
"200": 100, |
||||||
|
"404": 1 |
||||||
|
}, |
||||||
|
"duration": { |
||||||
|
"count": 91905, |
||||||
|
"sum": 4484.3037570333245, |
||||||
|
"average": 0.024613801985478054 |
||||||
|
} |
||||||
|
} |
||||||
|
`, |
||||||
|
expected: &AppStats{ |
||||||
|
RequestCounters: map[string]int{ |
||||||
|
"200": 65221, |
||||||
|
"404": 14066, |
||||||
|
"500": 12618, |
||||||
|
}, |
||||||
|
RequestRates: map[string]int{ |
||||||
|
"200": 100, |
||||||
|
"404": 1, |
||||||
|
}, |
||||||
|
Duration: &AppDuration{ |
||||||
|
Count: 91905, |
||||||
|
Sum: 4484.3037570333245, |
||||||
|
Average: 0.024613801985478054, |
||||||
|
|
||||||
|
}, |
||||||
|
}, |
||||||
|
ok: true, |
||||||
|
}, |
||||||
|
{ |
||||||
|
json: "invalid", |
||||||
|
ok: false, |
||||||
|
}, |
||||||
|
} |
||||||
|
|
||||||
|
for i, test := range tests { |
||||||
|
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { |
||||||
|
w.Header().Set("Content-Type", "application/json") |
||||||
|
fmt.Fprintln(w, test.json) |
||||||
|
})) |
||||||
|
defer server.Close() |
||||||
|
|
||||||
|
scraper := appScraper{ |
||||||
|
endpoint: server.URL, |
||||||
|
client: http.DefaultClient, |
||||||
|
} |
||||||
|
stats, err := scraper.stats() |
||||||
|
|
||||||
|
if err != nil{ |
||||||
|
if !test.ok{ |
||||||
|
continue |
||||||
|
} |
||||||
|
t.Fatalf("Test %v: http.Get(%q) unexpected error: %v", i, server.URL, err) |
||||||
|
} |
||||||
|
|
||||||
|
if !reflect.DeepEqual(*test.expected.Duration, *stats.Duration) { |
||||||
|
t.Fatalf("Test %v: Duration expected %v, got %v", i, *test.expected.Duration, *stats.Duration) |
||||||
|
} |
||||||
|
if !reflect.DeepEqual(test.expected.RequestCounters, stats.RequestCounters) { |
||||||
|
t.Fatalf("Test %v: RequestCounters expected %v, got %v", i, test.expected.RequestCounters, stats.RequestCounters) |
||||||
|
} |
||||||
|
if !reflect.DeepEqual(test.expected.RequestRates, stats.RequestRates) { |
||||||
|
t.Fatalf("Test %v: RequestRates expected %v, got %v", i, test.expected.RequestRates, stats.RequestRates) |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// Yes, this is stolen from the consul_exporter, but why reinvent the wheel? ;-)
|
||||||
|
func TestNewExporter(t *testing.T) { |
||||||
|
cases := []struct { |
||||||
|
uri string |
||||||
|
ok bool |
||||||
|
}{ |
||||||
|
{uri: "", ok: false}, |
||||||
|
{uri: "localhost:8500", ok: true}, |
||||||
|
{uri: "https://localhost:8500", ok: true}, |
||||||
|
{uri: "http://some.where:8500", ok: true}, |
||||||
|
{uri: "fuuuu://localhost:8500", ok: false}, |
||||||
|
} |
||||||
|
|
||||||
|
for _, test := range cases { |
||||||
|
_, err := NewExporter(test.uri) |
||||||
|
if test.ok && err != nil { |
||||||
|
t.Errorf("expected no error w/ %q, but got %q", test.uri, err) |
||||||
|
} |
||||||
|
if !test.ok && err == nil { |
||||||
|
t.Errorf("expected error w/ %q, but got %q", test.uri, err) |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
Loading…
Reference in new issue