commit fba1a94836f131945d5da6b92ef1e775c85cf700 Author: Richard Hillmann Date: Tue Mar 14 00:23:31 2017 +0100 init project diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9493596 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.idea/ +exporter \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..d548fc6 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,7 @@ +FROM busybox:latest + +COPY exporter /bin/exporter + +EXPOSE 8081 + +ENTRYPOINT ["/bin/exporter"] \ No newline at end of file diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md new file mode 100644 index 0000000..1de3192 --- /dev/null +++ b/INSTRUCTIONS.md @@ -0,0 +1,91 @@ +# SysEng coding challenge: Prometheus exporter + +Thanks for trying our coding challenge for systems engineers. + +## Prerequisites + +You need to be able to run Docker on your development computer to work on this +challenge. Please run +``` +docker run -it -p 8080:8080 beorn7/syseng-challenge +``` + +Then run `curl http://localhost:8080/stats`. The output should look similar to +the following: + +```json +{ + "requestCounters": { + "200": 65221, + "404": 14066, + "500": 12618 + }, + "requestRates": { + "200": 100, + "404": 1 + }, + "duration": { + "count": 91905, + "sum": 4484.3037570333245, + "average": 0.024613801985478054 + } +} +``` + +If you aren't already, you should make yourself familiar with the +[Prometheus monitoring and alerting system](https://prometheus.io). A good +starting point is Brian Brazil's very concise +[talk](https://www.youtube.com/watch?v=cwRmXqXKGtk) at FOSDEM 2016. + +## The Challenge + +Imagine the little binary you have started above is an instance of a +microservice that is running replicated with hundreds of instances on a +computing cluster. For the sake of the challenge, we are not interested in what +the service is actually doing. We are only interested in its metrics, and we +want to simulate monitoring it with Prometheus. Thankfully, the service is +providing metrics in JSON format via its `/stats` endpoint, as you have seen +above. The `requestCounters` tell you how often each HTTP status code has been +served during the lifetime of the binary. The `requestRates` tell you the same +but for the last second, i.e. they give you the current QPS. The `duration` +tells you how many requests have been served in total during the lifetime of +the binary (`count`) and how much total time those requests have taken in +seconds (`sum`). The `average` is the time in seconds a request has taken, +averaged over the last second. + +Unfortunately, Prometheus cannot ingest JSON directly but requires a custom +format. Usually, you would write your microservice in a way that it would +expose metrics in a format suitable for Prometheus directly. Let's imagine that +this direct instrumentation is, for some reason, not feasible in this case. (In +reality, this situation often arises when monitoring 3rd party software that +does not happen to be instrumented for Prometheus specifically.) The usual +solution is to write a so-called +[exporter](https://prometheus.io/docs/instrumenting/exporters/), a little glue +program that retrieves metrics from a 3rd party system and exposes them in the +Prometheus way. + +Your task is to write, in a language of your choice, such an exporter for the +simulated microservice running in your Docker right now. + +It might be helpful to also start a Prometheus server and scrape your exporter +and explore the possibilities enabled by your metrics. The simulated +microservice instance is looping through a number of scenarios over the course +of about 15 minutes. + +## Bonus questions + +Optionally, you may answer the following questions. Thinking about them might +also help you solve the coding challenge in a meaningful way. Keep answers +short. It's really just about sketching out a few ideas. If we invite you for +on-site interviews, we will have plenty of time to discuss them in detail. + +1. What are good ways of deploying hundreds of instances of our simulated + service? How would you deploy your exporter? And how would you configure + Prometheus to monitor them all? +2. What graphs about the service would you plot in a dashboard builder like + Grafana? Ideally, you can come up with PromQL expressions for them. +3. What would you alert on? What would be the urgency of the various alerts? + Again, it would be great if you could formulate alerting conditions with + PromQL. +4. If you were in control of the microservice, which exported metrics would you + add or modify next? diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4b6e64b --- /dev/null +++ b/Makefile @@ -0,0 +1,34 @@ +GO := GO15VENDOREXPERIMENT=1 go +pkgs = $(shell $(GO) list ./... | grep -v /vendor/) + +DOCKER_IMAGE_NAME ?= exporter +DOCKER_IMAGE_TAG ?= latest + + +all: format build test + +style: + @echo ">> checking code style" + @! gofmt -d $(shell find . -path ./vendor -prune -o -name '*.go' -print) | grep '^' + +test: + @echo ">> running tests" + @$(GO) test -short $(pkgs) + +format: + @echo ">> formatting code" + @$(GO) fmt $(pkgs) + +vet: + @echo ">> vetting code" + @$(GO) vet $(pkgs) + +build: + @echo ">> building binaries" + @$(GO) build -o exporter + +docker: + @echo ">> building docker image" + @docker build -t "$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_TAG)" . + +.PHONY: all style format build test vet docker \ No newline at end of file diff --git a/exporter.go b/exporter.go new file mode 100644 index 0000000..f1d6189 --- /dev/null +++ b/exporter.go @@ -0,0 +1,263 @@ +package main + +import ( + "encoding/json" + "flag" + "fmt" + "io/ioutil" + "net/http" + "syscall" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/prometheus/common/log" + "github.com/prometheus/common/version" + "net/url" + "strings" +) + +const ( + metricsPath = "/metrics" + namespace = "app" +) + +var ( + listenAddress = flag.String("listen", ":8080", "The address to listen on for HTTP requests.") + endpointApp = flag.String("endpoint", "http://localhost:8050/stats", "HTTP API address of the application") + hostname = flag.String("hostname", "", "Optional hostname which will be added to the exported metrics (defaults to $HOSTNAME)") + prometheusConstLabel = parseConstLabel() +) + +// run before creating the descriptor +func parseConstLabel() prometheus.Labels { + // parse flags in an early state, so we can retrieve the instance id + flag.Parse() + + if *hostname != "" { + // Try to set hostname from env var, so we can see if an pod does not work as expected + if value, found := syscall.Getenv("HOSTNAME"); found { + hostname = &value + } + } + + // generate constant label if hostname is present + if *hostname != "" { + return prometheus.Labels{"hostname": *hostname} + } + return prometheus.Labels{} +} + +var ( + // Create the prometheus descriptors + descUp = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "up"), + "Was the last query successful.", + nil, prometheusConstLabel, + ) + descRequestCount = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "request_count"), + "How many requests processed, partitioned by status code.", + []string{"code"}, prometheusConstLabel, + ) + descRequestCountTotal = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "request_count_total"), + "How many requests processed of all status codes.", + nil, prometheusConstLabel, + ) + descRequestRates = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "request_rates"), + "How many requests processed in the last second, partitioned by status code.", + []string{"code"}, prometheusConstLabel, + ) + descRequestRatesTotal = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "request_rates_total"), + "How many requests processed in the last second.", + nil, prometheusConstLabel, + ) + descDurationSum = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "duration_sum"), + "How much time consumed the requests in summary.", + nil, prometheusConstLabel, + ) + descDurationAvg = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "duration_avg"), + "How much time consumed the requests in average.", + nil, prometheusConstLabel, + ) +) + +// AppStats represent the schema of the returned json +type AppStats struct { + // RequestCounters are the served status codes during app lifetime + RequestCounters map[string]int `json:"requestCounters"` + // RequestRates are the served status codes for the last second (QPS) + RequestRates map[string]int `json:"requestRates"` + // Duration represent some request stats during the lifetime + Duration *AppDuration `json:"duration"` +} + +// AppDuration the schema of the returned duration part of the json +type AppDuration struct { + // Count is the total served request in the lifetime + Count int `json:"count"` + // Sum is the total time of taken time the requests have taken in seconds + Sum float64 `json:"sum"` + // Average time of usage a request has taken. + Average float64 `json:"average"` +} + +// appScraper is a helper to retrieve stats in a generic way +type appScraper struct { + endpoint string + client *http.Client +} + +// stats returns the fetched and parsed json +func (s *appScraper) stats() (*AppStats, error) { + var stats AppStats + response, err := s.client.Get(s.endpoint) + if err != nil { + return nil, err + } + + buf, err := ioutil.ReadAll(response.Body) + if err != nil { + return nil, err + } + + err = json.Unmarshal(buf, &stats) + if err != nil { + return nil, err + } + + // validate returned json is complete + // requestCounter and requestRates should be empty maps, so we are fine here + if stats.Duration == nil { + return nil, fmt.Errorf("Invalid JSON returned, could not retreive duration.*") + } + + + + return &stats, err +} + +// Exporter implements prometheus.Collector +type Exporter struct { + scraper *appScraper +} + +// Describe implements prometheus.Describe +func (e *Exporter) Describe(ch chan<- *prometheus.Desc) { + ch <- descUp + ch <- descRequestCount + ch <- descRequestCountTotal + ch <- descRequestRates + ch <- descDurationSum + ch <- descDurationAvg + +} + +func (e *Exporter) Collect(ch chan<- prometheus.Metric) { + + stats, err := e.scraper.stats() + if err != nil { + ch <- prometheus.MustNewConstMetric( + descUp, prometheus.GaugeValue, 0, + ) + log.Error("Failed to scrape app stats: ", err) + return + } + ch <- prometheus.MustNewConstMetric( + descUp, prometheus.GaugeValue, 1, + ) + + // Add counter per code on the fly (no need to update code if there are additional codes) + for code, count := range stats.RequestCounters { + ch <- prometheus.MustNewConstMetric( + descRequestCount, prometheus.CounterValue, float64(count), code, + ) + } + + // Add total of all requests + ch <- prometheus.MustNewConstMetric( + descRequestCountTotal, prometheus.CounterValue, float64(stats.Duration.Count), + ) + + // Add rates per code on the fly (no need to update code if there are additional codes) + ratesSum := 0 + for code, count := range stats.RequestRates { + ch <- prometheus.MustNewConstMetric( + descRequestRates, prometheus.GaugeValue, float64(count), code, + ) + ratesSum += count + } + // Additional sum of the rates, such like the requestCounter + ch <- prometheus.MustNewConstMetric( + descRequestRatesTotal, prometheus.CounterValue, float64(ratesSum), + ) + + ch <- prometheus.MustNewConstMetric( + descDurationSum, prometheus.CounterValue, stats.Duration.Sum, + ) + ch <- prometheus.MustNewConstMetric( + descDurationAvg, prometheus.GaugeValue, stats.Duration.Average, + ) + +} + +func NewExporter(endpoint string) (*Exporter, error) { + + if !strings.Contains(endpoint, "://") { + endpoint = "http://" + endpoint + } + u, err := url.Parse(endpoint) + if err != nil { + return nil, fmt.Errorf("invalid endpoint URL: %s", err) + } + if u.Host == "" || (u.Scheme != "http" && u.Scheme != "https") { + return nil, fmt.Errorf("invalid endpoint URL: %s", endpoint) + } + + // use custom http client with specific timeout + client := &http.Client{ + Timeout: time.Duration(100*time.Millisecond), + } + + // create api client + appScraper := &appScraper{ + client: client, + endpoint: endpoint, + } + + return &Exporter{ + scraper: appScraper, + }, nil +} + +func init() { + prometheus.MustRegister(version.NewCollector(fmt.Sprintf("%s_exporter", namespace))) +} + +func main() { + + exporter, err := NewExporter(*endpointApp) + if err != nil { + log.Fatal(err) + } + prometheus.MustRegister(exporter) + + http.Handle(metricsPath, promhttp.Handler()) + http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + w.Write([]byte(` + App Exporter + +

App Exporter

+

Metrics

+ + `)) + }) + + log.Infoln("Listening on", *listenAddress) + log.Fatal(http.ListenAndServe(*listenAddress, nil)) +} diff --git a/exporter_test.go b/exporter_test.go new file mode 100644 index 0000000..f07553c --- /dev/null +++ b/exporter_test.go @@ -0,0 +1,116 @@ +package main + +import ( + "testing" + "net/http/httptest" + "fmt" + "net/http" + "reflect" +) + +func TestScraper(t *testing.T) { + + tests := []struct { + json string + expected *AppStats + ok bool + }{ + { + json: ` +{ + "requestCounters": { + "200": 65221, + "404": 14066, + "500": 12618 + }, + "requestRates": { + "200": 100, + "404": 1 + }, + "duration": { + "count": 91905, + "sum": 4484.3037570333245, + "average": 0.024613801985478054 + } +} +`, + expected: &AppStats{ + RequestCounters: map[string]int{ + "200": 65221, + "404": 14066, + "500": 12618, + }, + RequestRates: map[string]int{ + "200": 100, + "404": 1, + }, + Duration: &AppDuration{ + Count: 91905, + Sum: 4484.3037570333245, + Average: 0.024613801985478054, + + }, + }, + ok: true, + }, + { + json: "invalid", + ok: false, + }, + } + + for i, test := range tests { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + fmt.Fprintln(w, test.json) + })) + defer server.Close() + + scraper := appScraper{ + endpoint: server.URL, + client: http.DefaultClient, + } + stats, err := scraper.stats() + + if err != nil{ + if !test.ok{ + continue + } + t.Fatalf("Test %v: http.Get(%q) unexpected error: %v", i, server.URL, err) + } + + if !reflect.DeepEqual(*test.expected.Duration, *stats.Duration) { + t.Fatalf("Test %v: Duration expected %v, got %v", i, *test.expected.Duration, *stats.Duration) + } + if !reflect.DeepEqual(test.expected.RequestCounters, stats.RequestCounters) { + t.Fatalf("Test %v: RequestCounters expected %v, got %v", i, test.expected.RequestCounters, stats.RequestCounters) + } + if !reflect.DeepEqual(test.expected.RequestRates, stats.RequestRates) { + t.Fatalf("Test %v: RequestRates expected %v, got %v", i, test.expected.RequestRates, stats.RequestRates) + } + } +} + +// Yes, this is stolen from the consul_exporter, but why reinvent the wheel? ;-) +func TestNewExporter(t *testing.T) { + cases := []struct { + uri string + ok bool + }{ + {uri: "", ok: false}, + {uri: "localhost:8500", ok: true}, + {uri: "https://localhost:8500", ok: true}, + {uri: "http://some.where:8500", ok: true}, + {uri: "fuuuu://localhost:8500", ok: false}, + } + + for _, test := range cases { + _, err := NewExporter(test.uri) + if test.ok && err != nil { + t.Errorf("expected no error w/ %q, but got %q", test.uri, err) + } + if !test.ok && err == nil { + t.Errorf("expected error w/ %q, but got %q", test.uri, err) + } + } +}