Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Otel grafana #544

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,135 changes: 1,135 additions & 0 deletions context/otel-stack/dashboards/dashboard-apm.json

Large diffs are not rendered by default.

1,139 changes: 1,139 additions & 0 deletions context/otel-stack/dashboards/dashboard-top.json

Large diffs are not rendered by default.

Empty file.
30 changes: 30 additions & 0 deletions context/otel-stack/grafana-datasources.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
apiVersion: 1

datasources:
- name: Prometheus
type: prometheus
uid: prometheus
access: proxy
orgId: 1
url: http://prometheus:9090
basicAuth: false
isDefault: false
version: 1
editable: false
jsonData:
httpMethod: GET
- name: Tempo
type: tempo
access: proxy
orgId: 1
url: http://tempo:3200
basicAuth: false
isDefault: true
version: 1
editable: false
apiVersion: 1
uid: tempo
jsonData:
httpMethod: GET
serviceMap:
datasourceUid: prometheus
16 changes: 16 additions & 0 deletions context/otel-stack/otel-collector.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
receivers:
otlp:
protocols:
grpc:
http:

exporters:
otlp:
endpoint: tempo:4317
tls:
insecure: true
service:
pipelines:
traces:
receivers: [otlp]
exporters: [otlp]
11 changes: 11 additions & 0 deletions context/otel-stack/prometheus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
global:
scrape_interval: 15s
evaluation_interval: 15s

scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'tempo'
static_configs:
- targets: ['tempo:3200']
65 changes: 65 additions & 0 deletions context/otel-stack/tempo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
stream_over_http_enabled: true
server:
http_listen_port: 3200
log_level: info

query_frontend:
search:
duration_slo: 5s
throughput_bytes_slo: 1.073741824e+09
trace_by_id:
duration_slo: 5s

distributor:
receivers: # this configuration will listen on all ports and protocols that tempo is capable of.
jaeger: # the receives all come from the OpenTelemetry collector. more configuration information can
protocols: # be found there: https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver
thrift_http: #
grpc: # for a production deployment you should only enable the receivers you need!
thrift_binary:
thrift_compact:
zipkin:
otlp:
protocols:
http:
grpc:
opencensus:

ingester:
max_block_duration: 5m # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally

compactor:
compaction:
block_retention: 1h # overall Tempo trace retention. set for demo purposes

metrics_generator:
registry:
external_labels:
source: tempo
cluster: docker-compose
storage:
path: /var/tempo/generator/wal
remote_write:
- url: http://prometheus:9090/api/v1/write
send_exemplars: true
traces_storage:
path: /var/tempo/generator/traces

storage:
trace:
backend: local # backend configuration to use
wal:
path: /var/tempo/wal # where to store the wal locally
local:
path: /var/tempo/blocks

overrides:
defaults:
global:
max_bytes_per_trace: 100000000
ingestion:
max_traces_per_user: 10000
read:
max_bytes_per_tag_values_query: 100000000
metrics_generator:
processors: [service-graphs, span-metrics, local-blocks] # enables metrics generator
31 changes: 30 additions & 1 deletion docs/06-configuring-services.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ This document describes configuration options of the services shipped with Spryk
* [WebDriver](#webdriver)
* [Dashboard](#dashboard)
* [Tideways](#tideways)
* [Local OpenTelemetry Stack](#grafana)


## Prerequisites
Expand Down Expand Up @@ -528,7 +529,7 @@ docker/sdk up

By default, in the New Relic dashboard, the APM is displayed as `company-staging-newrelic-app`. To improve visibility, you may want to configure each application as a separate APM. For example, `YVES-DE (docker.dev)`.

To do it, adjust the Monitoring service in `src/Pyz/Service/Monitoring/MonitoringDependencyProvider.php`:
To do it, adjust the Monitoring service in `src/Pyz/Service/Monitoring/MonitoringDependencyProvider.php`:

```php
<?php declare(strict_types = 1);
Expand Down Expand Up @@ -655,3 +656,31 @@ tideways:
docker/sdk boot deploy.*.yml &&\
docker/sdk up
```

## Local OpenTelemetry Stack

The Local OpenTelemetry Stack is a powerful tool designed for real-time monitoring of Application Performance Monitoring (APM) locally. It allows you to track APM traces across all containers or specific ones running PHP applications.

This stack integrates the following containers into your local environment:
* tempo-init
* tempo
* collector
* prometheus
* grafana

### Configuration

1. Adjust your `deploy.*.yml` as follows:

```yaml
grafana:
engine: otel-stack
endpoints:
grafana:
```

2. Bootstrap the docker setup and rebuild the application:
```bash
docker/sdk boot deploy.*.yml &&\
docker/sdk up
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{% extends "nginx/http/gateway/server.conf.twig" %}
{% block upstream %}{{ upstream }}:3000{% endblock upstream %}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{% include "service/#{engine}/latest/#{engine}.yml.twig" with _context only %}
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@

# Tempo runs as user 10001, and docker compose creates the volume as root.
# As such, we need to chown the volume in order for Tempo to start correctly.
tempo-init:
image: &tempoImage grafana/tempo:main-1a21818
user: root
entrypoint:
- "chown"
- "10001:10001"
- "/var/tempo"
volumes:
- {{ serviceName }}-{{ serviceData['engine'] }}-data:/var/tempo:rw

tempo:
image: *tempoImage
command: [ "-config.file=/etc/tempo.yaml" ]
volumes:
- ./${DEPLOYMENT_PATH}/context/otel-stack/tempo.yaml:/etc/tempo.yaml
- {{ serviceName }}-{{ serviceData['engine'] }}-data:/var/tempo:rw
networks:
- private
ports:
- "14268" # jaeger ingest
- "3200" # tempo
- "4317" # otlp grpc
- "4318" # otlp http
- "9411" # zipkin
depends_on:
- tempo-init
healthcheck:
test: [ "CMD", "curl", "-f", "http://localhost:3200/metrics" ]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s

collector:
image: otel/opentelemetry-collector:0.86.0
command: [ "--config=/etc/otel-collector.yaml" ]
volumes:
- ./${DEPLOYMENT_PATH}/context/otel-stack/otel-collector.yaml:/etc/otel-collector.yaml
networks:
- private
healthcheck:
test: [ "CMD", "curl", "-f", "http://localhost:13133/health" ]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
ports:
- "4318:4318"
- "4317:4317"
prometheus:
image: prom/prometheus:latest
command:
- --config.file=/etc/prometheus.yaml
- --web.enable-remote-write-receiver
- --enable-feature=exemplar-storage
networks:
- private
volumes:
- ./${DEPLOYMENT_PATH}/context/otel-stack/prometheus.yaml:/etc/prometheus.yaml
ports:
- "9090:9090"
healthcheck:
test: [ "CMD", "curl", "-f", "http://localhost:9090/-/healthy" ]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s

grafana:
image: grafana/grafana:11.0.0
volumes:
- ./${DEPLOYMENT_PATH}/context/otel-stack/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml
networks:
- public
- private
environment:
- GF_AUTH_ANONYMOUS_ENABLED=true
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
- GF_AUTH_DISABLE_LOGIN_FORM=true
- GF_FEATURE_TOGGLES_ENABLE=traceqlEditor
labels:
'spryker.app.name': grafana
'spryker.app.type': hidden
'spryker.project': ${SPRYKER_DOCKER_PREFIX}:${SPRYKER_DOCKER_TAG}
ports:
- "3000:3000"
healthcheck:
test: [ "CMD", "curl", "-f", "http://localhost:3000/" ]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
Loading