Skip to content

Commit

Permalink
Merge branch 'main' into link-rule
Browse files Browse the repository at this point in the history
  • Loading branch information
SOF3 committed Sep 6, 2023
2 parents 88c0a46 + 11d3a9f commit 4fca547
Show file tree
Hide file tree
Showing 73 changed files with 2,712 additions and 746 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ jobs:
TRACE_ID=$(grep "Location: /trace/" curl-output.http | cut -d/ -f3 | tr -d '\r')
mkdir -p output/api/traces
for mode in ff{0,1,2,3}{0,1}; do
mode_trace=${mode}${TRACE_ID:4}
for mode in ff{0,1,2,3}{0,1}00000{0,1}; do
mode_trace=${mode}${TRACE_ID:10}
curl -o output/api/traces/$mode_trace http://localhost:16686/api/traces/$mode_trace
done
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/chart-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,5 @@ jobs:
id: tag-name
run: echo "IMAGE_TAG=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT

- run: helm package charts/kelemetry --app-version="${IMAGE_TAG}" --version="${{steps.tag-name.outputs.IMAGE_TAG}}" -d output
- run: helm package charts/kelemetry --app-version="${{steps.tag-name.outputs.IMAGE_TAG}}" --version="${{steps.tag-name.outputs.IMAGE_TAG}}" -d output
- run: helm push output/kelemetry-chart-${{steps.tag-name.outputs.IMAGE_TAG}}.tgz oci://ghcr.io/kubewharf
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,6 @@ COPY --from=build /src/kelemetry /usr/local/bin/kelemetry
RUN mkdir -p /app/hack
WORKDIR /app
ADD hack/tfconfig.yaml hack/tfconfig.yaml
RUN sed -i 's/127\.0\.0\.1:17272/remote-badger:17271/g' hack/tfconfig.yaml

ENTRYPOINT ["kelemetry"]
15 changes: 9 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ CLUSTER_NAME ?= tracetest
KUBECONFIGS ?= $(CLUSTER_NAME)=$(KUBECONFIG)

PORT ?= 8080
LOG_LEVEL ?= debug
KLOG_VERBOSITY ?= 5
LOG_LEVEL ?= info
KLOG_VERBOSITY ?= 3

RACE_ARG := -race
ifdef SKIP_DETECT_RACE
Expand Down Expand Up @@ -133,13 +133,16 @@ kind:
sed "s/host.docker.internal/$$( \
docker network inspect kind -f '{{(index .IPAM.Config 0).Gateway}}' \
)/g" hack/audit-kubeconfig.yaml >hack/audit-kubeconfig.local.yaml
sed "s/host.docker.internal/$$( \
docker network inspect kind -f '{{(index .IPAM.Config 0).Gateway}}' \
)/g" hack/tracing-config.yaml >hack/tracing-config.local.yaml
cd hack && kind create cluster --config kind-cluster.yaml

COMPOSE_COMMAND ?= up --build -d --remove-orphans

stack:
docker-compose -f dev.docker-compose.yaml up --no-recreate --no-start # create network only
docker-compose \
docker compose -f dev.docker-compose.yaml up --no-recreate --no-start # create network only
docker compose \
-f dev.docker-compose.yaml \
-f <(jq -n \
--arg GATEWAY_ADDR $$(docker network inspect kelemetry_default -f '{{(index .IPAM.Config 0).Gateway}}') \
Expand All @@ -155,13 +158,13 @@ endef

export QUICKSTART_JQ_PATCH
quickstart:
docker-compose -f quickstart.docker-compose.yaml \
docker compose -f quickstart.docker-compose.yaml \
-f <(jq -n --arg KELEMETRY_IMAGE "$(KELEMETRY_IMAGE)" "$$QUICKSTART_JQ_PATCH") \
up --no-recreate --no-start
kubectl config view --raw --minify --flatten --merge >hack/client-kubeconfig.local.yaml
sed -i "s/0\.0\.0\.0/$$(docker network inspect kelemetry_default -f '{{(index .IPAM.Config 0).Gateway}}')/g" hack/client-kubeconfig.local.yaml
sed -i 's/certificate-authority-data: .*$$/insecure-skip-tls-verify: true/' hack/client-kubeconfig.local.yaml
docker-compose -f quickstart.docker-compose.yaml \
docker compose -f quickstart.docker-compose.yaml \
-f <(jq -n --arg KELEMETRY_IMAGE "$(KELEMETRY_IMAGE)" "$$QUICKSTART_JQ_PATCH") \
$(COMPOSE_COMMAND)

Expand Down
2 changes: 1 addition & 1 deletion charts/kelemetry/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ version: 0.1.0
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "0.1.0"
appVersion: "0.2.1"
33 changes: 24 additions & 9 deletions charts/kelemetry/templates/_helpers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ owner-linker-enable: {{ .Values.linkers.ownerReference }}

{{/* TRACER */}}
tracer-otel-endpoint: {{.Release.Name}}-collector.{{.Release.Namespace}}.svc:4317
tracer-otel-insecure: {{ .Values.collector.insecure }}
{{- end }}

{{- define "kelemetry.object-cache-options" }}
Expand Down Expand Up @@ -177,13 +178,17 @@ kube-other-rest-burst: {{.otherClusterBurst}}
{{- define "kelemetry.diff-cache-options-raw" }}
diff-cache-wrapper-enable: {{.Values.diffCache.memoryWrapper}}

{{- if .Values.diffCache.resourceVersionIndex | eq "Before" }}
diff-cache-use-old-rv: true
{{- else if .Values.diffCache.resourceVersionIndex | eq "After" }}
diff-cache-use-old-rv: false
{{- else }}
{{ printf "Unsupported resource version index type %q" .Values.diffCache.resourceVersionIndex }}
{{- end }}
kube-use-old-resource-version-clusters: [
{{- range .Values.multiCluster.clusters }}
{{- if .resourceVersionIndex | eq "Before" }}
{{toJson .name}},
{{- else if .resourceVersionIndex | eq "After" }}
# not {{toJson .name}},
{{- else }}
{{ printf "Unsupported resource version index type %q" .resourceVersionIndex | fail }}
{{- end }}
{{- end }}
]

{{- if .Values.diffCache.type | eq "etcd" }}
diff-cache: etcd
Expand Down Expand Up @@ -218,6 +223,16 @@ diff-cache-patch-ttl: {{toJson .Values.informers.diff.persistDuration.patch}}
diff-cache-snapshot-ttl: {{toJson .Values.informers.diff.persistDuration.snapshot}}
{{- end }}

{{- define "kelemetry.diff-decorator-options" }}
{{- include "kelemetry.diff-decorator-options-raw" . | include "kelemetry.yaml-to-args" }}
{{- end }}
{{- define "kelemetry.diff-decorator-options-raw" }}
diff-decorator-enable: {{ .Values.consumer.diff.enable | toJson }}
diff-decorator-fetch-backoff: {{ .Values.consumer.diff.backoff | toJson }}
diff-decorator-fetch-event-timeout: {{ .Values.consumer.diff.fetchEventTimeout | toJson }}
diff-decorator-fetch-total-timeout: {{ .Values.consumer.diff.fetchTotalTimeout | toJson }}
{{- end }}

{{- define "kelemetry.event-informer-options" }}
{{- include "kelemetry.event-informer-options-raw" . | include "kelemetry.yaml-to-args" }}
{{- end }}
Expand Down Expand Up @@ -291,14 +306,14 @@ jaeger-trace-cache-etcd-prefix: {{ .Values.frontend.traceCache.etcd.prefix | toJ
{{- if list "badger" "memory" | has .Values.storageBackend.type }}
{{- include "kelemetry.storage-options-stateful-grpc" . }}
{{- else }}
{{- include "kelemetry.storage-options-raw-stateless" . }}
{{- include "kelemetry.storage-options-stateless-raw" . }}
{{- end }}
{{- end }}
{{- define "kelemetry.storage-options-stateful-grpc" }}
span-storage.type: grpc-plugin
grpc-storage.server: {{.Release.Name}}-storage.{{.Release.Namespace}}.svc:17271
{{- end }}
{{- define "kelemetry.storage-options-stateless" }}
{{- define "kelemetry.storage-options-stateless-raw" }}
span-storage.type: {{toJson .Values.storageBackend.type}}
{{- range $key, $value := .Values.storageBackend.options }}
{{ toJson $key }}: {{ toJson $value }}
Expand Down
3 changes: 3 additions & 0 deletions charts/kelemetry/templates/collector.deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ spec:
args: [
{{- include "kelemetry.storage-options-raw" . | include "kelemetry.yaml-to-args" }}
]
env:
- name: COLLECTOR_OTLP_ENABLED
value: "true"
image: {{ printf "%s:%s" .Values.jaegerImages.collector.repository .Values.jaegerImages.collector.tag | toJson }}
imagePullPolicy: {{ toJson .Values.jaegerImages.pullPolicy }}
livenessProbe:
Expand Down
4 changes: 4 additions & 0 deletions charts/kelemetry/templates/consumer.deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ spec:
{{ include "kelemetry.logging-options" .Values.consumer }}
{{ include "kelemetry.kube-options" .Values.consumer }}
{{ include "kelemetry.audit-options" . }}
{{- if .Values.informers.diff.enable }}
{{ include "kelemetry.diff-cache-options" . }}
{{ include "kelemetry.diff-decorator-options" . }}
{{- end }}
]
ports: [
{{- if .Values.consumer.pprof }}
Expand Down
3 changes: 3 additions & 0 deletions charts/kelemetry/templates/informers.rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,6 @@ rules:
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["*"]
- apiGroups: [""]
resources: ["events"]
verbs: ["*"]
2 changes: 1 addition & 1 deletion charts/kelemetry/templates/storage.deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ spec:
imagePullPolicy: {{ toJson .Values.jaegerImages.pullPolicy }}
args: [
# Don't include storage-options-raw directly here; otherwise the traffic would be infinite recursion.
{{- include "kelemetry.storage-options-stateless" . | include "kelemetry.yaml-to-args" }}
{{- include "kelemetry.storage-options-stateless-raw" . | include "kelemetry.yaml-to-args" }}
]
livenessProbe:
httpGet:
Expand Down
41 changes: 30 additions & 11 deletions charts/kelemetry/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,24 @@ consumer:
otherConfig: {}
# clusterIP: xxx, externalIP: xxx, etc.

# Decorate audit logs with diff info from the diff cache.
diff:
enable: true
# The backoff duration between attempts to fetch diff from the diff cache.
backoff: 1s
# If this duration has elapsed since kube-apiserver sends the ResponseComplete audit stage
# but the diff cache still returns NotFound for the requested diff,
# consumer gives up retrying to fetch diff for this audit event.
# Consumer always tries to fetch at least once even if this duration has elapsed.
fetchEventTimeout: 15s
# The total duration that consumer takes to retry and fetch diff for each audit event.
# Comparing fetchEventTimeout vs fetchTotalTimeout:
# fetchEventTimeout is non-accumulative, e.g. if consumer is lagging behind for more than fetchEventTimeout,
# the consumer give up after the first attempt;
# meanwhile, fetchTotalTimeout is accumulative, e.g. in the worst case,
# the consumer takes fetchTotalTimeout to process each audit event before processing the next one.
fetchTotalTimeout: 10s

# The audit consumer is primarily CPU-bound.
resources: {}
# limits:
Expand Down Expand Up @@ -168,6 +186,8 @@ consumer:

# Jaeger collector collects otel tracing data and dispatches them to Jaeger storage.
collector:
insecure: true

replicaCount: 3
resources: {}
# limits:
Expand Down Expand Up @@ -270,8 +290,7 @@ storageBackend:
aggregator:
globalTags:
# Tags applied to all object spans (i.e. not actual events, just the parent placeholder)
pseudoSpan:
cluster: foo # one recommended use is to write the cluster name here.
pseudoSpan: {}
# Tags applied to all actual spans (e.g. events, audit logs)
eventSpan: {}

Expand Down Expand Up @@ -322,14 +341,6 @@ objectCache:

# Diff cache stores the object diff from informers so that audit consumer can use it.
diffCache:
# Whether to index by the resourceVersion `Before` or `After` a diff.
# `After` is more accurate, but it requires audit logs to be sent at RequestResponse level,
# which may result in more expensive audit logging costs.
# Use `Before` if you cannot use RequestResponse-level audit logging,
# but it may suffer from accuracy bugs such as duplicate events.
# See <https://github.com/kubernetes/kubernetes/issues/115791>.
resourceVersionIndex: Before

# Whether to persist a layer of read cache in memory to reduce etcd load.
memoryWrapper: true

Expand Down Expand Up @@ -370,9 +381,17 @@ multiCluster:
# This is the list of addresses that audit webhook requests from the apiserver may be sent from.
peerAddresses: [127.0.0.1]

# Whether to index object diff in this cluster by the resourceVersion `Before` or `After` a diff.
# `After` is more accurate, but it requires audit logs to be sent at RequestResponse level,
# which may result in more expensive audit logging costs.
# Use `Before` if you cannot use RequestResponse-level audit logging,
# but it may suffer from accuracy bugs such as duplicate events.
# See <https://github.com/kubernetes/kubernetes/issues/115791>.
resourceVersionIndex: After

kelemetryImage:
repository: ghcr.io/kubewharf/kelemetry
pullPolicy: IfNotPresent
pullPolicy: Always
# Overrides the image tag whose default is the chart appVersion.
tag: ""
pullSecrets: []
Expand Down
14 changes: 12 additions & 2 deletions dev.docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ services:
jaeger-query:
image: jaegertracing/jaeger-query:1.42
environment:
GRPC_STORAGE_SERVER: host.docker.internal:17271 # run on host directly
SPAN_STORAGE_TYPE: grpc-plugin
GRPC_STORAGE_SERVER: host.docker.internal:17271 # run on host directly
ports:
- 0.0.0.0:16686:16686
restart: always
Expand All @@ -39,7 +39,7 @@ services:
SPAN_STORAGE_TYPE: grpc-plugin
GRPC_STORAGE_SERVER: remote-badger:17271
ports:
- 127.0.0.1:4317:4317
- 0.0.0.0:4317:4317
restart: always
# Backend badger storage
# Feel free to override environment.SPAN_STORAGE_TYPE to other storages given the proper configuration.
Expand All @@ -55,6 +55,16 @@ services:
volumes:
- badger:/mnt/badger

# Web frontend for raw trace database view.
jaeger-query-raw:
image: jaegertracing/jaeger-query:1.42
environment:
SPAN_STORAGE_TYPE: grpc-plugin
GRPC_STORAGE_SERVER: remote-badger:17271
ports:
- 0.0.0.0:26686:16686
restart: always

volumes:
etcd: {}
badger: {}
13 changes: 11 additions & 2 deletions docs/DEPLOY.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Deploying Kelemetry for Production Clusters

> Note: Due to the variety of cloud providers and cluster management solutions,
> deploying Kelemetry for production might be tricky.
> If you just want to try out the features of Kelemetry,
> follow the [quick start guide](QUICK_START.md) instead,
> which sets up a basic stack locally using Docker.
To minimize data loss and latency and ensure high availability,
we recommend deploying Kelemetry in 3 separate components:
consumers, informers and storage plugin.
Expand Down Expand Up @@ -61,8 +67,11 @@ This setup is bundled into a Helm chart.

## Steps

1. Download [`values.yaml`](charts/kelemetry/values.yaml) and configure the settings.
2. Install the chart: `helm install kelemetry kelemetry oci://ghcr.io/kubewharf/kelemetry-chart --values values.yaml`
1. Download [`values.yaml`](/charts/kelemetry/values.yaml) and configure the settings.
2. Install the chart: `helm install kelemetry oci://ghcr.io/kubewharf/kelemetry-chart --values values.yaml`
3. If you use an audit webhook directly, remember to
[configure the apiserver](https://kubernetes.io/docs/tasks/debug/debug-cluster/audit/#webhook-backend)
to send audit logs to the webhook:

The default configuration is designed for single-cluster deployment.
For multi-cluster deployment, configure the `sharedEtcd` and `storageBackend` to use a common database.
12 changes: 5 additions & 7 deletions docs/USER_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,10 @@ Currently, the following tags are supported:

### Time

Currently, all traces are **rounded down** to the **newest half-hour before the event**.
That is, if you want to look for an event that happened at 12:34,
you should search for the trace at 12:30 instead.
Searching between 12:33 and 12:35 will yield **no search results**.

Each trace lasts for exactly 30 minutes, so the max/min duration fields are unsupported.
Kelemetry merges and truncates traces based on the time range given in the user input.
Only spans and events within this range are displayed.
Some display modes further truncate the time range to the duration from the earliest to the latest event,
so refer to the "Trace start" timestamp indicated in the trace view page.

## Trace view

Expand All @@ -54,7 +52,7 @@ If the current half-hour is still in progress, reload the page to load the new d

For the recommended `tracing` display mode,

- The timestamps are relative to the trace start, which is either `:00` or `:30` of an hour.
- The timestamps are relative to the trace start.
- Click on the arrow button on the left to collapse/expand a span.
- Click on the empty space on a span row to reveal details of the span.
- Hover cursor over a black vertical line on the span to reveal the events.
Expand Down
Loading

0 comments on commit 4fca547

Please sign in to comment.