diff --git a/go.mod b/go.mod index 1e534dda47..fc4a55342d 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.21 require ( github.com/bwplotka/mimic v0.2.1-0.20230303101552-f705cca2f4a4 + github.com/google/go-jsonnet v0.20.0 github.com/observatorium/api v0.1.3-0.20230711132510-96e8799ade44 github.com/observatorium/observatorium v0.0.0-00010101000000-000000000000 github.com/openshift/api v3.9.0+incompatible @@ -83,4 +84,4 @@ require ( ) // Delete when https://github.com/observatorium/observatorium/pull/543 is merged to main branch -replace github.com/observatorium/observatorium => github.com/thibaultmg/observatorium v0.0.0-20231204171717-54b9405e5fab +replace github.com/observatorium/observatorium => github.com/thibaultmg/observatorium v0.0.0-20231207162400-60f91aef7aea diff --git a/go.sum b/go.sum index e27d3e3d21..3691e337f2 100644 --- a/go.sum +++ b/go.sum @@ -682,6 +682,8 @@ github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-jsonnet v0.20.0 h1:WG4TTSARuV7bSm4PMB4ohjxe33IHT5WVTrJSU33uT4g= +github.com/google/go-jsonnet v0.20.0/go.mod h1:VbgWF9JX7ztlv770x/TolZNGGFfiHEVx9G6ca2eUmeA= github.com/google/go-querystring v1.0.0/go.mod h1:odCYkC5MyYFN7vkCjXpyrEuKhc/BUO6wN/zVPAxq5ck= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -1189,6 +1191,8 @@ github.com/seccomp/libseccomp-golang v0.9.1/go.mod h1:GbW5+tmTXfcxTToHLXlScSlAvW github.com/segmentio/kafka-go v0.1.0/go.mod h1:X6itGqS9L4jDletMsxZ7Dz+JFWxM6JHfPOCvTvk+EJo= github.com/segmentio/kafka-go v0.2.0/go.mod h1:X6itGqS9L4jDletMsxZ7Dz+JFWxM6JHfPOCvTvk+EJo= github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= +github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0= +github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= github.com/shurcooL/httpfs v0.0.0-20190707220628-8d4bc4ba7749/go.mod h1:ZY1cvUeJuFPAdZ/B6v7RHavJWZn2YPVFQ1OSXhCGOkg= github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= github.com/shurcooL/vfsgen v0.0.0-20181202132449-6a9ea43bcacd/go.mod h1:TrYk7fJVaAttu97ZZKrO9UbRa8izdowaMIZcxYMbVaw= @@ -1253,8 +1257,8 @@ github.com/syndtr/gocapability v0.0.0-20170704070218-db04d3cc01c8/go.mod h1:hkRG github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww= github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww= github.com/tchap/go-patricia v2.2.6+incompatible/go.mod h1:bmLyhP68RS6kStMGxByiQ23RP/odRBOTVjwp2cDyi6I= -github.com/thibaultmg/observatorium v0.0.0-20231204171717-54b9405e5fab h1:OLXEyHPt3e3Sz/w+MOU0uxwIPD7U1XRfHia1xs1TjuM= -github.com/thibaultmg/observatorium v0.0.0-20231204171717-54b9405e5fab/go.mod h1:P+7t9O8AitkuZjUhXC4LHw4iwAzTpIrs0tHz8X3xTvM= +github.com/thibaultmg/observatorium v0.0.0-20231207162400-60f91aef7aea h1:u8cjMxOcGgYIsyh/tGkGkofeqgeSi6/srancJOE0fU8= +github.com/thibaultmg/observatorium v0.0.0-20231207162400-60f91aef7aea/go.mod h1:P+7t9O8AitkuZjUhXC4LHw4iwAzTpIrs0tHz8X3xTvM= github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk= github.com/tinylib/msgp v1.0.2/go.mod h1:+d+yLhGm8mzTaHzB+wgMYrodPfmZrzkirds8fDWklFE= github.com/tinylib/msgp v1.1.0/go.mod h1:+d+yLhGm8mzTaHzB+wgMYrodPfmZrzkirds8fDWklFE= diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 5b17f03f4d..c4e2d07d64 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -230,7 +230,7 @@ "subdir": "configuration" } }, - "version": "54ed3ca4ce0d9b6d05abdc44c0a5abcb91045419", + "version": "73cf41804053c961ac776f5b2ee375ef67f23f64", "sum": "QMVHXmQ6fFl5Cg9XZ/5KgUwwEqyHwA9PGUfhxRLWp+E=" }, { diff --git a/resources/services/app-sre-stage-01/rhobs/default/observatorium-metrics-ruler-default-template.yaml b/resources/services/app-sre-stage-01/rhobs/default/observatorium-metrics-ruler-default-template.yaml new file mode 100755 index 0000000000..ad569a71c0 --- /dev/null +++ b/resources/services/app-sre-stage-01/rhobs/default/observatorium-metrics-ruler-default-template.yaml @@ -0,0 +1,590 @@ +apiVersion: template.openshift.io/v1 +kind: Template +metadata: + creationTimestamp: null + name: observatorium-thanos-ruler-default +objects: +- apiVersion: route.openshift.io/v1 + kind: Route + metadata: + annotations: + cert-manager.io/issuer-kind: ClusterIssuer + cert-manager.io/issuer-name: letsencrypt-prod-http + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: default + name: observatorium-thanos-ruler-default + namespace: rhobs + spec: + host: "" + port: + targetPort: https + tls: + insecureEdgeTerminationPolicy: Redirect + termination: reencrypt + to: + kind: Service + name: observatorium-thanos-ruler-default + weight: null +- apiVersion: v1 + kind: Service + metadata: + annotations: + service.alpha.openshift.io/serving-cert-secret-name: ruler-tls + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: default + name: observatorium-thanos-ruler-default + namespace: rhobs + spec: + ports: + - name: http + port: 10902 + protocol: TCP + targetPort: 10902 + - name: grpc + port: 10901 + protocol: TCP + targetPort: 10901 + - name: internal + port: 8083 + protocol: TCP + targetPort: 8083 + - name: https + port: 8443 + protocol: TCP + targetPort: 8443 + selector: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + observatorium/tenant: default +- apiVersion: v1 + imagePullSecrets: + - name: quay.io + kind: ServiceAccount + metadata: + annotations: + serviceaccounts.openshift.io/oauth-redirectreference.application: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"observatorium-thanos-ruler-default"}}' + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: default + name: observatorium-thanos-ruler-default + namespace: rhobs +- apiVersion: monitoring.coreos.com/v1 + kind: ServiceMonitor + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: default + prometheus: app-sre + name: observatorium-thanos-ruler-default + namespace: openshift-customer-monitoring + spec: + endpoints: + - port: http + relabelings: + - action: replace + separator: / + sourceLabels: + - namespace + - pod + targetLabel: instance + - port: internal + relabelings: + - action: replace + separator: / + sourceLabels: + - namespace + - pod + targetLabel: instance + namespaceSelector: + matchNames: + - rhobs + selector: + matchLabels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + observatorium/tenant: default +- apiVersion: apps/v1 + kind: StatefulSet + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: default + name: observatorium-thanos-ruler-default + namespace: rhobs + spec: + replicas: ${{REPLICAS}} + selector: + matchLabels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + observatorium/tenant: default + serviceName: observatorium-thanos-ruler-default + template: + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: default + namespace: rhobs + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/instance + operator: In + values: + - observatorium + - key: app.kubernetes.io/name + operator: In + values: + - thanos-rule + topologyKey: kubernetes.io/hostname + weight: 100 + containers: + - args: + - rule + - --alert.label-drop=rule_replica + - --data-dir=/var/thanos/ruler + - --label=rule_replica="$(NAME)" + - --log.format=logfmt + - --log.level=${LOG_LEVEL} + - --objstore.config=$(OBJSTORE_CONFIG) + - --query=http://observatorium-thanos-query-rule.rhobs.svc.cluster.local:10902 + - --rule-file=/etc/thanos/rules/synced-rules/observatorium.yaml + - | + --tracing.config=type: JAEGER + config: + service_name: thanos-rule + sampler_type: ratelimiting + sampler_param: 2 + - --tsdb.retention=2d + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + key: aws_access_key_id + name: default-tenant-s3 + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + key: aws_secret_access_key + name: default-tenant-s3 + - name: OBJ_STORE_BUCKET + valueFrom: + secretKeyRef: + key: bucket + name: default-tenant-s3 + - name: OBJ_STORE_REGION + valueFrom: + secretKeyRef: + key: aws_region + name: default-tenant-s3 + - name: OBJ_STORE_ENDPOINT + valueFrom: + secretKeyRef: + key: endpoint + name: default-tenant-s3 + - name: OBJSTORE_CONFIG + value: | + type: S3 + config: + bucket: $(OBJ_STORE_BUCKET) + endpoint: $(OBJ_STORE_ENDPOINT) + region: $(OBJ_STORE_REGION) + image: quay.io/thanos/thanos:v0.32.5 + imagePullPolicy: IfNotPresent + livenessProbe: + failureThreshold: 8 + httpGet: + path: /-/healthy + port: 10902 + periodSeconds: 30 + timeoutSeconds: 1 + name: thanos + ports: + - containerPort: 10902 + name: http + protocol: TCP + - containerPort: 10901 + name: grpc + protocol: TCP + readinessProbe: + failureThreshold: 20 + httpGet: + path: /-/ready + port: 10902 + periodSeconds: 5 + resources: + limits: + memory: ${MEMORY_LIMIT} + requests: + cpu: ${CPU_REQUEST} + memory: ${MEMORY_REQUEST} + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /var/thanos/ruler + name: data + - mountPath: /etc/thanos/rules/synced-rules + name: rule-syncer + readOnly: true + - args: + - -file=/etc/thanos-rule-syncer/observatorium.yaml + - -interval=60 + - -rules-backend-url=http://observatorium-rules-objstore.rhobs.svc.cluster.local:10902 + - -thanos-rule-url=127.0.0.1:10902 + image: quay.io/observatorium/thanos-rule-syncer:main-2022-09-14-338f9ec + name: observatorium-rules-syncer + ports: + - containerPort: 8083 + name: internal + protocol: TCP + resources: + limits: + cpu: 128m + memory: 128Mi + requests: + cpu: 32m + memory: 64Mi + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/thanos-rule-syncer + name: rule-syncer + - args: + - -volume-dir=/etc/thanos-rule-syncer + - -webhook-url=http://localhost:10902/-/reload + image: 'quay.io/openshift/origin-configmap-reloader:4.5.0:' + name: configmap-reloader + resources: + limits: + cpu: 200m + memory: 200Mi + requests: + cpu: 100m + memory: 100Mi + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/thanos/rules/observatorium-rules + name: observatorium-rules + - args: + - -provider=openshift + - -https-address=:8443 + - -http-address= + - -email-domain=* + - -upstream=http://localhost:10902 + - -openshift-service-account=observatorium-thanos-ruler-default + - '-openshift-sar={"resource": "namespaces", "verb": "get", "name": "rhobs", + "namespace": "rhobs"}' + - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", + "name": "rhobs", "namespace": "rhobs"}}' + - -tls-cert=/etc/tls/private/tls.crt + - -tls-key=/etc/tls/private/tls.key + - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token + - -cookie-secret=${OAUTH_PROXY_COOKIE_SECRET} + - -openshift-ca=/etc/pki/tls/cert.pem + - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt + image: quay.io/openshift/origin-oauth-proxy:4.15 + name: oauth-proxy + ports: + - containerPort: 8443 + name: https + protocol: TCP + resources: + limits: + cpu: 200m + memory: 200Mi + requests: + cpu: 100m + memory: 100Mi + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/tls/private + name: tls + readOnly: true + - args: + - --reporter.grpc.host-port=dns:///otel-trace-writer-collector-headless.observatorium-tools.svc:14250 + - --reporter.type=grpc + - --agent.tags=pod.namespace=$(NAMESPACE),pod.name=$(POD) + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POD + valueFrom: + fieldRef: + fieldPath: metadata.name + image: quay.io/app-sre/jaegertracing-jaeger-agent:1.22.0 + livenessProbe: + failureThreshold: 5 + httpGet: + path: / + port: 14271 + name: jaeger-agent + ports: + - containerPort: 5778 + name: configs + protocol: TCP + - containerPort: 6831 + name: jaeger-thrift + protocol: TCP + - containerPort: 14271 + name: metrics + protocol: TCP + readinessProbe: + httpGet: + path: / + port: 14271 + initialDelaySeconds: 1 + resources: + limits: + cpu: 128m + memory: 128Mi + requests: + cpu: 32m + memory: 64Mi + terminationMessagePolicy: FallbackToLogsOnError + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: observatorium-thanos-ruler-default + terminationGracePeriodSeconds: 120 + volumes: + - emptyDir: {} + name: rule-syncer + - name: tls + secret: + secretName: ruler-tls + updateStrategy: {} + volumeClaimTemplates: + - metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: default + name: data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: "" +- apiVersion: apps/v1 + kind: Deployment + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium + spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + strategy: {} + template: + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + namespace: observatorium + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/instance + operator: In + values: + - observatorium + - key: app.kubernetes.io/name + operator: In + values: + - rules-objstore + topologyKey: kubernetes.io/hostname + weight: 100 + containers: + - args: + - -log.format=logfmt + - -log.level=warn + image: 'quay.io/observatorium/rules-objstore:' + imagePullPolicy: IfNotPresent + livenessProbe: + failureThreshold: 10 + httpGet: + path: /live + port: 8081 + periodSeconds: 30 + successThreshold: 1 + timeoutSeconds: 1 + name: thanos + ports: + - containerPort: 8081 + name: internal + protocol: TCP + - containerPort: 8080 + name: public + protocol: TCP + readinessProbe: + failureThreshold: 12 + httpGet: + path: /ready + port: 8081 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + resources: + limits: + cpu: "1" + memory: 400Mi + requests: + cpu: 50m + memory: 200Mi + terminationMessagePolicy: FallbackToLogsOnError + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: observatorium-rules-objstore + terminationGracePeriodSeconds: 120 +- apiVersion: v1 + kind: Service + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium + spec: + ports: + - name: internal + port: 8081 + protocol: TCP + targetPort: 8081 + - name: public + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium +- apiVersion: v1 + kind: ServiceAccount + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium +- apiVersion: monitoring.coreos.com/v1 + kind: ServiceMonitor + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium + spec: + endpoints: + - port: internal + relabelings: + - action: replace + separator: / + sourceLabels: + - namespace + - pod + targetLabel: instance + namespaceSelector: {} + selector: + matchLabels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium +parameters: +- name: LOG_LEVEL + value: warn +- name: REPLICAS + value: "1" +- name: CPU_REQUEST + value: 100m +- name: MEMORY_LIMIT + value: 1Gi +- name: MEMORY_REQUEST + value: 256Mi +- from: '[a-zA-Z0-9]{40}' + generate: expression + name: OAUTH_PROXY_COOKIE_SECRET diff --git a/resources/services/app-sre-stage-01/rhobs/observatorium-metrics-query-frontend-template.yaml b/resources/services/app-sre-stage-01/rhobs/observatorium-metrics-query-frontend-template.yaml index 4d566db786..afe23c52fb 100755 --- a/resources/services/app-sre-stage-01/rhobs/observatorium-metrics-query-frontend-template.yaml +++ b/resources/services/app-sre-stage-01/rhobs/observatorium-metrics-query-frontend-template.yaml @@ -9,18 +9,18 @@ objects: metadata: creationTimestamp: null labels: - app.kubernetes.io/component: memcached + app.kubernetes.io/component: query-range-cache app.kubernetes.io/instance: observatorium app.kubernetes.io/name: memcached app.kubernetes.io/part-of: observatorium app.kubernetes.io/version: "1.5" - name: observatorium-thanos-query-frontend + name: observatorium-thanos-query-range-cache-memcached namespace: rhobs spec: - replicas: ${{REPLICAS}} + replicas: 1 selector: matchLabels: - app.kubernetes.io/component: memcached + app.kubernetes.io/component: query-range-cache app.kubernetes.io/instance: observatorium app.kubernetes.io/name: memcached app.kubernetes.io/part-of: observatorium @@ -29,7 +29,7 @@ objects: metadata: creationTimestamp: null labels: - app.kubernetes.io/component: memcached + app.kubernetes.io/component: query-range-cache app.kubernetes.io/instance: observatorium app.kubernetes.io/name: memcached app.kubernetes.io/part-of: observatorium @@ -60,17 +60,17 @@ objects: - --verbose=true image: quay.io/app-sre/memcached:1.5 imagePullPolicy: IfNotPresent - name: observatorium-thanos-query-frontend + name: observatorium-thanos-query-range-cache-memcached ports: - containerPort: 11211 name: client protocol: TCP resources: limits: - memory: ${MEMORY_LIMIT} + memory: 3Gi requests: - cpu: ${CPU_REQUEST} - memory: ${MEMORY_REQUEST} + cpu: 500m + memory: 2Gi terminationMessagePolicy: FallbackToLogsOnError - args: - --memcached.address=localhost:0 @@ -92,19 +92,19 @@ objects: terminationMessagePolicy: FallbackToLogsOnError nodeSelector: kubernetes.io/os: linux - serviceAccountName: observatorium-thanos-query-frontend + serviceAccountName: observatorium-thanos-query-range-cache-memcached terminationGracePeriodSeconds: 120 - apiVersion: v1 kind: Service metadata: creationTimestamp: null labels: - app.kubernetes.io/component: memcached + app.kubernetes.io/component: query-range-cache app.kubernetes.io/instance: observatorium app.kubernetes.io/name: memcached app.kubernetes.io/part-of: observatorium app.kubernetes.io/version: "1.5" - name: observatorium-thanos-query-frontend + name: observatorium-thanos-query-range-cache-memcached namespace: rhobs spec: clusterIP: None @@ -118,7 +118,7 @@ objects: protocol: TCP targetPort: 9150 selector: - app.kubernetes.io/component: memcached + app.kubernetes.io/component: query-range-cache app.kubernetes.io/instance: observatorium app.kubernetes.io/name: memcached app.kubernetes.io/part-of: observatorium @@ -129,25 +129,25 @@ objects: metadata: creationTimestamp: null labels: - app.kubernetes.io/component: memcached + app.kubernetes.io/component: query-range-cache app.kubernetes.io/instance: observatorium app.kubernetes.io/name: memcached app.kubernetes.io/part-of: observatorium app.kubernetes.io/version: "1.5" - name: observatorium-thanos-query-frontend + name: observatorium-thanos-query-range-cache-memcached namespace: rhobs - apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: creationTimestamp: null labels: - app.kubernetes.io/component: memcached + app.kubernetes.io/component: query-range-cache app.kubernetes.io/instance: observatorium app.kubernetes.io/name: memcached app.kubernetes.io/part-of: observatorium app.kubernetes.io/version: "1.5" prometheus: app-sre - name: observatorium-thanos-query-frontend + name: observatorium-thanos-query-range-cache-memcached namespace: openshift-customer-monitoring spec: endpoints: @@ -164,7 +164,7 @@ objects: - rhobs selector: matchLabels: - app.kubernetes.io/component: memcached + app.kubernetes.io/component: query-range-cache app.kubernetes.io/instance: observatorium app.kubernetes.io/name: memcached app.kubernetes.io/part-of: observatorium @@ -474,17 +474,17 @@ objects: metadata: creationTimestamp: null labels: - app.kubernetes.io/component: memcached + app.kubernetes.io/component: query-range-cache app.kubernetes.io/instance: observatorium app.kubernetes.io/name: memcached app.kubernetes.io/part-of: observatorium - name: observatorium-thanos-query-frontend + name: observatorium-thanos-query-range-cache-memcached namespace: rhobs spec: maxUnavailable: 1 selector: matchLabels: - app.kubernetes.io/component: memcached + app.kubernetes.io/component: query-range-cache app.kubernetes.io/instance: observatorium app.kubernetes.io/name: memcached app.kubernetes.io/part-of: observatorium diff --git a/resources/services/app-sre-stage-01/rhobs/observatorium-metrics-query-rule-template.yaml b/resources/services/app-sre-stage-01/rhobs/observatorium-metrics-query-rule-template.yaml index 55a1a5df44..ffcc1ef611 100755 --- a/resources/services/app-sre-stage-01/rhobs/observatorium-metrics-query-rule-template.yaml +++ b/resources/services/app-sre-stage-01/rhobs/observatorium-metrics-query-rule-template.yaml @@ -168,6 +168,9 @@ objects: - --endpoint=dnssrv+_grpc._tcp.observatorium-thanos-store-default.rhobs.svc.cluster.local - --endpoint=dnssrv+_grpc._tcp.observatorium-thanos-store-rhel.rhobs.svc.cluster.local - --endpoint=dnssrv+_grpc._tcp.observatorium-thanos-store-telemeter.rhobs.svc.cluster.local + - --endpoint=http://observatorium-thanos-ruler-default.rhobs.svc.cluster.local:10902 + - --endpoint=http://observatorium-thanos-ruler-rhel.rhobs.svc.cluster.local:10902 + - --endpoint=http://observatorium-thanos-ruler-telemeter.rhobs.svc.cluster.local:10902 - --log.format=logfmt - --log.level=${LOG_LEVEL} - --query.auto-downsampling diff --git a/resources/services/app-sre-stage-01/rhobs/observatorium-metrics-query-template.yaml b/resources/services/app-sre-stage-01/rhobs/observatorium-metrics-query-template.yaml index 35be9bacaf..cd94bc3c9b 100755 --- a/resources/services/app-sre-stage-01/rhobs/observatorium-metrics-query-template.yaml +++ b/resources/services/app-sre-stage-01/rhobs/observatorium-metrics-query-template.yaml @@ -168,6 +168,9 @@ objects: - --endpoint=dnssrv+_grpc._tcp.observatorium-thanos-store-default.rhobs.svc.cluster.local - --endpoint=dnssrv+_grpc._tcp.observatorium-thanos-store-rhel.rhobs.svc.cluster.local - --endpoint=dnssrv+_grpc._tcp.observatorium-thanos-store-telemeter.rhobs.svc.cluster.local + - --endpoint=http://observatorium-thanos-ruler-default.rhobs.svc.cluster.local:10902 + - --endpoint=http://observatorium-thanos-ruler-rhel.rhobs.svc.cluster.local:10902 + - --endpoint=http://observatorium-thanos-ruler-telemeter.rhobs.svc.cluster.local:10902 - --log.format=logfmt - --log.level=${LOG_LEVEL} - --query.auto-downsampling diff --git a/resources/services/app-sre-stage-01/rhobs/rhel/observatorium-metrics-ruler-rhel-template.yaml b/resources/services/app-sre-stage-01/rhobs/rhel/observatorium-metrics-ruler-rhel-template.yaml new file mode 100755 index 0000000000..95952edcd5 --- /dev/null +++ b/resources/services/app-sre-stage-01/rhobs/rhel/observatorium-metrics-ruler-rhel-template.yaml @@ -0,0 +1,590 @@ +apiVersion: template.openshift.io/v1 +kind: Template +metadata: + creationTimestamp: null + name: observatorium-thanos-ruler-rhel +objects: +- apiVersion: route.openshift.io/v1 + kind: Route + metadata: + annotations: + cert-manager.io/issuer-kind: ClusterIssuer + cert-manager.io/issuer-name: letsencrypt-prod-http + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: rhel + name: observatorium-thanos-ruler-rhel + namespace: rhobs + spec: + host: "" + port: + targetPort: https + tls: + insecureEdgeTerminationPolicy: Redirect + termination: reencrypt + to: + kind: Service + name: observatorium-thanos-ruler-rhel + weight: null +- apiVersion: v1 + kind: Service + metadata: + annotations: + service.alpha.openshift.io/serving-cert-secret-name: ruler-tls + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: rhel + name: observatorium-thanos-ruler-rhel + namespace: rhobs + spec: + ports: + - name: http + port: 10902 + protocol: TCP + targetPort: 10902 + - name: grpc + port: 10901 + protocol: TCP + targetPort: 10901 + - name: internal + port: 8083 + protocol: TCP + targetPort: 8083 + - name: https + port: 8443 + protocol: TCP + targetPort: 8443 + selector: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + observatorium/tenant: rhel +- apiVersion: v1 + imagePullSecrets: + - name: quay.io + kind: ServiceAccount + metadata: + annotations: + serviceaccounts.openshift.io/oauth-redirectreference.application: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"observatorium-thanos-ruler-rhel"}}' + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: rhel + name: observatorium-thanos-ruler-rhel + namespace: rhobs +- apiVersion: monitoring.coreos.com/v1 + kind: ServiceMonitor + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: rhel + prometheus: app-sre + name: observatorium-thanos-ruler-rhel + namespace: openshift-customer-monitoring + spec: + endpoints: + - port: http + relabelings: + - action: replace + separator: / + sourceLabels: + - namespace + - pod + targetLabel: instance + - port: internal + relabelings: + - action: replace + separator: / + sourceLabels: + - namespace + - pod + targetLabel: instance + namespaceSelector: + matchNames: + - rhobs + selector: + matchLabels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + observatorium/tenant: rhel +- apiVersion: apps/v1 + kind: StatefulSet + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: rhel + name: observatorium-thanos-ruler-rhel + namespace: rhobs + spec: + replicas: ${{REPLICAS}} + selector: + matchLabels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + observatorium/tenant: rhel + serviceName: observatorium-thanos-ruler-rhel + template: + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: rhel + namespace: rhobs + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/instance + operator: In + values: + - observatorium + - key: app.kubernetes.io/name + operator: In + values: + - thanos-rule + topologyKey: kubernetes.io/hostname + weight: 100 + containers: + - args: + - rule + - --alert.label-drop=rule_replica + - --data-dir=/var/thanos/ruler + - --label=rule_replica="$(NAME)" + - --log.format=logfmt + - --log.level=${LOG_LEVEL} + - --objstore.config=$(OBJSTORE_CONFIG) + - --query=http://observatorium-thanos-query-rule.rhobs.svc.cluster.local:10902 + - --rule-file=/etc/thanos/rules/synced-rules/observatorium.yaml + - | + --tracing.config=type: JAEGER + config: + service_name: thanos-rule + sampler_type: ratelimiting + sampler_param: 2 + - --tsdb.retention=2d + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + key: aws_access_key_id + name: rhelemeter-tenant-s3 + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + key: aws_secret_access_key + name: rhelemeter-tenant-s3 + - name: OBJ_STORE_BUCKET + valueFrom: + secretKeyRef: + key: bucket + name: rhelemeter-tenant-s3 + - name: OBJ_STORE_REGION + valueFrom: + secretKeyRef: + key: aws_region + name: rhelemeter-tenant-s3 + - name: OBJ_STORE_ENDPOINT + valueFrom: + secretKeyRef: + key: endpoint + name: rhelemeter-tenant-s3 + - name: OBJSTORE_CONFIG + value: | + type: S3 + config: + bucket: $(OBJ_STORE_BUCKET) + endpoint: $(OBJ_STORE_ENDPOINT) + region: $(OBJ_STORE_REGION) + image: quay.io/thanos/thanos:v0.32.5 + imagePullPolicy: IfNotPresent + livenessProbe: + failureThreshold: 8 + httpGet: + path: /-/healthy + port: 10902 + periodSeconds: 30 + timeoutSeconds: 1 + name: thanos + ports: + - containerPort: 10902 + name: http + protocol: TCP + - containerPort: 10901 + name: grpc + protocol: TCP + readinessProbe: + failureThreshold: 20 + httpGet: + path: /-/ready + port: 10902 + periodSeconds: 5 + resources: + limits: + memory: ${MEMORY_LIMIT} + requests: + cpu: ${CPU_REQUEST} + memory: ${MEMORY_REQUEST} + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /var/thanos/ruler + name: data + - mountPath: /etc/thanos/rules/synced-rules + name: rule-syncer + readOnly: true + - args: + - -file=/etc/thanos-rule-syncer/observatorium.yaml + - -interval=60 + - -rules-backend-url=http://observatorium-rules-objstore.rhobs.svc.cluster.local:10902 + - -thanos-rule-url=127.0.0.1:10902 + image: quay.io/observatorium/thanos-rule-syncer:main-2022-09-14-338f9ec + name: observatorium-rules-syncer + ports: + - containerPort: 8083 + name: internal + protocol: TCP + resources: + limits: + cpu: 128m + memory: 128Mi + requests: + cpu: 32m + memory: 64Mi + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/thanos-rule-syncer + name: rule-syncer + - args: + - -volume-dir=/etc/thanos-rule-syncer + - -webhook-url=http://localhost:10902/-/reload + image: 'quay.io/openshift/origin-configmap-reloader:4.5.0:' + name: configmap-reloader + resources: + limits: + cpu: 200m + memory: 200Mi + requests: + cpu: 100m + memory: 100Mi + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/thanos/rules/observatorium-rules + name: observatorium-rules + - args: + - -provider=openshift + - -https-address=:8443 + - -http-address= + - -email-domain=* + - -upstream=http://localhost:10902 + - -openshift-service-account=observatorium-thanos-ruler-rhel + - '-openshift-sar={"resource": "namespaces", "verb": "get", "name": "rhobs", + "namespace": "rhobs"}' + - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", + "name": "rhobs", "namespace": "rhobs"}}' + - -tls-cert=/etc/tls/private/tls.crt + - -tls-key=/etc/tls/private/tls.key + - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token + - -cookie-secret=${OAUTH_PROXY_COOKIE_SECRET} + - -openshift-ca=/etc/pki/tls/cert.pem + - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt + image: quay.io/openshift/origin-oauth-proxy:4.15 + name: oauth-proxy + ports: + - containerPort: 8443 + name: https + protocol: TCP + resources: + limits: + cpu: 200m + memory: 200Mi + requests: + cpu: 100m + memory: 100Mi + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/tls/private + name: tls + readOnly: true + - args: + - --reporter.grpc.host-port=dns:///otel-trace-writer-collector-headless.observatorium-tools.svc:14250 + - --reporter.type=grpc + - --agent.tags=pod.namespace=$(NAMESPACE),pod.name=$(POD) + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POD + valueFrom: + fieldRef: + fieldPath: metadata.name + image: quay.io/app-sre/jaegertracing-jaeger-agent:1.22.0 + livenessProbe: + failureThreshold: 5 + httpGet: + path: / + port: 14271 + name: jaeger-agent + ports: + - containerPort: 5778 + name: configs + protocol: TCP + - containerPort: 6831 + name: jaeger-thrift + protocol: TCP + - containerPort: 14271 + name: metrics + protocol: TCP + readinessProbe: + httpGet: + path: / + port: 14271 + initialDelaySeconds: 1 + resources: + limits: + cpu: 128m + memory: 128Mi + requests: + cpu: 32m + memory: 64Mi + terminationMessagePolicy: FallbackToLogsOnError + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: observatorium-thanos-ruler-rhel + terminationGracePeriodSeconds: 120 + volumes: + - emptyDir: {} + name: rule-syncer + - name: tls + secret: + secretName: ruler-tls + updateStrategy: {} + volumeClaimTemplates: + - metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: rhel + name: data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: "" +- apiVersion: apps/v1 + kind: Deployment + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium + spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + strategy: {} + template: + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + namespace: observatorium + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/instance + operator: In + values: + - observatorium + - key: app.kubernetes.io/name + operator: In + values: + - rules-objstore + topologyKey: kubernetes.io/hostname + weight: 100 + containers: + - args: + - -log.format=logfmt + - -log.level=warn + image: 'quay.io/observatorium/rules-objstore:' + imagePullPolicy: IfNotPresent + livenessProbe: + failureThreshold: 10 + httpGet: + path: /live + port: 8081 + periodSeconds: 30 + successThreshold: 1 + timeoutSeconds: 1 + name: thanos + ports: + - containerPort: 8081 + name: internal + protocol: TCP + - containerPort: 8080 + name: public + protocol: TCP + readinessProbe: + failureThreshold: 12 + httpGet: + path: /ready + port: 8081 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + resources: + limits: + cpu: "1" + memory: 400Mi + requests: + cpu: 50m + memory: 200Mi + terminationMessagePolicy: FallbackToLogsOnError + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: observatorium-rules-objstore + terminationGracePeriodSeconds: 120 +- apiVersion: v1 + kind: Service + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium + spec: + ports: + - name: internal + port: 8081 + protocol: TCP + targetPort: 8081 + - name: public + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium +- apiVersion: v1 + kind: ServiceAccount + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium +- apiVersion: monitoring.coreos.com/v1 + kind: ServiceMonitor + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium + spec: + endpoints: + - port: internal + relabelings: + - action: replace + separator: / + sourceLabels: + - namespace + - pod + targetLabel: instance + namespaceSelector: {} + selector: + matchLabels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium +parameters: +- name: LOG_LEVEL + value: warn +- name: REPLICAS + value: "1" +- name: CPU_REQUEST + value: 100m +- name: MEMORY_LIMIT + value: 1Gi +- name: MEMORY_REQUEST + value: 256Mi +- from: '[a-zA-Z0-9]{40}' + generate: expression + name: OAUTH_PROXY_COOKIE_SECRET diff --git a/resources/services/app-sre-stage-01/rhobs/telemeter/observatorium-metrics-ruler-telemeter-template.yaml b/resources/services/app-sre-stage-01/rhobs/telemeter/observatorium-metrics-ruler-telemeter-template.yaml new file mode 100755 index 0000000000..f66f46428b --- /dev/null +++ b/resources/services/app-sre-stage-01/rhobs/telemeter/observatorium-metrics-ruler-telemeter-template.yaml @@ -0,0 +1,728 @@ +apiVersion: template.openshift.io/v1 +kind: Template +metadata: + creationTimestamp: null + name: observatorium-thanos-ruler-telemeter +objects: +- apiVersion: route.openshift.io/v1 + kind: Route + metadata: + annotations: + cert-manager.io/issuer-kind: ClusterIssuer + cert-manager.io/issuer-name: letsencrypt-prod-http + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: telemeter + name: observatorium-thanos-ruler-telemeter + namespace: rhobs + spec: + host: "" + port: + targetPort: https + tls: + insecureEdgeTerminationPolicy: Redirect + termination: reencrypt + to: + kind: Service + name: observatorium-thanos-ruler-telemeter + weight: null +- apiVersion: v1 + data: + observatorium.yaml: | + groups: + - interval: 4m + name: telemeter-telemeter.rules + rules: + - expr: | + count by (name,reason) (cluster_operator_conditions{condition="Degraded"} == 1) + labels: + tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 + record: name_reason:cluster_operator_degraded:count + - expr: | + count by (name,reason) (cluster_operator_conditions{condition="Available"} == 0) + labels: + tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 + record: name_reason:cluster_operator_unavailable:count + - expr: | + sort_desc(max by (_id,code) (code:apiserver_request_count:rate:sum{code=~"(4|5)\\d\\d"}) > 0.5) + labels: + tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 + record: id_code:apiserver_request_error_rate_sum:max + - expr: | + bottomk by (_id) (1, max by (_id, version) (0 * cluster_version{type="failure"}) or max by (_id, version) (1 + 0 * cluster_version{type="current"})) + labels: + tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 + record: id_version:cluster_available + - expr: | + topk by (_id) (1, max by (_id, managed, ebs_account, internal) (label_replace(label_replace((ocm_subscription{support=~"Standard|Premium|Layered"} * 0 + 1) or ocm_subscription * 0, "internal", "true", "email_domain", "redhat.com|(.*\\.|^)ibm.com"), "managed", "", "managed", "false")) + on(_id) group_left(version) (topk by (_id) (1, id_version*0)) + on(_id) group_left(install_type) (topk by (_id) (1, id_install_type*0)) + on(_id) group_left(host_type) (topk by (_id) (1, id_primary_host_type*0)) + on(_id) group_left(provider) (topk by (_id) (1, id_provider*0))) + labels: + tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 + record: id_version_ebs_account_internal:cluster_subscribed + - expr: | + 0 * (max by (_id,host_type) (topk by (_id) (1, label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(cluster:virt_platform_nodes:sum, "host_type", "$1", "type", "(aws|ibm_.*|ovirt|none|rhev|gcp|openstack|hyperv|vmware|nutanix)"), "host_type", "virt-unknown", "host_type", ""), "host_type", "kvm-unknown", "type", "kvm"), "host_type", "xen-unknown", "type", "xen.*"), "host_type", "metal", "host_type", "none"), "host_type", "ibm-$1", "host_type", "ibm[_-](power|systemz).*"))) or on(_id) label_replace(max by (_id) (cluster_version{type="current"}), "host_type", "", "host_type", "")) + labels: + tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 + record: id_primary_host_type + - expr: | + 0 * topk by (_id) (1, group by (_id, provider) (label_replace(cluster_infrastructure_provider, "provider", "$1", "type", "(.*)")) or on(_id) label_replace(group by (_id) (cluster_version{type="current"}), "provider", "unknown", "provider", "")) + labels: + tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 + record: id_provider + - expr: | + 0 * (max by (_id,version) (topk by (_id) (1, cluster_version{type="current"})) or on(_id) label_replace(max by (_id) (cluster:node_instance_type_count:sum*0), "version", "", "unknown", "")) + labels: + tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 + record: id_version + - expr: | + ( + count by (_id, install_type) ( + label_replace( + label_replace( + label_replace( + label_replace( + label_replace( + label_replace( + label_replace( + topk by (_id) (1, cluster_installer), "install_type", "upi", "type", "other" + ), "install_type", "ipi", "type", "openshift-install" + ), "install_type", "hive", "invoker", "hive" + ), "install_type", "assisted-installer", "invoker", "assisted-installer" + ), "install_type", "infrastructure-operator", "invoker", "assisted-installer-operator" + ), "install_type", "agent-installer", "invoker", "agent-installer" + ), "install_type", "hypershift", "invoker", "hypershift" + ) + ) or on(_id) ( + label_replace( + count by (_id) ( + cluster:virt_platform_nodes:sum + ), "install_type", "unknown", "install_type", "" + ) + ) * 0 + ) * 0 + labels: + tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 + record: id_install_type + - expr: | + 0 * (max by (_id,cloudpak_type) (topk by (_id) (1, count by (_id,cloudpak_type) (label_replace(subscription_sync_total{installed=~"ibm-((licensing|common-service)-operator).*"}, "cloudpak_type", "unknown", "", ".*"))))) + labels: + tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 + record: id_cloudpak_type + - expr: | + topk by(_id) (1, + (label_replace(7+0*count by (_id) (cluster:usage:resources:sum{resource="netnamespaces.network.openshift.io"}), "network_type", "OpenshiftSDN", "", "") > 0) or + (label_replace(6+0*count by (_id) (cluster:usage:resources:sum{resource="clusterinformations.crd.projectcalico.org"}), "network_type", "Calico", "", "") > 0) or + (label_replace(5+0*count by (_id) (cluster:usage:resources:sum{resource="acicontainersoperators.aci.ctrl"}), "network_type", "ACI", "", "") > 0) or + (label_replace(4+0*count by (_id) (cluster:usage:resources:sum{resource="kuryrnetworks.openstack.org"}), "network_type", "Kuryr", "", "") > 0) or + (label_replace(3+0*count by (_id) (cluster:usage:resources:sum{resource="ciliumendpoints.cilium.io"}), "network_type", "Cilium", "", "") > 0) or + (label_replace(2+0*count by (_id) (cluster:usage:resources:sum{resource="ncpconfigs.nsx.vmware.com"}), "network_type", "VMWareNSX", "", "") > 0) or + (label_replace(1+0*count by (_id) (cluster:usage:resources:sum{resource="egressips.k8s.ovn.org"}), "network_type", "OVNKube", "", "")) or + (label_replace(0+0*max by (_id) (cluster:node_instance_type_count:sum*0), "network_type", "unknown", "", "")) + ) + labels: + tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 + record: id_network_type + - expr: | + 0 * topk by (ebs_account) (1, max by (ebs_account,account_type,internal,email_domain) (label_replace(label_replace(label_replace(ocm_subscription{email_domain="redhat.com"}*0+5, "class", "Internal", "class", ".*") or label_replace(ocm_subscription{class!="Customer",email_domain=~"(.*\\.|^)ibm.com"}*0+4, "class", "Internal", "class", ".*") or (ocm_subscription{class="Customer"}*0+3) or (ocm_subscription{class="Partner"}*0+2) or (ocm_subscription{class="Evaluation"}*0+1) or label_replace(ocm_subscription{class!~"Evaluation|Customer|Partner"}*0+0, "class", "", "class", ".*"), "account_type", "$1", "class", "(.+)"), "internal", "true", "email_domain", "redhat.com|(.*\\.|^)ibm.com") )) + labels: + tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 + record: ebs_account_account_type_email_domain_internal + - expr: | + topk(500, sum (acm_managed_cluster_info) by (managed_cluster_id, cloud, created_via, endpoint, instance, job, namespace, pod, service, vendor, version)) + labels: + tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 + record: acm_top500_mcs:acm_managed_cluster_info + - expr: | + max by(_id) (sum_over_time(cluster:usage:workload:capacity_physical_cpu_cores:max:5m[1h:5m]) / scalar(count_over_time(vector(1)[1h:5m]))) + labels: + tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 + record: cluster:usage:workload:capacity_physical_cpu_hours + - expr: | + max by(_id) (count_over_time(cluster:usage:workload:capacity_physical_cpu_cores:max:5m[1h:5m])) / scalar(count_over_time(vector(1)[1h:5m])) + labels: + tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 + record: cluster:usage:workload:capacity_physical_instance_hours + - expr: | + sum(sum_over_time(cluster:capacity_cpu_cores:sum{label_node_role_kubernetes_io = ''}[1h:5m])) by (_id) / scalar(count_over_time(vector(1)[1h:5m])) + labels: + tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 + record: cluster:usage:workload:capacity_virtual_cpu_hours + kind: ConfigMap + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: telemeter + name: observatorium-rules + namespace: rhobs +- apiVersion: v1 + kind: Service + metadata: + annotations: + service.alpha.openshift.io/serving-cert-secret-name: ruler-tls + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: telemeter + name: observatorium-thanos-ruler-telemeter + namespace: rhobs + spec: + ports: + - name: http + port: 10902 + protocol: TCP + targetPort: 10902 + - name: grpc + port: 10901 + protocol: TCP + targetPort: 10901 + - name: internal + port: 8083 + protocol: TCP + targetPort: 8083 + - name: https + port: 8443 + protocol: TCP + targetPort: 8443 + selector: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + observatorium/tenant: telemeter +- apiVersion: v1 + imagePullSecrets: + - name: quay.io + kind: ServiceAccount + metadata: + annotations: + serviceaccounts.openshift.io/oauth-redirectreference.application: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"observatorium-thanos-ruler-telemeter"}}' + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: telemeter + name: observatorium-thanos-ruler-telemeter + namespace: rhobs +- apiVersion: monitoring.coreos.com/v1 + kind: ServiceMonitor + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: telemeter + prometheus: app-sre + name: observatorium-thanos-ruler-telemeter + namespace: openshift-customer-monitoring + spec: + endpoints: + - port: http + relabelings: + - action: replace + separator: / + sourceLabels: + - namespace + - pod + targetLabel: instance + - port: internal + relabelings: + - action: replace + separator: / + sourceLabels: + - namespace + - pod + targetLabel: instance + namespaceSelector: + matchNames: + - rhobs + selector: + matchLabels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + observatorium/tenant: telemeter +- apiVersion: apps/v1 + kind: StatefulSet + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: telemeter + name: observatorium-thanos-ruler-telemeter + namespace: rhobs + spec: + replicas: ${{REPLICAS}} + selector: + matchLabels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + observatorium/tenant: telemeter + serviceName: observatorium-thanos-ruler-telemeter + template: + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: telemeter + namespace: rhobs + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/instance + operator: In + values: + - observatorium + - key: app.kubernetes.io/name + operator: In + values: + - thanos-rule + topologyKey: kubernetes.io/hostname + weight: 100 + containers: + - args: + - rule + - --alert.label-drop=rule_replica + - --data-dir=/var/thanos/ruler + - --label=rule_replica="$(NAME)" + - --log.format=logfmt + - --log.level=${LOG_LEVEL} + - --objstore.config=$(OBJSTORE_CONFIG) + - --query=http://observatorium-thanos-query-rule.rhobs.svc.cluster.local:10902 + - --rule-file=/etc/thanos/rules/synced-rules/observatorium.yaml + - --rule-file=/etc/thanos/rules/telemeter-rules/observatorium.yaml + - | + --tracing.config=type: JAEGER + config: + service_name: thanos-rule + sampler_type: ratelimiting + sampler_param: 2 + - --tsdb.retention=2d + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + key: aws_access_key_id + name: telemeter-tenant-s3 + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + key: aws_secret_access_key + name: telemeter-tenant-s3 + - name: OBJ_STORE_BUCKET + valueFrom: + secretKeyRef: + key: bucket + name: telemeter-tenant-s3 + - name: OBJ_STORE_REGION + valueFrom: + secretKeyRef: + key: aws_region + name: telemeter-tenant-s3 + - name: OBJ_STORE_ENDPOINT + valueFrom: + secretKeyRef: + key: endpoint + name: telemeter-tenant-s3 + - name: OBJSTORE_CONFIG + value: | + type: S3 + config: + bucket: $(OBJ_STORE_BUCKET) + endpoint: $(OBJ_STORE_ENDPOINT) + region: $(OBJ_STORE_REGION) + image: quay.io/thanos/thanos:v0.32.5 + imagePullPolicy: IfNotPresent + livenessProbe: + failureThreshold: 8 + httpGet: + path: /-/healthy + port: 10902 + periodSeconds: 30 + timeoutSeconds: 1 + name: thanos + ports: + - containerPort: 10902 + name: http + protocol: TCP + - containerPort: 10901 + name: grpc + protocol: TCP + readinessProbe: + failureThreshold: 20 + httpGet: + path: /-/ready + port: 10902 + periodSeconds: 5 + resources: + limits: + memory: ${MEMORY_LIMIT} + requests: + cpu: ${CPU_REQUEST} + memory: ${MEMORY_REQUEST} + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /var/thanos/ruler + name: data + - mountPath: /etc/thanos/rules/synced-rules + name: rule-syncer + readOnly: true + - mountPath: /etc/thanos/rules/telemeter-rules + name: observatorium-rules + - args: + - -file=/etc/thanos-rule-syncer/observatorium.yaml + - -interval=60 + - -rules-backend-url=http://observatorium-rules-objstore.rhobs.svc.cluster.local:10902 + - -thanos-rule-url=127.0.0.1:10902 + image: quay.io/observatorium/thanos-rule-syncer:main-2022-09-14-338f9ec + name: observatorium-rules-syncer + ports: + - containerPort: 8083 + name: internal + protocol: TCP + resources: + limits: + cpu: 128m + memory: 128Mi + requests: + cpu: 32m + memory: 64Mi + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/thanos-rule-syncer + name: rule-syncer + - args: + - -volume-dir=/etc/thanos-rule-syncer + - -webhook-url=http://localhost:10902/-/reload + image: 'quay.io/openshift/origin-configmap-reloader:4.5.0:' + name: configmap-reloader + resources: + limits: + cpu: 200m + memory: 200Mi + requests: + cpu: 100m + memory: 100Mi + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/thanos/rules/observatorium-rules + name: observatorium-rules + - args: + - -provider=openshift + - -https-address=:8443 + - -http-address= + - -email-domain=* + - -upstream=http://localhost:10902 + - -openshift-service-account=observatorium-thanos-ruler-telemeter + - '-openshift-sar={"resource": "namespaces", "verb": "get", "name": "rhobs", + "namespace": "rhobs"}' + - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", + "name": "rhobs", "namespace": "rhobs"}}' + - -tls-cert=/etc/tls/private/tls.crt + - -tls-key=/etc/tls/private/tls.key + - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token + - -cookie-secret=${OAUTH_PROXY_COOKIE_SECRET} + - -openshift-ca=/etc/pki/tls/cert.pem + - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt + image: quay.io/openshift/origin-oauth-proxy:4.15 + name: oauth-proxy + ports: + - containerPort: 8443 + name: https + protocol: TCP + resources: + limits: + cpu: 200m + memory: 200Mi + requests: + cpu: 100m + memory: 100Mi + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/tls/private + name: tls + readOnly: true + - args: + - --reporter.grpc.host-port=dns:///otel-trace-writer-collector-headless.observatorium-tools.svc:14250 + - --reporter.type=grpc + - --agent.tags=pod.namespace=$(NAMESPACE),pod.name=$(POD) + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POD + valueFrom: + fieldRef: + fieldPath: metadata.name + image: quay.io/app-sre/jaegertracing-jaeger-agent:1.22.0 + livenessProbe: + failureThreshold: 5 + httpGet: + path: / + port: 14271 + name: jaeger-agent + ports: + - containerPort: 5778 + name: configs + protocol: TCP + - containerPort: 6831 + name: jaeger-thrift + protocol: TCP + - containerPort: 14271 + name: metrics + protocol: TCP + readinessProbe: + httpGet: + path: / + port: 14271 + initialDelaySeconds: 1 + resources: + limits: + cpu: 128m + memory: 128Mi + requests: + cpu: 32m + memory: 64Mi + terminationMessagePolicy: FallbackToLogsOnError + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: observatorium-thanos-ruler-telemeter + terminationGracePeriodSeconds: 120 + volumes: + - configMap: + name: observatorium-rules + name: observatorium-rules + - emptyDir: {} + name: rule-syncer + - name: tls + secret: + secretName: ruler-tls + updateStrategy: {} + volumeClaimTemplates: + - metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.5 + observatorium/tenant: telemeter + name: data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: "" +- apiVersion: apps/v1 + kind: Deployment + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium + spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + strategy: {} + template: + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + namespace: observatorium + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/instance + operator: In + values: + - observatorium + - key: app.kubernetes.io/name + operator: In + values: + - rules-objstore + topologyKey: kubernetes.io/hostname + weight: 100 + containers: + - args: + - -log.format=logfmt + - -log.level=warn + image: 'quay.io/observatorium/rules-objstore:' + imagePullPolicy: IfNotPresent + livenessProbe: + failureThreshold: 10 + httpGet: + path: /live + port: 8081 + periodSeconds: 30 + successThreshold: 1 + timeoutSeconds: 1 + name: thanos + ports: + - containerPort: 8081 + name: internal + protocol: TCP + - containerPort: 8080 + name: public + protocol: TCP + readinessProbe: + failureThreshold: 12 + httpGet: + path: /ready + port: 8081 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + resources: + limits: + cpu: "1" + memory: 400Mi + requests: + cpu: 50m + memory: 200Mi + terminationMessagePolicy: FallbackToLogsOnError + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: observatorium-rules-objstore + terminationGracePeriodSeconds: 120 +- apiVersion: v1 + kind: Service + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium + spec: + ports: + - name: internal + port: 8081 + protocol: TCP + targetPort: 8081 + - name: public + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium +- apiVersion: v1 + kind: ServiceAccount + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium +- apiVersion: monitoring.coreos.com/v1 + kind: ServiceMonitor + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium + spec: + endpoints: + - port: internal + relabelings: + - action: replace + separator: / + sourceLabels: + - namespace + - pod + targetLabel: instance + namespaceSelector: {} + selector: + matchLabels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium +parameters: +- name: LOG_LEVEL + value: warn +- name: REPLICAS + value: "1" +- name: CPU_REQUEST + value: 100m +- name: MEMORY_LIMIT + value: 1Gi +- name: MEMORY_REQUEST + value: 256Mi +- from: '[a-zA-Z0-9]{40}' + generate: expression + name: OAUTH_PROXY_COOKIE_SECRET diff --git a/resources/services/telemeter-prod-01/rhobs/default/observatorium-metrics-ruler-default-template.yaml b/resources/services/telemeter-prod-01/rhobs/default/observatorium-metrics-ruler-default-template.yaml new file mode 100755 index 0000000000..eca254cd2b --- /dev/null +++ b/resources/services/telemeter-prod-01/rhobs/default/observatorium-metrics-ruler-default-template.yaml @@ -0,0 +1,590 @@ +apiVersion: template.openshift.io/v1 +kind: Template +metadata: + creationTimestamp: null + name: observatorium-thanos-ruler-default +objects: +- apiVersion: route.openshift.io/v1 + kind: Route + metadata: + annotations: + cert-manager.io/issuer-kind: ClusterIssuer + cert-manager.io/issuer-name: letsencrypt-prod-http + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.4 + observatorium/tenant: default + name: observatorium-thanos-ruler-default + namespace: rhobs + spec: + host: "" + port: + targetPort: https + tls: + insecureEdgeTerminationPolicy: Redirect + termination: reencrypt + to: + kind: Service + name: observatorium-thanos-ruler-default + weight: null +- apiVersion: v1 + kind: Service + metadata: + annotations: + service.alpha.openshift.io/serving-cert-secret-name: ruler-tls + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.4 + observatorium/tenant: default + name: observatorium-thanos-ruler-default + namespace: rhobs + spec: + ports: + - name: http + port: 10902 + protocol: TCP + targetPort: 10902 + - name: grpc + port: 10901 + protocol: TCP + targetPort: 10901 + - name: internal + port: 8083 + protocol: TCP + targetPort: 8083 + - name: https + port: 8443 + protocol: TCP + targetPort: 8443 + selector: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + observatorium/tenant: default +- apiVersion: v1 + imagePullSecrets: + - name: quay.io + kind: ServiceAccount + metadata: + annotations: + serviceaccounts.openshift.io/oauth-redirectreference.application: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"observatorium-thanos-ruler-default"}}' + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.4 + observatorium/tenant: default + name: observatorium-thanos-ruler-default + namespace: rhobs +- apiVersion: monitoring.coreos.com/v1 + kind: ServiceMonitor + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.4 + observatorium/tenant: default + prometheus: app-sre + name: observatorium-thanos-ruler-default + namespace: openshift-customer-monitoring + spec: + endpoints: + - port: http + relabelings: + - action: replace + separator: / + sourceLabels: + - namespace + - pod + targetLabel: instance + - port: internal + relabelings: + - action: replace + separator: / + sourceLabels: + - namespace + - pod + targetLabel: instance + namespaceSelector: + matchNames: + - rhobs + selector: + matchLabels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + observatorium/tenant: default +- apiVersion: apps/v1 + kind: StatefulSet + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.4 + observatorium/tenant: default + name: observatorium-thanos-ruler-default + namespace: rhobs + spec: + replicas: ${{REPLICAS}} + selector: + matchLabels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + observatorium/tenant: default + serviceName: observatorium-thanos-ruler-default + template: + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.4 + observatorium/tenant: default + namespace: rhobs + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/instance + operator: In + values: + - observatorium + - key: app.kubernetes.io/name + operator: In + values: + - thanos-rule + topologyKey: kubernetes.io/hostname + weight: 100 + containers: + - args: + - rule + - --alert.label-drop=rule_replica + - --data-dir=/var/thanos/ruler + - --label=rule_replica="$(NAME)" + - --log.format=logfmt + - --log.level=${LOG_LEVEL} + - --objstore.config=$(OBJSTORE_CONFIG) + - --query=http://observatorium-thanos-query-rule.rhobs.svc.cluster.local:10902 + - --rule-file=/etc/thanos/rules/synced-rules/observatorium.yaml + - | + --tracing.config=type: JAEGER + config: + service_name: thanos-rule + sampler_type: ratelimiting + sampler_param: 2 + - --tsdb.retention=2d + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + key: aws_access_key_id + name: default-tenant-s3 + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + key: aws_secret_access_key + name: default-tenant-s3 + - name: OBJ_STORE_BUCKET + valueFrom: + secretKeyRef: + key: bucket + name: default-tenant-s3 + - name: OBJ_STORE_REGION + valueFrom: + secretKeyRef: + key: aws_region + name: default-tenant-s3 + - name: OBJ_STORE_ENDPOINT + valueFrom: + secretKeyRef: + key: endpoint + name: default-tenant-s3 + - name: OBJSTORE_CONFIG + value: | + type: S3 + config: + bucket: $(OBJ_STORE_BUCKET) + endpoint: $(OBJ_STORE_ENDPOINT) + region: $(OBJ_STORE_REGION) + image: quay.io/thanos/thanos:v0.32.4 + imagePullPolicy: IfNotPresent + livenessProbe: + failureThreshold: 8 + httpGet: + path: /-/healthy + port: 10902 + periodSeconds: 30 + timeoutSeconds: 1 + name: thanos + ports: + - containerPort: 10902 + name: http + protocol: TCP + - containerPort: 10901 + name: grpc + protocol: TCP + readinessProbe: + failureThreshold: 20 + httpGet: + path: /-/ready + port: 10902 + periodSeconds: 5 + resources: + limits: + memory: ${MEMORY_LIMIT} + requests: + cpu: ${CPU_REQUEST} + memory: ${MEMORY_REQUEST} + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /var/thanos/ruler + name: data + - mountPath: /etc/thanos/rules/synced-rules + name: rule-syncer + readOnly: true + - args: + - -file=/etc/thanos-rule-syncer/observatorium.yaml + - -interval=60 + - -rules-backend-url=http://observatorium-rules-objstore.rhobs.svc.cluster.local:10902 + - -thanos-rule-url=127.0.0.1:10902 + image: quay.io/observatorium/thanos-rule-syncer:main-2022-09-14-338f9ec + name: observatorium-rules-syncer + ports: + - containerPort: 8083 + name: internal + protocol: TCP + resources: + limits: + cpu: 128m + memory: 128Mi + requests: + cpu: 32m + memory: 64Mi + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/thanos-rule-syncer + name: rule-syncer + - args: + - -volume-dir=/etc/thanos-rule-syncer + - -webhook-url=http://localhost:10902/-/reload + image: 'quay.io/openshift/origin-configmap-reloader:4.5.0:' + name: configmap-reloader + resources: + limits: + cpu: 200m + memory: 200Mi + requests: + cpu: 100m + memory: 100Mi + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/thanos/rules/observatorium-rules + name: observatorium-rules + - args: + - -provider=openshift + - -https-address=:8443 + - -http-address= + - -email-domain=* + - -upstream=http://localhost:10902 + - -openshift-service-account=observatorium-thanos-ruler-default + - '-openshift-sar={"resource": "namespaces", "verb": "get", "name": "rhobs", + "namespace": "rhobs"}' + - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", + "name": "rhobs", "namespace": "rhobs"}}' + - -tls-cert=/etc/tls/private/tls.crt + - -tls-key=/etc/tls/private/tls.key + - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token + - -cookie-secret=${OAUTH_PROXY_COOKIE_SECRET} + - -openshift-ca=/etc/pki/tls/cert.pem + - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt + image: quay.io/openshift/origin-oauth-proxy:4.15 + name: oauth-proxy + ports: + - containerPort: 8443 + name: https + protocol: TCP + resources: + limits: + cpu: 200m + memory: 200Mi + requests: + cpu: 100m + memory: 100Mi + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/tls/private + name: tls + readOnly: true + - args: + - --reporter.grpc.host-port=dns:///otel-trace-writer-collector-headless.observatorium-tools.svc:14250 + - --reporter.type=grpc + - --agent.tags=pod.namespace=$(NAMESPACE),pod.name=$(POD) + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POD + valueFrom: + fieldRef: + fieldPath: metadata.name + image: quay.io/app-sre/jaegertracing-jaeger-agent:1.22.0 + livenessProbe: + failureThreshold: 5 + httpGet: + path: / + port: 14271 + name: jaeger-agent + ports: + - containerPort: 5778 + name: configs + protocol: TCP + - containerPort: 6831 + name: jaeger-thrift + protocol: TCP + - containerPort: 14271 + name: metrics + protocol: TCP + readinessProbe: + httpGet: + path: / + port: 14271 + initialDelaySeconds: 1 + resources: + limits: + cpu: 128m + memory: 128Mi + requests: + cpu: 32m + memory: 64Mi + terminationMessagePolicy: FallbackToLogsOnError + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: observatorium-thanos-ruler-default + terminationGracePeriodSeconds: 120 + volumes: + - emptyDir: {} + name: rule-syncer + - name: tls + secret: + secretName: ruler-tls + updateStrategy: {} + volumeClaimTemplates: + - metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.4 + observatorium/tenant: default + name: data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: "" +- apiVersion: apps/v1 + kind: Deployment + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium + spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + strategy: {} + template: + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + namespace: observatorium + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/instance + operator: In + values: + - observatorium + - key: app.kubernetes.io/name + operator: In + values: + - rules-objstore + topologyKey: kubernetes.io/hostname + weight: 100 + containers: + - args: + - -log.format=logfmt + - -log.level=warn + image: 'quay.io/observatorium/rules-objstore:' + imagePullPolicy: IfNotPresent + livenessProbe: + failureThreshold: 10 + httpGet: + path: /live + port: 8081 + periodSeconds: 30 + successThreshold: 1 + timeoutSeconds: 1 + name: thanos + ports: + - containerPort: 8081 + name: internal + protocol: TCP + - containerPort: 8080 + name: public + protocol: TCP + readinessProbe: + failureThreshold: 12 + httpGet: + path: /ready + port: 8081 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + resources: + limits: + cpu: "1" + memory: 400Mi + requests: + cpu: 50m + memory: 200Mi + terminationMessagePolicy: FallbackToLogsOnError + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: observatorium-rules-objstore + terminationGracePeriodSeconds: 120 +- apiVersion: v1 + kind: Service + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium + spec: + ports: + - name: internal + port: 8081 + protocol: TCP + targetPort: 8081 + - name: public + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium +- apiVersion: v1 + kind: ServiceAccount + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium +- apiVersion: monitoring.coreos.com/v1 + kind: ServiceMonitor + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium + spec: + endpoints: + - port: internal + relabelings: + - action: replace + separator: / + sourceLabels: + - namespace + - pod + targetLabel: instance + namespaceSelector: {} + selector: + matchLabels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium +parameters: +- name: LOG_LEVEL + value: warn +- name: REPLICAS + value: "1" +- name: CPU_REQUEST + value: 100m +- name: MEMORY_LIMIT + value: 1Gi +- name: MEMORY_REQUEST + value: 256Mi +- from: '[a-zA-Z0-9]{40}' + generate: expression + name: OAUTH_PROXY_COOKIE_SECRET diff --git a/resources/services/telemeter-prod-01/rhobs/observatorium-metrics-query-frontend-template.yaml b/resources/services/telemeter-prod-01/rhobs/observatorium-metrics-query-frontend-template.yaml index ed51cca806..5ae33b2ff7 100755 --- a/resources/services/telemeter-prod-01/rhobs/observatorium-metrics-query-frontend-template.yaml +++ b/resources/services/telemeter-prod-01/rhobs/observatorium-metrics-query-frontend-template.yaml @@ -9,18 +9,18 @@ objects: metadata: creationTimestamp: null labels: - app.kubernetes.io/component: memcached + app.kubernetes.io/component: query-range-cache app.kubernetes.io/instance: observatorium app.kubernetes.io/name: memcached app.kubernetes.io/part-of: observatorium app.kubernetes.io/version: "1.5" - name: observatorium-thanos-query-frontend + name: observatorium-thanos-query-range-cache-memcached namespace: rhobs spec: - replicas: ${{REPLICAS}} + replicas: 1 selector: matchLabels: - app.kubernetes.io/component: memcached + app.kubernetes.io/component: query-range-cache app.kubernetes.io/instance: observatorium app.kubernetes.io/name: memcached app.kubernetes.io/part-of: observatorium @@ -29,7 +29,7 @@ objects: metadata: creationTimestamp: null labels: - app.kubernetes.io/component: memcached + app.kubernetes.io/component: query-range-cache app.kubernetes.io/instance: observatorium app.kubernetes.io/name: memcached app.kubernetes.io/part-of: observatorium @@ -60,17 +60,17 @@ objects: - --verbose=true image: quay.io/app-sre/memcached:1.5 imagePullPolicy: IfNotPresent - name: observatorium-thanos-query-frontend + name: observatorium-thanos-query-range-cache-memcached ports: - containerPort: 11211 name: client protocol: TCP resources: limits: - memory: ${MEMORY_LIMIT} + memory: 3Gi requests: - cpu: ${CPU_REQUEST} - memory: ${MEMORY_REQUEST} + cpu: 500m + memory: 2Gi terminationMessagePolicy: FallbackToLogsOnError - args: - --memcached.address=localhost:0 @@ -92,19 +92,19 @@ objects: terminationMessagePolicy: FallbackToLogsOnError nodeSelector: kubernetes.io/os: linux - serviceAccountName: observatorium-thanos-query-frontend + serviceAccountName: observatorium-thanos-query-range-cache-memcached terminationGracePeriodSeconds: 120 - apiVersion: v1 kind: Service metadata: creationTimestamp: null labels: - app.kubernetes.io/component: memcached + app.kubernetes.io/component: query-range-cache app.kubernetes.io/instance: observatorium app.kubernetes.io/name: memcached app.kubernetes.io/part-of: observatorium app.kubernetes.io/version: "1.5" - name: observatorium-thanos-query-frontend + name: observatorium-thanos-query-range-cache-memcached namespace: rhobs spec: clusterIP: None @@ -118,7 +118,7 @@ objects: protocol: TCP targetPort: 9150 selector: - app.kubernetes.io/component: memcached + app.kubernetes.io/component: query-range-cache app.kubernetes.io/instance: observatorium app.kubernetes.io/name: memcached app.kubernetes.io/part-of: observatorium @@ -129,25 +129,25 @@ objects: metadata: creationTimestamp: null labels: - app.kubernetes.io/component: memcached + app.kubernetes.io/component: query-range-cache app.kubernetes.io/instance: observatorium app.kubernetes.io/name: memcached app.kubernetes.io/part-of: observatorium app.kubernetes.io/version: "1.5" - name: observatorium-thanos-query-frontend + name: observatorium-thanos-query-range-cache-memcached namespace: rhobs - apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: creationTimestamp: null labels: - app.kubernetes.io/component: memcached + app.kubernetes.io/component: query-range-cache app.kubernetes.io/instance: observatorium app.kubernetes.io/name: memcached app.kubernetes.io/part-of: observatorium app.kubernetes.io/version: "1.5" prometheus: app-sre - name: observatorium-thanos-query-frontend + name: observatorium-thanos-query-range-cache-memcached namespace: openshift-customer-monitoring spec: endpoints: @@ -164,7 +164,7 @@ objects: - rhobs selector: matchLabels: - app.kubernetes.io/component: memcached + app.kubernetes.io/component: query-range-cache app.kubernetes.io/instance: observatorium app.kubernetes.io/name: memcached app.kubernetes.io/part-of: observatorium @@ -474,17 +474,17 @@ objects: metadata: creationTimestamp: null labels: - app.kubernetes.io/component: memcached + app.kubernetes.io/component: query-range-cache app.kubernetes.io/instance: observatorium app.kubernetes.io/name: memcached app.kubernetes.io/part-of: observatorium - name: observatorium-thanos-query-frontend + name: observatorium-thanos-query-range-cache-memcached namespace: rhobs spec: maxUnavailable: 1 selector: matchLabels: - app.kubernetes.io/component: memcached + app.kubernetes.io/component: query-range-cache app.kubernetes.io/instance: observatorium app.kubernetes.io/name: memcached app.kubernetes.io/part-of: observatorium diff --git a/resources/services/telemeter-prod-01/rhobs/observatorium-metrics-query-rule-template.yaml b/resources/services/telemeter-prod-01/rhobs/observatorium-metrics-query-rule-template.yaml index 2b5edd9820..76e50596ac 100755 --- a/resources/services/telemeter-prod-01/rhobs/observatorium-metrics-query-rule-template.yaml +++ b/resources/services/telemeter-prod-01/rhobs/observatorium-metrics-query-rule-template.yaml @@ -168,6 +168,9 @@ objects: - --endpoint=dnssrv+_grpc._tcp.observatorium-thanos-store-default.rhobs.svc.cluster.local - --endpoint=dnssrv+_grpc._tcp.observatorium-thanos-store-rhel.rhobs.svc.cluster.local - --endpoint=dnssrv+_grpc._tcp.observatorium-thanos-store-telemeter.rhobs.svc.cluster.local + - --endpoint=http://observatorium-thanos-ruler-default.rhobs.svc.cluster.local:10902 + - --endpoint=http://observatorium-thanos-ruler-rhel.rhobs.svc.cluster.local:10902 + - --endpoint=http://observatorium-thanos-ruler-telemeter.rhobs.svc.cluster.local:10902 - --log.format=logfmt - --log.level=${LOG_LEVEL} - --query.auto-downsampling diff --git a/resources/services/telemeter-prod-01/rhobs/observatorium-metrics-query-template.yaml b/resources/services/telemeter-prod-01/rhobs/observatorium-metrics-query-template.yaml index 528212d27a..e8d35b88d6 100755 --- a/resources/services/telemeter-prod-01/rhobs/observatorium-metrics-query-template.yaml +++ b/resources/services/telemeter-prod-01/rhobs/observatorium-metrics-query-template.yaml @@ -168,6 +168,9 @@ objects: - --endpoint=dnssrv+_grpc._tcp.observatorium-thanos-store-default.rhobs.svc.cluster.local - --endpoint=dnssrv+_grpc._tcp.observatorium-thanos-store-rhel.rhobs.svc.cluster.local - --endpoint=dnssrv+_grpc._tcp.observatorium-thanos-store-telemeter.rhobs.svc.cluster.local + - --endpoint=http://observatorium-thanos-ruler-default.rhobs.svc.cluster.local:10902 + - --endpoint=http://observatorium-thanos-ruler-rhel.rhobs.svc.cluster.local:10902 + - --endpoint=http://observatorium-thanos-ruler-telemeter.rhobs.svc.cluster.local:10902 - --log.format=logfmt - --log.level=${LOG_LEVEL} - --query.auto-downsampling diff --git a/resources/services/telemeter-prod-01/rhobs/rhel/observatorium-metrics-ruler-rhel-template.yaml b/resources/services/telemeter-prod-01/rhobs/rhel/observatorium-metrics-ruler-rhel-template.yaml new file mode 100755 index 0000000000..edee20bd02 --- /dev/null +++ b/resources/services/telemeter-prod-01/rhobs/rhel/observatorium-metrics-ruler-rhel-template.yaml @@ -0,0 +1,590 @@ +apiVersion: template.openshift.io/v1 +kind: Template +metadata: + creationTimestamp: null + name: observatorium-thanos-ruler-rhel +objects: +- apiVersion: route.openshift.io/v1 + kind: Route + metadata: + annotations: + cert-manager.io/issuer-kind: ClusterIssuer + cert-manager.io/issuer-name: letsencrypt-prod-http + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.4 + observatorium/tenant: rhel + name: observatorium-thanos-ruler-rhel + namespace: rhobs + spec: + host: "" + port: + targetPort: https + tls: + insecureEdgeTerminationPolicy: Redirect + termination: reencrypt + to: + kind: Service + name: observatorium-thanos-ruler-rhel + weight: null +- apiVersion: v1 + kind: Service + metadata: + annotations: + service.alpha.openshift.io/serving-cert-secret-name: ruler-tls + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.4 + observatorium/tenant: rhel + name: observatorium-thanos-ruler-rhel + namespace: rhobs + spec: + ports: + - name: http + port: 10902 + protocol: TCP + targetPort: 10902 + - name: grpc + port: 10901 + protocol: TCP + targetPort: 10901 + - name: internal + port: 8083 + protocol: TCP + targetPort: 8083 + - name: https + port: 8443 + protocol: TCP + targetPort: 8443 + selector: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + observatorium/tenant: rhel +- apiVersion: v1 + imagePullSecrets: + - name: quay.io + kind: ServiceAccount + metadata: + annotations: + serviceaccounts.openshift.io/oauth-redirectreference.application: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"observatorium-thanos-ruler-rhel"}}' + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.4 + observatorium/tenant: rhel + name: observatorium-thanos-ruler-rhel + namespace: rhobs +- apiVersion: monitoring.coreos.com/v1 + kind: ServiceMonitor + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.4 + observatorium/tenant: rhel + prometheus: app-sre + name: observatorium-thanos-ruler-rhel + namespace: openshift-customer-monitoring + spec: + endpoints: + - port: http + relabelings: + - action: replace + separator: / + sourceLabels: + - namespace + - pod + targetLabel: instance + - port: internal + relabelings: + - action: replace + separator: / + sourceLabels: + - namespace + - pod + targetLabel: instance + namespaceSelector: + matchNames: + - rhobs + selector: + matchLabels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + observatorium/tenant: rhel +- apiVersion: apps/v1 + kind: StatefulSet + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.4 + observatorium/tenant: rhel + name: observatorium-thanos-ruler-rhel + namespace: rhobs + spec: + replicas: ${{REPLICAS}} + selector: + matchLabels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + observatorium/tenant: rhel + serviceName: observatorium-thanos-ruler-rhel + template: + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.4 + observatorium/tenant: rhel + namespace: rhobs + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/instance + operator: In + values: + - observatorium + - key: app.kubernetes.io/name + operator: In + values: + - thanos-rule + topologyKey: kubernetes.io/hostname + weight: 100 + containers: + - args: + - rule + - --alert.label-drop=rule_replica + - --data-dir=/var/thanos/ruler + - --label=rule_replica="$(NAME)" + - --log.format=logfmt + - --log.level=${LOG_LEVEL} + - --objstore.config=$(OBJSTORE_CONFIG) + - --query=http://observatorium-thanos-query-rule.rhobs.svc.cluster.local:10902 + - --rule-file=/etc/thanos/rules/synced-rules/observatorium.yaml + - | + --tracing.config=type: JAEGER + config: + service_name: thanos-rule + sampler_type: ratelimiting + sampler_param: 2 + - --tsdb.retention=2d + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + key: aws_access_key_id + name: rhelemeter-tenant-s3 + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + key: aws_secret_access_key + name: rhelemeter-tenant-s3 + - name: OBJ_STORE_BUCKET + valueFrom: + secretKeyRef: + key: bucket + name: rhelemeter-tenant-s3 + - name: OBJ_STORE_REGION + valueFrom: + secretKeyRef: + key: aws_region + name: rhelemeter-tenant-s3 + - name: OBJ_STORE_ENDPOINT + valueFrom: + secretKeyRef: + key: endpoint + name: rhelemeter-tenant-s3 + - name: OBJSTORE_CONFIG + value: | + type: S3 + config: + bucket: $(OBJ_STORE_BUCKET) + endpoint: $(OBJ_STORE_ENDPOINT) + region: $(OBJ_STORE_REGION) + image: quay.io/thanos/thanos:v0.32.4 + imagePullPolicy: IfNotPresent + livenessProbe: + failureThreshold: 8 + httpGet: + path: /-/healthy + port: 10902 + periodSeconds: 30 + timeoutSeconds: 1 + name: thanos + ports: + - containerPort: 10902 + name: http + protocol: TCP + - containerPort: 10901 + name: grpc + protocol: TCP + readinessProbe: + failureThreshold: 20 + httpGet: + path: /-/ready + port: 10902 + periodSeconds: 5 + resources: + limits: + memory: ${MEMORY_LIMIT} + requests: + cpu: ${CPU_REQUEST} + memory: ${MEMORY_REQUEST} + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /var/thanos/ruler + name: data + - mountPath: /etc/thanos/rules/synced-rules + name: rule-syncer + readOnly: true + - args: + - -file=/etc/thanos-rule-syncer/observatorium.yaml + - -interval=60 + - -rules-backend-url=http://observatorium-rules-objstore.rhobs.svc.cluster.local:10902 + - -thanos-rule-url=127.0.0.1:10902 + image: quay.io/observatorium/thanos-rule-syncer:main-2022-09-14-338f9ec + name: observatorium-rules-syncer + ports: + - containerPort: 8083 + name: internal + protocol: TCP + resources: + limits: + cpu: 128m + memory: 128Mi + requests: + cpu: 32m + memory: 64Mi + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/thanos-rule-syncer + name: rule-syncer + - args: + - -volume-dir=/etc/thanos-rule-syncer + - -webhook-url=http://localhost:10902/-/reload + image: 'quay.io/openshift/origin-configmap-reloader:4.5.0:' + name: configmap-reloader + resources: + limits: + cpu: 200m + memory: 200Mi + requests: + cpu: 100m + memory: 100Mi + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/thanos/rules/observatorium-rules + name: observatorium-rules + - args: + - -provider=openshift + - -https-address=:8443 + - -http-address= + - -email-domain=* + - -upstream=http://localhost:10902 + - -openshift-service-account=observatorium-thanos-ruler-rhel + - '-openshift-sar={"resource": "namespaces", "verb": "get", "name": "rhobs", + "namespace": "rhobs"}' + - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", + "name": "rhobs", "namespace": "rhobs"}}' + - -tls-cert=/etc/tls/private/tls.crt + - -tls-key=/etc/tls/private/tls.key + - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token + - -cookie-secret=${OAUTH_PROXY_COOKIE_SECRET} + - -openshift-ca=/etc/pki/tls/cert.pem + - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt + image: quay.io/openshift/origin-oauth-proxy:4.15 + name: oauth-proxy + ports: + - containerPort: 8443 + name: https + protocol: TCP + resources: + limits: + cpu: 200m + memory: 200Mi + requests: + cpu: 100m + memory: 100Mi + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/tls/private + name: tls + readOnly: true + - args: + - --reporter.grpc.host-port=dns:///otel-trace-writer-collector-headless.observatorium-tools.svc:14250 + - --reporter.type=grpc + - --agent.tags=pod.namespace=$(NAMESPACE),pod.name=$(POD) + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POD + valueFrom: + fieldRef: + fieldPath: metadata.name + image: quay.io/app-sre/jaegertracing-jaeger-agent:1.22.0 + livenessProbe: + failureThreshold: 5 + httpGet: + path: / + port: 14271 + name: jaeger-agent + ports: + - containerPort: 5778 + name: configs + protocol: TCP + - containerPort: 6831 + name: jaeger-thrift + protocol: TCP + - containerPort: 14271 + name: metrics + protocol: TCP + readinessProbe: + httpGet: + path: / + port: 14271 + initialDelaySeconds: 1 + resources: + limits: + cpu: 128m + memory: 128Mi + requests: + cpu: 32m + memory: 64Mi + terminationMessagePolicy: FallbackToLogsOnError + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: observatorium-thanos-ruler-rhel + terminationGracePeriodSeconds: 120 + volumes: + - emptyDir: {} + name: rule-syncer + - name: tls + secret: + secretName: ruler-tls + updateStrategy: {} + volumeClaimTemplates: + - metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.4 + observatorium/tenant: rhel + name: data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: "" +- apiVersion: apps/v1 + kind: Deployment + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium + spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + strategy: {} + template: + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + namespace: observatorium + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/instance + operator: In + values: + - observatorium + - key: app.kubernetes.io/name + operator: In + values: + - rules-objstore + topologyKey: kubernetes.io/hostname + weight: 100 + containers: + - args: + - -log.format=logfmt + - -log.level=warn + image: 'quay.io/observatorium/rules-objstore:' + imagePullPolicy: IfNotPresent + livenessProbe: + failureThreshold: 10 + httpGet: + path: /live + port: 8081 + periodSeconds: 30 + successThreshold: 1 + timeoutSeconds: 1 + name: thanos + ports: + - containerPort: 8081 + name: internal + protocol: TCP + - containerPort: 8080 + name: public + protocol: TCP + readinessProbe: + failureThreshold: 12 + httpGet: + path: /ready + port: 8081 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + resources: + limits: + cpu: "1" + memory: 400Mi + requests: + cpu: 50m + memory: 200Mi + terminationMessagePolicy: FallbackToLogsOnError + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: observatorium-rules-objstore + terminationGracePeriodSeconds: 120 +- apiVersion: v1 + kind: Service + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium + spec: + ports: + - name: internal + port: 8081 + protocol: TCP + targetPort: 8081 + - name: public + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium +- apiVersion: v1 + kind: ServiceAccount + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium +- apiVersion: monitoring.coreos.com/v1 + kind: ServiceMonitor + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium + spec: + endpoints: + - port: internal + relabelings: + - action: replace + separator: / + sourceLabels: + - namespace + - pod + targetLabel: instance + namespaceSelector: {} + selector: + matchLabels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium +parameters: +- name: LOG_LEVEL + value: warn +- name: REPLICAS + value: "1" +- name: CPU_REQUEST + value: 100m +- name: MEMORY_LIMIT + value: 1Gi +- name: MEMORY_REQUEST + value: 256Mi +- from: '[a-zA-Z0-9]{40}' + generate: expression + name: OAUTH_PROXY_COOKIE_SECRET diff --git a/resources/services/telemeter-prod-01/rhobs/telemeter/observatorium-metrics-ruler-telemeter-template.yaml b/resources/services/telemeter-prod-01/rhobs/telemeter/observatorium-metrics-ruler-telemeter-template.yaml new file mode 100755 index 0000000000..fef0659a5e --- /dev/null +++ b/resources/services/telemeter-prod-01/rhobs/telemeter/observatorium-metrics-ruler-telemeter-template.yaml @@ -0,0 +1,590 @@ +apiVersion: template.openshift.io/v1 +kind: Template +metadata: + creationTimestamp: null + name: observatorium-thanos-ruler-telemeter +objects: +- apiVersion: route.openshift.io/v1 + kind: Route + metadata: + annotations: + cert-manager.io/issuer-kind: ClusterIssuer + cert-manager.io/issuer-name: letsencrypt-prod-http + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.4 + observatorium/tenant: telemeter + name: observatorium-thanos-ruler-telemeter + namespace: rhobs + spec: + host: "" + port: + targetPort: https + tls: + insecureEdgeTerminationPolicy: Redirect + termination: reencrypt + to: + kind: Service + name: observatorium-thanos-ruler-telemeter + weight: null +- apiVersion: v1 + kind: Service + metadata: + annotations: + service.alpha.openshift.io/serving-cert-secret-name: ruler-tls + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.4 + observatorium/tenant: telemeter + name: observatorium-thanos-ruler-telemeter + namespace: rhobs + spec: + ports: + - name: http + port: 10902 + protocol: TCP + targetPort: 10902 + - name: grpc + port: 10901 + protocol: TCP + targetPort: 10901 + - name: internal + port: 8083 + protocol: TCP + targetPort: 8083 + - name: https + port: 8443 + protocol: TCP + targetPort: 8443 + selector: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + observatorium/tenant: telemeter +- apiVersion: v1 + imagePullSecrets: + - name: quay.io + kind: ServiceAccount + metadata: + annotations: + serviceaccounts.openshift.io/oauth-redirectreference.application: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"observatorium-thanos-ruler-telemeter"}}' + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.4 + observatorium/tenant: telemeter + name: observatorium-thanos-ruler-telemeter + namespace: rhobs +- apiVersion: monitoring.coreos.com/v1 + kind: ServiceMonitor + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.4 + observatorium/tenant: telemeter + prometheus: app-sre + name: observatorium-thanos-ruler-telemeter + namespace: openshift-customer-monitoring + spec: + endpoints: + - port: http + relabelings: + - action: replace + separator: / + sourceLabels: + - namespace + - pod + targetLabel: instance + - port: internal + relabelings: + - action: replace + separator: / + sourceLabels: + - namespace + - pod + targetLabel: instance + namespaceSelector: + matchNames: + - rhobs + selector: + matchLabels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + observatorium/tenant: telemeter +- apiVersion: apps/v1 + kind: StatefulSet + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.4 + observatorium/tenant: telemeter + name: observatorium-thanos-ruler-telemeter + namespace: rhobs + spec: + replicas: ${{REPLICAS}} + selector: + matchLabels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + observatorium/tenant: telemeter + serviceName: observatorium-thanos-ruler-telemeter + template: + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.4 + observatorium/tenant: telemeter + namespace: rhobs + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/instance + operator: In + values: + - observatorium + - key: app.kubernetes.io/name + operator: In + values: + - thanos-rule + topologyKey: kubernetes.io/hostname + weight: 100 + containers: + - args: + - rule + - --alert.label-drop=rule_replica + - --data-dir=/var/thanos/ruler + - --label=rule_replica="$(NAME)" + - --log.format=logfmt + - --log.level=${LOG_LEVEL} + - --objstore.config=$(OBJSTORE_CONFIG) + - --query=http://observatorium-thanos-query-rule.rhobs.svc.cluster.local:10902 + - --rule-file=/etc/thanos/rules/synced-rules/observatorium.yaml + - | + --tracing.config=type: JAEGER + config: + service_name: thanos-rule + sampler_type: ratelimiting + sampler_param: 2 + - --tsdb.retention=2d + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + key: aws_access_key_id + name: telemeter-tenant-s3 + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + key: aws_secret_access_key + name: telemeter-tenant-s3 + - name: OBJ_STORE_BUCKET + valueFrom: + secretKeyRef: + key: bucket + name: telemeter-tenant-s3 + - name: OBJ_STORE_REGION + valueFrom: + secretKeyRef: + key: aws_region + name: telemeter-tenant-s3 + - name: OBJ_STORE_ENDPOINT + valueFrom: + secretKeyRef: + key: endpoint + name: telemeter-tenant-s3 + - name: OBJSTORE_CONFIG + value: | + type: S3 + config: + bucket: $(OBJ_STORE_BUCKET) + endpoint: $(OBJ_STORE_ENDPOINT) + region: $(OBJ_STORE_REGION) + image: quay.io/thanos/thanos:v0.32.4 + imagePullPolicy: IfNotPresent + livenessProbe: + failureThreshold: 8 + httpGet: + path: /-/healthy + port: 10902 + periodSeconds: 30 + timeoutSeconds: 1 + name: thanos + ports: + - containerPort: 10902 + name: http + protocol: TCP + - containerPort: 10901 + name: grpc + protocol: TCP + readinessProbe: + failureThreshold: 20 + httpGet: + path: /-/ready + port: 10902 + periodSeconds: 5 + resources: + limits: + memory: ${MEMORY_LIMIT} + requests: + cpu: ${CPU_REQUEST} + memory: ${MEMORY_REQUEST} + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /var/thanos/ruler + name: data + - mountPath: /etc/thanos/rules/synced-rules + name: rule-syncer + readOnly: true + - args: + - -file=/etc/thanos-rule-syncer/observatorium.yaml + - -interval=60 + - -rules-backend-url=http://observatorium-rules-objstore.rhobs.svc.cluster.local:10902 + - -thanos-rule-url=127.0.0.1:10902 + image: quay.io/observatorium/thanos-rule-syncer:main-2022-09-14-338f9ec + name: observatorium-rules-syncer + ports: + - containerPort: 8083 + name: internal + protocol: TCP + resources: + limits: + cpu: 128m + memory: 128Mi + requests: + cpu: 32m + memory: 64Mi + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/thanos-rule-syncer + name: rule-syncer + - args: + - -volume-dir=/etc/thanos-rule-syncer + - -webhook-url=http://localhost:10902/-/reload + image: 'quay.io/openshift/origin-configmap-reloader:4.5.0:' + name: configmap-reloader + resources: + limits: + cpu: 200m + memory: 200Mi + requests: + cpu: 100m + memory: 100Mi + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/thanos/rules/observatorium-rules + name: observatorium-rules + - args: + - -provider=openshift + - -https-address=:8443 + - -http-address= + - -email-domain=* + - -upstream=http://localhost:10902 + - -openshift-service-account=observatorium-thanos-ruler-telemeter + - '-openshift-sar={"resource": "namespaces", "verb": "get", "name": "rhobs", + "namespace": "rhobs"}' + - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", + "name": "rhobs", "namespace": "rhobs"}}' + - -tls-cert=/etc/tls/private/tls.crt + - -tls-key=/etc/tls/private/tls.key + - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token + - -cookie-secret=${OAUTH_PROXY_COOKIE_SECRET} + - -openshift-ca=/etc/pki/tls/cert.pem + - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt + image: quay.io/openshift/origin-oauth-proxy:4.15 + name: oauth-proxy + ports: + - containerPort: 8443 + name: https + protocol: TCP + resources: + limits: + cpu: 200m + memory: 200Mi + requests: + cpu: 100m + memory: 100Mi + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/tls/private + name: tls + readOnly: true + - args: + - --reporter.grpc.host-port=dns:///otel-trace-writer-collector-headless.observatorium-tools.svc:14250 + - --reporter.type=grpc + - --agent.tags=pod.namespace=$(NAMESPACE),pod.name=$(POD) + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POD + valueFrom: + fieldRef: + fieldPath: metadata.name + image: quay.io/app-sre/jaegertracing-jaeger-agent:1.22.0 + livenessProbe: + failureThreshold: 5 + httpGet: + path: / + port: 14271 + name: jaeger-agent + ports: + - containerPort: 5778 + name: configs + protocol: TCP + - containerPort: 6831 + name: jaeger-thrift + protocol: TCP + - containerPort: 14271 + name: metrics + protocol: TCP + readinessProbe: + httpGet: + path: / + port: 14271 + initialDelaySeconds: 1 + resources: + limits: + cpu: 128m + memory: 128Mi + requests: + cpu: 32m + memory: 64Mi + terminationMessagePolicy: FallbackToLogsOnError + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: observatorium-thanos-ruler-telemeter + terminationGracePeriodSeconds: 120 + volumes: + - emptyDir: {} + name: rule-syncer + - name: tls + secret: + secretName: ruler-tls + updateStrategy: {} + volumeClaimTemplates: + - metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rule-evaluation-engine + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: thanos-rule + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: v0.32.4 + observatorium/tenant: telemeter + name: data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: "" +- apiVersion: apps/v1 + kind: Deployment + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium + spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + strategy: {} + template: + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + namespace: observatorium + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/instance + operator: In + values: + - observatorium + - key: app.kubernetes.io/name + operator: In + values: + - rules-objstore + topologyKey: kubernetes.io/hostname + weight: 100 + containers: + - args: + - -log.format=logfmt + - -log.level=warn + image: 'quay.io/observatorium/rules-objstore:' + imagePullPolicy: IfNotPresent + livenessProbe: + failureThreshold: 10 + httpGet: + path: /live + port: 8081 + periodSeconds: 30 + successThreshold: 1 + timeoutSeconds: 1 + name: thanos + ports: + - containerPort: 8081 + name: internal + protocol: TCP + - containerPort: 8080 + name: public + protocol: TCP + readinessProbe: + failureThreshold: 12 + httpGet: + path: /ready + port: 8081 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + resources: + limits: + cpu: "1" + memory: 400Mi + requests: + cpu: 50m + memory: 200Mi + terminationMessagePolicy: FallbackToLogsOnError + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: observatorium-rules-objstore + terminationGracePeriodSeconds: 120 +- apiVersion: v1 + kind: Service + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium + spec: + ports: + - name: internal + port: 8081 + protocol: TCP + targetPort: 8081 + - name: public + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium +- apiVersion: v1 + kind: ServiceAccount + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium +- apiVersion: monitoring.coreos.com/v1 + kind: ServiceMonitor + metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium + app.kubernetes.io/version: "" + name: observatorium-rules-objstore + namespace: observatorium + spec: + endpoints: + - port: internal + relabelings: + - action: replace + separator: / + sourceLabels: + - namespace + - pod + targetLabel: instance + namespaceSelector: {} + selector: + matchLabels: + app.kubernetes.io/component: rules-storage + app.kubernetes.io/instance: observatorium + app.kubernetes.io/name: rules-objstore + app.kubernetes.io/part-of: observatorium +parameters: +- name: LOG_LEVEL + value: warn +- name: REPLICAS + value: "1" +- name: CPU_REQUEST + value: 100m +- name: MEMORY_LIMIT + value: 1Gi +- name: MEMORY_REQUEST + value: 256Mi +- from: '[a-zA-Z0-9]{40}' + generate: expression + name: OAUTH_PROXY_COOKIE_SECRET diff --git a/services_go/instances/rhobs/rhobs.go b/services_go/instances/rhobs/rhobs.go index 26b0a5b1cb..2b47edb8cb 100644 --- a/services_go/instances/rhobs/rhobs.go +++ b/services_go/instances/rhobs/rhobs.go @@ -1,11 +1,16 @@ package rhobs import ( + "encoding/json" + "fmt" "sort" + "github.com/google/go-jsonnet" "github.com/observatorium/observatorium/configuration_go/abstr/kubernetes/thanos/receive" + "github.com/observatorium/observatorium/configuration_go/abstr/kubernetes/thanos/ruler" "github.com/observatorium/observatorium/configuration_go/abstr/kubernetes/thanos/store" "github.com/rhobs/configuration/services_go/observatorium" + "gopkg.in/yaml.v3" ) const ( @@ -123,6 +128,16 @@ func stageConfig() observatorium.Observatorium { StorePreManifestsHook: func(store *store.StoreStatefulSet) { store.VolumeSize = "5Gi" }, + RulerPreManifestsHook: func(rulerSs *ruler.RulerStatefulSet) { + rulerSs.ConfigMaps["observatorium-rules"] = map[string]string{ + "observatorium.yaml": getTelemeterRules(), + } + rulerSs.Options.RuleFile = append(rulerSs.Options.RuleFile, ruler.RuleFileOption{ + FileName: "observatorium.yaml", + ConfigMapName: "observatorium-rules", + ParentDir: "telemeter-rules", + }) + }, }, }, }, @@ -225,3 +240,48 @@ func buildTenants(tenants map[string]observatorium.Tenants, instance InstanceNam return ret } + +func getTelemeterRules() string { + vm := jsonnet.MakeVM() + vm.Importer(&jsonnet.FileImporter{ + JPaths: []string{"./vendor_jsonnet"}, + }) + + snippet := fmt.Sprintf(` + local telemeterRules = (import 'github.com/openshift/telemeter/jsonnet/telemeter/rules.libsonnet'); + { + groups: std.map(function(group) { + name: 'telemeter-' + group.name, + interval: group.interval, + rules: std.map(function(rule) rule { + labels+: { + tenant_id: '%s', + }, + }, group.rules), + }, telemeterRules.prometheus.recordingrules.groups), + }`, tenantsMapping[TelemeterInstanceName]["telemeter"]) + + // Evaluate the Jsonnet content + jsonStr, err := vm.EvaluateAnonymousSnippet("telemeter-rules", snippet) + if err != nil { + panic(fmt.Sprintf("Failed to evaluate Jsonnet content: %v\n", err)) + } + + return jsonToYaml(jsonStr) +} + +func jsonToYaml(jsonStr string) string { + // Unmarshal the jsonStr into a map + var data map[string]interface{} + if err := json.Unmarshal([]byte(jsonStr), &data); err != nil { + panic(fmt.Sprintf("Failed to unmarshal Jsonnet content: %v\n", err)) + } + + // Marshal the map into YAML + yamlBytes, err := yaml.Marshal(data) + if err != nil { + panic(fmt.Sprintf("Failed to marshal Jsonnet content: %v\n", err)) + } + + return string(yamlBytes) +} diff --git a/services_go/observatorium/cache.go b/services_go/observatorium/cache.go index 05c29f9179..d5a79d72c0 100644 --- a/services_go/observatorium/cache.go +++ b/services_go/observatorium/cache.go @@ -18,8 +18,6 @@ func makeMemcached(name, namespace string, preManifestHook func(*memcached.Memca // K8s config memcachedDeployment := memcached.NewMemcachedStatefulSet() memcachedDeployment.Name = name - // memcachedDeployment.CommonLabels[observatoriumInstanceLabel] = instanceName - // memcachedDeployment.CommonLabels[k8sutil.ComponentLabel] = component memcachedDeployment.Image = "quay.io/app-sre/memcached" memcachedDeployment.ImageTag = "1.5" memcachedDeployment.Namespace = namespace diff --git a/services_go/observatorium/metrics.go b/services_go/observatorium/metrics.go index 63a5f9c526..173931eb75 100644 --- a/services_go/observatorium/metrics.go +++ b/services_go/observatorium/metrics.go @@ -4,6 +4,7 @@ import ( _ "embed" "fmt" "maps" + "net" "sort" "time" @@ -14,6 +15,7 @@ import ( "github.com/observatorium/observatorium/configuration_go/abstr/kubernetes/thanos/query" "github.com/observatorium/observatorium/configuration_go/abstr/kubernetes/thanos/queryfrontend" "github.com/observatorium/observatorium/configuration_go/abstr/kubernetes/thanos/receive" + "github.com/observatorium/observatorium/configuration_go/abstr/kubernetes/thanos/ruler" "github.com/observatorium/observatorium/configuration_go/abstr/kubernetes/thanos/store" "github.com/observatorium/observatorium/configuration_go/k8sutil" "github.com/observatorium/observatorium/configuration_go/openshift" @@ -48,6 +50,7 @@ const ( ingestorControllerLabel = "controller.receive.thanos.io" ingestorControllerLabelValue = "thanos-receive-controller" ingestorControllerLabelHashring = ingestorControllerLabel + "/hashring" + queryRuleName = "observatorium-thanos-query-rule" ) //go:embed assets/store-auto-shard-relabel-configMap.sh @@ -68,6 +71,7 @@ type ObservatoriumMetrics struct { QueryFrontendPreManifestsHook func(*queryfrontend.QueryFrontendDeployment) QueryFrontendCachePreManifestsHook func(*memcached.MemcachedDeployment) storesRegister []string + queryRuleURL string queryAdhocURL string } @@ -83,6 +87,7 @@ type ObservatoriumMetricsInstance struct { BucketCachePreManifestsHook func(*memcached.MemcachedDeployment) CompactorPreManifestsHook func(*compactor.CompactorStatefulSet) ReceiveIngestorPreManifestsHook func(*receive.Ingestor) + RulerPreManifestsHook func(*ruler.RulerStatefulSet) } // Tenants contains the configuration for a tenant in a metrics instance. @@ -106,14 +111,173 @@ func (o *ObservatoriumMetrics) Manifests(generator *mimic.Generator) { gen.Add(makeFileName("receive-ingestor", instanceCfg.InstanceName), withStatusRemove(o.makeTenantReceiveIngestor(instanceCfg))) gen.Add(makeFileName("compact", instanceCfg.InstanceName), withStatusRemove(o.makeCompactor(instanceCfg))) gen.Add(makeFileName("store", instanceCfg.InstanceName), withStatusRemove(o.makeStore(instanceCfg))) + gen.Add(makeFileName("ruler", instanceCfg.InstanceName), withStatusRemove(o.makeRuler(instanceCfg))) } + // Order matters here, each component registers itself in the storesRegister slice or the queryRuleURL variable generator.Add("observatorium-metrics-receive-router-template.yaml", withStatusRemove(o.makeReceiveRouter())) generator.Add("observatorium-metrics-query-rule-template.yaml", withStatusRemove(o.makeQueryConfig(true, o.QueryRulePreManifestsHook))) generator.Add("observatorium-metrics-query-template.yaml", withStatusRemove(o.makeQueryConfig(false, o.QueryAdhocPreManifestsHook))) generator.Add("observatorium-metrics-query-frontend-template.yaml", withStatusRemove(o.makeQueryFrontend())) } +func (o *ObservatoriumMetrics) makeRuler(instanceCfg *ObservatoriumMetricsInstance) encoding.Encoder { + rulerStatefulset := ruler.NewRuler() + + // K8s config + rulerStatefulset.Name = fmt.Sprintf("%s-%s", rulerStatefulset.Name, instanceCfg.InstanceName) + rulerStatefulset.CommonLabels[observatoriumInstanceLabel] = instanceCfg.InstanceName + rulerStatefulset.Image = thanosImage + rulerStatefulset.ImageTag = o.ThanosImageTag + rulerStatefulset.Namespace = o.Namespace + rulerStatefulset.Replicas = 1 + delete(rulerStatefulset.PodResources.Limits, corev1.ResourceCPU) + rulerStatefulset.PodResources.Requests[corev1.ResourceCPU] = resource.MustParse("100m") + rulerStatefulset.PodResources.Requests[corev1.ResourceMemory] = resource.MustParse("256Mi") + rulerStatefulset.PodResources.Limits[corev1.ResourceMemory] = resource.MustParse("1Gi") + tlsSecret := "ruler-tls" + rulesObjstoreName := "observatorium-rules-objstore" + rulesSyncer := ruler.NewRulesSyncerContainer(&ruler.RulesSyncerOptions{ + File: "/etc/thanos-rule-syncer/observatorium.yaml", + Interval: 60, + RulesBackendUrl: fmt.Sprintf("http://%s.%s.svc.cluster.local:10902", rulesObjstoreName, o.Namespace), + ThanosRuleUrl: &net.TCPAddr{ + IP: net.ParseIP("127.0.0.1"), + Port: 10902, + }, + }) + rulesSyncer.Image = "quay.io/observatorium/thanos-rule-syncer" + rulesSyncer.ImageTag = "main-2022-09-14-338f9ec" + + rulerStatefulset.Options.RuleFile = append(rulerStatefulset.Options.RuleFile, ruler.RuleFileOption{ + FileName: "observatorium.yaml", + VolumeName: "rule-syncer", + ParentDir: "synced-rules", + }) + rulerStatefulset.Sidecars = []k8sutil.ContainerProvider{ + rulesSyncer, + &k8sutil.Container{ + Name: "configmap-reloader", + Image: "quay.io/openshift/origin-configmap-reloader:4.5.0", + Args: []string{ + "-volume-dir=/etc/thanos-rule-syncer", + "-webhook-url=http://localhost:10902/-/reload", + }, + Resources: k8sutil.NewResourcesRequirements("100m", "200m", "100Mi", "200Mi"), + VolumeMounts: []corev1.VolumeMount{ + { + Name: "observatorium-rules", + MountPath: "/etc/thanos/rules/observatorium-rules", + }, + }, + }, + makeOauthProxy(10902, o.Namespace, rulerStatefulset.Name, tlsSecret), + makeJaegerAgent("observatorium-tools"), + } + rulerStatefulset.Env = append(rulerStatefulset.Env, objStoreEnvVars(instanceCfg.ObjStoreSecret)...) + + // Ruler config + rulerStatefulset.Options.LogLevel = log.LogLevelWarn + rulerStatefulset.Options.LogFormat = log.LogFormatLogfmt + rulerStatefulset.Options.Label = []ruler.Label{ + {Key: "rule_replica", Value: "\"$(NAME)\""}, + } + rulerStatefulset.Options.TracingConfig = &trclient.TracingConfig{ + Type: trclient.Jaeger, + Config: jaeger.Config{ + SamplerParam: 2, + SamplerType: jaeger.SamplerTypeRateLimiting, + ServiceName: rulerStatefulset.CommonLabels[k8sutil.NameLabel], + }, + } + rulerStatefulset.Options.AlertLabelDrop = []string{"rule_replica"} + rulerStatefulset.Options.TsdbRetention = model.Duration(2 * 24 * time.Hour) + rulerStatefulset.Options.Query = []string{ + fmt.Sprintf("http://%s.%s.svc.cluster.local:10902", queryRuleName, o.Namespace), // hardcoded + } + + // --alertmanagers.url=dnssrv+http://observatorium-alertmanager-peers.observatorium-mst-stage.svc.cluster.local:9093 + + // Register the store api + o.storesRegister = append(o.storesRegister, fmt.Sprintf("http://%s.%s.svc.cluster.local:10902", rulerStatefulset.Name, rulerStatefulset.Namespace)) + + // Execute preManifestsHook + if instanceCfg.RulerPreManifestsHook != nil { + instanceCfg.RulerPreManifestsHook(rulerStatefulset) + } + + // Post process + manifests := rulerStatefulset.Manifests() + postProcessServiceMonitor(getObject[*monv1.ServiceMonitor](manifests), rulerStatefulset.Namespace) + addQuayPullSecret(getObject[*corev1.ServiceAccount](manifests)) + service := getObject[*corev1.Service](manifests) + service.ObjectMeta.Annotations[servingCertSecretNameAnnotation] = tlsSecret + // Add annotations for openshift oauth so that the route to access the query ui works + serviceAccount := getObject[*corev1.ServiceAccount](manifests) + if serviceAccount.Annotations == nil { + serviceAccount.Annotations = map[string]string{} + } + serviceAccount.Annotations["serviceaccounts.openshift.io/oauth-redirectreference.application"] = fmt.Sprintf(`{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"%s"}}`, rulerStatefulset.Name) + + // Add route for oauth-proxy + manifests["oauth-proxy-route"] = &routev1.Route{ + TypeMeta: metav1.TypeMeta{ + Kind: "Route", + APIVersion: routev1.SchemeGroupVersion.String(), + }, + ObjectMeta: metav1.ObjectMeta{ + Name: rulerStatefulset.Name, + Namespace: o.Namespace, + Labels: maps.Clone(getObject[*appsv1.StatefulSet](manifests).ObjectMeta.Labels), + Annotations: map[string]string{ + "cert-manager.io/issuer-kind": "ClusterIssuer", + "cert-manager.io/issuer-name": "letsencrypt-prod-http", + }, + }, + Spec: routev1.RouteSpec{ + Port: &routev1.RoutePort{ + TargetPort: intstr.FromString("https"), + }, + TLS: &routev1.TLSConfig{ + Termination: routev1.TLSTerminationReencrypt, + InsecureEdgeTerminationPolicy: routev1.InsecureEdgeTerminationPolicyRedirect, + }, + To: routev1.RouteTargetReference{ + Kind: "Service", + Name: rulerStatefulset.Name, + }, + }, + } + + // Add rules objstore + rulesObjstore := ruler.NewRulesObjstore() + rulesObjstore.Name = rulesObjstoreName + for key, val := range rulesObjstore.Manifests() { + manifests[key] = val + } + + // Wrap in template, add parameters + defaultParams := defaultTemplateParams(defaultTemplateParamsConfig{ + LogLevel: string(rulerStatefulset.Options.LogLevel), + Replicas: rulerStatefulset.Replicas, + CPURequest: rulerStatefulset.PodResources.Requests[corev1.ResourceCPU], + MemoryLimit: rulerStatefulset.PodResources.Limits[corev1.ResourceMemory], + MemoryRequest: rulerStatefulset.PodResources.Requests[corev1.ResourceMemory], + }) + template := openshift.WrapInTemplate("", manifests, metav1.ObjectMeta{ + Name: rulerStatefulset.Name, + }, append(defaultParams, []templatev1.Parameter{ + { + Name: "OAUTH_PROXY_COOKIE_SECRET", + Generate: "expression", + From: "[a-zA-Z0-9]{40}", + }, + }...)) + + // Adding a special encoder wrapper to replace the templated values in the template with their corresponding template parameter. + return NewDefaultTemplateYAML(encoding.GhodssYAML(template[""]), rulerStatefulset.Name) +} + func (o *ObservatoriumMetrics) makeQueryFrontend() encoding.Encoder { queryFrontend := queryfrontend.NewQueryFrontend() @@ -217,7 +381,14 @@ func (o *ObservatoriumMetrics) makeQueryFrontend() encoding.Encoder { } // Add cache - res := makeMemcached(queryFrontend.Name, o.Namespace, o.QueryFrontendCachePreManifestsHook) + rangeCache := "observatorium-thanos-query-range-cache-memcached" + cachePreManHook := func(memdep *memcached.MemcachedDeployment) { + memdep.CommonLabels[k8sutil.ComponentLabel] = "query-range-cache" + if o.QueryFrontendCachePreManifestsHook != nil { + o.QueryFrontendCachePreManifestsHook(memdep) + } + } + res := makeMemcached(rangeCache, o.Namespace, cachePreManHook) for k, v := range res { manifests[k] = v } @@ -249,7 +420,7 @@ func (o *ObservatoriumMetrics) makeQueryConfig(isRuleQuery bool, preManifestHook // K8s config if isRuleQuery { - queryDplt.Name = queryDplt.Name + "-rule" + queryDplt.Name = queryRuleName queryDplt.CommonLabels[k8sutil.NameLabel] = queryDplt.CommonLabels[k8sutil.NameLabel] + "-rule" // Regenerate the affinity to update the name selector queryDplt.Affinity = k8sutil.NewAntiAffinity(nil, map[string]string{ @@ -300,8 +471,11 @@ func (o *ObservatoriumMetrics) makeQueryConfig(isRuleQuery bool, preManifestHook queryDplt.Options.QueryTelemetryRequestDurationSecondsQuantiles = []float64{0.1, 0.25, 0.75, 1.25, 1.75, 2.5, 3, 5, 10, 15, 30, 60, 120} } - if !isRuleQuery { - o.queryAdhocURL = fmt.Sprintf("http://%s.%s.svc.cluster.local:10902", queryDplt.Name, queryDplt.Namespace) + ruleUrl := fmt.Sprintf("http://%s.%s.svc.cluster.local:10902", queryDplt.Name, queryDplt.Namespace) + if isRuleQuery { + o.queryRuleURL = ruleUrl + } else { + o.queryAdhocURL = ruleUrl } // Execute preManifestsHook