Skip to content

Commit

Permalink
Switch to opentelemetry sdk for metrics (#170)
Browse files Browse the repository at this point in the history
* Switch to opentelemetry sdk for metrics
* Bump version to 0.5.4
  • Loading branch information
hardbyte authored Jun 13, 2024
1 parent 09423f6 commit d990415
Show file tree
Hide file tree
Showing 6 changed files with 608 additions and 203 deletions.
4 changes: 2 additions & 2 deletions operator/charts/netchecks/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ icon: https://raw.githubusercontent.com/hardbyte/netchecks/main/.github/logo.png
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.16
version: 0.1.17

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "0.5.3"
appVersion: "0.5.4"
sources:
- https://github.com/hardbyte/netchecks
maintainers:
Expand Down
74 changes: 54 additions & 20 deletions operator/netchecks_operator/main.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
import datetime
import json
import random
import time
from collections import defaultdict
from contextlib import contextmanager
from json import JSONDecodeError
from time import sleep
from typing import List

from opentelemetry import metrics
from opentelemetry.exporter.prometheus import PrometheusMetricReader
from opentelemetry.metrics import Counter, Histogram
from opentelemetry.sdk.metrics import MeterProvider
import prometheus_client as prometheus

from kubernetes.client import (
Expand All @@ -18,6 +24,7 @@
V1VolumeMount,
V1SecretVolumeSource,
)
from opentelemetry.sdk.resources import Attributes
from structlog import get_logger
from rich import print
import kopf
Expand All @@ -26,49 +33,74 @@
from netchecks_operator.config import Config
from importlib import metadata


try:
NETCHECK_OPERATOR_VERSION = metadata.version("netcheck-operator")
except metadata.PackageNotFoundError:
NETCHECK_OPERATOR_VERSION = "unknown"


logger = get_logger()
settings = Config()
logger = get_logger()


logger.debug("Starting operator", config=settings.json())
if settings.metrics.enabled:
prometheus.start_http_server(settings.metrics.port)
logger.debug("Starting metrics", metrics_port=settings.metrics.port)
prometheus.start_http_server(port=settings.metrics.port)


API_GROUP_NAME = "netchecks.io"

# define Prometheus metrics
ASSERTION_COUNT = prometheus.Counter("netchecks_assertions", "Number of network assertions", ["name"])
ASSERTION_REQUEST_TIME = prometheus.Summary(
"netchecks_operator_assertion_processing_seconds",
"Time spent processing network assertions by the netchecks operator",
["name", "method"],
# Initialize metrics

metrics.set_meter_provider(MeterProvider(metric_readers=[PrometheusMetricReader()]))
meter = metrics.get_meter("netchecks-operator", version=NETCHECK_OPERATOR_VERSION)


# define metrics

ASSERTION_COUNT = meter.create_counter("netchecks_assertions", description="Number of network assertions")

ASSERTION_REQUEST_TIME = meter.create_histogram(
"netchecks_assertion_processing_duration",
unit="s",
description="Time spent processing network assertions by the netchecks operator",
)
ASSERTION_RESULT_TIME = prometheus.Summary(
ASSERTION_RESULT_TIME = meter.create_histogram(
"netchecks_operator_assertion_results_processing_seconds",
"s",
"Time spent processing network assertion results by the netchecks operator",
)

ASSERTION_TEST_TIME = prometheus.Summary(
ASSERTION_TEST_TIME = meter.create_histogram(
"netchecks_probe_processing_seconds",
"Time spent testing network assertions by netchecks probe",
["name", "type"],
unit="s",
description="Time spent testing network assertions by netchecks probe",
)


@contextmanager
def metered_duration(instrument: Counter | Histogram, attributes: Attributes | None = None):
start_time = time.time()
try:
yield
finally:
duration = time.time() - start_time
if isinstance(instrument, Counter):
instrument.add(amount=duration, attributes=attributes)
elif isinstance(instrument, Histogram):
instrument.record(duration, attributes=attributes)


@kopf.on.resume("networkassertions.v1.netchecks.io")
@kopf.on.create("networkassertions.v1.netchecks.io")
def creation(body, spec, name, namespace, **kwargs):
with ASSERTION_REQUEST_TIME.labels(name, "create").time():
with metered_duration(ASSERTION_REQUEST_TIME, {"name": name, "method": "create"}):
logger = get_logger(name=name, namespace=namespace)
batch_v1 = client.BatchV1Api()
logger.info("NetworkAssertion on-create/on-resume handler called")
ASSERTION_COUNT.labels(name).inc()
ASSERTION_COUNT.add(1, {"name": name})

logger.debug("Requested NetworkAssertion body", body=body)
logger.info("Requested NetworkAssertion spec", spec=spec)
Expand Down Expand Up @@ -319,20 +351,17 @@ def delete(name, namespace, **kwargs):
logger.info("networkassertion delete handler called")



@kopf.daemon(
"pod",
labels={
"app.kubernetes.io/name": "netchecks",
"app.kubernetes.io/component": "probe",
},
)
@ASSERTION_RESULT_TIME.time()
def monitor_selected_netcheck_pods(name, namespace, spec, status, stopped, **kwargs):
logger = get_logger(name=name, namespace=namespace)
logger.info("Monitoring pod")
core_v1 = client.api.core_v1_api.CoreV1Api()

while not stopped:
logger.debug("Getting pod status")
pod: V1Pod = core_v1.read_namespaced_pod(name=name, namespace=namespace)
Expand Down Expand Up @@ -367,7 +396,8 @@ def monitor_selected_netcheck_pods(name, namespace, spec, status, stopped, **kwa
return

# Process the results, creating or updating the associated PolicyReport
process_probe_output(pod_log, assertion_name, namespace, name)
with metered_duration(ASSERTION_RESULT_TIME, {"assertion_name": assertion_name}):
process_probe_output(pod_log, assertion_name, namespace, name)

break
case _:
Expand Down Expand Up @@ -578,8 +608,12 @@ def process_probe_output(pod_log: str, network_assertion_name, namespace, pod_na
test_end_iso_timestamp = datetime.datetime.fromisoformat(test_result["data"]["endTimestamp"])
test_duration = (test_end_iso_timestamp - test_start_iso_timestamp).total_seconds()

ASSERTION_TEST_TIME.labels(name=network_assertion_name, type=test_result["spec"]["type"]).observe(
test_duration
ASSERTION_TEST_TIME.record(
test_duration,
{
"name": network_assertion_name,
"type": test_result["spec"]["type"],
},
)


Expand Down
Loading

0 comments on commit d990415

Please sign in to comment.