diff --git a/ansible/README.dev.md b/ansible/README.dev.md index 512d61570a..146a70e946 100644 --- a/ansible/README.dev.md +++ b/ansible/README.dev.md @@ -278,7 +278,7 @@ custom_image_config: - components: - mlserver image: - tag: 1.6.0 + tag: 1.6.1 custom_servers_values: mlserver: @@ -346,7 +346,7 @@ custom_image_config: - components: - mlserver image: - tag: 1.6.0 + tag: 1.6.1 custom_components_values: kafka: diff --git a/ansible/roles/grafana/defaults/main.yaml b/ansible/roles/grafana/defaults/main.yaml index 65ed25e8e0..453db1de56 100644 --- a/ansible/roles/grafana/defaults/main.yaml +++ b/ansible/roles/grafana/defaults/main.yaml @@ -1,6 +1,6 @@ --- -grafana_chart_version: 8.4.1 -grafana_app_version: 11.1.3 +grafana_chart_version: 8.6.4 +grafana_app_version: 11.3.1 grafana_preloaded_dashboards: - name: mms diff --git a/ansible/roles/jaeger/defaults/main.yaml b/ansible/roles/jaeger/defaults/main.yaml index 610ce3b4a4..a6692661b0 100644 --- a/ansible/roles/jaeger/defaults/main.yaml +++ b/ansible/roles/jaeger/defaults/main.yaml @@ -1,7 +1,7 @@ --- jaeger_namespace: observability -jaeger_version: v1.53.0 +jaeger_version: v1.62.0 jaeger_yaml: "https://github.com/jaegertracing/jaeger-operator/releases/download/{{ jaeger_version }}/jaeger-operator.yaml" jaeger_wait_for_deployments: true diff --git a/ansible/roles/opentelemetry/defaults/main.yaml b/ansible/roles/opentelemetry/defaults/main.yaml index aecb733803..70cf99712f 100644 --- a/ansible/roles/opentelemetry/defaults/main.yaml +++ b/ansible/roles/opentelemetry/defaults/main.yaml @@ -1,7 +1,7 @@ --- opentelemetry_namespace: opentelemetry-operator-system -opentelemetry_version: v0.92.0 +opentelemetry_version: v0.114.1 opentelemetry_yaml: "https://github.com/open-telemetry/opentelemetry-operator/releases/download/{{ opentelemetry_version }}/opentelemetry-operator.yaml" opentelemetry_wait_for_deployments: true diff --git a/ansible/roles/prometheus/defaults/main.yaml b/ansible/roles/prometheus/defaults/main.yaml index ce4445e1b2..37511149fe 100644 --- a/ansible/roles/prometheus/defaults/main.yaml +++ b/ansible/roles/prometheus/defaults/main.yaml @@ -1,4 +1,8 @@ --- seldon_monitoring_namespace: "seldon-monitoring" seldon_monitoring_prometheus_operator_values: "{{ lookup('file', 'prometheus-operator-values.yaml') | from_yaml }}" -seldon_monitoring_prometheus_operator_chart_version: "8.3.6" +seldon_monitoring_prometheus_adapter_values: "{{ lookup('file', 'prometheus-adapter-values.yaml') | from_yaml }}" +seldon_monitoring_prometheus_operator_chart_version: "10.0.4" +seldon_monitoring_prometheus_adapter_chart_version: "4.11.0" + +seldon_prometheus_adapter_config_rate: "5m" diff --git a/ansible/roles/prometheus/files/prometheus-adapter-values.yaml b/ansible/roles/prometheus/files/prometheus-adapter-values.yaml new file mode 100644 index 0000000000..82aacc597e --- /dev/null +++ b/ansible/roles/prometheus/files/prometheus-adapter-values.yaml @@ -0,0 +1,2 @@ +prometheus: + url: http://seldon-monitoring-prometheus \ No newline at end of file diff --git a/ansible/roles/prometheus/tasks/main.yaml b/ansible/roles/prometheus/tasks/main.yaml index 2371c7a29c..482311a402 100644 --- a/ansible/roles/prometheus/tasks/main.yaml +++ b/ansible/roles/prometheus/tasks/main.yaml @@ -14,3 +14,22 @@ chart_ref: "kube-prometheus" chart_version: "{{ seldon_monitoring_prometheus_operator_chart_version }}" values: "{{ seldon_monitoring_prometheus_operator_values }}" + + +- name: Install Prometheus Adapter + kubernetes.core.helm: + name: prometheus-adapter + release_namespace: "{{ seldon_monitoring_namespace }}" + chart_repo_url: "https://prometheus-community.github.io/helm-charts" + chart_ref: "prometheus-adapter" + chart_version: "{{ seldon_monitoring_prometheus_adapter_chart_version }}" + values: "{{ seldon_monitoring_prometheus_adapter_values }}" + +- name: Create Seldon HPA ConfigMap + kubernetes.core.k8s: + state: present + namespace: "{{ seldon_monitoring_namespace }}" + template: "templates/seldon-prometheus-adapter-configmap.j2" + +- name: "Rollout Prometheus Adapter" + shell: "kubectl rollout restart deployment prometheus-adapter -n {{ seldon_monitoring_namespace }}" diff --git a/ansible/roles/prometheus/templates/seldon-prometheus-adapter-configmap.j2 b/ansible/roles/prometheus/templates/seldon-prometheus-adapter-configmap.j2 new file mode 100644 index 0000000000..a89e9c1606 --- /dev/null +++ b/ansible/roles/prometheus/templates/seldon-prometheus-adapter-configmap.j2 @@ -0,0 +1,25 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-adapter +data: + config.yaml: |- + "rules": + - + "seriesQuery": | + {__name__="seldon_model_infer_total",namespace!=""} + "resources": + "overrides": + "model": {group: "mlops.seldon.io", resource: "model"} + "server": {group: "mlops.seldon.io", resource: "server"} + "pod": {resource: "pod"} + "namespace": {resource: "namespace"} + "name": + "matches": "seldon_model_infer_total" + "as": "infer_rps" + "metricsQuery": | + sum by (<<.GroupBy>>) ( + rate ( + <<.Series>>{<<.LabelMatchers>>}[{{ seldon_prometheus_adapter_config_rate }}] + ) + ) \ No newline at end of file diff --git a/ansible/roles/strimzi/defaults/main.yaml b/ansible/roles/strimzi/defaults/main.yaml index 9a82f686e4..cf0abd38fa 100644 --- a/ansible/roles/strimzi/defaults/main.yaml +++ b/ansible/roles/strimzi/defaults/main.yaml @@ -1,7 +1,7 @@ --- strimzi_kafka_operator_namespace: "strimzi-system" -strimzi_kafka_operator_version: "0.33.2" +strimzi_kafka_operator_version: "0.44.0" strimzi_kafka_operator_values: "{{ lookup('template', 'strimzi-operator-values.yaml.j2') | from_yaml }}" strimzi_kafka_operator_wait_timeout: "300s" -strimzi_kafka_operator_feature_gates: "+UseKRaft,+UseStrimziPodSets" +strimzi_kafka_operator_feature_gates: "" diff --git a/docs-gb/kubernetes/kafka.md b/docs-gb/kubernetes/kafka.md index 0b4632fcae..157691439a 100644 --- a/docs-gb/kubernetes/kafka.md +++ b/docs-gb/kubernetes/kafka.md @@ -53,7 +53,6 @@ You can enable `featureGates` during Helm installation via: helm upgrade --install strimzi-kafka-operator \ strimzi/strimzi-kafka-operator \ --namespace seldon-mesh --create-namespace \ - --set featureGates='+UseKRaft\,+UseStrimziPodSets' ``` {% hint style="warning" %} @@ -89,5 +88,5 @@ ansible-playbook playbooks/setup-ecosystem.yaml -e full_install=no -e install_ka ## Notes - You can check [kafka-examples](https://github.com/strimzi/strimzi-kafka-operator/tree/main/examples/kafka) for more details. -- As we are using [KRaft](https://kafka.apache.org/documentation/#kraft), use Kafka version 3.3 or above. +- As we are using [KRaft](https://kafka.apache.org/documentation/#kraft), use Kafka version 3.4 or above. - For security settings check [here](../getting-started/kubernetes-installation/security.md#kafka). diff --git a/k8s/helm-charts/seldon-core-v2-setup/values.yaml b/k8s/helm-charts/seldon-core-v2-setup/values.yaml index 1e1c0c75ab..3617fa44c6 100644 --- a/k8s/helm-charts/seldon-core-v2-setup/values.yaml +++ b/k8s/helm-charts/seldon-core-v2-setup/values.yaml @@ -265,7 +265,7 @@ serverConfig: pullPolicy: IfNotPresent registry: docker.io repository: seldonio/mlserver - tag: 1.6.0 + tag: 1.6.1 serverCapabilities: "mlserver,alibi-detect,alibi-explain,huggingface,lightgbm,mlflow,python,sklearn,spark-mlib,xgboost" modelVolumeStorage: 1Gi resources: diff --git a/k8s/helm-charts/seldon-core-v2-setup/values.yaml.template b/k8s/helm-charts/seldon-core-v2-setup/values.yaml.template index e4337192dd..dec1079ac7 100644 --- a/k8s/helm-charts/seldon-core-v2-setup/values.yaml.template +++ b/k8s/helm-charts/seldon-core-v2-setup/values.yaml.template @@ -265,7 +265,7 @@ serverConfig: pullPolicy: IfNotPresent registry: docker.io repository: seldonio/mlserver - tag: 1.6.0 + tag: 1.6.1 serverCapabilities: "mlserver,alibi-detect,alibi-explain,huggingface,lightgbm,mlflow,python,sklearn,spark-mlib,xgboost" modelVolumeStorage: 1Gi resources: diff --git a/k8s/yaml/components.yaml b/k8s/yaml/components.yaml index 3ed31a762f..a49a752018 100644 --- a/k8s/yaml/components.yaml +++ b/k8s/yaml/components.yaml @@ -1107,7 +1107,7 @@ spec: value: "false" - name: MLSERVER_GRPC_MAX_MESSAGE_LENGTH value: "1048576000" - image: 'docker.io/seldonio/mlserver:1.6.0' + image: 'docker.io/seldonio/mlserver:1.6.1' imagePullPolicy: 'IfNotPresent' lifecycle: preStop: diff --git a/kafka/strimzi/README.md b/kafka/strimzi/README.md index 02cb28f991..89460470b0 100644 --- a/kafka/strimzi/README.md +++ b/kafka/strimzi/README.md @@ -23,7 +23,6 @@ You can enable `featureGates` during Helm installation via: helm upgrade --install strimzi-kafka-operator \ strimzi/strimzi-kafka-operator \ --namespace seldon-mesh --create-namespace \ - --set featureGates='+UseKRaft\,+UseStrimziPodSets' ``` ```{warning} diff --git a/kafka/strimzi/templates/cluster.yaml b/kafka/strimzi/templates/cluster.yaml index a6ae168dae..0c50791080 100644 --- a/kafka/strimzi/templates/cluster.yaml +++ b/kafka/strimzi/templates/cluster.yaml @@ -2,11 +2,15 @@ apiVersion: kafka.strimzi.io/v1beta2 kind: Kafka metadata: name: {{ .Values.cluster.name }} + annotations: + strimzi.io/node-pools: enabled + strimzi.io/kraft: enabled spec: entityOperator: userOperator: {} kafka: version: {{ .Values.cluster.version }} + metadataVersion: {{ .Values.cluster.metadataVersion }} replicas: {{ .Values.broker.replicas }} listeners: {{- if .Values.broker.plaintext.enabled }} @@ -29,12 +33,6 @@ spec: livenessProbe: initialDelaySeconds: {{ .Values.broker.liveness.initialDelaySeconds }} timeoutSeconds: {{ .Values.broker.liveness.timeoutSeconds }} - resources: - requests: - cpu: '{{ .Values.broker.resources.cpu }}' - memory: '{{ .Values.broker.resources.memory }}' - limits: - memory: '{{ .Values.broker.resources.memory }}' config: auto.create.topics.enable: {{ .Values.topic.autoCreate }} offsets.topic.replication.factor: {{ .Values.topic.offsetReplicationFactor }} @@ -42,40 +40,10 @@ spec: transaction.state.log.min.isr: {{ .Values.topic.txStateMinISR }} default.replication.factor: {{ .Values.topic.defaultReplicationFactor }} min.insync.replicas: {{ .Values.topic.minISR }} - inter.broker.protocol.version: {{ .Values.broker.interBrokerProtocolVersion }} message.max.bytes: {{ .Values.broker.messageMaxBytes }} - template: - pod: - tmpDirSizeLimit: {{ .Values.broker.tmpDirSizeLimit }} - storage: - type: jbod - volumes: - - id: 0 - type: persistent-claim - size: {{ .Values.broker.pvcSize }} - deleteClaim: false metricsConfig: type: jmxPrometheusExporter valueFrom: configMapKeyRef: name: kafka-metrics key: kafka-metrics-config.yml - # zookeeper settings should not be use in case of STRIMZI_FEATURE_GATES=+UseStrimziPodSets,+UseKRaft (raft) - # to enable raft run: - # `kubectl set env deployment/strimzi-cluster-operator STRIMZI_FEATURE_GATES=+UseStrimziPodSets,+UseKRaft -n kafka` - # which is the default with ansible install - zookeeper: - replicas: {{ .Values.zookeeper.replicas }} - readinessProbe: - initialDelaySeconds: {{ .Values.zookeeper.readiness.initialDelaySeconds }} - timeoutSeconds: {{ .Values.zookeeper.readiness.timeoutSeconds }} - livenessProbe: - initialDelaySeconds: {{ .Values.zookeeper.liveness.initialDelaySeconds }} - timeoutSeconds: {{ .Values.zookeeper.liveness.timeoutSeconds }} - storage: - type: persistent-claim - size: {{ .Values.zookeeper.pvcSize }} - deleteClaim: false - kafkaExporter: - topicRegex: ".*" - groupRegex: ".*" diff --git a/kafka/strimzi/templates/pool.yaml b/kafka/strimzi/templates/pool.yaml new file mode 100644 index 0000000000..be7ceb51d3 --- /dev/null +++ b/kafka/strimzi/templates/pool.yaml @@ -0,0 +1,32 @@ +apiVersion: kafka.strimzi.io/v1beta2 +kind: KafkaNodePool +metadata: + name: kafka + labels: + strimzi.io/cluster: {{ .Values.cluster.name }} +spec: + replicas: {{ .Values.broker.replicas }} + roles: + - broker + - controller + + resources: + requests: + cpu: '{{ .Values.broker.resources.cpu }}' + memory: '{{ .Values.broker.resources.memory }}' + limits: + memory: '{{ .Values.broker.resources.memory }}' + template: + pod: + tmpDirSizeLimit: {{ .Values.broker.tmpDirSizeLimit }} + storage: + type: jbod + volumes: + - id: 0 + type: ephemeral + sizeLimit: {{ .Values.broker.kraftMetadataSizeLimit }} + kraftMetadata: shared + - id: 1 + type: persistent-claim + size: {{ .Values.broker.pvcSize }} + deleteClaim: false diff --git a/kafka/strimzi/values.yaml b/kafka/strimzi/values.yaml index 223213a3fd..cdf5bb445a 100644 --- a/kafka/strimzi/values.yaml +++ b/kafka/strimzi/values.yaml @@ -2,7 +2,8 @@ cluster: name: "seldon" - version: "3.3.1" + version: "3.8.0" + metadataVersion: "3.8.0" metrics: enabled: true @@ -30,10 +31,10 @@ broker: initialDelaySeconds: 15 timeoutSeconds: 5 - interBrokerProtocolVersion: "3.3" - tmpDirSizeLimit: 100Mi + kraftMetadataSizeLimit: 1Gi + pvcSize: 100Gi messageMaxBytes: 1000000000 @@ -50,15 +51,3 @@ topic: defaultReplicationFactor: 1 minISR: 1 -zookeeper: - replicas: 1 - - readiness: - initialDelaySeconds: 15 - timeoutSeconds: 5 - - liveness: - initialDelaySeconds: 15 - timeoutSeconds: 5 - - pvcSize: 100Gi diff --git a/operator/Makefile b/operator/Makefile index 43ce171711..f58a30e9f7 100644 --- a/operator/Makefile +++ b/operator/Makefile @@ -5,7 +5,7 @@ IMG ?= ${DOCKERHUB_USERNAME}/seldonv2-controller:${CUSTOM_IMAGE_TAG} IMG_CLI ?= ${DOCKERHUB_USERNAME}/seldon-cli:${CUSTOM_IMAGE_TAG} AGENT_IMG ?= ${DOCKERHUB_USERNAME}/seldon-agent:${CUSTOM_IMAGE_TAG} RCLONE_IMG ?= ${DOCKERHUB_USERNAME}/seldon-rclone:${CUSTOM_IMAGE_TAG} -MLSERVER_IMG ?= seldonio/mlserver:1.6.0 +MLSERVER_IMG ?= seldonio/mlserver:1.6.1 TRITON_IMG ?= nvcr.io/nvidia/tritonserver:23.03-py3 # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary. ENVTEST_K8S_VERSION = 1.22 diff --git a/operator/config/serverconfigs/kustomization.yaml b/operator/config/serverconfigs/kustomization.yaml index 1c751358e6..9f80734795 100644 --- a/operator/config/serverconfigs/kustomization.yaml +++ b/operator/config/serverconfigs/kustomization.yaml @@ -9,7 +9,7 @@ images: newTag: latest - name: mlserver newName: seldonio/mlserver - newTag: 1.6.0 + newTag: 1.6.1 - name: rclone newName: seldonio/seldon-rclone newTag: latest diff --git a/samples/smoke-tests.sh b/samples/smoke-tests.sh index e18997359b..8c1f84d810 100755 --- a/samples/smoke-tests.sh +++ b/samples/smoke-tests.sh @@ -28,13 +28,13 @@ function load() { else if [ $1 == "model" ] then - seldon model load -f $2 + SELDON_FORCE_CONTROL_PLANE=true seldon model load -f $2 elif [ $1 == "pipeline" ] then - seldon pipeline load -f $2 + SELDON_FORCE_CONTROL_PLANE=true seldon pipeline load -f $2 elif [ $1 == "experiment" ] then - seldon experiment start -f $2 + SELDON_FORCE_CONTROL_PLANE=true seldon experiment start -f $2 fi fi } @@ -47,13 +47,13 @@ function unload() { else if [ $1 == "model" ] then - seldon model unload $2 + SELDON_FORCE_CONTROL_PLANE=true seldon model unload $2 elif [ $1 == "pipeline" ] then - seldon pipeline unload $2 + SELDON_FORCE_CONTROL_PLANE=true seldon pipeline unload $2 elif [ $1 == "experiment" ] then - seldon experiment stop $2 + SELDON_FORCE_CONTROL_PLANE=true seldon experiment stop $2 fi fi } @@ -74,10 +74,10 @@ function status() { else if [ $1 == "model" ] then - seldon model status $2 -w ModelAvailable | jq -M . + seldon model status $2 -w ModelAvailable -t 10 | jq -M . elif [ $1 == "pipeline" ] then - seldon pipeline status $2 -w PipelineReady | jq -M . + seldon pipeline status $2 -w PipelineReady -t 10 | jq -M . elif [ $1 == "experiment" ] then seldon experiment status $2 -w | jq -M . @@ -189,9 +189,9 @@ unload pipeline trigger-joins ./pipelines/trigger-joins.yaml unload model mul10 ./models/mul10.yaml unload model add10 ./models/add10.yaml +sleep $sleepTime # MLServer -sleep $sleepTime load model ./models/sklearn-iris-gs.yaml status model iris seldon model infer iris '{"inputs": [{"name": "predict", "shape": [1, 4], "datatype": "FP32", "data": [[1, 2, 3, 4]]}]}' @@ -199,6 +199,7 @@ seldon model infer iris --inference-mode grpc \ '{"model_name":"iris","inputs":[{"name":"input","contents":{"fp32_contents":[1,2,3,4]},"datatype":"FP32","shape":[1,4]}]}' | jq -M . unload model iris ./models/sklearn-iris-gs.yaml +sleep $sleepTime # Experiments load model ./models/sklearn1.yaml diff --git a/samples/stress-tests.sh b/samples/stress-tests.sh index 5aa55c9288..30d0e93de5 100755 --- a/samples/stress-tests.sh +++ b/samples/stress-tests.sh @@ -40,13 +40,13 @@ function load() { else if [ $1 == "model" ] then - seldon model load -f $2 + SELDON_FORCE_CONTROL_PLANE=true seldon model load -f $2 elif [ $1 == "pipeline" ] then - seldon pipeline load -f $2 + SELDON_FORCE_CONTROL_PLANE=true seldon pipeline load -f $2 elif [ $1 == "experiment" ] then - seldon experiment start -f $2 + SELDON_FORCE_CONTROL_PLANE=true seldon experiment start -f $2 fi fi } @@ -59,13 +59,13 @@ function unload() { else if [ $1 == "model" ] then - seldon model unload $2 + SELDON_FORCE_CONTROL_PLANE=true seldon model unload $2 elif [ $1 == "pipeline" ] then - seldon pipeline unload $2 + SELDON_FORCE_CONTROL_PLANE=true seldon pipeline unload $2 elif [ $1 == "experiment" ] then - seldon experiment stop $2 + SELDON_FORCE_CONTROL_PLANE=true seldon experiment stop $2 fi fi } @@ -86,13 +86,13 @@ function status() { else if [ $1 == "model" ] then - seldon model status $2 -w ModelAvailable | jq -M . + seldon model status $2 -w ModelAvailable -t 10 | jq -M . elif [ $1 == "pipeline" ] then - seldon pipeline status $2 -w PipelineReady | jq -M . + seldon pipeline status $2 -w PipelineReady -t 10 | jq -M . elif [ $1 == "experiment" ] then - seldon experiment status $2 -w | jq -M . + seldon experiment status $2 -w -t 10 | jq -M . fi fi } diff --git a/scheduler/Makefile b/scheduler/Makefile index b118dcbc6e..c7bb8d5e9e 100644 --- a/scheduler/Makefile +++ b/scheduler/Makefile @@ -8,9 +8,9 @@ ENVOY_IMG ?= ${DOCKERHUB_USERNAME}/seldon-envoy:${CUSTOM_IMAGE_TAG} GRAFANA_IMG ?= ${DOCKERHUB_USERNAME}/seldon-grafana:${CUSTOM_IMAGE_TAG} PROMETHEUS_IMG ?= prom/prometheus:latest HODOMETER_IMG ?= ${DOCKERHUB_USERNAME}/seldon-hodometer:${CUSTOM_IMAGE_TAG} -KAFKA_IMG ?= docker.io/bitnami/kafka:3.3.1 +KAFKA_IMG ?= docker.io/bitnami/kafka:3.8.1 ZOOKEEPER_IMG ?= docker.io/bitnami/zookeeper:3.8 -MLSERVER_IMG ?= seldonio/mlserver:1.6.0 +MLSERVER_IMG ?= seldonio/mlserver:1.6.1 MODELGATEWAY_IMG ?= ${DOCKERHUB_USERNAME}/seldon-modelgateway:${CUSTOM_IMAGE_TAG} OTELCOL_IMG ?= otel/opentelemetry-collector-contrib-dev:latest JAEGER_IMG ?= jaegertracing/all-in-one:latest diff --git a/scheduler/env.all b/scheduler/env.all index 57cb872a21..10decc8a97 100644 --- a/scheduler/env.all +++ b/scheduler/env.all @@ -41,7 +41,7 @@ PIPELINEGATEWAY_IMAGE_AND_TAG=seldonio/seldon-pipelinegateway:latest SERVER_MLSERVER_IMAGE_AND_TAG=seldonio/mlserver:1.3.5 SERVER_TRITON_IMAGE_AND_TAG=nvcr.io/nvidia/tritonserver:23.03-py3 SCHEDULER_IMAGE_AND_TAG=seldonio/seldon-scheduler:latest -KAFKA_IMAGE_AND_TAG=docker.io/bitnami/kafka:3.3 +KAFKA_IMAGE_AND_TAG=docker.io/bitnami/kafka:3.8 GRAFANA_IMAGE_AND_TAG=seldonio/seldon-grafana:latest AGENT_OVERCOMMIT_PERCENTAGE=20