Add infrastructure-as-a-code for MLflow tracking server

kouddy · Jul 26, 2020 · aff38ed · aff38ed
1 parent f2c8c1f
commit aff38ed
Show file tree

Hide file tree

Showing 6 changed files with 161 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+*.iml
+*.idea
diff --git a/.local-notes/commands.md b/.local-notes/commands.md
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,25 @@
+FROM continuumio/miniconda3:latest
+
+RUN apt-get -y update
+RUN apt-get -y upgrade
+
+RUN pip install --upgrade pip
+RUN pip install mlflow==1.10.0 boto3 awscli
+
+# Install Mysql connector required by SQLAlchemy (https://docs.sqlalchemy.org/en/13/dialects/mysql.html#module-sqlalchemy.dialects.mysql.mysqlconnector)
+RUN apt-get -y install default-libmysqlclient-dev build-essential
+RUN pip install mysqlclient
+
+# If you want to use file store as backend store and wants to improve tracking server's performance, uncomment following lines. See https://mlflow.org/docs/latest/tracking.html#id53
+#RUN apt-get -y install libyaml-cpp-dev libyaml-dev
+#RUN pip --no-cache-dir install --force-reinstall -I pyyaml
+
+RUN mkdir /app
+RUN cd /app
+RUN mkdir mlflow
+
+COPY run.sh /app/mlflow/run.sh
+
+RUN chmod 777 /app/mlflow/run.sh
+
+ENTRYPOINT ["/app/mlflow/run.sh"]
diff --git a/README.md b/README.md
@@ -1 +1,36 @@
-# mlflow-openshift-infrastructure
+# mlflow-tracking-server-openshift-infrastructure
+
+## Introduction
+This repository contains MLflow's [tracking server](https://www.mlflow.org/docs/latest/tracking.html) Dockerfile as well as infrastructure-as-code needed to run on OpenShift.
+
+## Quick start
+### Trying it out locally
+The fastest way to try it out is by start a container locally.
+You can build container first by running:
+```
+docker build .
+```
+
+You can then start the container by running:
+```
+docker run -it -p 5000:5000 \
+    -e BACKEND_STORE_URI="/app/mlflow/mlruns" \
+    <container image ID>
+```
+
+Note that if file store is used, MLflow won't be able to use Artifact Store, so functionality such as uploading artifact and Model Registry will be disabled.
+
+### Running it on OpenShift for production use case
+After running `docker build .` and image it pushed to an place which stores Docker Image such as Docker Hub, you can use following command to do deployment:
+```
+oc process -f deployment.yaml \
+    -p IMAGE_URL=<image URL> \
+    -p BACKEND_STORE_URI=mysql://user:password@mysql:3306/sampledb \
+    -p DEFAULT_ARTIFACT_ROOT="s3://<your bucket>/artifacts" \
+    -p AWS_ACCESS_KEY_ID=<key> \
+    -p AWS_SECRET_ACCESS_KEY=<password> \
+    | oc apply -f-
+```
+Note that because now MLflow tracking server is deployed remotely (and usually is used in production), it is good to use non file store as backend store.
+Also it is good to use AWS S3, or Azure, or Google Cloud to store artifacts when used in production.
+Note that dependencies such as backend store such as a database and artifact store such as AWS S3 needs to be set explicitly before running above command.
diff --git a/deployment.yaml b/deployment.yaml
@@ -0,0 +1,88 @@
+apiVersion: v1
+kind: Template
+labels:
+  app: ${APP_NAME}
+metadata:
+  name: ${APP_NAME}
+objects:
+  - apiVersion: v1
+    kind: Route
+    metadata:
+      name: ${APP_NAME}-route
+      labels:
+        app: ${APP_NAME}
+    spec:
+      to:
+        kind: Service
+        name: ${APP_NAME}-svc
+
+  - apiVersion: v1
+    kind: Service
+    metadata:
+      name: ${APP_NAME}-svc
+      namespace: default
+      labels:
+        app: ${APP_NAME}
+    spec:
+      ports:
+      - port: 80
+        targetPort: 5000
+        protocol: TCP
+      selector:
+        app: ${APP_NAME}
+
+  - apiVersion: v1
+    kind: DeploymentConfig
+    metadata:
+      name: ${APP_NAME}-dep
+      labels:
+        app: ${APP_NAME}
+    spec:
+      replicas: 1
+      strategy:
+        type: Recreate
+      template:
+        metadata:
+          labels:
+            app: ${APP_NAME}
+        spec:
+          containers:
+          - name: ${APP_NAME}
+            imagePullPolicy: Always
+            image: ${IMAGE_URL}
+            ports:
+              - containerPort: 5000
+                name: plaintext-port
+            resources:
+              requests:
+                cpu: 100m
+                memory: 300Mi
+              limits:
+                cpu: 200m
+                memory: 600Mi
+            env:
+            - name: BACKEND_STORE_URI
+              value: ${BACKEND_STORE_URI}
+            - name: DEFAULT_ARTIFACT_ROOT
+              value: ${DEFAULT_ARTIFACT_ROOT}
+            - name: AWS_ACCESS_KEY_ID
+              value: ${AWS_ACCESS_KEY_ID}
+            - name: AWS_SECRET_ACCESS_KEY
+              value: ${AWS_SECRET_ACCESS_KEY}
+
+parameters:
+  - name: IMAGE_URL
+    description: URL of the docker image
+  - name: APP_NAME
+    description: Application to use in OpenShift
+    value: mlflow-tracking-server
+  - name: BACKEND_STORE_URI
+    description: URI of Backend Store for MLflow. If the tracking server is intended for other people to access remotely, recommended backend store is to use a database.
+    value: mysql://user:password@mysql:3306/sampledb
+  - name: DEFAULT_ARTIFACT_ROOT
+    description: Path of Artifact Store. If the tracking server is intended for other people to access remotely, file store is not recommended.
+    value: "s3://bucket/artifacts"
+  - name: AWS_ACCESS_KEY_ID
+    description: AWS Access Key ID which has access to S3 bucket mentioned above.
+  - name: AWS_SECRET_ACCESS_KEY
+    description: AWS Access Key Secret which has access to S3 bucket mentioned above.
diff --git a/run.sh b/run.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+set -e
+
+cd /app/mlflow
+mlflow server \
+    --backend-store-uri=${BACKEND_STORE_URI} \
+    --default-artifact-root=${DEFAULT_ARTIFACT_ROOT} \
+    --host 0.0.0.0 \
+    --port 5000