diff --git a/.github/workflows/deploy_docs_to_gh_pages.yaml b/.github/workflows/deploy_docs_to_gh_pages.yaml
new file mode 100644
index 00000000..7bf5a377
--- /dev/null
+++ b/.github/workflows/deploy_docs_to_gh_pages.yaml
@@ -0,0 +1,54 @@
+# Deploy CMF mkdocs-based documentation to GitHub pages. The CMF docs need to be built first (because we automatically
+# build API documentation pages).
+# https://github.com/peaceiris/actions-gh-pages
+
+name: Build CMF Docs & Deploy to GitHub pages
+
+on:
+ push:
+ branches:
+ - master
+ paths:
+ - '.github/workflows/deploy_docs_to_gh_pages.yaml' # The workflow file itself.
+ - 'docs/**' # Documentation files
+ - '!docs/_src/**' # but exclude this one (raw resources for docs).
+ - 'cmflib/cmf.py' # Public API documentation.
+
+jobs:
+ deploy-docs-to-gh-pages:
+ # Do not run on forked repositories.
+ if: github.repository_owner == 'HewlettPackard'
+ runs-on: ubuntu-latest
+ permissions:
+ contents: write
+ steps:
+ - name: Checkout Repository
+ uses: actions/checkout@v3
+
+ - name: Setup Python Environment
+ uses: actions/setup-python@v3
+ with:
+ python-version: '3.8'
+
+ - name: Install Python Dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r docs/requirements.txt
+
+ - name: Build Docs
+ working-directory: './'
+ # First - build documentation, then remove a directory that contains raw files for documentation assets, and
+ # these raw files are not required for running the CMF documentation site.
+ run: |
+ mkdocs build --theme material --site-dir ../site/
+ rm -r ../site/_src
+
+ - name: Deploy Docts To GitHub Pages
+ # This step will deploy the generated documentation to `gh-pages` branch.
+ uses: peaceiris/actions-gh-pages@v3.9.0
+ with:
+ user_name: 'github-actions[bot]'
+ user_email: 'github-actions[bot]@users.noreply.github.com'
+ github_token: ${{ secrets.GITHUB_TOKEN }}
+ publish_dir: ../site
+ allow_empty_commit: true
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..0cd4d772
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,88 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+./build/
+mlcommons_box/build
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+*.ipynb
+# pyenv
+.python-version
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# PyCharm, IDEA
+.idea
+
diff --git a/Dockerfile b/Dockerfile
index 6de84301..58c1cebe 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -12,50 +12,61 @@ FROM jupyter/tensorflow-notebook
#FROM nvcr.io/partners/chainer:4.0.0b1
#Following: https://jupyter-docker-stacks.readthedocs.io/en/latest/using/recipes.html#
-ARG NB_USER
+ARG NB_USER
ARG NB_UID
ARG NB_GID
+
# name your environment and choose the python version
-ARG conda_env=python37
-ARG py_ver=3.7
+#ARG conda_env=python37
+#ARG py_ver=3.7
+
+#https://stackoverflow.com/questions/53004311/how-to-add-conda-environment-to-jupyter-lab
+#https://gist.github.com/James-Leslie/734babcbc1dd3f5fbc828af773922497
+RUN conda install -c conda-forge nb_conda_kernels
# you can add additional libraries you want mamba to install by listing them below the first line and ending with "&& \"
-RUN mamba create --quiet --yes -p "${CONDA_DIR}/envs/${conda_env}" python=${py_ver} ipython ipykernel && \
- mamba clean --all -f -y
+#RUN mamba create --quiet --yes -p "${CONDA_DIR}/envs/${conda_env}" python=${py_ver} ipython ipykernel && \
+# mamba clean --all -f -y
+
+#RUN conda activate ${conda_env}
# create Python kernel and link it to jupyter
-RUN "${CONDA_DIR}/envs/${conda_env}/bin/python" -m ipykernel install --user --name="${conda_env}" && \
- fix-permissions "${CONDA_DIR}" && \
- fix-permissions "/home/${NB_USER}"
+#RUN "${CONDA_DIR}/envs/${conda_env}/bin/python" -m ipykernel install --user --name="${conda_env}" && \
+# fix-permissions "${CONDA_DIR}" && \
+# fix-permissions "/home/${NB_USER}"
# any additional pip installs can be added by uncommenting the following line
#RUN "${CONDA_DIR}/envs/${conda_env}/bin/pip" install --quiet --no-cache-dir
# if you want this environment to be the default one, uncomment the following line:
-RUN echo "conda activate ${conda_env}" >> "${HOME}/.bashrc"
#RUN apt-get update; apt-get install -y build-essential
USER ${NB_USER}
-RUN "${CONDA_DIR}/envs/${conda_env}/bin/pip" install --quiet --no-cache-dir 'flake8==3.9.2' && \
- fix-permissions "${CONDA_DIR}" && \
- fix-permissions "/home/${NB_USER}" && \
- mkdir /home/${NB_USER}/cmflib/
+#RUN "${CONDA_DIR}/envs/${conda_env}/bin/pip" install --quiet --no-cache-dir 'flake8==3.9.2' && \
+
+# fix-permissions "${CONDA_DIR}" && \
+# fix-permissions "/home/${NB_USER}" && \
+RUN mkdir /home/${NB_USER}/cmflib/
-COPY --chown=${NB_UID}:${NB_GID} cmflib/Requirements.txt /home/${NB_USER}/cmflib/
-RUN "${CONDA_DIR}/envs/${conda_env}/bin/pip" install --no-cache-dir --requirement /home/${NB_USER}/cmflib/Requirements.txt && \
+COPY --chown=${NB_UID}:${NB_GID} Requirements.txt /home/${NB_USER}/cmflib/
+RUN "${CONDA_DIR}/bin/pip" install --no-cache-dir --requirement /home/${NB_USER}/cmflib/Requirements.txt && \
fix-permissions "${CONDA_DIR}" && \
fix-permissions "/home/${NB_USER}"
-COPY --chown=${NB_UID}:${NB_GID} cmflib/cmflib /home/${NB_USER}/cmflib/cmflib
-COPY --chown=${NB_UID}:${NB_GID} cmflib/setup.py /home/${NB_USER}/cmflib/setup.py
-RUN cd /home/${NB_USER}/cmflib && "${CONDA_DIR}/envs/${conda_env}/bin/pip" install --no-cache . && \
+COPY --chown=${NB_UID}:${NB_GID} cmflib /home/${NB_USER}/cmflib/cmflib
+COPY --chown=${NB_UID}:${NB_GID} setup.py /home/${NB_USER}/cmflib/setup.py
+RUN cd /home/${NB_USER}/cmflib && "${CONDA_DIR}/bin/pip" install --no-cache . && \
fix-permissions "${CONDA_DIR}" && \
fix-permissions "/home/${NB_USER}"
#ENV PYTHONPATH "${PYTHONPATH}:/home/${NB_USER}/cmflib"
-COPY --chown=${NB_UID}:${NB_GID} cmflib/example-get-started /home/${NB_USER}/example-get-started
+COPY --chown=${NB_UID}:${NB_GID} examples/example-get-started /home/${NB_USER}/example-get-started
+
+
+# if you want this environment to be the default one, uncomment the following line:
+RUN echo "conda activate ${conda_env}" >> "${HOME}/.bashrc"
diff --git a/Query_Tester-base_mlmd.ipynb b/Query_Tester-base_mlmd.ipynb
new file mode 100644
index 00000000..69ec3afa
--- /dev/null
+++ b/Query_Tester-base_mlmd.ipynb
@@ -0,0 +1,1569 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "92316a38",
+ "metadata": {},
+ "source": [
+ "### This Jupyter notebook provides examples of how the pipeline metadata stored with CMF tracking layer can be queried with CMF query layer."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "id": "d8cecabd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "from cmflib import cmfquery"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9ae54008",
+ "metadata": {},
+ "source": [
+ "### Initialize the library and get all the stages in the pipeline\n",
+ "Point the library to the metadata file.
\n",
+ "The `get_pipeline_stages` call point to the different stages in the pipeline."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "id": "8b735117",
+ "metadata": {
+ "pycharm": {
+ "is_executing": true
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Prepare', 'Featurize', 'Train', 'Evaluate']"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "query = cmfquery.CmfQuery(\"./mlmd\")\n",
+ "pipelines = query.get_pipeline_names()\n",
+ "stages = query.get_pipeline_stages(pipelines[0])\n",
+ "display(stages)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5ac1591f",
+ "metadata": {},
+ "source": [
+ "### Query the Executions in each stage"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "id": "da0ee66d",
+ "metadata": {
+ "pycharm": {
+ "is_executing": true
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Displaying execution for stage Prepare\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Context_ID | \n",
+ " Context_Type | \n",
+ " Execution | \n",
+ " Git_End_Commit | \n",
+ " Git_Repo | \n",
+ " Git_Start_Commit | \n",
+ " Pipeline_Type | \n",
+ " Pipeline_id | \n",
+ " id | \n",
+ " seed | \n",
+ " split | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2 | \n",
+ " Prepare | \n",
+ " ['src/parse.py', 'artifacts/data.xml.gz', 'artifacts/parsed'] | \n",
+ " | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 8158283953c04affb8fe5ea6710656564ede7d3a | \n",
+ " Test-env | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 20170428 | \n",
+ " 0.2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Context_ID Context_Type \\\n",
+ "0 2 Prepare \n",
+ "\n",
+ " Execution \\\n",
+ "0 ['src/parse.py', 'artifacts/data.xml.gz', 'artifacts/parsed'] \n",
+ "\n",
+ " Git_End_Commit Git_Repo \\\n",
+ "0 /tmp/cmf/example_get_started/git_remote \n",
+ "\n",
+ " Git_Start_Commit Pipeline_Type Pipeline_id id \\\n",
+ "0 8158283953c04affb8fe5ea6710656564ede7d3a Test-env 1 1 \n",
+ "\n",
+ " seed split \n",
+ "0 20170428 0.2 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Displaying execution for stage Featurize\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Context_ID | \n",
+ " Context_Type | \n",
+ " Execution | \n",
+ " Git_End_Commit | \n",
+ " Git_Repo | \n",
+ " Git_Start_Commit | \n",
+ " Pipeline_Type | \n",
+ " Pipeline_id | \n",
+ " id | \n",
+ " max_features | \n",
+ " ngrams | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Featurize-execution | \n",
+ " ['src/featurize.py', 'artifacts/parsed', 'artifacts/features'] | \n",
+ " | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 8158283953c04affb8fe5ea6710656564ede7d3a | \n",
+ " Test-env | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3000 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Context_ID Context_Type \\\n",
+ "0 3 Featurize-execution \n",
+ "\n",
+ " Execution \\\n",
+ "0 ['src/featurize.py', 'artifacts/parsed', 'artifacts/features'] \n",
+ "\n",
+ " Git_End_Commit Git_Repo \\\n",
+ "0 /tmp/cmf/example_get_started/git_remote \n",
+ "\n",
+ " Git_Start_Commit Pipeline_Type Pipeline_id id \\\n",
+ "0 8158283953c04affb8fe5ea6710656564ede7d3a Test-env 1 2 \n",
+ "\n",
+ " max_features ngrams \n",
+ "0 3000 2 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Displaying execution for stage Train\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Context_ID | \n",
+ " Context_Type | \n",
+ " Execution | \n",
+ " Git_End_Commit | \n",
+ " Git_Repo | \n",
+ " Git_Start_Commit | \n",
+ " Pipeline_Type | \n",
+ " Pipeline_id | \n",
+ " id | \n",
+ " min_split | \n",
+ " n_est | \n",
+ " seed | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 4 | \n",
+ " Train-execution | \n",
+ " ['src/train.py', 'artifacts/features', 'artifacts/model'] | \n",
+ " | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 8158283953c04affb8fe5ea6710656564ede7d3a | \n",
+ " Test-env | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 64 | \n",
+ " 100 | \n",
+ " 20170428 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Context_ID Context_Type \\\n",
+ "0 4 Train-execution \n",
+ "\n",
+ " Execution Git_End_Commit \\\n",
+ "0 ['src/train.py', 'artifacts/features', 'artifacts/model'] \n",
+ "\n",
+ " Git_Repo \\\n",
+ "0 /tmp/cmf/example_get_started/git_remote \n",
+ "\n",
+ " Git_Start_Commit Pipeline_Type Pipeline_id id \\\n",
+ "0 8158283953c04affb8fe5ea6710656564ede7d3a Test-env 1 3 \n",
+ "\n",
+ " min_split n_est seed \n",
+ "0 64 100 20170428 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Displaying execution for stage Evaluate\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Context_ID | \n",
+ " Context_Type | \n",
+ " Execution | \n",
+ " Git_End_Commit | \n",
+ " Git_Repo | \n",
+ " Git_Start_Commit | \n",
+ " Pipeline_Type | \n",
+ " Pipeline_id | \n",
+ " id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 5 | \n",
+ " Evaluate-execution | \n",
+ " ['src/test.py', 'artifacts/model', 'artifacts/features', 'artifacts/tes... | \n",
+ " | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 8158283953c04affb8fe5ea6710656564ede7d3a | \n",
+ " Test-env | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Context_ID Context_Type \\\n",
+ "0 5 Evaluate-execution \n",
+ "\n",
+ " Execution \\\n",
+ "0 ['src/test.py', 'artifacts/model', 'artifacts/features', 'artifacts/tes... \n",
+ "\n",
+ " Git_End_Commit Git_Repo \\\n",
+ "0 /tmp/cmf/example_get_started/git_remote \n",
+ "\n",
+ " Git_Start_Commit Pipeline_Type Pipeline_id id \n",
+ "0 8158283953c04affb8fe5ea6710656564ede7d3a Test-env 1 4 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "for stage in stages:\n",
+ " executions = query.get_all_executions_in_stage(stage)\n",
+ " print(f\"Displaying execution for stage {stage}\")\n",
+ " display(executions)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5a3599af",
+ "metadata": {},
+ "source": [
+ "### Get all artifacts of an execution. \n",
+ "input parameter - execution_id
\n",
+ "output parameter - artifacts
\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "id": "6fa93876",
+ "metadata": {
+ "pycharm": {
+ "is_executing": true
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Displaying the artifacts for execution with id 1 belonging to Prepare\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Commit | \n",
+ " create_time_since_epoch | \n",
+ " event | \n",
+ " git_repo | \n",
+ " id | \n",
+ " last_update_time_since_epoch | \n",
+ " name | \n",
+ " type | \n",
+ " uri | \n",
+ " user-metadata1 | \n",
+ " user-metadata2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " commit 03c25dfdb6c188b7b04f7e675dec072de192b851 | \n",
+ " 1667255770792 | \n",
+ " INPUT | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 1 | \n",
+ " 1667255778222 | \n",
+ " artifacts/data.xml.gz:236d9502e0283d91f689d7038b8508a2 | \n",
+ " Dataset | \n",
+ " 236d9502e0283d91f689d7038b8508a2 | \n",
+ " metadata_value | \n",
+ " metadata_value | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " commit 4fba7197919fb85dd1a0899d2cf5c5c690ee607c | \n",
+ " 1667255774532 | \n",
+ " OUTPUT | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 2 | \n",
+ " 1667255774532 | \n",
+ " artifacts/parsed/train.tsv:22ec7737f442cfc81e8c701fb58d1007 | \n",
+ " Dataset | \n",
+ " 22ec7737f442cfc81e8c701fb58d1007 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " commit 5dfd3ac63c950f6394e5b7cebd55343402c7fdb6 | \n",
+ " 1667255776391 | \n",
+ " OUTPUT | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 3 | \n",
+ " 1667255776391 | \n",
+ " artifacts/parsed/test.tsv:03e3627bda150c8cf51a55ef96ab3ede | \n",
+ " Dataset | \n",
+ " 03e3627bda150c8cf51a55ef96ab3ede | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Commit create_time_since_epoch \\\n",
+ "0 commit 03c25dfdb6c188b7b04f7e675dec072de192b851 1667255770792 \n",
+ "1 commit 4fba7197919fb85dd1a0899d2cf5c5c690ee607c 1667255774532 \n",
+ "2 commit 5dfd3ac63c950f6394e5b7cebd55343402c7fdb6 1667255776391 \n",
+ "\n",
+ " event git_repo id \\\n",
+ "0 INPUT /tmp/cmf/example_get_started/git_remote 1 \n",
+ "1 OUTPUT /tmp/cmf/example_get_started/git_remote 2 \n",
+ "2 OUTPUT /tmp/cmf/example_get_started/git_remote 3 \n",
+ "\n",
+ " last_update_time_since_epoch \\\n",
+ "0 1667255778222 \n",
+ "1 1667255774532 \n",
+ "2 1667255776391 \n",
+ "\n",
+ " name type \\\n",
+ "0 artifacts/data.xml.gz:236d9502e0283d91f689d7038b8508a2 Dataset \n",
+ "1 artifacts/parsed/train.tsv:22ec7737f442cfc81e8c701fb58d1007 Dataset \n",
+ "2 artifacts/parsed/test.tsv:03e3627bda150c8cf51a55ef96ab3ede Dataset \n",
+ "\n",
+ " uri user-metadata1 user-metadata2 \n",
+ "0 236d9502e0283d91f689d7038b8508a2 metadata_value metadata_value \n",
+ "1 22ec7737f442cfc81e8c701fb58d1007 NaN NaN \n",
+ "2 03e3627bda150c8cf51a55ef96ab3ede NaN NaN "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "\n",
+ "executions = query.get_all_executions_in_stage(stages[0])\n",
+ "print(f\"Displaying the artifacts for execution with id {executions.iloc[0]['id']} belonging to {stages[0]}\")\n",
+ "artifacts = query.get_all_artifacts_for_execution(executions.iloc[0][\"id\"])\n",
+ "display(artifacts)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d15b7386",
+ "metadata": {},
+ "source": [
+ "### Get all executions for an artifact (pass the artifact full name as the input parameter)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "id": "f1632d60",
+ "metadata": {
+ "pycharm": {
+ "is_executing": true
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Type | \n",
+ " execution_id | \n",
+ " execution_name | \n",
+ " pipeline | \n",
+ " stage | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " INPUT | \n",
+ " 1 | \n",
+ " | \n",
+ " Test-env | \n",
+ " Prepare | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Type execution_id execution_name pipeline stage\n",
+ "0 INPUT 1 Test-env Prepare"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "#Provide the artifact in name:hash format\n",
+ "artifacts = query.get_all_artifacts_for_execution(executions.iloc[0]['id'])\n",
+ "for index, art in artifacts.iterrows():\n",
+ " if art[\"event\"] == \"INPUT\":\n",
+ " artifact_name = art[\"name\"]\n",
+ " break\n",
+ "linked = query.get_all_executions_for_artifact(artifact_name)\n",
+ "display(linked)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7ad864e3",
+ "metadata": {},
+ "source": [
+ "### Get all the parent artifacts of an artifact. Provides the artifact lineage chain"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "id": "09652709",
+ "metadata": {
+ "pycharm": {
+ "is_executing": true
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Parent artifact of artifacts/features/train.pkl:5de5e987eadb4b86fc47604b59cb3725\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Commit | \n",
+ " create_time_since_epoch | \n",
+ " git_repo | \n",
+ " id | \n",
+ " last_update_time_since_epoch | \n",
+ " name | \n",
+ " type | \n",
+ " uri | \n",
+ " user-metadata1 | \n",
+ " user-metadata2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " commit 4fba7197919fb85dd1a0899d2cf5c5c690ee607c | \n",
+ " 1667255774532 | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 2 | \n",
+ " 1667255774532 | \n",
+ " artifacts/parsed/train.tsv:22ec7737f442cfc81e8c701fb58d1007 | \n",
+ " Dataset | \n",
+ " 22ec7737f442cfc81e8c701fb58d1007 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " commit 5dfd3ac63c950f6394e5b7cebd55343402c7fdb6 | \n",
+ " 1667255776391 | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 3 | \n",
+ " 1667255776391 | \n",
+ " artifacts/parsed/test.tsv:03e3627bda150c8cf51a55ef96ab3ede | \n",
+ " Dataset | \n",
+ " 03e3627bda150c8cf51a55ef96ab3ede | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " commit 03c25dfdb6c188b7b04f7e675dec072de192b851 | \n",
+ " 1667255770792 | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 1 | \n",
+ " 1667255778222 | \n",
+ " artifacts/data.xml.gz:236d9502e0283d91f689d7038b8508a2 | \n",
+ " Dataset | \n",
+ " 236d9502e0283d91f689d7038b8508a2 | \n",
+ " metadata_value | \n",
+ " metadata_value | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Commit create_time_since_epoch \\\n",
+ "0 commit 4fba7197919fb85dd1a0899d2cf5c5c690ee607c 1667255774532 \n",
+ "1 commit 5dfd3ac63c950f6394e5b7cebd55343402c7fdb6 1667255776391 \n",
+ "2 commit 03c25dfdb6c188b7b04f7e675dec072de192b851 1667255770792 \n",
+ "\n",
+ " git_repo id last_update_time_since_epoch \\\n",
+ "0 /tmp/cmf/example_get_started/git_remote 2 1667255774532 \n",
+ "1 /tmp/cmf/example_get_started/git_remote 3 1667255776391 \n",
+ "2 /tmp/cmf/example_get_started/git_remote 1 1667255778222 \n",
+ "\n",
+ " name type \\\n",
+ "0 artifacts/parsed/train.tsv:22ec7737f442cfc81e8c701fb58d1007 Dataset \n",
+ "1 artifacts/parsed/test.tsv:03e3627bda150c8cf51a55ef96ab3ede Dataset \n",
+ "2 artifacts/data.xml.gz:236d9502e0283d91f689d7038b8508a2 Dataset \n",
+ "\n",
+ " uri user-metadata1 user-metadata2 \n",
+ "0 22ec7737f442cfc81e8c701fb58d1007 NaN NaN \n",
+ "1 03e3627bda150c8cf51a55ef96ab3ede NaN NaN \n",
+ "2 236d9502e0283d91f689d7038b8508a2 metadata_value metadata_value "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "artifacts = query.get_all_artifacts_for_execution(2)\n",
+ "for index, art in artifacts.iterrows():\n",
+ " if art[\"event\"] == \"OUTPUT\":\n",
+ " artifact_name = art[\"name\"]\n",
+ " break\n",
+ "print(f\"Parent artifact of {artifact_name}\")\n",
+ "linked = query.get_all_parent_artifacts(artifact_name)\n",
+ "display(linked)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "63b615f1",
+ "metadata": {},
+ "source": [
+ "### Get all child artifacts of an artifact. It provides the lineage chain of successors"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "id": "57b85ea6",
+ "metadata": {
+ "pycharm": {
+ "is_executing": true
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Child artifact of artifacts/features/train.pkl:5de5e987eadb4b86fc47604b59cb3725\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Commit | \n",
+ " avg_prec | \n",
+ " create_time_since_epoch | \n",
+ " id | \n",
+ " last_update_time_since_epoch | \n",
+ " metrics_name | \n",
+ " model_framework | \n",
+ " model_name | \n",
+ " model_type | \n",
+ " name | \n",
+ " roc_auc | \n",
+ " type | \n",
+ " uri | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " commit 8158283953c04affb8fe5ea6710656564ede7d3a | \n",
+ " NaN | \n",
+ " 1667255811813 | \n",
+ " 6 | \n",
+ " 1667255811813 | \n",
+ " NaN | \n",
+ " SKlearn | \n",
+ " RandomForestClassifier:default | \n",
+ " RandomForestClassifier | \n",
+ " artifacts/model/model.pkl:5f6e4aa57cce9e3a0b2f12e5766d19be:3 | \n",
+ " NaN | \n",
+ " Model | \n",
+ " 5f6e4aa57cce9e3a0b2f12e5766d19be | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 0.526754 | \n",
+ " 1667255818388 | \n",
+ " 7 | \n",
+ " 1667255818388 | \n",
+ " metrics:878d492e-596c-11ed-99a3-b47af137252e:4 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " metrics:878d492e-596c-11ed-99a3-b47af137252e:4 | \n",
+ " 0.959238 | \n",
+ " Metrics | \n",
+ " 878d492e-596c-11ed-99a3-b47af137252e | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Commit avg_prec \\\n",
+ "0 commit 8158283953c04affb8fe5ea6710656564ede7d3a NaN \n",
+ "1 NaN 0.526754 \n",
+ "\n",
+ " create_time_since_epoch id last_update_time_since_epoch \\\n",
+ "0 1667255811813 6 1667255811813 \n",
+ "1 1667255818388 7 1667255818388 \n",
+ "\n",
+ " metrics_name model_framework \\\n",
+ "0 NaN SKlearn \n",
+ "1 metrics:878d492e-596c-11ed-99a3-b47af137252e:4 NaN \n",
+ "\n",
+ " model_name model_type \\\n",
+ "0 RandomForestClassifier:default RandomForestClassifier \n",
+ "1 NaN NaN \n",
+ "\n",
+ " name roc_auc \\\n",
+ "0 artifacts/model/model.pkl:5f6e4aa57cce9e3a0b2f12e5766d19be:3 NaN \n",
+ "1 metrics:878d492e-596c-11ed-99a3-b47af137252e:4 0.959238 \n",
+ "\n",
+ " type uri \n",
+ "0 Model 5f6e4aa57cce9e3a0b2f12e5766d19be \n",
+ "1 Metrics 878d492e-596c-11ed-99a3-b47af137252e "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "print(f\"Child artifact of {artifact_name}\")\n",
+ "linked = query.get_all_child_artifacts(artifact_name)\n",
+ "display(linked)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4cf2f2e6",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "### Get all the parent artifacts of an artifact. Provides the artifact lineage chain of predecessors"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "id": "493bd571",
+ "metadata": {
+ "pycharm": {
+ "is_executing": true
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Commit | \n",
+ " create_time_since_epoch | \n",
+ " git_repo | \n",
+ " id | \n",
+ " last_update_time_since_epoch | \n",
+ " name | \n",
+ " type | \n",
+ " uri | \n",
+ " user-metadata1 | \n",
+ " user-metadata2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " commit 03c25dfdb6c188b7b04f7e675dec072de192b851 | \n",
+ " 1667255770792 | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 1 | \n",
+ " 1667255778222 | \n",
+ " artifacts/data.xml.gz:236d9502e0283d91f689d7038b8508a2 | \n",
+ " Dataset | \n",
+ " 236d9502e0283d91f689d7038b8508a2 | \n",
+ " metadata_value | \n",
+ " metadata_value | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Commit create_time_since_epoch \\\n",
+ "0 commit 03c25dfdb6c188b7b04f7e675dec072de192b851 1667255770792 \n",
+ "\n",
+ " git_repo id last_update_time_since_epoch \\\n",
+ "0 /tmp/cmf/example_get_started/git_remote 1 1667255778222 \n",
+ "\n",
+ " name type \\\n",
+ "0 artifacts/data.xml.gz:236d9502e0283d91f689d7038b8508a2 Dataset \n",
+ "\n",
+ " uri user-metadata1 user-metadata2 \n",
+ "0 236d9502e0283d91f689d7038b8508a2 metadata_value metadata_value "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "\n",
+ "linked = query.get_all_parent_artifacts(linked.iloc[0][\"name\"])\n",
+ "display(linked)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "id": "4eb35ec5",
+ "metadata": {
+ "pycharm": {
+ "is_executing": true
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Commit | \n",
+ " avg_prec | \n",
+ " create_time_since_epoch | \n",
+ " git_repo | \n",
+ " id | \n",
+ " last_update_time_since_epoch | \n",
+ " metrics_name | \n",
+ " model_framework | \n",
+ " model_name | \n",
+ " model_type | \n",
+ " name | \n",
+ " roc_auc | \n",
+ " type | \n",
+ " uri | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " commit 4fba7197919fb85dd1a0899d2cf5c5c690ee607c | \n",
+ " NaN | \n",
+ " 1667255774532 | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 2 | \n",
+ " 1667255774532 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " artifacts/parsed/train.tsv:22ec7737f442cfc81e8c701fb58d1007 | \n",
+ " NaN | \n",
+ " Dataset | \n",
+ " 22ec7737f442cfc81e8c701fb58d1007 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " commit 5dfd3ac63c950f6394e5b7cebd55343402c7fdb6 | \n",
+ " NaN | \n",
+ " 1667255776391 | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 3 | \n",
+ " 1667255776391 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " artifacts/parsed/test.tsv:03e3627bda150c8cf51a55ef96ab3ede | \n",
+ " NaN | \n",
+ " Dataset | \n",
+ " 03e3627bda150c8cf51a55ef96ab3ede | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " commit 4546b0679bcae18bd85893c69581db91da40495c | \n",
+ " NaN | \n",
+ " 1667255800206 | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 4 | \n",
+ " 1667255800206 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " artifacts/features/train.pkl:5de5e987eadb4b86fc47604b59cb3725 | \n",
+ " NaN | \n",
+ " Dataset | \n",
+ " 5de5e987eadb4b86fc47604b59cb3725 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " commit d67bedaa20e64e45fe9f553935d9ff0726f19b59 | \n",
+ " NaN | \n",
+ " 1667255802382 | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 5 | \n",
+ " 1667255802382 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " artifacts/features/test.pkl:b1f98b4ebd09a0bdc72f1a8c102065dd | \n",
+ " NaN | \n",
+ " Dataset | \n",
+ " b1f98b4ebd09a0bdc72f1a8c102065dd | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " commit 8158283953c04affb8fe5ea6710656564ede7d3a | \n",
+ " NaN | \n",
+ " 1667255811813 | \n",
+ " NaN | \n",
+ " 6 | \n",
+ " 1667255811813 | \n",
+ " NaN | \n",
+ " SKlearn | \n",
+ " RandomForestClassifier:default | \n",
+ " RandomForestClassifier | \n",
+ " artifacts/model/model.pkl:5f6e4aa57cce9e3a0b2f12e5766d19be:3 | \n",
+ " NaN | \n",
+ " Model | \n",
+ " 5f6e4aa57cce9e3a0b2f12e5766d19be | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " NaN | \n",
+ " 0.526754 | \n",
+ " 1667255818388 | \n",
+ " NaN | \n",
+ " 7 | \n",
+ " 1667255818388 | \n",
+ " metrics:878d492e-596c-11ed-99a3-b47af137252e:4 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " metrics:878d492e-596c-11ed-99a3-b47af137252e:4 | \n",
+ " 0.959238 | \n",
+ " Metrics | \n",
+ " 878d492e-596c-11ed-99a3-b47af137252e | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Commit avg_prec \\\n",
+ "0 commit 4fba7197919fb85dd1a0899d2cf5c5c690ee607c NaN \n",
+ "1 commit 5dfd3ac63c950f6394e5b7cebd55343402c7fdb6 NaN \n",
+ "2 commit 4546b0679bcae18bd85893c69581db91da40495c NaN \n",
+ "3 commit d67bedaa20e64e45fe9f553935d9ff0726f19b59 NaN \n",
+ "4 commit 8158283953c04affb8fe5ea6710656564ede7d3a NaN \n",
+ "5 NaN 0.526754 \n",
+ "\n",
+ " create_time_since_epoch git_repo id \\\n",
+ "0 1667255774532 /tmp/cmf/example_get_started/git_remote 2 \n",
+ "1 1667255776391 /tmp/cmf/example_get_started/git_remote 3 \n",
+ "2 1667255800206 /tmp/cmf/example_get_started/git_remote 4 \n",
+ "3 1667255802382 /tmp/cmf/example_get_started/git_remote 5 \n",
+ "4 1667255811813 NaN 6 \n",
+ "5 1667255818388 NaN 7 \n",
+ "\n",
+ " last_update_time_since_epoch \\\n",
+ "0 1667255774532 \n",
+ "1 1667255776391 \n",
+ "2 1667255800206 \n",
+ "3 1667255802382 \n",
+ "4 1667255811813 \n",
+ "5 1667255818388 \n",
+ "\n",
+ " metrics_name model_framework \\\n",
+ "0 NaN NaN \n",
+ "1 NaN NaN \n",
+ "2 NaN NaN \n",
+ "3 NaN NaN \n",
+ "4 NaN SKlearn \n",
+ "5 metrics:878d492e-596c-11ed-99a3-b47af137252e:4 NaN \n",
+ "\n",
+ " model_name model_type \\\n",
+ "0 NaN NaN \n",
+ "1 NaN NaN \n",
+ "2 NaN NaN \n",
+ "3 NaN NaN \n",
+ "4 RandomForestClassifier:default RandomForestClassifier \n",
+ "5 NaN NaN \n",
+ "\n",
+ " name roc_auc \\\n",
+ "0 artifacts/parsed/train.tsv:22ec7737f442cfc81e8c701fb58d1007 NaN \n",
+ "1 artifacts/parsed/test.tsv:03e3627bda150c8cf51a55ef96ab3ede NaN \n",
+ "2 artifacts/features/train.pkl:5de5e987eadb4b86fc47604b59cb3725 NaN \n",
+ "3 artifacts/features/test.pkl:b1f98b4ebd09a0bdc72f1a8c102065dd NaN \n",
+ "4 artifacts/model/model.pkl:5f6e4aa57cce9e3a0b2f12e5766d19be:3 NaN \n",
+ "5 metrics:878d492e-596c-11ed-99a3-b47af137252e:4 0.959238 \n",
+ "\n",
+ " type uri \n",
+ "0 Dataset 22ec7737f442cfc81e8c701fb58d1007 \n",
+ "1 Dataset 03e3627bda150c8cf51a55ef96ab3ede \n",
+ "2 Dataset 5de5e987eadb4b86fc47604b59cb3725 \n",
+ "3 Dataset b1f98b4ebd09a0bdc72f1a8c102065dd \n",
+ "4 Model 5f6e4aa57cce9e3a0b2f12e5766d19be \n",
+ "5 Metrics 878d492e-596c-11ed-99a3-b47af137252e "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "linked = query.get_all_child_artifacts(linked.iloc[0][\"name\"])\n",
+ "display(linked)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "29060ba9",
+ "metadata": {},
+ "source": [
+ "### Get immediate child artifacts of an artifact. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "id": "93bd401d",
+ "metadata": {
+ "pycharm": {
+ "is_executing": true
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Commit | \n",
+ " create_time_since_epoch | \n",
+ " id | \n",
+ " last_update_time_since_epoch | \n",
+ " model_framework | \n",
+ " model_name | \n",
+ " model_type | \n",
+ " name | \n",
+ " type | \n",
+ " uri | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " commit 8158283953c04affb8fe5ea6710656564ede7d3a | \n",
+ " 1667255811813 | \n",
+ " 6 | \n",
+ " 1667255811813 | \n",
+ " SKlearn | \n",
+ " RandomForestClassifier:default | \n",
+ " RandomForestClassifier | \n",
+ " artifacts/model/model.pkl:5f6e4aa57cce9e3a0b2f12e5766d19be:3 | \n",
+ " Model | \n",
+ " 5f6e4aa57cce9e3a0b2f12e5766d19be | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Commit create_time_since_epoch \\\n",
+ "0 commit 8158283953c04affb8fe5ea6710656564ede7d3a 1667255811813 \n",
+ "\n",
+ " id last_update_time_since_epoch model_framework \\\n",
+ "0 6 1667255811813 SKlearn \n",
+ "\n",
+ " model_name model_type \\\n",
+ "0 RandomForestClassifier:default RandomForestClassifier \n",
+ "\n",
+ " name type \\\n",
+ "0 artifacts/model/model.pkl:5f6e4aa57cce9e3a0b2f12e5766d19be:3 Model \n",
+ "\n",
+ " uri \n",
+ "0 5f6e4aa57cce9e3a0b2f12e5766d19be "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "linked = query.get_one_hop_child_artifacts(artifact_name)\n",
+ "display(linked)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "529817a7",
+ "metadata": {},
+ "source": [
+ "### Get all child artifacts "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "id": "496ee2bc",
+ "metadata": {
+ "pycharm": {
+ "is_executing": true
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Commit | \n",
+ " avg_prec | \n",
+ " create_time_since_epoch | \n",
+ " id | \n",
+ " last_update_time_since_epoch | \n",
+ " metrics_name | \n",
+ " model_framework | \n",
+ " model_name | \n",
+ " model_type | \n",
+ " name | \n",
+ " roc_auc | \n",
+ " type | \n",
+ " uri | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " commit 8158283953c04affb8fe5ea6710656564ede7d3a | \n",
+ " NaN | \n",
+ " 1667255811813 | \n",
+ " 6 | \n",
+ " 1667255811813 | \n",
+ " NaN | \n",
+ " SKlearn | \n",
+ " RandomForestClassifier:default | \n",
+ " RandomForestClassifier | \n",
+ " artifacts/model/model.pkl:5f6e4aa57cce9e3a0b2f12e5766d19be:3 | \n",
+ " NaN | \n",
+ " Model | \n",
+ " 5f6e4aa57cce9e3a0b2f12e5766d19be | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 0.526754 | \n",
+ " 1667255818388 | \n",
+ " 7 | \n",
+ " 1667255818388 | \n",
+ " metrics:878d492e-596c-11ed-99a3-b47af137252e:4 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " metrics:878d492e-596c-11ed-99a3-b47af137252e:4 | \n",
+ " 0.959238 | \n",
+ " Metrics | \n",
+ " 878d492e-596c-11ed-99a3-b47af137252e | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Commit avg_prec \\\n",
+ "0 commit 8158283953c04affb8fe5ea6710656564ede7d3a NaN \n",
+ "1 NaN 0.526754 \n",
+ "\n",
+ " create_time_since_epoch id last_update_time_since_epoch \\\n",
+ "0 1667255811813 6 1667255811813 \n",
+ "1 1667255818388 7 1667255818388 \n",
+ "\n",
+ " metrics_name model_framework \\\n",
+ "0 NaN SKlearn \n",
+ "1 metrics:878d492e-596c-11ed-99a3-b47af137252e:4 NaN \n",
+ "\n",
+ " model_name model_type \\\n",
+ "0 RandomForestClassifier:default RandomForestClassifier \n",
+ "1 NaN NaN \n",
+ "\n",
+ " name roc_auc \\\n",
+ "0 artifacts/model/model.pkl:5f6e4aa57cce9e3a0b2f12e5766d19be:3 NaN \n",
+ "1 metrics:878d492e-596c-11ed-99a3-b47af137252e:4 0.959238 \n",
+ "\n",
+ " type uri \n",
+ "0 Model 5f6e4aa57cce9e3a0b2f12e5766d19be \n",
+ "1 Metrics 878d492e-596c-11ed-99a3-b47af137252e "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "linked = query.get_all_child_artifacts(artifact_name)\n",
+ "display(linked)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "id": "1ae1d868",
+ "metadata": {
+ "pycharm": {
+ "is_executing": true
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Provide Execution id corresponding to a Evaluate stage\n",
+ "linked = query.get_all_artifacts_for_execution(4)\n",
+ "for index, row in linked.iterrows():\n",
+ " if row[\"type\"] == \"Metrics\":\n",
+ " break\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ae3109b7",
+ "metadata": {},
+ "source": [
+ "### Get artifact "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "id": "8bf8a41d",
+ "metadata": {
+ "pycharm": {
+ "is_executing": true
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " type | \n",
+ " uri | \n",
+ " name | \n",
+ " create_time_since_epoch | \n",
+ " last_update_time_since_epoch | \n",
+ " metrics_name | \n",
+ " avg_prec | \n",
+ " roc_auc | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 7 | \n",
+ " Metrics | \n",
+ " 878d492e-596c-11ed-99a3-b47af137252e | \n",
+ " metrics:878d492e-596c-11ed-99a3-b47af137252e:4 | \n",
+ " 1667255818388 | \n",
+ " 1667255818388 | \n",
+ " metrics:878d492e-596c-11ed-99a3-b47af137252e:4 | \n",
+ " 0.526754 | \n",
+ " 0.959238 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id type uri \\\n",
+ "0 7 Metrics 878d492e-596c-11ed-99a3-b47af137252e \n",
+ "\n",
+ " name create_time_since_epoch \\\n",
+ "0 metrics:878d492e-596c-11ed-99a3-b47af137252e:4 1667255818388 \n",
+ "\n",
+ " last_update_time_since_epoch \\\n",
+ "0 1667255818388 \n",
+ "\n",
+ " metrics_name avg_prec roc_auc \n",
+ "0 metrics:878d492e-596c-11ed-99a3-b47af137252e:4 0.526754 0.959238 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "artifact = query.get_artifact(row[\"name\"])\n",
+ "display(artifact)\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/README.md b/README.md
index 793a4366..428d939c 100644
--- a/README.md
+++ b/README.md
@@ -1,134 +1,113 @@
-# Common Metadata Framework(CMF)
-CMF library helps to collect and store information associated with ML pipelines.
-It provides api's to record and query the metadata associated with ML pipelines.
-The framework adopts a data first approach and all artifacts recorded in the framework are versioned and identified by the content hash.
-[Detailed documentation of the API's](docs/API.md)
-
-# Getting Started
-### Install the library
-
-#### Creating the wheel file
-
-```
-python setup.py bdist_wheel
-cd dist
-pip install
-
-```
-or Install directly,
-pip install .
-
-[Quick start](example-get-started/README.md)
-### Pre-Requisite
-1. Python Version - Needs Python version >=3.6 and <3.9 (Not compatible with Python3.9)
-2. Set Environment variables.
- Library uses DVC for artifact versioning and git for code versioning. It requires a set of environment variables to operate DVC and git. The list of environment variables needed can be found in example-get-started/sample_env.
- Copy sample_env from example-get-started directory to local directory.
- Modify sample_env and run 'source sample_env' to set the environment variables.
-3. Copy initialize.sh from example-get-started folder to your local directory.
- Run command - sh initialize.sh
- Before running the script, please ensure that required environment variables are set.
- This configures DVC and git with the provided variables in Step 1.
-
-
-## Logging metadata with CMF
-#### Import of the library to record the metadata
-```python
-from cmflib import cmf
-
-```
-
-### Create the metadata writer
-The metadata writer is responsible to manage the backend to record the metadata.
-It also creates a pipeline abstraction, which helps to group individual stages and execution.
-```python
-cmf = cmf.Cmf(filename="mlmd",
- pipeline_name="Test-env")
-
-```
-### Create the stage in pipeline.
-An ML pipeline can have multiple stages. This context abstraction tracks the stage and its metadata.
-A dictionary can be passed to hold the user given metadata. The custom properties is an optional argument
- ```python
-context = cmf.create_context(pipeline_stage="Prepare",
- custom_properties={"user-metadata1":"metadata_value"})
-```
-
-#### Create the execution
-A stage in ML pipeline can have multiple executions. Every run is marked as an execution.
-This API helps to track the metadata associated with the execution
-```python
-execution = cmf.create_execution(execution_type="Prepare-1", custom_properties = {"user-metadata1":"metadata_value"})
-```
-#### Log artifacts
-An Execution could have multiple artifacts associated with it as Input or Output. The path of the artifact provided should be path from root of the repo.
-The metadata associated with the artifact could be logged as an optional argument which takes in a dictionary
-```python
-cmf.log_dataset(input, "input", custom_properties={"user-metadata1":"metadata_value"})
-cmf.log_dataset(output_train, "output", custom_properties={"user-metadata1":"metadata_value"})
-cmf.log_dataset(output_test, "output", custom_properties={"user-metadata1":"metadata_value"})
-```
-#### Log model
-A model developed as part of training step or used in a evaluation or inference step can be logged. It can be input or output
-The metadata associated with the artifact could be logged as an optional argument which takes in a dictionary
-```python
-cmf.log_model(path="model.pkl", event="output", model_framework="SKlearn", model_type="RandomForestClassifier", model_name="RandomForestClassifier:default" )
-cmf.log_model(path="model.pkl", event="input", model_framework="SKlearn", model_type="RandomForestClassifier", model_name="RandomForestClassifier:default" )
-```
-#### Log metrics
-Metrics of each step can be logged
-The metadata associated with the artifact could be logged as argument which takes in a dictionary
-```python
-#Can be called at every epoch or every step in the training. This is logged to a parquet file and commited at the commit stage.
-while True: #Inside training loop
- cmf.log_metric("training_metrics", {"loss":loss})
-cmf.commit_metrics("training_metrics")
-```
-#### Log Stage metrics
-Metrics for each stage.
-```python
-cmf.log_execution_metrics("metrics", {"avg_prec":avg_prec, "roc_auc":roc_auc})
-```
-#### Creating the dataslices
-This helps to track a subset of the data. For eg- Accuracy of the model for a slice of data(gender, ethnicity etc)
-```python
-dataslice = cmf.create_dataslice("slice-a")
-for i in range(1,20,1):
- j = random.randrange(100)
- dataslice.add_data("data/raw_data/"+str(j)+".xml")
-dataslice.commit()
-```
-### To use Graph Layer
-CMF library has an optional Graph layer which stores the relationships in a Graph Database(NEO4J)
-To use the graph layer, the "graph" parameter in the library init call should be set to true. This is set as false by default.
-The library reads the configuration parameters of the Database from the environment variables.
-The variables "NEO4J_URI", "NEO4J_USER_NAME", "NEO4J_PASSWD" should be either set as environment variables.
-
-```
- export NEO4J_URI="bolt://10.93.244.219:7687"
- export NEO4J_USER_NAME=neo4j
- export NEO4J_PASSWD=neo4j
+# cmf
+## Common Metadata Framework
+[Getting Started](docs/index.md)
+[Detailed documentation of the API's](https://hewlettpackard.github.io/cmf/api/public/cmf)
+
+Interactions in data pipelines can be complex. The Different stages in the pipeline, (which may not be next to each other) may have to interact to produce or transform artifacts. As the artifacts navigates and undergo transformations through this pipeline, it can take a complicated path, which might also involve bidirectional movement across these stages. Also there could be dependencies between the multiple stages, where the metrics produced by a stage could influence the metrics at a subsequent stage. It is important to track the metadata across a pipeline to provide features like, lineage tracking, provenance and reproducibility.
+
+The tracking of metadata through these complex pipelines have multiple challenges, some of them being,
+
+- Each stage in the pipeline could be executed in a different datacenter or an edge site having intermittent connection to the core datacenter.
+
+- Each stage in the pipeline could be possibly managed by different teams.
+
+- The artifacts (input or output) needs to be uniquely identified across different sites and across multiple pipelines.
+
+
+Common metadata framework (CMF) addresses the problems associated with tracking of pipeline metadata from distributed sites and tracks code, data and metadata together for end-to-end traceability.
+
+The framework automatically tracks the code version as one of the metadata for an execution. Additionally the data artifacts are also versioned automatically using a data versioning framework (like DVC) and the metadata regarding the data version is stored along with the code. The framework stores the Git commit id of the metadata file associated with the artifact and content hash of the artifact as metadata. The framework provides API’s to track the hyper parameters and other metadata of pipelines. Therefore from the metadata stored, users can zero in on the hyper parameters, code version and the artifact version used for the experiment.
+
+Identifying the artifacts by content hash allows the framework, to uniquely identify an artifact anywhere in the distributed sites. This enables the metadata from the distributed sites to be precisely merged to a central repository, thereby providing a single global metadata from the distributed sites.
+
+On this backbone, we build the Git like experience for metadata, enabling users to push their local metadata to the remote repository, where it is merged to create the global metadata and pull metadata from the global metadata to the local, to create a local view, which would contain only the metadata of interest.
+
+The framework can be used to track various types of pipelines such as data pipelines or AI pipelines.
+
+
+
+
+
-'''
-Create the metadata writer
-The metadata writer is responsible to manage the backend to record the metadata.
-It also creates a pipeline abstraction, which helps to group individual stages and execution.
-'''
-cmf = cmf.Cmf(filename="mlmd",
- pipeline_name="Test-env", graph=True)
-
-```
-
-### Use a Jupyterlab environment with CMF pre-installed
-- CMF is preinstalled in a JupyterLab Notebook Environment.
-- Accessible at http://[HOST.IP.AD.DR]:8888 (default token: `docker`)
-- Within the Jupyterlab environment, a startup script switches context to `$USER:$GROUP` as specified in `.env`
-- `example-get-started` from this repo is bind mounted into `/home/jovyan/example-get-started`
-- Update `docker-compose.yml` as needed. For example to bind mount another volume on the host: `/lustre/data/dataspaces/dataspaces_testbed/:/home/jovyan/work`
-
-```
-#create .env file in current folder using env-example as a template. #These are used by docker-compose.yml
-docker-compose up --build -d
-#To Shutdown/Remove (Remove Volumes as well)
-docker-compose down -v
-```
+
+### Common metadata framework (CMF), has the following components,
+
+CMF - Metadata library - Exposes API’s to track the pipeline metadata. It also provides API’s to query the stored metadata.
+
+CMF local client – The client interacts with the server to pull or push metadata from or to the remote store.
+
+CMF central server - Interacts with all the remote clients and is responsible to merge the metadata transferred by the remote client and manage the consolidated metadata.
+
+ CMF central repositories - Host the code, data and metadata.
+
+
+
+
+### Metadata library
+
+The API’s and the abstractions provided by the library enables tracking of pipeline metadata. It tracks the stages in the pipeline, the input and output artifacts at each stage and metrics. The framework allows metrics to be tracked both at coarse and fine grained intervals. It could be a stage metrics, which could be captured at the end of a stage or fine grained metrics which is tracked per step (epoch) or at regular intervals during the execution of the stage.
+
+The metadata logged through the API’s are written to a backend relational database. The library also provides API’s to query the metadata stored in the relational database for the users to inspect pipelines.
+
+In addition to explicit tracking through the API’s library also provides, implicit tracking. The implicit tracking automatically tracks the software version used in the pipelines. The function arguments and function return values can be automatically tracked by adding metadata tracker class decorators on the functions.
+
+Before writing the metadata to relational database, the metadata operations are journaled in the metadata journal log. This enables the framework to transfer the local metadata to the central server.
+
+All artifacts are versioned with a data versioning framework (for e.g., DVC). The content hash of the artifacts are generated and stored along with the user provided metadata. A special artifact metadata file called a “.dvc” file is created for every artifact (file / folder) which is added to data version management system. The .dvc file contains the content hash of the artifact.
+
+For every new execution, the metadata tracker creates a new branch to track the code. The special metadata file created for artifacts, the “.dvc” file is also committed to GIT and its commit id is tracked as a metadata information. The artifacts are versioned through the versioning of its metadata file. Whenever there is a change in the artifact, the metadata file is modified to reflect its current content hash, and the file is tracked as a new version of the metadata file.
+
+The metadata tracker automatically tracks the start commit when the library was initialized and creates separate commit for each change in the artifact along the experiment. This helps to track the transformations on the artifacts along the different stages in the pipeline.
+
+To understand more details, see documentation [here](docs/README.md)
+
+### Metadata client
+
+The metadata client interacts with the metadata server. It communicates with the server, for synchronization of metadata.
+
+After the experiment is completed, the user invokes the “Cmf push” command to push the collected metadata to the remote. This transfers the existing metadata journal to the server.
+
+The metadata from the central repository can be pulled to the local repository, either using the artifacts or using the project as the identifier or both.
+
+When artifact is used as the identifier, all metadata associated with the artifacts currently present in the branch of the cloned Git repository is pulled from the central repository to the local repository. The pulled metadata consist of not only the immediate metadata associated with the artifacts, it contains the metadata of all the artifacts in its chain of lineage.
+
+When project is used as the identifier, all the metadata associated with the current branch of the pipeline code that is checked out is pulled to the local repository.
+
+### Central server
+
+The central server, exposes REST API’s that can be called from the remote clients. This can help in situations where the connectivity between the core datacenter and the remote client is robust. The remote client calls the API’s exposed by the central server to log the metadata directly to the central metadata repository.
+
+Where the connectivity with the central server is intermittent, the remote clients log the metadata to the local repository. The journaled metadata is pushed by the remote client to the central server. The central server, will replay the journal and merge the incoming metadata with the metadata already existing in the central repository. The ability to accurately identify the artifacts anywhere using their content hash, makes this merge robust.
+
+### Central Repositories
+
+The common metadata framework consist of three central repositories for the code, data and metadata.
+
+#### Central Metadata repository
+
+Central metadata repository holds the metadata pushed from the distributed sites. It holds metadata about all the different pipelines that was tracked using the common metadata tracker. The consolidated view of the metadata stored in the central repository, helps the users to learn across various stages in the pipeline executed at different locations. Using the query layer that is pointed to the central repository, the users gets the global view of the metadata which provides them with a deeper understanding of the pipelines and its metadata. The metadata helps to understand nonobvious results like performance of a dataset with respect to other datasets, Performance of a particular pipeline with respect to other pipelines etc.
+
+#### Central Artifact storage repository
+
+Central Artifact storage repository stores all the artifacts related to experiment. The data versioning framework (DVC) stores the artifacts in a content addressable layout. The artifacts are stored inside the folder with name as the first two characters of the content hash and the name of the artifact as the remaining part of the content hash. This helps in efficient retrieval of the artifacts.
+
+#### Git Repository
+
+Git repository is used to track the code. Along with the code, the metadata file of the artifacts which contain the content hash of the artifacts are also stored in GIT. The Data versioning framework (dvc) would use these files to retrieve the artifacts from the artifact storage repository.
+
+### Advantages
+
+1. Tracking of metadata for distributed pipeline, thereby enabling efficient pipeline
+
+2. Enables tracking of code, data and metadata in a single framework.
+
+3. Provides a git like ease of management for metadata.
+
+4. Provides collaboration across teams
+
+### Talks and Papers
+1. Monterey data conference 2022
+
+### Community
+
+ Slack: Common Metadata Framework
diff --git a/Requirements.txt b/Requirements.txt
index 35adf4af..979136db 100644
--- a/Requirements.txt
+++ b/Requirements.txt
@@ -1,9 +1,9 @@
-ml-metadata==1.3.0
+ml-metadata==1.11.0
dvc
pandas
retrying
pyarrow
neo4j
-sklearn
+scikit-learn
tabulate
click
diff --git a/cmflib/cmf.py b/cmflib/cmf.py
index e9941c29..e4a62d06 100644
--- a/cmflib/cmf.py
+++ b/cmflib/cmf.py
@@ -1,3 +1,4 @@
+"""This module contains all the public API for CMF"""
###
# Copyright (2022) Hewlett Packard Enterprise Development LP
#
@@ -15,34 +16,74 @@
###
import time
-import pandas as pd
import uuid
import re
import os
-import dvc
import sys
+import pandas as pd
import typing as t
+#This import is needed for jupyterlab environment
+import dvc
from ml_metadata.proto import metadata_store_pb2 as mlpb, metadata_store_pb2
from ml_metadata.metadata_store import metadata_store
-from cmflib.dvc_wrapper import dvc_get_url, dvc_get_hash, git_get_commit, commit_output, git_get_repo, commit_dvc_lock_file
-import cmflib.graph_wrapper as graph_wrapper
-from cmflib.metadata_helper import get_or_create_parent_context, get_or_create_run_context, \
- associate_child_to_parent_context, create_new_execution_in_existing_run_context, link_execution_to_artifact, \
- create_new_artifact_event_and_attribution, get_artifacts_by_id, put_artifact, link_execution_to_input_artifact
+from cmflib.dvc_wrapper import dvc_get_url, dvc_get_hash, git_get_commit, \
+ commit_output, git_get_repo, commit_dvc_lock_file, \
+ git_checkout_new_branch, \
+ check_git_repo, check_default_remote, check_git_remote
+from cmflib import graph_wrapper
+from cmflib.metadata_helper import get_or_create_parent_context, \
+ get_or_create_run_context, associate_child_to_parent_context, \
+ create_new_execution_in_existing_run_context, link_execution_to_artifact, \
+ create_new_artifact_event_and_attribution, get_artifacts_by_id, \
+ put_artifact, link_execution_to_input_artifact
+
+
+class Cmf:
+ """This class provides methods to log metadata for distributed AI pipelines.
+
+ The class instance creates an ML metadata store to store the metadata. It creates a driver to store nodes and its
+ relationships to neo4j. The user has to provide the name of the pipeline, that needs to be recorded with it.
+
+ ```python
+ cmflib.cmf.Cmf(
+ filename="mlmd",
+ pipeline_name="test_pipeline",
+ custom_properties={"owner": "user_a"},
+ graph=False
+ )
+ ```
+
+ Args:
+ filename: Path to the sqlite file to store the metadata
+ pipeline_name: Name to uniquely identify the pipeline. Note that name is the unique identification for a
+ pipeline. If a pipeline already exist with the same name, the existing pipeline object is reused.
+ custom_properties: Additional properties of the pipeline that needs to be stored.
+ graph: If set to true, the libray also stores the relationships in the provided graph database. The following
+ environment variables should be set: `NEO4J_URI` (graph server URI), `NEO4J_USER_NAME` (user name) and
+ `NEO4J_PASSWD` (user password), e.g.:
+ ```bash
+ export NEO4J_URI="bolt://ip:port"
+ export NEO4J_USER_NAME=neo4j
+ export NEO4J_PASSWD=neo4j
+ ```
+ """
+ # pylint: disable=too-many-instance-attributes
-class Cmf(object):
__neo4j_uri = os.getenv('NEO4J_URI', "")
__neo4j_user = os.getenv('NEO4J_USER_NAME', "")
__neo4j_password = os.getenv('NEO4J_PASSWD', "")
def __init__(self, filename: str = "mlmd",
- pipeline_name="", custom_properties=None, graph: bool = False):
+ pipeline_name: str = "", custom_properties: t.Optional[t.Dict] = None,
+ graph: bool = False):
+ Cmf.__prechecks()
if custom_properties is None:
custom_properties = {}
- config = metadata_store_pb2.ConnectionConfig()
+ config = mlpb.ConnectionConfig()
config.sqlite.filename_uri = filename
self.store = metadata_store.MetadataStore(config)
+ self.filename = filename
self.child_context = None
self.execution = None
self.execution_name = ""
@@ -51,25 +92,114 @@ def __init__(self, filename: str = "mlmd",
self.input_artifacts = []
self.execution_label_props = {}
self.graph = graph
- self.parent_context = get_or_create_parent_context(store=self.store, pipeline=pipeline_name,
- custom_properties=custom_properties)
- if graph:
- self.driver = graph_wrapper.GraphDriver(Cmf.__neo4j_uri, Cmf.__neo4j_user, Cmf.__neo4j_password)
- self.driver.create_pipeline_node(pipeline_name, self.parent_context.id, custom_properties)
+ self.branch_name = filename.rsplit('/',1)[-1]
+
+ git_checkout_new_branch(self.branch_name)
+ self.parent_context = get_or_create_parent_context(
+ store=self.store, pipeline=pipeline_name, custom_properties=custom_properties)
+ if graph is True:
+ self.driver = graph_wrapper.GraphDriver(
+ Cmf.__neo4j_uri, Cmf.__neo4j_user, Cmf.__neo4j_password)
+ self.driver.create_pipeline_node(
+ pipeline_name, self.parent_context.id, custom_properties)
+
+ @staticmethod
+ def __prechecks():
+ """Pre checks for cmf
+
+ 1. Needs to be a git repository and
+ git remote should be set
+ 2. Needs to be a dvc repository and
+ default dvc remote should be set
+ """
+ Cmf.__check_git_init()
+ Cmf.__check_default_remote()
+ Cmf.__check_git_remote()
+
+ @staticmethod
+ def __check_git_remote():
+ """Executes precheck for git remote"""
+ if not check_git_remote():
+ print("*** Error git remote not set ***\n"
+ "*** Run the command "
+ "`git remote add origin ` ***\n"
+ " or \n"
+ " After Updating the sample_env file,"
+ " run `source sample_env`\n"
+ " Then run 'sh initialize.sh'")
+ sys.exit(1)
+
+ @staticmethod
+ def __check_default_remote():
+ """Executes precheck for default dvc remote"""
+ if not check_default_remote():
+ print("*** DVC not configured correctly***\n"
+ "Initialize dvc and add a default dvc remote\n"
+ "Run commands\n"
+ "dvc init\n"
+ "dvc remote add -d \n")
+ sys.exit(1)
+
+ @staticmethod
+ def __check_git_init():
+ """Verifies that the directory is a git repo"""
+ if not check_git_repo():
+ print("*** Not a git repo, Please do the following ***\n"
+ " Initialize git\n"
+ " Initialize dvc and add a default dvc remote\n"
+ " or \n"
+ " After Updating the sample_env file,"
+ " run `source sample_env`\n"
+ " Then run 'sh initialize.sh'")
+ sys.exit(1)
def __del__(self):
- if self.graph:
+ """Destructor - Cleans up the connection to neo4j"""
+ # if self.execution is not None:
+ # commit_output(self.filename, self.execution.id)
+ if hasattr(self, 'driver'):
self.driver.close()
- def create_context(self, pipeline_stage: str, custom_properties: {} = None) -> mlpb.Context:
+ def create_context(self, pipeline_stage: str,
+ custom_properties: t.Optional[t.Dict] = None) -> mlpb.Context:
+ """Creates a stage in the pipeline.
+
+ If it already exists, it is reused and not created again.
+
+ Example:
+ ```python
+ # Import CMF
+ from cmflib.cmf import Cmf
+ from ml_metadata.proto import metadata_store_pb2 as mlpb
+
+ # Create CMF logger
+ cmf = Cmf(filename="mlmd", pipeline_name="test_pipeline")
+
+ # Create or reuse context for this stage
+ context: mlmd.proto.Context = cmf.create_context(
+ pipeline_stage="prepare",
+ custom_properties ={"user-metadata1": "metadata_value"}
+ )
+ ```
+
+ Args:
+ pipeline_stage: Name of the pipeline stage.
+ custom_properties: Developers can provide key value pairs with additional properties of the stage that
+ need to be stored.
+ Returns:
+ Context object from ML Metadata library associated with the new stage.
+ """
custom_props = {} if custom_properties is None else custom_properties
- pipeline_stage = pipeline_stage
- ctx = get_or_create_run_context(self.store, pipeline_stage, custom_props)
+ ctx = get_or_create_run_context(
+ self.store, pipeline_stage, custom_props)
self.child_context = ctx
- associate_child_to_parent_context(store=self.store, parent_context=self.parent_context,
- child_context=ctx)
+ associate_child_to_parent_context(
+ store=self.store,
+ parent_context=self.parent_context,
+ child_context=ctx)
if self.graph:
- self.driver.create_stage_node(pipeline_stage, self.parent_context, ctx.id, custom_props)
+ self.driver.create_stage_node(
+ pipeline_stage, self.parent_context, ctx.id, custom_props)
return ctx
def create_execution(self, execution_type: str,
@@ -142,34 +272,89 @@ def create_execution(self, execution_type: str,
, self.execution.id, custom_props)
return self.execution
- def update_execution(self, execution_id: int):
+ def update_execution(self, execution_id: int, custom_properties: t.Optional[t.Dict] = None):
+ """Updates an existing execution.
+
+ The custom properties can be updated after creation of the execution.
+ The new custom properties is merged with earlier custom properties.
+ """
self.execution = self.store.get_executions_by_id([execution_id])[0]
if self.execution is None:
- print("Error no execution id")
- exit()
- execution_type = self.store.get_execution_types_by_id([self.execution.type_id])[0]
-
- self.execution_name = str(self.execution.id) + "," + execution_type.name
+ print("Error - no execution id")
+ sys.exit(1)
+ execution_type = self.store.get_execution_types_by_id(
+ [self.execution.type_id])[0]
+
+ if custom_properties:
+ for key, value in custom_properties.items():
+ if isinstance(value, int):
+ self.execution.custom_properties[key].int_value = value
+ else:
+ self.execution.custom_properties[key].string_value = str(
+ value)
+ self.store.put_executions([self.execution])
+ c_props = {}
+ for k, v in self.execution.custom_properties.items():
+ key = re.sub('-', '_', k)
+ val_type = str(v).split(':', maxsplit=1)[0]
+ if val_type == "string_value":
+ val = self.execution.custom_properties[k].string_value
+ else:
+ val = str(v).split(':')[1]
+ # The properties value are stored in the format type:value hence,
+ # taking only value
+ self.execution_label_props[key] = val
+ c_props[key] = val
+ self.execution_name = str(
+ self.execution.id) + "," + execution_type.name
self.execution_command = self.execution.properties["Execution"]
- self.execution_label_props["Execution_Name"] = execution_type.name + ":" + str(self.execution.id)
+ self.execution_label_props["Execution_Name"] = execution_type.name + \
+ ":" + str(self.execution.id)
self.execution_label_props["execution_command"] = self.execution.properties["Execution"].string_value
if self.graph:
- self.driver.create_execution_node(self.execution_name, self.child_context.id, self.parent_context,
- self.execution.properties["Execution"].string_value, self.execution.id, {})
+ self.driver.create_execution_node(
+ self.execution_name,
+ self.child_context.id,
+ self.parent_context,
+ self.execution.properties["Execution"].string_value,
+ self.execution.id,
+ c_props)
return self.execution
-
- def log_dvc_lock(self, file_path:str):
+
+ def log_dvc_lock(self, file_path: str):
+ """Used to update the dvc lock file created with dvc run command."""
return commit_dvc_lock_file(file_path, self.execution.id)
- def log_dataset(self, url: str, event: str, custom_properties: {} = None) -> mlpb.Artifact:
+ def log_dataset(self, url: str, event: str, custom_properties: t.Optional[t.Dict] = None) -> mlpb.Artifact:
+ """Logs a dataset as artifact.
+
+ This call adds the dataset to dvc. The dvc metadata file created (.dvc) will be added to git and committed. The
+ version of the dataset is automatically obtained from the versioning software(DVC) and tracked as a metadata.
+
+ Example:
+ ```python
+ artifact: mlmd.proto.Artifact = cmf.log_dataset(
+ url="/repo/data.xml",
+ event="input",
+ custom_properties={"source":"kaggle"}
+ )
+ ```
+
+ Args:
+ url: The path to the dataset.
+ event: Takes arguments `INPUT` OR `OUTPUT`.
+ custom_properties: Dataset properties (key/value pairs).
+
+ Returns:
+ Artifact object from ML Metadata library associated with the new dataset artifact.
+ """
custom_props = {} if custom_properties is None else custom_properties
git_repo = git_get_repo()
name = re.split('/', url)[-1]
- event_type = metadata_store_pb2.Event.Type.OUTPUT
+ event_type = mlpb.Event.Type.OUTPUT
existing_artifact = []
- c_hash = ""
if event.lower() == "input":
- event_type = metadata_store_pb2.Event.Type.INPUT
+ event_type = mlpb.Event.Type.INPUT
dataset_commit = commit_output(url, self.execution.id)
c_hash = dvc_get_hash(url)
@@ -182,68 +367,94 @@ def log_dataset(self, url: str, event: str, custom_properties: {} = None) -> mlp
if existing_artifact and len(existing_artifact) != 0:
existing_artifact = existing_artifact[0]
- #Quick fix- Updating only the name
+ # Quick fix- Updating only the name
if custom_properties is not None:
- self.update_existing_artifact(existing_artifact, custom_properties)
+ self.update_existing_artifact(
+ existing_artifact, custom_properties)
uri = c_hash
- artifact = link_execution_to_artifact(store=self.store,
- execution_id=self.execution.id,
- uri=uri,
- input_name=url,
- event_type=event_type)
+ artifact = link_execution_to_artifact(
+ store=self.store,
+ execution_id=self.execution.id,
+ uri=uri,
+ input_name=url,
+ event_type=event_type)
else:
# if((existing_artifact and len(existing_artifact )!= 0) and c_hash != ""):
# url = url + ":" + str(self.execution.id)
uri = c_hash if c_hash and c_hash.strip() else str(uuid.uuid1())
- artifact = create_new_artifact_event_and_attribution \
- (store=self.store,
- execution_id=self.execution.id,
- context_id=self.child_context.id,
- uri=uri,
- name=url,
- type_name="Dataset",
- event_type=event_type,
- properties={"git_repo": str(git_repo), "Commit": str(dataset_commit)},
- artifact_type_properties={"git_repo": metadata_store_pb2.STRING,
- "Commit": metadata_store_pb2.STRING
- },
- custom_properties=custom_props,
- milliseconds_since_epoch=int(time.time() * 1000),
- )
+ artifact = create_new_artifact_event_and_attribution(
+ store=self.store,
+ execution_id=self.execution.id,
+ context_id=self.child_context.id,
+ uri=uri,
+ name=url,
+ type_name="Dataset",
+ event_type=event_type,
+ properties={
+ "git_repo": str(git_repo),
+ "Commit": str(dataset_commit)},
+ artifact_type_properties={
+ "git_repo": mlpb.STRING,
+ "Commit": mlpb.STRING},
+ custom_properties=custom_props,
+ milliseconds_since_epoch=int(
+ time.time() * 1000),
+ )
custom_props["git_repo"] = git_repo
custom_props["Commit"] = dataset_commit
self.execution_label_props["git_repo"] = git_repo
self.execution_label_props["Commit"] = dataset_commit
if self.graph:
- self.driver.create_dataset_node(name, url, uri, event, self.execution.id, self.parent_context, custom_props)
+ self.driver.create_dataset_node(
+ name,
+ url,
+ uri,
+ event,
+ self.execution.id,
+ self.parent_context,
+ custom_props)
if event.lower() == "input":
- self.input_artifacts.append({"Name": name, "Path": url, "URI": uri, "Event": event.lower(),
+ self.input_artifacts.append({"Name": name,
+ "Path": url,
+ "URI": uri,
+ "Event": event.lower(),
"Execution_Name": self.execution_name,
- "Type": "Dataset", "Execution_Command": self.execution_command,
+ "Type": "Dataset",
+ "Execution_Command": self.execution_command,
"Pipeline_Id": self.parent_context.id,
"Pipeline_Name": self.parent_context.name})
self.driver.create_execution_links(uri, name, "Dataset")
else:
- child_artifact = {"Name": name, "Path": url, "URI": uri, "Event": event.lower(),
- "Execution_Name": self.execution_name,
- "Type": "Dataset", "Execution_Command": self.execution_command,
- "Pipeline_Id": self.parent_context.id, "Pipeline_Name": self.parent_context.name}
- self.driver.create_artifact_relationships(self.input_artifacts, child_artifact,
- self.execution_label_props)
+ child_artifact = {
+ "Name": name,
+ "Path": url,
+ "URI": uri,
+ "Event": event.lower(),
+ "Execution_Name": self.execution_name,
+ "Type": "Dataset",
+ "Execution_Command": self.execution_command,
+ "Pipeline_Id": self.parent_context.id,
+ "Pipeline_Name": self.parent_context.name}
+ self.driver.create_artifact_relationships(
+ self.input_artifacts, child_artifact, self.execution_label_props)
return artifact
- def log_dataset_with_version(self, url: str, version:str, event: str, custom_properties: {} = None) -> mlpb.Artifact:
+
+ def log_dataset_with_version(self, url: str, version: str, event: str,
+ custom_properties: t.Optional[t.Dict] = None) -> mlpb.Artifact:
+ """Logs a dataset when the version(hash) is known"""
+
custom_props = {} if custom_properties is None else custom_properties
git_repo = git_get_repo()
name = re.split('/', url)[-1]
- event_type = metadata_store_pb2.Event.Type.OUTPUT
+ event_type = mlpb.Event.Type.OUTPUT
existing_artifact = []
c_hash = version
if event.lower() == "input":
- event_type = metadata_store_pb2.Event.Type.INPUT
+ event_type = mlpb.Event.Type.INPUT
- #dataset_commit = commit_output(url, self.execution.id)
+ # dataset_commit = commit_output(url, self.execution.id)
dataset_commit = version
url = url + ":" + c_hash
@@ -254,88 +465,141 @@ def log_dataset_with_version(self, url: str, version:str, event: str, custom_pr
if existing_artifact and len(existing_artifact) != 0:
existing_artifact = existing_artifact[0]
- #Quick fix- Updating only the name
+ # Quick fix- Updating only the name
if custom_properties is not None:
- self.update_existing_artifact(existing_artifact, custom_properties)
+ self.update_existing_artifact(
+ existing_artifact, custom_properties)
uri = c_hash
- artifact = link_execution_to_artifact(store=self.store,
- execution_id=self.execution.id,
- uri=uri,
- input_name=url,
- event_type=event_type)
+ artifact = link_execution_to_artifact(
+ store=self.store,
+ execution_id=self.execution.id,
+ uri=uri,
+ input_name=url,
+ event_type=event_type)
else:
# if((existing_artifact and len(existing_artifact )!= 0) and c_hash != ""):
# url = url + ":" + str(self.execution.id)
uri = c_hash if c_hash and c_hash.strip() else str(uuid.uuid1())
- artifact = create_new_artifact_event_and_attribution \
- (store=self.store,
- execution_id=self.execution.id,
- context_id=self.child_context.id,
- uri=uri,
- name=url,
- type_name="Dataset",
- event_type=event_type,
- properties={"git_repo": str(git_repo), "Commit": str(dataset_commit)},
- artifact_type_properties={"git_repo": metadata_store_pb2.STRING,
- "Commit": metadata_store_pb2.STRING
- },
- custom_properties=custom_props,
- milliseconds_since_epoch=int(time.time() * 1000),
- )
+ artifact = create_new_artifact_event_and_attribution(
+ store=self.store,
+ execution_id=self.execution.id,
+ context_id=self.child_context.id,
+ uri=uri,
+ name=url,
+ type_name="Dataset",
+ event_type=event_type,
+ properties={
+ "git_repo": str(git_repo),
+ "Commit": str(dataset_commit)},
+ artifact_type_properties={
+ "git_repo": mlpb.STRING,
+ "Commit": mlpb.STRING},
+ custom_properties=custom_props,
+ milliseconds_since_epoch=int(
+ time.time() * 1000),
+ )
custom_props["git_repo"] = git_repo
custom_props["Commit"] = dataset_commit
self.execution_label_props["git_repo"] = git_repo
self.execution_label_props["Commit"] = dataset_commit
if self.graph:
- self.driver.create_dataset_node(name, url, uri, event, self.execution.id, self.parent_context, custom_props)
+ self.driver.create_dataset_node(
+ name,
+ url,
+ uri,
+ event,
+ self.execution.id,
+ self.parent_context,
+ custom_props)
if event.lower() == "input":
- self.input_artifacts.append({"Name": name, "Path": url, "URI": uri, "Event": event.lower(),
+ self.input_artifacts.append({"Name": name,
+ "Path": url,
+ "URI": uri,
+ "Event": event.lower(),
"Execution_Name": self.execution_name,
- "Type": "Dataset", "Execution_Command": self.execution_command,
+ "Type": "Dataset",
+ "Execution_Command": self.execution_command,
"Pipeline_Id": self.parent_context.id,
"Pipeline_Name": self.parent_context.name})
self.driver.create_execution_links(uri, name, "Dataset")
else:
- child_artifact = {"Name": name, "Path": url, "URI": uri, "Event": event.lower(),
- "Execution_Name": self.execution_name,
- "Type": "Dataset", "Execution_Command": self.execution_command,
- "Pipeline_Id": self.parent_context.id, "Pipeline_Name": self.parent_context.name}
- self.driver.create_artifact_relationships(self.input_artifacts, child_artifact,
- self.execution_label_props)
+ child_artifact = {
+ "Name": name,
+ "Path": url,
+ "URI": uri,
+ "Event": event.lower(),
+ "Execution_Name": self.execution_name,
+ "Type": "Dataset",
+ "Execution_Command": self.execution_command,
+ "Pipeline_Id": self.parent_context.id,
+ "Pipeline_Name": self.parent_context.name}
+ self.driver.create_artifact_relationships(
+ self.input_artifacts, child_artifact,
+ self.execution_label_props)
return artifact
# Add the model to dvc do a git commit and store the commit id in MLMD
- def log_model(self, path: str, event: str, model_framework: str, model_type: str, model_name: str,
- custom_properties=None) -> object:
+ def log_model(self, path: str, event: str, model_framework: str = "Default",
+ model_type: str = "Default", model_name: str = "Default",
+ custom_properties: t.Optional[t.Dict] = None) -> mlpb.Artifact:
+ """Logs a model.
+
+ The model is added to dvc and the metadata file (.dvc) gets committed to git.
+
+ Example:
+ ```python
+ artifact: mlmd.proto.Artifact= cmf.log_model(
+ path="path/to/model.pkl",
+ event="output",
+ model_framework="SKlearn",
+ model_type="RandomForestClassifier",
+ model_name="RandomForestClassifier:default"
+ )
+ ```
+
+ Args:
+ path: Path to the model file.
+ event: Takes arguments `INPUT` OR `OUTPUT`.
+ model_framework: Framework used to create the model.
+ model_type: Type of model algorithm used.
+ model_name: Name of the algorithm used.
+ custom_properties: The model properties.
+
+ Returns:
+ Artifact object from ML Metadata library associated with the new model artifact.
+ """
if custom_properties is None:
custom_properties = {}
custom_props = {} if custom_properties is None else custom_properties
- name = re.split('/', path)[-1]
- event_type = metadata_store_pb2.Event.Type.OUTPUT
+ # name = re.split('/', path)[-1]
+ event_type = mlpb.Event.Type.OUTPUT
existing_artifact = []
if event.lower() == "input":
- event_type = metadata_store_pb2.Event.Type.INPUT
+ event_type = mlpb.Event.Type.INPUT
model_commit = commit_output(path, self.execution.id)
c_hash = dvc_get_hash(path)
- # If connecting to an existing artifact - The name of the artifact is used as path/steps/key
+ # If connecting to an existing artifact - The name of the artifact is
+ # used as path/steps/key
model_uri = path + ":" + c_hash
- uri = ""
+ # uri = ""
if c_hash and c_hash.strip():
uri = c_hash.strip()
existing_artifact.extend(self.store.get_artifacts_by_uri(uri))
else:
raise RuntimeError("Model commit failed, Model uri empty")
- if existing_artifact and len(existing_artifact) != 0 and event_type == metadata_store_pb2.Event.Type.INPUT:
- artifact = link_execution_to_artifact(store=self.store,
- execution_id=self.execution.id,
- uri=c_hash,
- input_name=model_uri,
- event_type=event_type)
+ if existing_artifact and len(
+ existing_artifact) != 0 and event_type == mlpb.Event.Type.INPUT:
+ artifact = link_execution_to_artifact(
+ store=self.store,
+ execution_id=self.execution.id,
+ uri=c_hash,
+ input_name=model_uri,
+ event_type=event_type)
model_uri = artifact.name
else:
@@ -349,42 +613,77 @@ def log_model(self, path: str, event: str, model_framework: str, model_type: str
name=model_uri,
type_name="Model",
event_type=event_type,
- properties={"model_framework": str(model_framework),
- "model_type": str(model_type),
- "model_name": str(model_name),
- "Commit": str(model_commit)},
- artifact_type_properties={"model_framework": metadata_store_pb2.STRING,
- "model_type": metadata_store_pb2.STRING,
- "model_name": metadata_store_pb2.STRING,
- "Commit": metadata_store_pb2.STRING,
- },
+ properties={
+ "model_framework": str(model_framework),
+ "model_type": str(model_type),
+ "model_name": str(model_name),
+ "Commit": str(model_commit)},
+ artifact_type_properties={
+ "model_framework": mlpb.STRING,
+ "model_type": mlpb.STRING,
+ "model_name": mlpb.STRING,
+ "Commit": mlpb.STRING,
+ },
custom_properties=custom_props,
- milliseconds_since_epoch=int(time.time() * 1000),
+ milliseconds_since_epoch=int(
+ time.time() * 1000),
)
# custom_properties["Commit"] = model_commit
self.execution_label_props["Commit"] = model_commit
if self.graph:
- self.driver.create_model_node(model_uri, uri, event, self.execution.id, self.parent_context, custom_props)
+ self.driver.create_model_node(
+ model_uri,
+ uri,
+ event,
+ self.execution.id,
+ self.parent_context,
+ custom_props)
if event.lower() == "input":
self.input_artifacts.append(
- {"Name": model_uri, "URI": uri, "Event": event.lower(), "Execution_Name": self.execution_name,
+ {"Name": model_uri, "URI": uri, "Event": event.lower(),
+ "Execution_Name": self.execution_name,
"Type": "Model", "Execution_Command": self.execution_command,
- "Pipeline_Id": self.parent_context.id, "Pipeline_Name": self.parent_context.name})
+ "Pipeline_Id": self.parent_context.id,
+ "Pipeline_Name": self.parent_context.name})
self.driver.create_execution_links(uri, model_uri, "Model")
else:
- child_artifact = {"Name": model_uri, "URI": uri, "Event": event.lower(),
- "Execution_Name": self.execution_name,
- "Type": "Model", "Execution_Command": self.execution_command,
- "Pipeline_Id": self.parent_context.id, "Pipeline_Name": self.parent_context.name}
- self.driver.create_artifact_relationships(self.input_artifacts, child_artifact,
- self.execution_label_props)
+ child_artifact = {
+ "Name": model_uri,
+ "URI": uri,
+ "Event": event.lower(),
+ "Execution_Name": self.execution_name,
+ "Type": "Model",
+ "Execution_Command": self.execution_command,
+ "Pipeline_Id": self.parent_context.id,
+ "Pipeline_Name": self.parent_context.name}
+ self.driver.create_artifact_relationships(
+ self.input_artifacts, child_artifact, self.execution_label_props)
return artifact
- def log_execution_metrics(self, metrics_name: str, custom_properties: {} = None) -> object:
+ def log_execution_metrics(self, metrics_name: str, custom_properties: t.Optional[t.Dict] = None) -> mlpb.Artifact:
+ """Log the metadata associated with the execution (coarse-grained tracking).
+ It is stored as a metrics artifact. This does not have a backing physical file, unlike other artifacts that we
+ have.
+
+ Example:
+ ```python
+ exec_metrics: mlpb.Artifact = cmf.log_execution_metrics(
+ metrics_name="Training_Metrics",
+ {"auc": auc, "loss": loss}
+ )
+ ```
+
+ Args:
+ metrics_name: Name to identify the metrics.
+ custom_properties: Dictionary with metric values.
+
+ Returns:
+ Artifact object from ML Metadata library associated with the new coarse-grained metrics artifact.
+ """
custom_props = {} if custom_properties is None else custom_properties
uri = str(uuid.uuid1())
metrics_name = metrics_name + ":" + uri + ":" + str(self.execution.id)
@@ -395,41 +694,80 @@ def log_execution_metrics(self, metrics_name: str, custom_properties: {} = None)
uri=uri,
name=metrics_name,
type_name="Metrics",
- event_type=metadata_store_pb2.Event.Type.OUTPUT,
- properties={"metrics_name": metrics_name},
- artifact_type_properties={"metrics_name": metadata_store_pb2.STRING},
+ event_type=mlpb.Event.Type.OUTPUT,
+ properties={
+ "metrics_name": metrics_name},
+ artifact_type_properties={
+ "metrics_name": mlpb.STRING},
custom_properties=custom_props,
- milliseconds_since_epoch=int(time.time() * 1000),
+ milliseconds_since_epoch=int(
+ time.time() * 1000),
)
if self.graph:
# To do create execution_links
- self.driver.create_metrics_node(metrics_name, uri, "output", self.execution.id, self.parent_context,
- custom_props)
- child_artifact = {"Name": metrics_name, "URI": uri, "Event": "output",
- "Execution_Name": self.execution_name,
- "Type": "Metrics", "Execution_Command": self.execution_command,
- "Pipeline_Id": self.parent_context.id, "Pipeline_Name": self.parent_context.name}
- self.driver.create_artifact_relationships(self.input_artifacts, child_artifact, self.execution_label_props)
+ self.driver.create_metrics_node(
+ metrics_name,
+ uri,
+ "output",
+ self.execution.id,
+ self.parent_context,
+ custom_props)
+ child_artifact = {
+ "Name": metrics_name,
+ "URI": uri,
+ "Event": "output",
+ "Execution_Name": self.execution_name,
+ "Type": "Metrics",
+ "Execution_Command": self.execution_command,
+ "Pipeline_Id": self.parent_context.id,
+ "Pipeline_Name": self.parent_context.name}
+ self.driver.create_artifact_relationships(
+ self.input_artifacts, child_artifact, self.execution_label_props)
return metrics
- # Log to parquet file
- def log_metric(self, metrics_name: str, custom_properties: {} = None):
- if metrics_name in self.metrics.keys():
+ def log_metric(self, metrics_name: str, custom_properties: t.Optional[t.Dict] = None) -> None:
+ """Stores the fine-grained (per step or per epoch) metrics to memory.
+
+ The metrics provided are stored in a parquet file. The `commit_metrics` call add the parquet file in the version
+ control framework. The metrics written in the parquet file can be retrieved using the `read_metrics` call.
+
+ Example:
+ ```python
+ # Can be called at every epoch or every step in the training. This is logged to a parquet file and committed
+ # at the commit stage.
+
+ # Inside training loop
+ while True:
+ cmf.log_metric("training_metrics", {"train_loss": train_loss})
+ cmf.commit_metrics("training_metrics")
+ ```
+
+ Args:
+ metrics_name: Name to identify the metrics.
+ custom_properties: Dictionary with metrics.
+ """
+ if metrics_name in self.metrics:
key = max((self.metrics[metrics_name]).keys()) + 1
self.metrics[metrics_name][key] = custom_properties
else:
self.metrics[metrics_name] = {}
self.metrics[metrics_name][1] = custom_properties
- # Commit the metrics file associated with the metrics id to dvc and git and
- # store the artifact in mlmd
+
def commit_metrics(self, metrics_name: str):
- metrics_df = pd.DataFrame.from_dict(self.metrics[metrics_name], orient='index')
+ """Writes the inmemory metrics to parquet file
+
+ Commit the metrics file associated with the metrics id to dvc and git and
+ store the artifact in mlmd
+ """
+ metrics_df = pd.DataFrame.from_dict(
+ self.metrics[metrics_name], orient='index')
metrics_df.index.names = ['SequenceNumber']
metrics_df.to_parquet(metrics_name)
metrics_commit = commit_output(metrics_name, self.execution.id)
uri = dvc_get_hash(metrics_name)
- name = metrics_name + ":" + uri + ":" + str(self.execution.id)
+ name = metrics_name + ":" + uri + ":" + \
+ str(self.execution.id) + ":" + str(uuid.uuid1())
custom_props = {"Name": metrics_name, "Commit": metrics_commit}
metrics = create_new_artifact_event_and_attribution(
store=self.store,
@@ -438,20 +776,32 @@ def commit_metrics(self, metrics_name: str):
uri=uri,
name=name,
type_name="Step_Metrics",
- event_type=metadata_store_pb2.Event.Type.OUTPUT,
+ event_type=mlpb.Event.Type.OUTPUT,
custom_properties=custom_props,
milliseconds_since_epoch=int(time.time() * 1000),
)
if self.graph:
- self.driver.create_metrics_node(name, uri, "output", self.execution.id, self.parent_context,
- custom_props)
- child_artifact = {"Name": name, "URI": uri, "Event": "output", "Execution_Name": self.execution_name,
- "Type": "Metrics", "Execution_Command": self.execution_command,
- "Pipeline_Id": self.parent_context.id}
- self.driver.create_artifact_relationships(self.input_artifacts, child_artifact, self.execution_label_props)
+ self.driver.create_metrics_node(
+ name,
+ uri,
+ "output",
+ self.execution.id,
+ self.parent_context,
+ custom_props)
+ child_artifact = {
+ "Name": name,
+ "URI": uri,
+ "Event": "output",
+ "Execution_Name": self.execution_name,
+ "Type": "Metrics",
+ "Execution_Command": self.execution_command,
+ "Pipeline_Id": self.parent_context.id}
+ self.driver.create_artifact_relationships(
+ self.input_artifacts, child_artifact, self.execution_label_props)
return metrics
- def log_validation_output(self, version: str, custom_properties: {} = None) -> object:
+
+ def log_validation_output(self, version: str, custom_properties: t.Optional[t.Dict] = None) -> object:
uri = str(uuid.uuid1())
return create_new_artifact_event_and_attribution(
store=self.store,
@@ -460,87 +810,155 @@ def log_validation_output(self, version: str, custom_properties: {} = None) -> o
uri=uri,
name=uri,
type_name="Validation_output",
- event_type=metadata_store_pb2.Event.Type.INTERNAL_OUTPUT,
+ event_type=mlpb.Event.Type.INTERNAL_OUTPUT,
properties={"version": version},
- artifact_type_properties={"version": metadata_store_pb2.STRING},
+ artifact_type_properties={"version": mlpb.STRING},
custom_properties=custom_properties,
milliseconds_since_epoch=int(time.time() * 1000),
)
- def update_existing_artifact(self, artifact: mlpb.Artifact, custom_props: {}):
- for key, value in custom_props.items():
- if isinstance(value,int):
+
+ def update_existing_artifact(self, artifact: mlpb.Artifact, custom_properties: t.Dict):
+ """Updates an existing artifact and stores it back to mlmd"""
+ for key, value in custom_properties.items():
+ if isinstance(value, int):
artifact.custom_properties[key].int_value = value
else:
artifact.custom_properties[key].string_value = str(value)
put_artifact(self.store, artifact)
- def get_artifact(self, artifact_id: int) -> metadata_store_pb2.Artifact:
+
+ def get_artifact(self, artifact_id: int) -> mlpb.Artifact:
+ """Gets the artifact object from mlmd"""
return get_artifacts_by_id(self.store, [artifact_id])[0]
- def update_model_output(self, artifact: metadata_store_pb2.Artifact):
+
+ def update_model_output(self, artifact: mlpb.Artifact):
+ """updates an artifact"""
put_artifact(self.store, artifact)
- def create_dataslice(self, name: str) -> object:
- return Cmf.dataslice(name, self)
+ def create_dataslice(self, name: str) -> "Cmf.DataSlice":
+ """Creates a dataslice object.
+
+ Once created, users can add data instances to this data slice with [add_data][cmflib.cmf.Cmf.DataSlice.add_data]
+ method. Users are also responsible for committing data slices by calling the
+ [commit][cmflib.cmf.Cmf.DataSlice.commit] method.
+
+ Example:
+ ```python
+ dataslice = cmf.create_dataslice("slice-a")
+ ```
+
+ Args:
+ name: Name to identify the dataslice.
+
+ Returns:
+ Instance of a newly created [DataSlice][cmflib.cmf.Cmf.DataSlice].
+ """
+ return Cmf.DataSlice(name, self)
+
def read_dataslice(self, name: str) -> pd.DataFrame:
+ """Reads the dataslice"""
# To do checkout if not there
df = pd.read_parquet(name)
return df
- # To do - Once update the hash and the new version should be updated in the mlmd
- def update_dataslice(self, name: str, record: str, custom_props: {}):
+ # To do - Once update the hash and the new version should be updated in
+ # the mlmd
+ def update_dataslice(self, name: str, record: str, custom_properties: t.Dict):
df = pd.read_parquet(name)
temp_dict = df.to_dict('index')
- temp_dict[record].update(custom_props)
+ temp_dict[record].update(custom_properties)
dataslice_df = pd.DataFrame.from_dict(temp_dict, orient='index')
dataslice_df.index.names = ['Path']
dataslice_df.to_parquet(name)
- class dataslice(object):
- def __init__(self, name: str, writer, props: {} = None):
+ class DataSlice:
+ """A data slice represents a named subset of data.
+
+ It can be used to track performance of an ML model on different slices of the training or testing dataset
+ splits. This can be useful from different perspectives, for instance, to mitigate model bias.
+
+ > Instances of data slices are not meant to be created manually by users. Instead, use
+ [Cmf.create_dataslice][cmflib.cmf.Cmf.create_dataslice] method.
+
+ """
+ def __init__(self, name: str, writer, props: t.Optional[t.Dict] = None):
self.props = {} if props is None else props
self.name = name
self.writer = writer
- def add_data(self, path, custom_props: {} = None):
+ def add_data(self, path: str, custom_properties: t.Optional[t.Dict] = None) -> None:
+ """Add data to create the dataslice.
+
+ Currently supported only for file abstractions. Pre-condition - the parent folder, containing the file
+ should already be versioned.
+
+ Example:
+ ```python
+ dataslice.add_data(f"data/raw_data/{j}.xml)
+ ```
+ Args:
+ path: Name to identify the file to be added to the dataslice.
+ custom_properties: Properties associated with this datum.
+ """
+
self.props[path] = {}
self.props[path]['hash'] = dvc_get_hash(path)
- if custom_props:
- for k, v in custom_props.items():
+ if custom_properties:
+ for k, v in custom_properties.items():
self.props[path][k] = v
- """
- Place holder for updating back to mlmd
+# """
+# Place holder for updating back to mlmd
- def update_data(self, path, custom_props:{}):
- for k ,v in custom_props.items():
- self.props[path][k] = v
- """
+# def update_data(self, path, custom_props:{}):
+# for k ,v in custom_props.items():
+# self.props[path][k] = v
+# """
+
+ def commit(self, custom_properties: t.Optional[t.Dict] = None) -> None:
+ """Commit the dataslice.
- def commit(self, custom_props: {} = None):
+ The created dataslice is versioned and added to underneath data versioning software.
+
+ Example:
+ ```python
+ dataslice.commit()
+ ```
+
+ Args:
+ custom_properties: Properties associated with this data slice.
+ """
git_repo = git_get_repo()
dataslice_df = pd.DataFrame.from_dict(self.props, orient='index')
dataslice_df.index.names = ['Path']
dataslice_df.to_parquet(self.name)
existing_artifact = []
- dataslice_commit = commit_output(self.name, self.writer.execution.id)
+ dataslice_commit = commit_output(
+ self.name, self.writer.execution.id)
c_hash = dvc_get_hash(self.name)
remote = dvc_get_url(self.name)
if c_hash and c_hash.strip():
- existing_artifact.extend(self.writer.store.get_artifacts_by_uri(c_hash))
+ existing_artifact.extend(
+ self.writer.store.get_artifacts_by_uri(c_hash))
if existing_artifact and len(existing_artifact) != 0:
print("Adding to existing data slice")
- _ = link_execution_to_input_artifact(store=self.writer.store,
- execution_id=self.writer.execution.id,
- uri=c_hash,
- input_name=self.name + ":" + c_hash)
+ _ = link_execution_to_input_artifact(
+ store=self.writer.store,
+ execution_id=self.writer.execution.id,
+ uri=c_hash,
+ input_name=self.name + ":" + c_hash)
else:
- props = {"Commit": dataslice_commit, "git_repo": git_repo, "Remote": remote}
- custom_properties = props.update(custom_props) if custom_props else props
+ props = {
+ "Commit": dataslice_commit,
+ "git_repo": git_repo,
+ "Remote": remote}
+ custom_properties = props.update(
+ custom_properties) if custom_properties else props
create_new_artifact_event_and_attribution(
store=self.writer.store,
execution_id=self.writer.execution.id,
@@ -548,20 +966,20 @@ def commit(self, custom_props: {} = None):
uri=c_hash,
name=self.name + ":" + c_hash,
type_name="Dataslice",
- event_type=metadata_store_pb2.Event.Type.OUTPUT,
+ event_type=mlpb.Event.Type.OUTPUT,
custom_properties=custom_properties,
milliseconds_since_epoch=int(time.time() * 1000),
)
- return None
-
- """Temporary code"""
-
- def materialize(self, name):
- slicedir = name + "-" + "dir"
- os.mkdir(slicedir)
- df = pd.read_parquet(name)
- for index, row in df.iterrows():
- print(index)
- first, middle, last = str(index).split("/")
- print(last)
- os.symlink(str(index), slicedir + "/ " + last)
+
+
+# """Temporary code"""
+#
+# def materialize(self, name):
+# slicedir = name + "-" + "dir"
+# os.mkdir(slicedir)
+# df = pd.read_parquet(name)
+# for index, row in df.iterrows():
+# print(index)
+# first, middle, last = str(index).split("/")
+# print(last)
+# os.symlink(str(index), slicedir + "/ " + last)
diff --git a/cmflib/cmfquery.py b/cmflib/cmfquery.py
index 701c2b92..3e99815f 100644
--- a/cmflib/cmfquery.py
+++ b/cmflib/cmfquery.py
@@ -28,9 +28,21 @@ def __init__(self, filepath: str = "mlmd"):
def _transform_to_dataframe(self, node):
d = {"id": node.id}
for k, v in node.properties.items():
- d[k] = v.string_value if v.HasField('string_value') else v.int_value
+ if v.HasField('string_value'):
+ d[k] = v.string_value
+ elif v.HasField('int_value'):
+ d[k] = v.int_value
+ else:
+ d[k] = v.double_value
+
for k, v in node.custom_properties.items():
- d[k] = v.string_value if v.HasField('string_value') else v.int_value
+ if v.HasField('string_value'):
+ d[k] = v.string_value
+ elif v.HasField('int_value'):
+ d[k] = v.int_value
+ else:
+ d[k] = v.double_value
+
df = pd.DataFrame(d, index=[0, ])
return df
@@ -79,9 +91,19 @@ def get_artifact_df(self, node):
"name": node.name, "create_time_since_epoch": node.create_time_since_epoch,
"last_update_time_since_epoch": node.last_update_time_since_epoch}
for k, v in node.properties.items():
- d[k] = v.string_value if v.HasField('string_value') else v.double_value
+ if v.HasField('string_value'):
+ d[k] = v.string_value
+ elif v.HasField('int_value'):
+ d[k] = v.int_value
+ else:
+ d[k] = v.double_value
for k, v in node.custom_properties.items():
- d[k] = v.string_value if v.HasField('string_value') else v.double_value
+ if v.HasField('string_value'):
+ d[k] = v.string_value
+ elif v.HasField('int_value'):
+ d[k] = v.int_value
+ else:
+ d[k] = v.double_value
df = pd.DataFrame(d, index=[0, ])
return df
@@ -210,6 +232,24 @@ def get_all_parent_artifacts(self, artifact_name: str) -> pd.DataFrame:
df = df.drop_duplicates(subset=None, keep='first', inplace=False)
return df
+ def get_all_parent_executions(self, artifact_name:str)-> pd.DataFrame:
+ df = self.get_all_parent_artifacts(artifact_name)
+ artifact_ids = df.id.values.tolist()
+
+ executions_ids = set(
+ event.execution_id
+ for event in self.store.get_events_by_artifact_ids(artifact_ids)
+ if event.type == mlpb.Event.OUTPUT)
+ executions = self.store.get_executions_by_id(executions_ids)
+
+ df = pd.DataFrame()
+ for exe in executions:
+ d1 = self._transform_to_dataframe(exe)
+ # df = df.append(d1, sort=True, ignore_index=True)
+ df = pd.concat([df, d1], sort=True, ignore_index=True)
+ return df
+
+
def find_producer_execution(self, artifact_name: str) -> object:
artifact = None
artifacts = self.store.get_artifacts()
diff --git a/cmflib/dvc_wrapper.py b/cmflib/dvc_wrapper.py
index 477eb9da..4db864ed 100644
--- a/cmflib/dvc_wrapper.py
+++ b/cmflib/dvc_wrapper.py
@@ -20,6 +20,46 @@
import dvc.exceptions
+def check_git_remote() -> bool:
+ process = ""
+ commit = ""
+ git_remote_configured = False
+ try:
+ process = subprocess.Popen(['git', 'remote', 'show'],
+ stdout=subprocess.PIPE,
+ universal_newlines=True)
+ # output = process.stdout.readline()
+ output, error = process.communicate(timeout=60)
+
+ remote = output.strip()
+ if remote:
+ git_remote_configured = True
+ except Exception as err:
+ process.kill()
+ outs, errs = process.communicate()
+ return git_remote_configured
+
+
+def check_default_remote() -> bool:
+ process = ""
+ commit = ""
+ dvc_configured = False
+ try:
+ process = subprocess.Popen(['dvc', 'config', 'core.remote'],
+ stdout=subprocess.PIPE,
+ universal_newlines=True)
+ # output = process.stdout.readline()
+ output, error = process.communicate(timeout=60)
+
+ remote = output.strip()
+ if remote:
+ dvc_configured = True
+ except Exception as err:
+ process.kill()
+ outs, errs = process.communicate()
+ return dvc_configured
+
+
def dvc_get_url(folder: str, retry: bool = False, repo: str = "") -> str:
url = ""
try:
@@ -54,7 +94,57 @@ def dvc_get_hash(folder: str, repo: str = "") -> str:
return c_hash
+def check_git_repo() -> bool:
+
+ process = ""
+ commit = ""
+ is_git_repo = False
+ try:
+ process = subprocess.Popen(['git',
+ 'rev-parse',
+ '--is-inside-work-tree'],
+ stdout=subprocess.PIPE,
+ universal_newlines=True)
+ # output = process.stdout.readline()
+ output, error = process.communicate(timeout=60)
+
+ is_git_repo = output.strip()
+ except Exception as err:
+ process.kill()
+ outs, errs = process.communicate()
+ return is_git_repo
+
+
+def git_checkout_new_branch(branch_name: str):
+
+ process = ""
+ commit = ""
+ try:
+ process = subprocess.Popen(['git',
+ 'checkout',
+ '-q',
+ '-B',
+ branch_name],
+ stdout=subprocess.PIPE,
+ universal_newlines=True)
+ # output = process.stdout.readline()
+ output, error = process.communicate(timeout=60)
+
+ commit = output.strip()
+ print(f"*** Note: CMF will check out a new branch in git to commit the metadata files ***\n"
+ f"*** The checked out branch is {branch_name}. ***")
+ except Exception as err:
+ process.kill()
+ outs, errs = process.communicate()
+ print(f"Unexpected {err}, {type(err)}")
+ print(f"Unexpected {outs}")
+ print(f"Unexpected {errs}")
+ print(f"Checking out new branch for the execution failed, continuing in the default branch.")
+
+
def git_get_commit() -> str:
+ process = ""
+ commit = ""
try:
process = subprocess.Popen(['git', 'rev-parse', 'HEAD'],
stdout=subprocess.PIPE,
@@ -70,17 +160,27 @@ def git_get_commit() -> str:
print(f"Unexpected {errs}")
return commit
+
def commit_dvc_lock_file(file_path: str, execution_id) -> str:
commit = ""
+ process = ""
try:
process = subprocess.Popen(['git', 'add', file_path],
stdout=subprocess.PIPE,
universal_newlines=True)
# To-Do : Parse the output and report if error
_, _ = process.communicate(timeout=60)
- process = subprocess.Popen(['git', 'commit', '-m ' + 'commiting ' + str(file_path) + "-" + str(execution_id)],
- stdout=subprocess.PIPE,
- universal_newlines=True)
+ process = subprocess.Popen(
+ [
+ 'git',
+ 'commit',
+ '-m ' +
+ 'commiting ' +
+ str(file_path) +
+ "-" +
+ str(execution_id)],
+ stdout=subprocess.PIPE,
+ universal_newlines=True)
output, errs = process.communicate(timeout=60)
commit = output.strip()
@@ -101,6 +201,7 @@ def commit_dvc_lock_file(file_path: str, execution_id) -> str:
def commit_output(folder: str, execution_id: str) -> str:
commit = ""
+ process = ""
try:
process = subprocess.Popen(['dvc', 'add', folder],
stdout=subprocess.PIPE,
@@ -114,9 +215,17 @@ def commit_output(folder: str, execution_id: str) -> str:
universal_newlines=True)
# To-Do : Parse the output and report if error
_, _ = process.communicate(timeout=60)
- process = subprocess.Popen(['git', 'commit', '-m ' + 'commiting ' + str(folder) + "-" + str(execution_id)],
- stdout=subprocess.PIPE,
- universal_newlines=True)
+ process = subprocess.Popen(
+ [
+ 'git',
+ 'commit',
+ '-m ' +
+ 'commiting dvc metadata file for ' +
+ str(folder) +
+ "-" +
+ str(execution_id)],
+ stdout=subprocess.PIPE,
+ universal_newlines=True)
output, errs = process.communicate(timeout=60)
commit = output.strip()
@@ -139,6 +248,9 @@ def commit_output(folder: str, execution_id: str) -> str:
# Get the remote repo
def git_get_repo() -> str:
commit = ""
+ process = ""
+ output = ""
+ errs = ""
try:
process = subprocess.Popen(['git', 'remote', '-v'],
stdout=subprocess.PIPE,
diff --git a/cmflib/graph_wrapper.py b/cmflib/graph_wrapper.py
index 75465a54..8f796166 100644
--- a/cmflib/graph_wrapper.py
+++ b/cmflib/graph_wrapper.py
@@ -14,6 +14,7 @@
# limitations under the License.
###
from neo4j import GraphDatabase
+import typing as t
import re
from ml_metadata.proto import metadata_store_pb2 as mlpb
@@ -34,7 +35,8 @@ def create_pipeline_node(self, name: str, uri: int, props=None):
props = {}
pipeline_syntax = self._create_pipeline_syntax(name, props, uri)
with self.driver.session() as session:
- node = session.write_transaction(self._run_transaction, pipeline_syntax)
+ node = session.write_transaction(
+ self._run_transaction, pipeline_syntax)
self.pipeline_id = node[0]["node_id"]
def create_stage_node(self, name: str, parent_context: mlpb.Context, stage_id: int, props=None):
@@ -42,12 +44,14 @@ def create_stage_node(self, name: str, parent_context: mlpb.Context, stage_id: i
props = {}
parent_id = parent_context.id
parent_name = parent_context.name
- stage_syntax = self._create_stage_syntax(name, props, stage_id, parent_id, parent_name)
+ stage_syntax = self._create_stage_syntax(
+ name, props, stage_id, parent_id, parent_name)
with self.driver.session() as session:
- node = session.write_transaction(self._run_transaction, stage_syntax)
+ node = session.write_transaction(
+ self._run_transaction, stage_syntax)
self.stage_id = node[0]["node_id"]
- pc_syntax = self._create_parent_child_syntax("Pipeline", "Stage", self.pipeline_id, self.stage_id,
- "contains")
+ pc_syntax = self._create_parent_child_syntax(
+ "Pipeline", "Stage", self.pipeline_id, self.stage_id, "contains")
_ = session.write_transaction(self._run_transaction, pc_syntax)
def create_execution_node(self, name: str, parent_id: int, pipeline_context: mlpb.Context, command: str,
@@ -57,11 +61,14 @@ def create_execution_node(self, name: str, parent_id: int, pipeline_context: mlp
props = {}
pipeline_id = pipeline_context.id
pipeline_name = pipeline_context.name
- execution_syntax = self._create_execution_syntax(name, command, props, execution_id, pipeline_id, pipeline_name)
+ execution_syntax = self._create_execution_syntax(
+ name, command, props, execution_id, pipeline_id, pipeline_name)
with self.driver.session() as session:
- node = session.write_transaction(self._run_transaction, execution_syntax)
+ node = session.write_transaction(
+ self._run_transaction, execution_syntax)
self.execution_id = node[0]["node_id"]
- pc_syntax = self._create_parent_child_syntax("Stage", "Execution", self.stage_id, self.execution_id, "runs")
+ pc_syntax = self._create_parent_child_syntax(
+ "Stage", "Execution", self.stage_id, self.execution_id, "runs")
_ = session.write_transaction(self._run_transaction, pc_syntax)
def create_dataset_node(self, name: str, path: str, uri: str, event: str, execution_id: int,
@@ -71,13 +78,14 @@ def create_dataset_node(self, name: str, path: str, uri: str, event: str, execut
custom_properties = {}
pipeline_id = pipeline_context.id
pipeline_name = pipeline_context.name
- dataset_syntax = self._create_dataset_syntax(name, path, uri, pipeline_id, pipeline_name,
- custom_properties)
+ dataset_syntax = self._create_dataset_syntax(
+ name, path, uri, pipeline_id, pipeline_name, custom_properties)
with self.driver.session() as session:
- node = session.write_transaction(self._run_transaction, dataset_syntax)
+ node = session.write_transaction(
+ self._run_transaction, dataset_syntax)
node_id = node[0]["node_id"]
- pc_syntax = self._create_execution_artifacts_link_syntax("Execution", "Dataset", self.execution_id, node_id,
- event)
+ pc_syntax = self._create_execution_artifacts_link_syntax(
+ "Execution", "Dataset", self.execution_id, node_id, event)
_ = session.write_transaction(self._run_transaction, pc_syntax)
def create_model_node(self, name: str, uri: str, event: str, execution_id: str, pipeline_context: mlpb.Context,
@@ -86,12 +94,14 @@ def create_model_node(self, name: str, uri: str, event: str, execution_id: str,
custom_properties = {}
pipeline_id = pipeline_context.id
pipeline_name = pipeline_context.name
- model_syntax = self._create_model_syntax(name, uri, pipeline_id, pipeline_name, custom_properties)
+ model_syntax = self._create_model_syntax(
+ name, uri, pipeline_id, pipeline_name, custom_properties)
with self.driver.session() as session:
- node = session.write_transaction(self._run_transaction, model_syntax)
+ node = session.write_transaction(
+ self._run_transaction, model_syntax)
node_id = node[0]["node_id"]
- pc_syntax = self._create_execution_artifacts_link_syntax("Execution", "Model", self.execution_id, node_id,
- event)
+ pc_syntax = self._create_execution_artifacts_link_syntax(
+ "Execution", "Model", self.execution_id, node_id, event)
_ = session.write_transaction(self._run_transaction, pc_syntax)
def create_metrics_node(self, name: str, uri: str, event: str, execution_id: int, pipeline_context: mlpb.Context,
@@ -100,16 +110,21 @@ def create_metrics_node(self, name: str, uri: str, event: str, execution_id: int
custom_properties = {}
pipeline_id = pipeline_context.id
pipeline_name = pipeline_context.name
- metrics_syntax = self._create_metrics_syntax(name, uri, event, execution_id, pipeline_id, pipeline_name,
- custom_properties)
+ metrics_syntax = self._create_metrics_syntax(
+ name, uri, event, execution_id, pipeline_id, pipeline_name, custom_properties)
with self.driver.session() as session:
- node = session.write_transaction(self._run_transaction, metrics_syntax)
+ node = session.write_transaction(
+ self._run_transaction, metrics_syntax)
node_id = node[0]["node_id"]
- pc_syntax = self._create_execution_artifacts_link_syntax("Execution", "Metrics", self.execution_id, node_id,
- event)
+ pc_syntax = self._create_execution_artifacts_link_syntax(
+ "Execution", "Metrics", self.execution_id, node_id, event)
_ = session.write_transaction(self._run_transaction, pc_syntax)
- def create_artifact_relationships(self, parent_artifacts, child_artifact, relation_properties):
+ def create_artifact_relationships(
+ self,
+ parent_artifacts,
+ child_artifact,
+ relation_properties):
# f = lambda d: d['Event']
# res = {k:list(v) for k,v in groupby(sorted(artifacts, key=f), f)}
@@ -123,45 +138,58 @@ def create_artifact_relationships(self, parent_artifacts, child_artifact, relati
parent_artifact_type = k["Type"]
parent_artifact_uri = k["URI"]
parent_name = k["Name"]
- relation = re.sub('\W+', '', re.split(",", k["Execution_Name"])[-1])
- pc_syntax = self._create_parent_child_artifacts_syntax(parent_artifact_type, child_artifact_type,
- parent_artifact_uri, child_artifact_uri, parent_name,
- child_name, pipeline_id, relation,
- relation_properties)
+ relation = re.sub(
+ '\W+', '', re.split(",", k["Execution_Name"])[-1])
+ pc_syntax = self._create_parent_child_artifacts_syntax(
+ parent_artifact_type,
+ child_artifact_type,
+ parent_artifact_uri,
+ child_artifact_uri,
+ parent_name,
+ child_name,
+ pipeline_id,
+ relation,
+ relation_properties)
with self.driver.session() as session:
_ = session.write_transaction(self._run_transaction, pc_syntax)
- def create_execution_links(self, parent_artifact_uri, parent_artifact_name, parent_artifact_type):
+ def create_execution_links(
+ self,
+ parent_artifact_uri,
+ parent_artifact_name,
+ parent_artifact_type):
parent_execution_query = "MATCH (n:{}".format(
- parent_artifact_type) + "{uri: '" + parent_artifact_uri + "'}) <-[:output]-(f:Execution) Return ID(f)as id, f.uri as uri"
+ parent_artifact_type) + "{uri: '" + parent_artifact_uri + "'}) " \
+ "<-[:output]-(f:Execution) Return ID(f)as id, f.uri as uri"
- already_linked_execution_query = "MATCH (f)-[r:linked]->(e2:Execution) WHERE r.uri = '{}' RETURN ID(f)as id, f.uri as uri".format(
- parent_artifact_uri)
+ already_linked_execution_query = "MATCH (f)-[r:linked]->(e2:Execution) " \
+ "WHERE r.uri = '{}' RETURN ID(f)as id, f.uri as uri".format(parent_artifact_uri)
with self.driver.session() as session:
- execution_parent = session.read_transaction(self._run_transaction, parent_execution_query)
+ execution_parent = session.read_transaction(
+ self._run_transaction, parent_execution_query)
executions = {}
for record in execution_parent:
p_id = record["id"]
executions[str(p_id)] = str(record["uri"])
- linked_executions = session.read_transaction(self._run_transaction, already_linked_execution_query)
+ linked_executions = session.read_transaction(
+ self._run_transaction, already_linked_execution_query)
linked = {}
for record in linked_executions:
linked_id = record["id"]
linked[str(linked_id)] = str(record["uri"])
- unlinked_executions = [i for i in executions.keys() if i not in linked.keys()]
+ unlinked_executions = [
+ i for i in executions.keys() if i not in linked.keys()]
if not unlinked_executions:
return
execution_id_to_link = unlinked_executions[0]
execution_uri = executions[execution_id_to_link]
- pc_syntax = self._create_execution_link_syntax("Execution", "Execution", execution_uri,
- execution_id_to_link,
- self.execution_id,
- "linked", {"Artifact_Name": parent_artifact_name,
- "uri": parent_artifact_uri})
+ pc_syntax = self._create_execution_link_syntax(
+ "Execution", "Execution", execution_uri, execution_id_to_link, self.execution_id, "linked", {
+ "Artifact_Name": parent_artifact_name, "uri": parent_artifact_uri})
_ = session.write_transaction(self._run_transaction, pc_syntax)
@staticmethod
@@ -173,7 +201,7 @@ def _run_transaction(tx, message):
return values
@staticmethod
- def _create_pipeline_syntax(name: str, props: {}, uri: int) -> str:
+ def _create_pipeline_syntax(name: str, props: t.Dict, uri: int) -> str:
props["Name"] = name
props["uri"] = str(uri)
props["pipeline_id"] = str(uri)
@@ -186,7 +214,8 @@ def _create_pipeline_syntax(name: str, props: {}, uri: int) -> str:
syntax_str = syntax_str + "}) RETURN ID(a) as node_id"
return syntax_str
- # Todo - Verify what is considered as unique node . is it a combination of all properties
+ # Todo - Verify what is considered as unique node . is it a combination of
+ # all properties
@staticmethod
def _create_dataset_syntax(name: str, path: str, uri: str, pipeline_id: int, pipeline_name: str,
@@ -196,11 +225,11 @@ def _create_dataset_syntax(name: str, path: str, uri: str, pipeline_id: int, pip
custom_properties["pipeline_id"] = str(pipeline_id)
custom_properties["pipeline_name"] = pipeline_name
syntax_str = "MERGE (a:Dataset {uri:\"" + uri + "\"}) SET "
- props_str = ""
+ # props_str = ""
for k, v in custom_properties.items():
k = re.sub('\W+', '', k)
- props_str = "a." + k + " = coalesce([x in a." + k + " where x <>\"" + str(v) + "\"], []) + \"" + str(
- v) + "\","
+ props_str = "a." + k + \
+ " = coalesce([x in a." + k + " where x <>\"" + str(v) + "\"], []) + \"" + str(v) + "\","
syntax_str = syntax_str + props_str
syntax_str = syntax_str.rstrip(",")
syntax_str = syntax_str + " RETURN ID(a) as node_id"
@@ -239,7 +268,7 @@ def _create_metrics_syntax(name: str, uri: str, event: str, execution_id: int, p
return syntax_str
@staticmethod
- def _create_stage_syntax(name: str, props: {}, uri: int, pipeline_id: int, pipeline_name: str) -> str:
+ def _create_stage_syntax(name: str, props: t.Dict, uri: int, pipeline_id: int, pipeline_name: str) -> str:
props["Name"] = name
props["uri"] = str(uri)
props["pipeline_id"] = str(pipeline_id)
@@ -274,7 +303,7 @@ def _create_execution_artifacts_link_syntax(parent_label: str, child_label: str,
@staticmethod
def _create_execution_link_syntax(parent_label: str, child_label: str, parent_uri: str, parent_id: str,
child_id: int,
- relation: str, relation_properties: {}):
+ relation: str, relation_properties: t.Dict):
"""
MATCH
(a:Person),
@@ -284,25 +313,22 @@ def _create_execution_link_syntax(parent_label: str, child_label: str, parent_ur
RETURN type(r)
"""
parent_child_syntax_1 = "MATCH (a:{}), (b:{}) WHERE a.uri = '{}' AND ID(a) = {} AND ID(b) = {} ".format(
- parent_label,
- child_label,
- parent_uri,
- parent_id,
- child_id)
+ parent_label, child_label, parent_uri, parent_id, child_id)
parent_child_syntax_2 = "MERGE (a)-[r:{}".format(relation)
parent_child_syntax_3 = "{"
for k, v in relation_properties.items():
parent_child_syntax_3 = parent_child_syntax_3 + k + ":" + "\"" + v + "\"" + ","
- parent_child_syntax_3 = parent_child_syntax_3.rstrip(parent_child_syntax_3[-1])
+ parent_child_syntax_3 = parent_child_syntax_3.rstrip(
+ parent_child_syntax_3[-1])
parent_child_syntax_4 = "}]->(b) RETURN type(r)"
parent_child_syntax = parent_child_syntax_1 + parent_child_syntax_2 \
- + parent_child_syntax_3 + parent_child_syntax_4
+ + parent_child_syntax_3 + parent_child_syntax_4
return parent_child_syntax
@staticmethod
def _create_parent_child_artifacts_syntax(parent_label: str, child_label: str, parent_uri: str, child_uri: str,
parent_name: str, child_name: str, pipeline_id: int, relation: str,
- relation_properties: {}):
+ relation_properties: t.Dict):
"""
MATCH
(a:Person),
@@ -316,15 +342,17 @@ def _create_parent_child_artifacts_syntax(parent_label: str, child_label: str, p
parent_child_syntax_2 = "MERGE (a)-[r:{}".format(relation)
parent_child_syntax_3 = "{"
for k, v in relation_properties.items():
- parent_child_syntax_3 = parent_child_syntax_3 + k + ":" + "\"" + str(v) + "\"" + ","
- parent_child_syntax_3 = parent_child_syntax_3.rstrip(parent_child_syntax_3[-1])
+ parent_child_syntax_3 = parent_child_syntax_3 + \
+ k + ":" + "\"" + str(v) + "\"" + ","
+ parent_child_syntax_3 = parent_child_syntax_3.rstrip(
+ parent_child_syntax_3[-1])
parent_child_syntax_4 = "}]->(b) RETURN type(r)"
- parent_child_syntax = parent_child_syntax_1 + parent_child_syntax_2 + parent_child_syntax_3 + \
- parent_child_syntax_4
+ parent_child_syntax = parent_child_syntax_1 + parent_child_syntax_2 + \
+ parent_child_syntax_3 + parent_child_syntax_4
return parent_child_syntax
@staticmethod
- def _create_execution_syntax(name: str, command: str, props: {}, uri: int, pipeline_id: int,
+ def _create_execution_syntax(name: str, command: str, props: t.Dict, uri: int, pipeline_id: int,
pipeline_name: str) -> str:
props["Name"] = name
props["Command"] = command
diff --git a/cmflib/metadata_helper.py b/cmflib/metadata_helper.py
index feffc0a7..43fbcefc 100644
--- a/cmflib/metadata_helper.py
+++ b/cmflib/metadata_helper.py
@@ -13,15 +13,16 @@
# limitations under the License.
###
-import json
+
import os
import sys
-import ml_metadata
+import typing as t
from time import sleep
from ml_metadata.proto import metadata_store_pb2
from ml_metadata.metadata_store import metadata_store
from ipaddress import ip_address, IPv4Address
from typing import List
+import functools
def value_to_mlmd_value(value) -> metadata_store_pb2.Value:
@@ -50,7 +51,8 @@ def connect_to_mlmd() -> metadata_store.MetadataStore:
try:
mlmd_store = metadata_store.MetadataStore(mlmd_connection_config)
# All get requests fail when the DB is empty, so we have to use a put request.
- # TODO: Replace with _ = mlmd_store.get_context_types() when https://github.com/google/ml-metadata/issues/28 is fixed
+ # TODO: Replace with _ = mlmd_store.get_context_types()
+ # when https://github.com/google/ml-metadata/issues/28 is fixed
_ = mlmd_store.put_execution_type(
metadata_store_pb2.ExecutionType(
name="DummyExecutionType",
@@ -85,7 +87,7 @@ def get_or_create_artifact_type(store, type_name, properties: dict = None) -> me
try:
artifact_type = store.get_artifact_type(type_name=type_name)
return artifact_type
- except:
+ except BaseException:
artifact_type = metadata_store_pb2.ArtifactType(
name=type_name,
properties=properties,
@@ -98,12 +100,13 @@ def get_or_create_execution_type(store, type_name, properties: dict = None) -> m
try:
execution_type = store.get_execution_type(type_name=type_name)
return execution_type
- except:
+ except BaseException:
execution_type = metadata_store_pb2.ExecutionType(
name=type_name,
properties=properties,
)
- execution_type.id = store.put_execution_type(execution_type) # Returns ID
+ execution_type.id = store.put_execution_type(
+ execution_type) # Returns ID
return execution_type
@@ -111,7 +114,7 @@ def get_or_create_context_type(store, type_name, properties: dict = None) -> met
try:
context_type = store.get_context_type(type_name=type_name)
return context_type
- except:
+ except BaseException:
context_type = metadata_store_pb2.ContextType(
name=type_name,
properties=properties,
@@ -190,18 +193,17 @@ def create_context_with_type(
return context
-import functools
-
-
@functools.lru_cache(maxsize=128)
def get_context_by_name(
store,
context_name: str,
) -> metadata_store_pb2.Context:
- matching_contexts = [context for context in store.get_contexts() if context.name == context_name]
+ matching_contexts = [
+ context for context in store.get_contexts() if context.name == context_name]
assert len(matching_contexts) <= 1
if len(matching_contexts) == 0:
- raise ValueError('Context with name "{}" was not found'.format(context_name))
+ raise ValueError(
+ 'Context with name "{}" was not found'.format(context_name))
return matching_contexts[0]
@@ -215,7 +217,7 @@ def get_or_create_context_with_type(
) -> metadata_store_pb2.Context:
try:
context = get_context_by_name(store, context_name)
- except:
+ except BaseException:
context = create_context_with_type(
store=store,
context_name=context_name,
@@ -231,8 +233,8 @@ def get_or_create_context_with_type(
assert len(context_types) == 1
if context_types[0].name != type_name:
raise RuntimeError(
- 'Context "{}" was found, but it has type "{}" instead of "{}"'.format(context_name, context_types[0].name,
- type_name))
+ 'Context "{}" was found, but it has type "{}" instead of "{}"'.format(
+ context_name, context_types[0].name, type_name))
return context
@@ -291,11 +293,12 @@ def create_new_execution_in_existing_context(
def get_or_create_parent_context(
store,
pipeline: str,
- custom_properties: {} = None
+ custom_properties: t.Optional[t.Dict] = None
) -> metadata_store_pb2.Context:
mlmd_custom_properties = {}
for property_name, property_value in (custom_properties or {}).items():
- mlmd_custom_properties[property_name] = value_to_mlmd_value(property_value)
+ mlmd_custom_properties[property_name] = value_to_mlmd_value(
+ property_value)
context = get_or_create_context_with_type(
store=store,
@@ -305,21 +308,21 @@ def get_or_create_parent_context(
PARENT_CONTEXT_NAME: metadata_store_pb2.STRING,
},
properties={
- PARENT_CONTEXT_NAME: metadata_store_pb2.Value(string_value=pipeline)
- },
- custom_properties=mlmd_custom_properties
- )
+ PARENT_CONTEXT_NAME: metadata_store_pb2.Value(
+ string_value=pipeline)},
+ custom_properties=mlmd_custom_properties)
return context
def get_or_create_run_context(
store,
pipeline_stage: str,
- custom_properties: {} = None,
+ custom_properties: t.Optional[t.Dict] = None,
) -> metadata_store_pb2.Context:
mlmd_custom_properties = {}
for property_name, property_value in (custom_properties or {}).items():
- mlmd_custom_properties[property_name] = value_to_mlmd_value(property_value)
+ mlmd_custom_properties[property_name] = value_to_mlmd_value(
+ property_value)
context = get_or_create_context_with_type(
store=store,
@@ -329,19 +332,20 @@ def get_or_create_run_context(
PIPELINE_STAGE: metadata_store_pb2.STRING,
},
properties={
- PIPELINE_STAGE: metadata_store_pb2.Value(string_value=pipeline_stage)
- },
- custom_properties=mlmd_custom_properties
- )
+ PIPELINE_STAGE: metadata_store_pb2.Value(
+ string_value=pipeline_stage)},
+ custom_properties=mlmd_custom_properties)
return context
def associate_child_to_parent_context(store, parent_context: metadata_store_pb2.Context,
child_context: metadata_store_pb2.Context):
try:
- associate = metadata_store_pb2.ParentContext(child_id=child_context.id, parent_id=parent_context.id)
+ associate = metadata_store_pb2.ParentContext(
+ child_id=child_context.id, parent_id=parent_context.id)
store.put_parent_contexts([associate])
except Exception as e:
+ # print(e)
# print('Warning: Exception:{}'.format(str(e)), file=sys.stderr)
sys.stderr.flush()
@@ -356,11 +360,12 @@ def create_new_execution_in_existing_run_context(
git_repo: str = None,
git_start_commit: str = None,
git_end_commit: str = "",
- custom_properties: {} = None,
+ custom_properties: t.Optional[t.Dict] = None,
) -> metadata_store_pb2.Execution:
mlmd_custom_properties = {}
for property_name, property_value in (custom_properties or {}).items():
- mlmd_custom_properties[property_name] = value_to_mlmd_value(property_value)
+ mlmd_custom_properties[property_name] = value_to_mlmd_value(
+ property_value)
return create_new_execution_in_existing_context(
store=store,
@@ -413,7 +418,8 @@ def create_new_artifact_event_and_attribution(
mlmd_custom_properties = {}
for property_name, property_value in (custom_properties or {}).items():
- mlmd_custom_properties[property_name] = value_to_mlmd_value(property_value)
+ mlmd_custom_properties[property_name] = value_to_mlmd_value(
+ property_value)
artifact = create_artifact_with_type(
store=store,
@@ -494,7 +500,7 @@ def link_execution_to_artifact(
artifact = artifacts[-1]
- #Check if event already exist
+ # Check if event already exist
events = store.get_events_by_artifact_ids([artifact.id])
for evt in events:
if evt.execution_id == execution_id:
diff --git a/docker-compose.yml b/docker-compose.yml
index 4fe37efc..5d9cee8f 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,5 +1,5 @@
-#Following https://jupyter-docker-stacks.readthedocs.io/en/latest/using/selecting.html
-#And https://jupyter-docker-stacks.readthedocs.io/en/latest/using/common.html#docker-options
+# Following https://jupyter-docker-stacks.readthedocs.io/en/latest/using/selecting.html
+# And https://jupyter-docker-stacks.readthedocs.io/en/latest/using/common.html#docker-options
version: '3.8'
services:
jupyter-cmf-notebook:
@@ -8,8 +8,26 @@ services:
context: ./
container_name: jupyter-cmf-notebook
hostname: jupyter-cmf-notebook
+
+ #workspace is the directory from your home folder that
+ #will be mounted inside the docker conatiner with cmf pre-installed
+ #dvc_remote is the remote data store for dvc
+ #your .ssh folder is mounted inside the docker conatiner
+ #to enable you to push and pull code from git
+ #To-Do
+ # Create these directories in your home folder
+ #1. mkdir $HOME/workspace
+ #2. mkdir $HOME/dvc_remote
+ #or
+ #Change the below lines to relect the appropriate directories
+ #1. If your workspace is named "experiment" change the below line
+ #$HOME/workspace:/home/jovyan/workspace to
+ #$HOME/experiment:/home/jovyan/wokspace
+ #2. If your remote is /extmount/data change the line
+ #$HOME/dvc_remote:/home/jovyan/dvc_remote to
+ #/extmount/data:/home/jovyan/dvc_remote
volumes:
- - $HOME/workspace:/home/jovyan/workspace
+ - $HOME/workspace:/home/jovyan/workspace
- $HOME/dvc_remote:/home/jovyan/dvc_remote
- $HOME/.ssh:/home/jovyan/.ssh
ports:
@@ -36,6 +54,7 @@ services:
- GIT_BRANCH=${GIT_BRANCH} #from .env file
- DB_URL=mysql://db:33060
- DB_PASSWORD=${MYSQL_ROOT_PASSWORD} #from .env file
+ - NEO4J=TRUE
- NEO4J_URI=bolt://neo4j:7687
- NEO4J_USER_NAME=${NEO4J_USER_NAME} #from .env file
- NEO4J_PASSWD=${NEO4J_PASSWD} #from .env file
@@ -48,7 +67,7 @@ services:
container_name: neo4j
hostname: neo4j
ports:
- - 7475:7474
- - 7688:7687
+ - "7475:7474"
+ - "7688:7687"
environment:
- NEO4J_AUTH=${NEO4J_USER_NAME}/${NEO4J_PASSWD}
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 00000000..90289040
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,9 @@
+# CMF Documentation
+
+CMF uses [MKDocs](https://www.mkdocs.org/) for documentation. To build documentation locally, install dependencies
+and run mkdocs http server:
+
+```shell
+pip install -r docs/requirements.txt
+mkdocs serve
+```
diff --git a/docs/_src/README.md b/docs/_src/README.md
new file mode 100644
index 00000000..d7e96969
--- /dev/null
+++ b/docs/_src/README.md
@@ -0,0 +1,13 @@
+# CMF docs development resources
+
+This directory contains files that are used to create some content for the CMF documentation. This process is not
+automated yet. Files in this directory are not supposed to be referenced from documentation pages.
+
+> It also should not be required to automatically redeploy documentation (e.g., with GitHub actions) when documentation
+> files change only in this particular directory.
+
+- The [diagrams.drawio](./diagrams.drawio) file is created with [PyCharm](https://www.jetbrains.com/pycharm/)'s
+ [Diagram.NET](https://app.diagrams.net/) plugin. It contains a number of diagrams used in the documentation. Now,
+ to update those diagrams, use this file to edit them, them take a screenshot, edit with some editor, and then
+ overwrite corresponding files (e.g., [ML Pipeline Definition](../assets/ml_pipeline_def.png)) used on the main page.
+
diff --git a/docs/_src/diagrams.drawio b/docs/_src/diagrams.drawio
new file mode 100644
index 00000000..3a1b95cb
--- /dev/null
+++ b/docs/_src/diagrams.drawio
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/API.md b/docs/api/public/API.md
similarity index 100%
rename from docs/API.md
rename to docs/api/public/API.md
diff --git a/docs/api/public/cmf.md b/docs/api/public/cmf.md
new file mode 100644
index 00000000..cc60c906
--- /dev/null
+++ b/docs/api/public/cmf.md
@@ -0,0 +1,16 @@
+# cmflib.cmf.Cmf
+
+::: cmflib.cmf.Cmf
+ options:
+ show_root_toc_entry: false
+ merge_init_into_class: true
+ docstring_style: google
+ members:
+ - __init__
+ - create_context
+ - create_execution
+ - log_dataset
+ - log_model
+ - log_execution_metrics
+ - log_metric
+ - create_dataslice
diff --git a/docs/api/public/dataslice.md b/docs/api/public/dataslice.md
new file mode 100644
index 00000000..3e37a8ee
--- /dev/null
+++ b/docs/api/public/dataslice.md
@@ -0,0 +1,10 @@
+# cmflib.cmf.Cmf.DataSlice
+
+::: cmflib.cmf.Cmf.DataSlice
+ options:
+ show_root_toc_entry: false
+ merge_init_into_class: true
+ docstring_style: google
+ members:
+ - add_data
+ - commit
diff --git a/docs/architecture/advantages.md b/docs/architecture/advantages.md
new file mode 100644
index 00000000..75b42aa2
--- /dev/null
+++ b/docs/architecture/advantages.md
@@ -0,0 +1,6 @@
+# Advantages
+
+1. Tracking of metadata for distributed pipeline, thereby enabling efficient pipeline.
+2. Enables tracking of code, data and metadata in a single framework.
+3. Provides a git like ease of management for metadata.
+4. Provides collaboration across teams.
diff --git a/docs/architecture/components.md b/docs/architecture/components.md
new file mode 100644
index 00000000..acc4fe2e
--- /dev/null
+++ b/docs/architecture/components.md
@@ -0,0 +1,93 @@
+# CMF Components
+Common metadata framework has the following components:
+
+- [Metadata Library](#metadata-library) exposes API’s to track the pipeline metadata. It also provides API’s to query
+ the stored metadata.
+- [Local Client](#local-client) interacts with the server to pull or push metadata from or to the remote store.
+- [Central Server](#central-server) interacts with all the remote clients and is responsible to merge the metadata
+ transferred by the remote client and manage the consolidated metadata.
+- [Central Repositories](#central-repositories) hosts the code, data and metadata.
+
+
+
+
+## Metadata Library
+The API’s and the abstractions provided by the library enables tracking of pipeline metadata. It tracks the stages in
+the pipeline, the input and output artifacts at each stage and metrics. The framework allows metrics to be tracked both
+at coarse and fine-grained intervals. It could be a stage metrics, which could be captured at the end of a stage or
+fine-grained metrics which is tracked per step (epoch) or at regular intervals during the execution of the stage.
+
+The metadata logged through the APIs are written to a backend relational database. The library also provides API’s to
+query the metadata stored in the relational database for the users to inspect pipelines.
+
+In addition to explicit tracking through the API’s library also provides, implicit tracking. The implicit tracking
+automatically tracks the software version used in the pipelines. The function arguments and function return values can
+be automatically tracked by adding metadata tracker class decorators on the functions.
+
+Before writing the metadata to relational database, the metadata operations are journaled in the metadata journal log.
+This enables the framework to transfer the local metadata to the central server.
+
+All artifacts are versioned with a data versioning framework (for e.g., DVC). The content hash of the artifacts are
+generated and stored along with the user provided metadata. A special artifact metadata file called a “.dvc” file is
+created for every artifact (file / folder) which is added to data version management system. The .dvc file contains the
+content hash of the artifact.
+
+For every new execution, the metadata tracker creates a new branch to track the code. The special metadata file created
+for artifacts, the “.dvc” file is also committed to GIT and its commit id is tracked as a metadata information.
+The artifacts are versioned through the versioning of its metadata file. Whenever there is a change in the artifact,
+the metadata file is modified to reflect its current content hash, and the file is tracked as a new version of the
+metadata file.
+
+The metadata tracker automatically tracks the start commit when the library was initialized and creates separate commit
+for each change in the artifact along the experiment. This helps to track the transformations on the artifacts along the
+different stages in the pipeline.
+
+## Local Client
+The metadata client interacts with the metadata server. It communicates with the server, for synchronization of metadata.
+
+After the experiment is completed, the user invokes the “Cmf push” command to push the collected metadata to the remote.
+This transfers the existing metadata journal to the server.
+
+The metadata from the central repository can be pulled to the local repository, either using the artifacts or using the
+project as the identifier or both.
+
+When artifact is used as the identifier, all metadata associated with the artifacts currently present in the branch of
+the cloned Git repository is pulled from the central repository to the local repository. The pulled metadata consist of
+not only the immediate metadata associated with the artifacts, it contains the metadata of all the artifacts in its
+chain of lineage.
+
+When project is used as the identifier, all the metadata associated with the current branch of the pipeline code that
+is checked out is pulled to the local repository.
+
+## Central Server
+The central server, exposes REST API’s that can be called from the remote clients. This can help in situations where the
+connectivity between the core datacenter and the remote client is robust. The remote client calls the API’s exposed by
+the central server to log the metadata directly to the central metadata repository.
+
+Where the connectivity with the central server is intermittent, the remote clients log the metadata to the local
+repository. The journaled metadata is pushed by the remote client to the central server. The central server, will
+replay the journal and merge the incoming metadata with the metadata already existing in the central repository. The
+ability to accurately identify the artifacts anywhere using their content hash, makes this merge robust.
+
+## Central Repositories
+The common metadata framework consist of three central repositories for the code, data and metadata.
+
+#### Central Metadata repository
+Central metadata repository holds the metadata pushed from the distributed sites. It holds metadata about all the
+different pipelines that was tracked using the common metadata tracker. The consolidated view of the metadata stored
+in the central repository, helps the users to learn across various stages in the pipeline executed at different
+locations. Using the query layer that is pointed to the central repository, the users gets the global view of the
+metadata which provides them with a deeper understanding of the pipelines and its metadata. The metadata helps to
+understand nonobvious results like performance of a dataset with respect to other datasets, Performance of a particular
+pipeline with respect to other pipelines etc.
+
+#### Central Artifact storage repository
+Central Artifact storage repository stores all the artifacts related to experiment. The data versioning framework (DVC)
+stores the artifacts in a content addressable layout. The artifacts are stored inside the folder with name as the first
+two characters of the content hash and the name of the artifact as the remaining part of the content hash. This helps
+in efficient retrieval of the artifacts.
+
+#### Git Repository
+Git repository is used to track the code. Along with the code, the metadata file of the artifacts which contain the
+content hash of the artifacts are also stored in GIT. The Data versioning framework (dvc) would use these files to
+retrieve the artifacts from the artifact storage repository.
diff --git a/docs/architecture/overview.md b/docs/architecture/overview.md
new file mode 100644
index 00000000..a51b0017
--- /dev/null
+++ b/docs/architecture/overview.md
@@ -0,0 +1,37 @@
+# Architecture Overview
+
+Interactions in data pipelines can be complex. The Different stages in the pipeline, (which may not be next to each
+other) may have to interact to produce or transform artifacts. As the artifacts navigates and undergo transformations
+through this pipeline, it can take a complicated path, which might also involve bidirectional movement across these
+stages.
+Also, there could be dependencies between the multiple stages, where the metrics produced by a stage could influence the
+metrics at a subsequent stage. It is important to track the metadata across a pipeline to provide features like,
+lineage tracking, provenance and reproducibility.
+
+The tracking of metadata through these complex pipelines have multiple challenges, some of them being,
+
+- Each stage in the pipeline could be executed in a different datacenter or an edge site having intermittent connection
+ to the core datacenter.
+- Each stage in the pipeline could be possibly managed by different teams.
+- The artifacts (input or output) needs to be uniquely identified across different sites and across multiple pipelines.
+
+Common metadata framework (CMF) addresses the problems associated with tracking of pipeline metadata from distributed
+sites and tracks code, data and metadata together for end-to-end traceability.
+
+The framework automatically tracks the code version as one of the metadata for an execution. Additionally, the data
+artifacts are also versioned automatically using a data versioning framework (like DVC) and the metadata regarding the
+data version is stored along with the code. The framework stores the Git commit id of the metadata file associated with
+the artifact and content hash of the artifact as metadata. The framework provides API’s to track the hyperparameters and
+other metadata of pipelines. Therefore, from the metadata stored, users can zero in on the hyperparameters, code
+version and the artifact version used for the experiment.
+
+Identifying the artifacts by content hash allows the framework, to uniquely identify an artifact anywhere in the
+distributed sites. This enables the metadata from the distributed sites to be precisely merged to a central repository,
+thereby providing a single global metadata from the distributed sites.
+
+On this backbone, we build the Git like experience for metadata, enabling users to push their local metadata to the
+remote repository, where it is merged to create the global metadata and pull metadata from the global metadata to the
+local, to create a local view, which would contain only the metadata of interest.
+
+The framework can be used to track various types of pipelines such as data pipelines or AI pipelines.
+
diff --git a/docs/assets/Metadata_stored.png b/docs/assets/Metadata_stored.png
new file mode 100644
index 00000000..d1b60dd1
Binary files /dev/null and b/docs/assets/Metadata_stored.png differ
diff --git a/docs/assets/Python_kernel.png b/docs/assets/Python_kernel.png
new file mode 100644
index 00000000..5616b1d9
Binary files /dev/null and b/docs/assets/Python_kernel.png differ
diff --git a/docs/assets/distributed_architecture.png b/docs/assets/distributed_architecture.png
new file mode 100644
index 00000000..38cf8932
Binary files /dev/null and b/docs/assets/distributed_architecture.png differ
diff --git a/docs/assets/framework.png b/docs/assets/framework.png
new file mode 100644
index 00000000..639c5fb2
Binary files /dev/null and b/docs/assets/framework.png differ
diff --git a/docs/assets/jupyter.png b/docs/assets/jupyter.png
new file mode 100644
index 00000000..02638b18
Binary files /dev/null and b/docs/assets/jupyter.png differ
diff --git a/docs/assets/ml_pipeline_def.png b/docs/assets/ml_pipeline_def.png
new file mode 100644
index 00000000..1465784f
Binary files /dev/null and b/docs/assets/ml_pipeline_def.png differ
diff --git a/docs/assets/ml_pipeline_stage_execution.png b/docs/assets/ml_pipeline_stage_execution.png
new file mode 100644
index 00000000..c28b46be
Binary files /dev/null and b/docs/assets/ml_pipeline_stage_execution.png differ
diff --git a/docs/assets/neo4j_output.PNG b/docs/assets/neo4j_output.PNG
new file mode 100644
index 00000000..81494a97
Binary files /dev/null and b/docs/assets/neo4j_output.PNG differ
diff --git a/docs/assets/neo4j_server.png b/docs/assets/neo4j_server.png
new file mode 100644
index 00000000..e544614d
Binary files /dev/null and b/docs/assets/neo4j_server.png differ
diff --git a/docs/assets/python_kernel_broader.png b/docs/assets/python_kernel_broader.png
new file mode 100644
index 00000000..ce2328bc
Binary files /dev/null and b/docs/assets/python_kernel_broader.png differ
diff --git a/docs/assets/slack_logo.png b/docs/assets/slack_logo.png
new file mode 100644
index 00000000..fb5d71a4
Binary files /dev/null and b/docs/assets/slack_logo.png differ
diff --git a/docs/examples/getting_started.md b/docs/examples/getting_started.md
new file mode 100644
index 00000000..4682ab29
--- /dev/null
+++ b/docs/examples/getting_started.md
@@ -0,0 +1,105 @@
+# Getting Started
+
+> This example depends on the following packages: `git`. We also recommend installing
+> [anaconda](https://docs.anaconda.com/anaconda/install/linux/) to manage python virtual environments.
+> This example was tested in the following environments:
+>
+> - `Ubuntu-22.04 with python-3.8.15`
+
+This example demonstrates how CMF tracks a metadata associated with executions of various machine learning (ML)
+pipelines. ML pipelines differ from other pipelines (e.g., data Extract-Transform-Load pipelines) by the presence of
+ML steps, such as training and testing ML models. More comprehensive ML pipelines may include steps such as deploying a
+trained model and tracking its inference parameters (such as response latency, memory consumption etc.). This example,
+located [here](https://github.com/HewlettPackard/cmf/tree/master/examples/example-get-started) implements a simple
+pipeline consisting of four steps:
+
+- The [parse](https://github.com/HewlettPackard/cmf/blob/master/examples/example-get-started/src/parse.py) step splits
+ the [raw data](https://github.com/HewlettPackard/cmf/tree/master/examples/example-get-started/artifacts) into
+ `train` and `test` raw datasets for training and testing a machine learning model. This step registers one
+ input artifact (raw `dataset`) and two output artifacts (train and test `datasets`).
+- The [featurize](https://github.com/HewlettPackard/cmf/blob/master/examples/example-get-started/src/featurize.py)
+ step creates two machine learning splits - train and test splits - that will be used by an ML training algorithm to
+ train ML models. This step registers two input artifacts (raw train and test datasets) and two output artifacts (
+ train and test ML datasets).
+- The next [train](https://github.com/HewlettPackard/cmf/blob/master/examples/example-get-started/src/train.py) step
+ trains an ML model (random forest classifier). It registers one input artifact (train ML dataset) and one
+ output artifact (trained ML model).
+- The fourth [test](https://github.com/HewlettPackard/cmf/blob/master/examples/example-get-started/src/test.py) step
+ tests the ML model trained in the third `train` step. This step registers two input artifacts (ML model and test
+ dataset) and one output artifact (performance metrics).
+- The last [query](https://github.com/HewlettPackard/cmf/blob/master/examples/example-get-started/src/query.py) step
+ is a demonstration that shows how pipeline metadata can be retrieved from CMF. It will print metadata associated with
+ all executions of the above steps. This means that if you rerun the pipeline again, the output will include not only
+ metadata associated with the last run, but also metadata associated with all previous runs.
+
+
+## Pre-requisites
+
+We start by creating (1) a workspace directory that will contain all files for this example and (2) a python virtual
+environment. Then we will clone the CMF project that contains this example project.
+```shell
+# Create workspace directory
+mkdir cmf_getting_started_example
+cd cmf_getting_started_example
+
+# Create and activate Python virtual environment (the Python version may need to be adjusted depending on your system)
+conda create -n cmf_getting_started_example python=3.8
+conda activate cmf_getting_started_example
+
+# Clone the CMF project from GitHub and install CMF
+git clone https://github.com/HewlettPackard/cmf
+pip install ./cmf
+```
+
+## Project initialization
+We need to copy the source tree of the example in its own directory (that must be outside the CMF source tree), and
+initialize `git` and `dvc` for this project.
+
+```shell
+# Create a separate copy of the example project
+cp -r ./cmf/examples/example-get-started/ ./example-get-started
+cd ./example-get-started
+```
+
+Review the content of the
+[sample_env](https://github.com/HewlettPackard/cmf/blob/master/examples/example-get-started/sample_env) file which is
+located in the root directory of the example. For the demonstration purposes, you can leave all fields as is. Once this
+file is reviewed, source that file and run
+[initialize.sh](https://github.com/HewlettPackard/cmf/blob/master/examples/example-get-started/initialize.sh)
+to initialize `git` and `dvc` repositories.
+```shell
+# Export environmental variables
+source ./sample_env
+# Initialize the example project
+sh ./initialize.sh
+```
+
+## Project execution
+The `initialize.sh` script executed above has printed some details about the project. To execute the example
+pipeline, run the
+[test_script.sh](https://github.com/HewlettPackard/cmf/blob/master/examples/example-get-started/test_script.sh)
+file (before that, study the contents of that file). Basically, that script will run a sequence of steps
+common for a typical machine learning project - getting raw data, converting it into machine learning train/test splits,
+training and testing a model. The execution of these steps (and parent pipeline) will be recorded by the CMF.
+```shell
+# Run the example pipeline
+sh ./test_script.sh
+```
+
+This script will run the pipeline and will store its metadata in a sqlite file named mlmd. Verify that all stages are
+done using `git log` command. You should see commits corresponding to the artifacts that were created.
+
+Under normal conditions, the next steps would be to: (1) execute the `dvc push` command to push the artifacts to dvc
+remote and (2) execute the `git push origin` command to track the metadata of the generated artifacts.
+
+
+## Query
+The stored metadata can be explored using the query layer. Example Jupyter notebook
+[Query_Tester-base_mlmd.ipynb](https://github.com/HewlettPackard/cmf/blob/master/examples/example-get-started/Query_Tester-base_mlmd.ipynb)
+can be found in this directory.
+
+## Clean Up
+Metadata is stored in sqlite file named "mlmd". To clean up, delete the "mlmd" file.
+
+## Steps to test dataslice
+Run the following command: `python test-data-slice.py`.
diff --git a/docs/extra.css b/docs/extra.css
new file mode 100644
index 00000000..dc646e02
--- /dev/null
+++ b/docs/extra.css
@@ -0,0 +1,3 @@
+.md-typeset__table {
+ min-width: 100%;
+}
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 00000000..3dac5a26
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,339 @@
+# CMF in a nutshell
+
+CMF (Common Metadata Framework) collects and stores information associated with Machine Learning (ML) pipelines. It
+also implements APIs to query this metadata. The CMF adopts a data-first approach: all artifacts (such as datasets, ML
+models and performance metrics) recorded by the framework are versioned and identified by their content hash.
+
+## Installation
+CMF requires 3.7 >= Python <= 3.9. Create python virtual environment:
+
+=== "Conda"
+ ```shell
+ conda create -n cmf python=3.8
+ conda activate cmf
+ ```
+
+=== "VirtualEnv"
+ ```shell
+ virtualenv --python=3.8 .cmf
+ source .cmf/bin/activate
+ ```
+
+Install CMF
+
+=== "Latest version form GitHub"
+ ```shell
+ pip install https://github.com/HewlettPackard/cmf
+ ```
+
+=== "Stable version form PyPI"
+ ```shell
+ # Work in progress: not available yet.
+ # pip install cmflib
+ ```
+### [Jupyter Lab docker container with CMF pre-installed](#docker-section)
+## Introduction
+Complex ML projects rely on `ML pipelines` to train and test ML models. An ML pipeline is a sequence of stages where
+each stage performs a particular task, such as data loading and pre-processing, ML model training and testing. Stages
+
+- consume `inputs` and produce `outputs`.
+- are parametrized by parameters that guide the process of producing outputs.
+
+
+
+CMF uses abstractions defined in [ML Metadata](https://www.tensorflow.org/tfx/guide/mlmd) (MLMD) library to represent
+CMF concepts. Each pipeline has a name. Users provide it when they initialize the CMF. Each stage is characterized by
+metadata represented as MLMD's `Context` object. When users actually run a stage, this is recorded by the MLMD's
+`Execution` object. Inputs and outputs of stages are represented as MLMD's `Artifact` object, while parameters of stages
+are recorded as properties of executions.
+
+
+
+
+
+
+=== "1 Init"
+ Start tracking the pipeline metadata by initializing the CMF runtime. The metadata will be associated with the
+ pipeline named `test_pipeline`.
+ ```python
+ from cmflib.cmf import Cmf
+ from ml_metadata.proto import metadata_store_pb2 as mlpb
+
+ cmf = Cmf(
+ filename="mlmd",
+ pipeline_name="test_pipeline",
+ )
+ ```
+
+=== "2 Stage type"
+ Before we can start tracking metadata, we need to let CMF know about stage type. This is not yet associated with
+ this particular execution.
+ ```python
+ context: mlmd.proto.Context = cmf.create_context(
+ pipeline_stage="train"
+ )
+ ```
+
+=== "3 New execution"
+ Now we can create a new stage execution associated with the `train` stage. The CMF always creates a new execution,
+ and will adjust its name, so it's unique. This is also the place where we log stage `parameters`.
+ ```python
+ execution: mlmd.proto.Execution = cmf.create_execution(
+ execution_type="train",
+ custom_properties = {"num_epochs": 100, "learning_rate": 0.01}
+ )
+ ```
+
+=== "4 Log Artifacts"
+ Finally, we can log an input (train dataset), and once trained, an output (ML model) artifacts.
+ ```python
+ cmf.log_dataset(
+ 'artifacts/test_dataset.csv', # Dataset path
+ "input" # This is INPUT artifact
+ )
+ cmf.log_model(
+ "artifacts/model.pkl", # Model path
+ event="output" # This is OUTPUT artifact
+ )
+ ```
+
+
+ |
+
+
+ |
+
+
+
+
+## Quick Example
+Simple "getting started" example is described [here](examples/getting_started.md).
+
+## API Overview
+
+**Import CMF**.
+```python
+from cmflib import cmf
+```
+
+**Create the metadata writer**. The [metadata writer][cmflibcmfcmf] is responsible for managing a CMF backend to record
+the pipeline metadata. Internally, it creates a pipeline abstraction that groups individual stages and their executions.
+All stages, their executions and produced artifacts will be associated with a pipeline with the given name.
+```python
+cmf = cmf.Cmf(
+ filename="mlmd", # Path to ML Metadata file.
+ pipeline_name="mnist" # Name of a ML pipeline.
+)
+```
+
+**Define a stage**. An ML pipeline can have multiple stages, and each stage can be associated with multiple executions.
+A stage is like a class in the world of object-oriented programming languages. A context (stage description) defines
+what this stage looks like (name and optional properties), and is created with the
+[create_context][cmflib.cmf.Cmf.create_context] method.
+```python
+context = cmf.create_context(
+ pipeline_stage="download", # Stage name
+ custom_properties={ # Optional properties
+ "uses_network": True, # Downloads from the Internet
+ "disk_space": "10GB" # Needs this much space
+ }
+)
+```
+
+**Create a stage execution**. A stage in ML pipeline can have multiple executions. Every run is marked as an execution.
+This API helps to track the metadata associated with the execution, like stage parameters (e.g., number of epochs and
+learning rate for train stages). The stage execution name does not need to be the same as the name of its context.
+Moreover, the CMF will adjust this name to ensure every execution has a unique name. The CMF will internally associate
+this execution with the context created previously. Stage executions are created by calling the
+[create_execution][cmflib.cmf.Cmf.create_execution] method.
+```python
+execution = cmf.create_execution(
+ execution_type="download", # Execution name.
+ custom_properties = { # Execution parameters
+ "url": "https://a.com/mnist.gz" # Data URL.
+ }
+)
+```
+
+**Log artifacts**. A stage execution can consume (inputs) and produce (outputs) multiple artifacts (datasets, models and
+performance metrics). The path of these artifacts must be relative to the project (repository) root path. Artifacts
+might have optional metadata associated with them. These metadata could include feature statistics for ML datasets, or useful parameters for ML models (such as, for
+instance, number of trees in a random forest classifier).
+
+- **Datasets** are logged with the [log_dataset][cmflib.cmf.Cmf.log_dataset] method.
+ ```python
+ cmf.log_dataset('data/mnist.gz', "input", custom_properties={"name": "mnist", "type": 'raw'})
+ cmf.log_dataset('data/train.csv', "output", custom_properties={"name": "mnist", "type": "train_split"})
+ cmf.log_dataset('data/test.csv', "output", custom_properties={"name": "mnist", "type": "test_split"})
+ ```
+
+- **ML models** produced by training stages are logged using [log_model][cmflib.cmf.Cmf.log_model] API. ML models can be
+ both input and output artifacts. The metadata associated with the artifact could be logged as an optional argument.
+ ```python
+ # In train stage
+ cmf.log_model(
+ path="model/rf.pkl", event="output", model_framework="scikit-learn", model_type="RandomForestClassifier",
+ model_name="RandomForestClassifier:default"
+ )
+
+ # In test stage
+ cmf.log_model(
+ path="model/rf.pkl", event="input"
+ )
+ ```
+
+- **Metrics** of every optimization step (one epoch of Stochastic Gradient Descent, or one boosting round in
+ Gradient Boosting Trees) are logged using [log_metric][cmflib.cmf.Cmf.log_metric] API.
+ ```python
+ #Can be called at every epoch or every step in the training. This is logged to a parquet file and committed at the
+ # commit stage.
+
+ #Inside training loop
+ while True:
+ cmf.log_metric("training_metrics", {"loss": loss})
+ cmf.commit_metrics("training_metrics")
+ ```
+
+- **Stage metrics**, or final metrics, are logged with the [log_execution_metrics][cmflib.cmf.Cmf.log_execution_metrics]
+ method. These are final metrics of a stage, such as final train or test accuracy.
+ ```python
+ cmf.log_execution_metrics("metrics", {"avg_prec": avg_prec, "roc_auc": roc_auc})
+ ```
+
+**Dataslices** are intended to be used to track subsets of the data. For instance, this can be used to track and compare
+accuracies of ML models on these subsets to identify model bias. [Data slices][cmflibcmfcmfdataslice] are created with
+the [create_dataslice][cmflib.cmf.Cmf.create_dataslice] method.
+```python
+dataslice = cmf.create_dataslice("slice-a")
+for i in range(1, 20, 1):
+ j = random.randrange(100)
+ dataslice.add_data("data/raw_data/"+str(j)+".xml")
+dataslice.commit()
+```
+
+## Graph Layer Overview
+CMF library has an optional `graph layer` which stores the relationships in a Neo4J graph database. To use the graph
+layer, the `graph` parameter in the library init call must be set to true (it is set to false by default). The
+library reads the configuration parameters of the graph database from the following environment variables: `NEO4J_URI`,
+`NEO4J_USER_NAME` and `NEO4J_PASSWD`. They need to be made available in a user environment, e.g.:
+
+```shell
+export NEO4J_URI="bolt://10.93.244.219:7687"
+export NEO4J_USER_NAME=neo4j
+export NEO4J_PASSWD=neo4j
+```
+
+To use the graph layer, instantiate the CMF with `graph=True` parameter:
+```python
+from cmflib import cmf
+
+cmf = cmf.Cmf(
+ filename="mlmd",
+ pipeline_name="anomaly_detection_pipeline",
+ graph=True
+)
+```
+
+## Use a Jupyterlab Docker environment with CMF pre-installed
+CMF has a docker-compose file which creates two docker containers,
+- JupyterLab Notebook Environment with CMF pre installed.
+ - Accessible at http://[HOST.IP.AD.DR]:8888 (default token: `docker`)
+ - Within the Jupyterlab environment, a startup script switches context to `$USER:$GROUP` as specified in `.env`
+ - `example-get-started` from this repo is bind mounted into `/home/jovyan/example-get-started`
+- Neo4j Docker container to store and access lineages.
+
+#### Step 1.
+ `create .env file in current folder using env-example as a template. Modify the .env file for the following variables
+USER,UID,GROUP,GID,GIT_USER_NAME,GIT_USER_EMAIL,GIT_REMOTE_URL #These are used by docker-compose.yml`
+#### Step 2.
+**Update `docker-compose.yml` as needed.**
+ your .ssh folder is mounted inside the docker conatiner to enable you to push and pull code from git
+ **To-Do**
+ Create these directories in your home folder
+```
+mkdir $HOME/workspace
+mkdir $HOME/dvc_remote
+```
+workspace - workspace will be mounted inside the cmf pre-installed docker conatiner (can be your code directory)
+dvc_remote - remote data store for dvc
+
+***or***
+Change the below lines in docker-compose to reflect the appropriate directories
+```
+ If your workspace is named "experiment" change the below line
+$HOME/workspace:/home/jovyan/workspace to
+$HOME/experiment:/home/jovyan/wokspace
+```
+```
+If your remote is /extmount/data change the line
+$HOME/dvc_remote:/home/jovyan/dvc_remote to
+/extmount/data:/home/jovyan/dvc_remote
+```
+***Start the docker***
+```
+docker-compose up --build -d
+```
+***Access the jupyter notebook***
+http://[HOST.IP.AD.DR]:8888 (default token: `docker`)
+
+Click the terminal icon
+
+***Quick Start***
+```
+cd example-get-started
+sh initialize.sh
+sh test_script.sh
+dvc push
+```
+The above steps will run a pre coded example pipeline and the metadata is stored in a file named "mlmd".
+The artifacts created will be pushed to configured dvc remote (default: /home/dvc_remote)
+The stored metadata is displayed as
+![image](assets/Metadata_stored.png)
+
+Metadata lineage can be accessed in neo4j.
+Open http://host:7475/browser/
+Connect to server with default password neo4j123 (To change this modify .env file)
+
+Run the query
+```
+MATCH (a:Execution)-[r]-(b) WHERE (b:Dataset or b:Model or b:Metrics) RETURN a,r, b
+```
+Expected output
+
+
+***Jupyter Lab Notebook***
+Select the kernel as Python[conda env:python37]
+
+
+***Shutdown/remove (Remove volumes as well)***
+```
+docker-compose down -v
+```
+
+## License
+CMF is an open source project hosted on [GitHub](https://github.com/HewlettPackard/cmf) and distributed according to
+the Apache 2.0 [licence](https://github.com/HewlettPackard/cmf/blob/master/LICENSE). We are welcome user contributions -
+send us a message on the Slack [channel](https://commonmetadata.slack.com/) or open a GitHub
+[issue](https://github.com/HewlettPackard/cmf/issues) or a [pull request](https://github.com/HewlettPackard/cmf/pulls)
+on GitHub.
+
+## Citation
+```bibtex
+@mist{foltin2022cmf,
+ title={Self-Learning Data Foundation for Scientific AI},
+ author={Martin Foltin, Annmary Justine, Sergey Serebryakov, Cong Xu, Aalap Tripathy, Suparna Bhattacharya,
+ Paolo Faraboschi},
+ year={2022},
+ note = {Presented at the "Monterey Data Conference"},
+ URL={https://drive.google.com/file/d/1Oqs0AN0RsAjt_y9ZjzYOmBxI8H0yqSpB/view},
+}
+```
+
+## Community
+[](https://commonmetadata.slack.com/)
+
+!!! help
+
+ Common Metadata Framework and its documentation are in active stage of development and are very new. If there is
+ anything unclear, missing or there's a typo, please, open an issue or pull request
+ on [GitHub](https://github.com/HewlettPackard/cmf).
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 00000000..b2ed12cc
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,4 @@
+mkdocs>=1.0.4
+mkdocs-material>=4.4.0
+pymdown-extensions>=7.1
+mkdocstrings[python]
\ No newline at end of file
diff --git a/env-example b/env-example
index 0f8ae362..de5cd5c3 100644
--- a/env-example
+++ b/env-example
@@ -1,16 +1,14 @@
-USER=tripataa
-UID=121956566
+USER=frist
+UID=123456789
GROUP=users
GID=100
-AWS_ACCESS_KEY_ID=blabla
-AWS_SECRET_ACCESS_KEY=blabla
-DVC_REMOTE_URL=/tmp/myremote
-GIT_USER_NAME=aalap-tripathy
-GIT_USER_EMAIL=aalap.tripathy@hpe.com
-GIT_REMOTE_URL=git@github.hpe.com:aalap-tripathy/cmd-test.git
-GIT_BRANCH=tripataa_experiments
-MYSQL_ROOT_PASSWORD=password
-NEO4J_USER_NAME=neo4j
-NEO4J_PASSWD=neo4j123
+DVC_REMOTE_URL=/home/jovyan/dvc_remote
+GIT_USER_NAME=first.second
+GIT_USER_EMAIL=first.second@hpe.com
+GIT_REMOTE_URL=git@github.hpe.com:first-second/cmf-demo.git
+GIT_BRANCH=first_checkout
+NEO4J_USER_NAME=USER
+NEO4J=TRUE
+NEO4J_PASSWD=XXXXX
HTTP_PROXY=http://proxy.houston.hpecorp.net:8080
-HTTPS_PROXY=http://proxy.houston.hpecorp.net:8080
\ No newline at end of file
+HTTPS_PROXY=http://proxy.houston.hpecorp.net:8080
diff --git a/examples/al_object_detection/dvc_cmf_ingest.py b/examples/active_learning/dvc_cmf_ingest.py
similarity index 97%
rename from examples/al_object_detection/dvc_cmf_ingest.py
rename to examples/active_learning/dvc_cmf_ingest.py
index 215cd282..aa3baf8c 100644
--- a/examples/al_object_detection/dvc_cmf_ingest.py
+++ b/examples/active_learning/dvc_cmf_ingest.py
@@ -46,14 +46,14 @@ def get_cmf_hierarchy(execution_lineage:str):
execution_exist : True if it exeist, False otherwise
metawrite: cmf object
"""
-def ingest_metadata(execution_lineage:str, metadata:dict, execution_exist:bool, metawriter:cmf.Cmf, command:str = "") :
+def ingest_metadata(execution_lineage:str, metadata:dict, execution_exist:bool, metawriter:cmf.Cmf) :
pipeline_name, context_name, execution = get_cmf_hierarchy(execution_lineage)
_ = metawriter.create_context(pipeline_stage=context_name)
if execution_exist:
_ = metawriter.update_execution(int(execution))
else :
- _ = metawriter.create_execution(execution, {}, command)
+ _ = metawriter.create_execution(execution)
for k, v in metadata.items():
if k == "deps":
@@ -123,7 +123,7 @@ def ingest_metadata(execution_lineage:str, metadata:dict, execution_exist:bool,
pipeline_name = "Pipeline"+"-"+str(uuid.uuid4()) if not pipeline_name else pipeline_name
-metawriter = cmf.Cmf(filename="mlmd", pipeline_name=pipeline_name, graph=False)
+metawriter = cmf.Cmf(filename="mlmd", pipeline_name=pipeline_name, graph=True)
"""
Parse the dvc.lock dictionary and get the command section
@@ -146,6 +146,8 @@ def ingest_metadata(execution_lineage:str, metadata:dict, execution_exist:bool,
"""
vvv.pop(0)
cmd = cmd_exe.get(str(vvv), None)
+ print(cmd_exe)
+ print(str(vvv))
if cmd is not None:
"""
cmd(lineage) - eg - '1,eval,active_learning '
@@ -160,6 +162,6 @@ def ingest_metadata(execution_lineage:str, metadata:dict, execution_exist:bool,
context_name = k
execution_name = vvv[0]
lineage = execution_name+","+context_name+","+ pipeline_name
- ingest_metadata(lineage, vv, False, metawriter, str(vvv))
+ ingest_metadata(lineage, vv, False, metawriter)
metawriter.log_dvc_lock("dvc.lock")
diff --git a/examples/active_learning/sample_env b/examples/active_learning/sample_env
index a58e9d43..7892bbdb 100644
--- a/examples/active_learning/sample_env
+++ b/examples/active_learning/sample_env
@@ -1,7 +1,7 @@
export DVC_REMOTE_URL=/tmp/myremote
-export GIT_USER_NAME=annmary-roy@hpe.com
-export GIT_USER_EMAIL=annmary.roy@hpe.com
-export GIT_REMOTE_URL=git@github.hpe.com:annmary-roy/crop-dataset.git
-export NEO4J_USER_NAME=neo4j
-export NEO4J_PASSWD=test
-export NEO4J_URI="bolt://ai-datafoundation.labs.hpecorp.net:7687"
+export GIT_USER_NAME=first-second@hpe.com
+export GIT_USER_EMAIL=first.second@hpe.com
+export GIT_REMOTE_URL=git@github.hpe.com:first-second/crop-dataset.git
+export NEO4J_USER_NAME=user
+export NEO4J_PASSWD=XXXX
+export NEO4J_URI="bolt://xxxx:xxx"
diff --git a/examples/al_object_detection/.gitignore b/examples/al_object_detection/.gitignore
index a6d44692..9d4bce4f 100644
--- a/examples/al_object_detection/.gitignore
+++ b/examples/al_object_detection/.gitignore
@@ -1,3 +1,4 @@
**/__pyccache__
/work_dirs
*.pyc
+/al_cycle_next.json
diff --git a/examples/al_object_detection/al_cycle.json b/examples/al_object_detection/al_cycle.json
new file mode 100644
index 00000000..50184808
--- /dev/null
+++ b/examples/al_object_detection/al_cycle.json
@@ -0,0 +1,5 @@
+{
+ "al_cycle": 1,
+ "next_cycle": 2,
+ "al_seed": 123
+}
diff --git a/examples/al_object_detection/cmf_dvc_ingest.py b/examples/al_object_detection/cmf_dvc_ingest.py
new file mode 100644
index 00000000..459c84ee
--- /dev/null
+++ b/examples/al_object_detection/cmf_dvc_ingest.py
@@ -0,0 +1,187 @@
+###
+# Copyright (2024) Hewlett Packard Enterprise Development LP
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# You may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###
+
+import argparse
+import yaml
+import pandas as pd
+import typing as t
+import uuid
+from ml_metadata.metadata_store import metadata_store
+from ml_metadata.proto import metadata_store_pb2 as mlpb
+from cmflib import cmfquery
+from cmflib import cmf
+
+
+"""
+Ingest the metadata in dvc.lock file into CMF.
+If cmf mlmd file exist with metadata from metrics logging or logging for other
+metadata not captured in dvc.lock file, pass that mlmd file as the input file.
+This code queries the file for existing pipelines, stages and executions and stores
+as a dictionary. If the execution cmd in the stored dict, matches the execution command
+in the dvc.lock file, that execution is updated with additional metadata.
+If there is no prior execution captured, a new execution is created
+"""
+uuid_ = str(uuid.uuid4())
+
+pipeline_name = ""
+
+parser = argparse.ArgumentParser()
+"""
+File name of the cmf metadata file . Pass the file name if the pipeline has been recorded
+with CMF explicit log statements , to record metadata not part of dvc.lock file
+"""
+parser.add_argument('--cmf_filename', type=str, default="mlmd", help="cmf filename")
+args = parser.parse_args()
+
+"""
+Parses the string and returns, pipeline name, context name and execution name
+"""
+def get_cmf_hierarchy(execution_lineage:str):
+ cmf_levels = execution_lineage.split(',')
+ return cmf_levels[-1], cmf_levels[1], cmf_levels[0]
+
+"""
+Ingest the metadata into cmf
+args
+ execution_lineage: format- execution id(if present)/execution file name/context/pipeline
+ eg - demo_train.py,active_learning_training@2,active_learning -- without existing execution
+ - 3,eval,active_learning -- with existing execution in cmf metadata file
+ metadata: The parsed dictionary from dvc.lock file
+ execution_exist : True if it exeist, False otherwise
+ metawrite: cmf object
+"""
+def ingest_metadata(execution_lineage:str, metadata:dict, metawriter:cmf.Cmf, command:str = "") :
+ pipeline_name, context_name, execution = get_cmf_hierarchy(execution_lineage)
+
+ _ = metawriter.create_execution(
+ str(context_name) + '_' + str(execution),
+ {},
+ cmd = str(command),
+ create_new_execution=False
+ )
+
+ for k, v in metadata.items():
+ if k == "deps":
+ for dep in v:
+ metawriter.log_dataset_with_version(dep["path"], dep["md5"], "input")
+ if k == "outs":
+ for out in v:
+ metawriter.log_dataset_with_version(out["path"], out["md5"], "output")
+
+def find_location(string, elements):
+ for index, element in enumerate(elements):
+ if string == element:
+ return index
+ return None
+
+#Query mlmd to get all the executions and its commands
+cmd_exe = {}
+cmf_query = cmfquery.CmfQuery(args.cmf_filename)
+pipelines: t.List[str] = cmf_query.get_pipeline_names()
+for pipeline in pipelines:
+ pipeline_name = pipeline
+ stages: t.List[str] = cmf_query.get_pipeline_stages(pipeline)
+ for stage in stages:
+ exe_df: pd.DataFrame = cmf_query.get_all_executions_in_stage(stage)
+ """
+ Parse all the executions in a stage
+ eg- exe_step = ['demo_eval.py', '--trained_model', 'data/model-1', '--enable_df', 'True', '--round', '1']
+ """
+ for index, row in exe_df.iterrows():
+ exe_step = row['Execution']
+ '''
+ if already same execution command has been captured previously use the latest
+ execution id to associate the new metadata
+ '''
+ if None is cmd_exe.get(exe_step, None):
+ cmd_exe[exe_step] = str(row['id']) + "," + stage + "," + pipeline
+ else:
+ if row['id'] > int(cmd_exe.get(exe_step, None).split(',')[0]):
+ cmd_exe[exe_step] = str(row['id']) + "," + stage + "," + pipeline
+
+"""
+Parse the dvc.lock file.
+"""
+pipeline_dict = {}
+with open("dvc.lock", 'r') as f:
+ valuesYaml = yaml.load(f, Loader=yaml.FullLoader)
+
+for k in valuesYaml['stages']:
+ pipeline_dict[k] = {}
+ commands=[]
+ deps = []
+ outs = []
+ k_dict = {}
+ i = 0
+
+ for kk in valuesYaml['stages'][k]:
+ if kk == 'cmd':
+ cmd_list = valuesYaml['stages'][k][kk].split()
+ commands.append(cmd_list)
+ k_dict['cmd'] = cmd_list
+ i = i + 1
+ if kk == 'deps':
+ deps = valuesYaml['stages'][k][kk]
+ k_dict['deps'] = deps
+ if kk == 'outs':
+ outs = valuesYaml['stages'][k][kk]
+ k_dict['outs'] = outs
+
+ pipeline_dict[k][str(i)] = k_dict
+
+"""
+Create a unique Pipeline name if there is no mlmd file
+"""
+
+
+pipeline_name = "Pipeline"+"-"+str(uuid_) if not pipeline_name else pipeline_name
+metawriter = cmf.Cmf(filename="mlmd", pipeline_name=pipeline_name, graph=True)
+
+"""
+Parse the dvc.lock dictionary and get the command section
+"""
+for k, v in pipeline_dict.items():
+ for kk, vv in v.items():
+ for kkk, vvv in vv.items():
+ if kkk == 'cmd':
+ """
+ Key eg - cmd
+ Value eg - ['python3', 'demo.py', '--enable_df', 'True']
+ cmd_exe eg - {"['demo_eval.py', '--trained_model', 'data/model-3', '--enable_df', 'True', '--round', '3']":
+ '3,eval,active_learning',
+ "['demo_eval.py', '--trained_model', 'data/model-2', '--enable_df', 'True', '--round', '2']": '2,eval,active_learning',
+ "['demo_eval.py', '--trained_model', 'data/model-1', '--enable_df', 'True', '--round', '1']": '1,eval,active_learning'}
+ In the next line pop out the python
+ if the pipeline_dict command is already there in the cmd_exe dict got from parsing the mlmd pop that cmd out
+ and use the stored lineage from the mlmd
+ """
+ vvv.pop(0)
+ pos = find_location('--execution_name', vvv)
+ if pos:
+ execution_name = vvv[pos+1]
+ else:
+ execution_name = uuid_
+
+ context_name = k
+ lineage = execution_name+","+context_name+","+ pipeline_name
+
+ cmd = cmd_exe.get(str(' '.join(vvv)), None)
+ _ = metawriter.create_context(pipeline_stage=context_name)
+
+ ingest_metadata(lineage, vv, metawriter, str(' '.join(vvv)))
+
+
+metawriter.log_dvc_lock("dvc.lock")
diff --git a/examples/al_object_detection/cmf_logger_mmcv/__init__.py b/examples/al_object_detection/cmf_logger_mmcv/__init__.py
new file mode 100644
index 00000000..53c1122e
--- /dev/null
+++ b/examples/al_object_detection/cmf_logger_mmcv/__init__.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_module import BaseModule, ModuleDict, ModuleList, Sequential
+from .base_runner import BaseRunner
+from .builder import RUNNERS, build_runner
+from .checkpoint import (CheckpointLoader, _load_checkpoint,
+ _load_checkpoint_with_prefix, load_checkpoint,
+ load_state_dict, save_checkpoint, weights_to_cpu)
+from .default_constructor import DefaultRunnerConstructor
+from .dist_utils import (allreduce_grads, allreduce_params, get_dist_info,
+ init_dist, master_only)
+from .epoch_based_runner import EpochBasedRunner, Runner
+from .fp16_utils import LossScaler, auto_fp16, force_fp32, wrap_fp16_model
+from .hooks import (HOOKS, CheckpointHook, ClosureHook, DistEvalHook,
+ DistSamplerSeedHook, DvcliveLoggerHook, EMAHook, EvalHook,
+ Fp16OptimizerHook, GradientCumulativeFp16OptimizerHook,
+ GradientCumulativeOptimizerHook, Hook, IterTimerHook,
+ LoggerHook, MlflowLoggerHook, NeptuneLoggerHook,
+ OptimizerHook, PaviLoggerHook, SegmindLoggerHook,
+ SyncBuffersHook, TensorboardLoggerHook, TextLoggerHook,
+ WandbLoggerHook, CmfLoggerHook)
+from .hooks.lr_updater import StepLrUpdaterHook # noqa
+from .hooks.lr_updater import (CosineAnnealingLrUpdaterHook,
+ CosineRestartLrUpdaterHook, CyclicLrUpdaterHook,
+ ExpLrUpdaterHook, FixedLrUpdaterHook,
+ FlatCosineAnnealingLrUpdaterHook,
+ InvLrUpdaterHook, LinearAnnealingLrUpdaterHook,
+ LrUpdaterHook, OneCycleLrUpdaterHook,
+ PolyLrUpdaterHook)
+from .hooks.momentum_updater import (CosineAnnealingMomentumUpdaterHook,
+ CyclicMomentumUpdaterHook,
+ LinearAnnealingMomentumUpdaterHook,
+ MomentumUpdaterHook,
+ OneCycleMomentumUpdaterHook,
+ StepMomentumUpdaterHook)
+from .iter_based_runner import IterBasedRunner, IterLoader
+from .log_buffer import LogBuffer
+from .optimizer import (OPTIMIZER_BUILDERS, OPTIMIZERS,
+ DefaultOptimizerConstructor, build_optimizer,
+ build_optimizer_constructor)
+from .priority import Priority, get_priority
+from .utils import get_host_info, get_time_str, obj_from_dict, set_random_seed
+
+# initialize ipu to registor ipu runner to RUNNERS
+from mmcv.device import ipu # isort:skip # noqa
+
+__all__ = [
+ 'BaseRunner', 'Runner', 'EpochBasedRunner', 'IterBasedRunner', 'LogBuffer',
+ 'HOOKS', 'Hook', 'CheckpointHook', 'ClosureHook', 'LrUpdaterHook',
+ 'FixedLrUpdaterHook', 'StepLrUpdaterHook', 'ExpLrUpdaterHook',
+ 'PolyLrUpdaterHook', 'InvLrUpdaterHook', 'CosineAnnealingLrUpdaterHook',
+ 'FlatCosineAnnealingLrUpdaterHook', 'CosineRestartLrUpdaterHook',
+ 'CyclicLrUpdaterHook', 'OneCycleLrUpdaterHook', 'MomentumUpdaterHook',
+ 'StepMomentumUpdaterHook', 'CosineAnnealingMomentumUpdaterHook',
+ 'CyclicMomentumUpdaterHook', 'OneCycleMomentumUpdaterHook',
+ 'OptimizerHook', 'IterTimerHook', 'DistSamplerSeedHook', 'LoggerHook',
+ 'PaviLoggerHook', 'TextLoggerHook', 'TensorboardLoggerHook',
+ 'NeptuneLoggerHook', 'WandbLoggerHook', 'MlflowLoggerHook',
+ 'DvcliveLoggerHook', '_load_checkpoint', 'load_state_dict',
+ 'load_checkpoint', 'weights_to_cpu', 'save_checkpoint', 'Priority',
+ 'get_priority', 'get_host_info', 'get_time_str', 'obj_from_dict',
+ 'init_dist', 'get_dist_info', 'master_only', 'OPTIMIZER_BUILDERS',
+ 'OPTIMIZERS', 'DefaultOptimizerConstructor', 'build_optimizer',
+ 'build_optimizer_constructor', 'IterLoader', 'set_random_seed',
+ 'auto_fp16', 'force_fp32', 'wrap_fp16_model', 'Fp16OptimizerHook',
+ 'SyncBuffersHook', 'EMAHook', 'build_runner', 'RUNNERS', 'allreduce_grads',
+ 'allreduce_params', 'LossScaler', 'CheckpointLoader', 'BaseModule',
+ '_load_checkpoint_with_prefix', 'EvalHook', 'DistEvalHook', 'Sequential',
+ 'ModuleDict', 'ModuleList', 'GradientCumulativeOptimizerHook',
+ 'GradientCumulativeFp16OptimizerHook', 'DefaultRunnerConstructor',
+ 'SegmindLoggerHook', 'LinearAnnealingMomentumUpdaterHook',
+ 'LinearAnnealingLrUpdaterHook', 'CmfLoggerHook'
+]
diff --git a/examples/al_object_detection/cmf_logger_mmcv/hooks/__init__.py b/examples/al_object_detection/cmf_logger_mmcv/hooks/__init__.py
new file mode 100644
index 00000000..501562f8
--- /dev/null
+++ b/examples/al_object_detection/cmf_logger_mmcv/hooks/__init__.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .checkpoint import CheckpointHook
+from .closure import ClosureHook
+from .ema import EMAHook
+from .evaluation import DistEvalHook, EvalHook
+from .hook import HOOKS, Hook
+from .iter_timer import IterTimerHook
+from .logger import (DvcliveLoggerHook, LoggerHook, MlflowLoggerHook,
+ NeptuneLoggerHook, PaviLoggerHook, SegmindLoggerHook,
+ TensorboardLoggerHook, TextLoggerHook, WandbLoggerHook, CmfLoggerHook)
+from .lr_updater import (CosineAnnealingLrUpdaterHook,
+ CosineRestartLrUpdaterHook, CyclicLrUpdaterHook,
+ ExpLrUpdaterHook, FixedLrUpdaterHook,
+ FlatCosineAnnealingLrUpdaterHook, InvLrUpdaterHook,
+ LinearAnnealingLrUpdaterHook, LrUpdaterHook,
+ OneCycleLrUpdaterHook, PolyLrUpdaterHook,
+ StepLrUpdaterHook)
+from .memory import EmptyCacheHook
+from .momentum_updater import (CosineAnnealingMomentumUpdaterHook,
+ CyclicMomentumUpdaterHook,
+ LinearAnnealingMomentumUpdaterHook,
+ MomentumUpdaterHook,
+ OneCycleMomentumUpdaterHook,
+ StepMomentumUpdaterHook)
+from .optimizer import (Fp16OptimizerHook, GradientCumulativeFp16OptimizerHook,
+ GradientCumulativeOptimizerHook, OptimizerHook)
+from .profiler import ProfilerHook
+from .sampler_seed import DistSamplerSeedHook
+from .sync_buffer import SyncBuffersHook
+
+__all__ = [
+ 'HOOKS', 'Hook', 'CheckpointHook', 'ClosureHook', 'LrUpdaterHook',
+ 'FixedLrUpdaterHook', 'StepLrUpdaterHook', 'ExpLrUpdaterHook',
+ 'PolyLrUpdaterHook', 'InvLrUpdaterHook', 'CosineAnnealingLrUpdaterHook',
+ 'FlatCosineAnnealingLrUpdaterHook', 'CosineRestartLrUpdaterHook',
+ 'CyclicLrUpdaterHook', 'OneCycleLrUpdaterHook', 'OptimizerHook',
+ 'Fp16OptimizerHook', 'IterTimerHook', 'DistSamplerSeedHook',
+ 'EmptyCacheHook', 'LoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook',
+ 'TextLoggerHook', 'TensorboardLoggerHook', 'NeptuneLoggerHook',
+ 'WandbLoggerHook', 'DvcliveLoggerHook', 'MomentumUpdaterHook',
+ 'StepMomentumUpdaterHook', 'CosineAnnealingMomentumUpdaterHook',
+ 'CyclicMomentumUpdaterHook', 'OneCycleMomentumUpdaterHook',
+ 'SyncBuffersHook', 'EMAHook', 'EvalHook', 'DistEvalHook', 'ProfilerHook',
+ 'GradientCumulativeOptimizerHook', 'GradientCumulativeFp16OptimizerHook',
+ 'SegmindLoggerHook', 'LinearAnnealingLrUpdaterHook',
+ 'LinearAnnealingMomentumUpdaterHook', 'CmfLoggerHook'
+]
diff --git a/examples/al_object_detection/cmf_logger_mmcv/hooks/logger/__init__.py b/examples/al_object_detection/cmf_logger_mmcv/hooks/logger/__init__.py
new file mode 100644
index 00000000..da3dedd1
--- /dev/null
+++ b/examples/al_object_detection/cmf_logger_mmcv/hooks/logger/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import LoggerHook
+from .dvclive import DvcliveLoggerHook
+from .mlflow import MlflowLoggerHook
+from .neptune import NeptuneLoggerHook
+from .pavi import PaviLoggerHook
+from .segmind import SegmindLoggerHook
+from .tensorboard import TensorboardLoggerHook
+from .text import TextLoggerHook
+from .wandb import WandbLoggerHook
+from .cmflogger import CmfLoggerHook
+
+__all__ = [
+ 'LoggerHook', 'CmfLoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook',
+ 'TensorboardLoggerHook', 'TextLoggerHook', 'WandbLoggerHook',
+ 'NeptuneLoggerHook', 'DvcliveLoggerHook', 'SegmindLoggerHook'
+]
diff --git a/examples/al_object_detection/cmf_logger_mmcv/hooks/logger/cmflogger.py b/examples/al_object_detection/cmf_logger_mmcv/hooks/logger/cmflogger.py
new file mode 100644
index 00000000..1c3e0cff
--- /dev/null
+++ b/examples/al_object_detection/cmf_logger_mmcv/hooks/logger/cmflogger.py
@@ -0,0 +1,105 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+###
+# Copyright (2024) Hewlett Packard Enterprise Development LP
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# You may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###
+
+from mmcv.utils import TORCH_VERSION
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+import uuid
+import os
+import sys
+
+
+@HOOKS.register_module()
+class CmfLoggerHook(LoggerHook):
+ """Class to log metrics and (optionally) a trained model to CMF.
+
+ It requires `cmflib`_ to be installed.
+
+ Args:
+ mlmd_file_path (str): Path to mlmd file.
+ exp_name (str, optional): Name of the experiment to be used.
+ Default None. If not None, set the active experiment.
+ If experiment_name does not exist, an experiment_name
+ with be created using uuid.
+ tags (Dict[str], optional): Tags for the current run.
+ Default None. If not None, set tags for the current run.
+ interval (int): Logging interval (every k iterations). Default: 10.
+ ignore_last (bool): Ignore the log of last iterations in each epoch
+ if less than `interval`. Default: True.
+ reset_flag (bool): Whether to clear the output buffer after logging.
+ Default: False.
+ by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
+ """
+ Training_metric_count = 1
+ Validation_metric_count = 1
+
+ @master_only
+ def __init__(self,
+ mlmd_file_path = os.path.join('/'+os.environ['DVC_ROOT']) + '/mlmd',
+ exp_name=None,
+ tags=None,
+ interval=10, #needed by super class
+ ignore_last=True, #needed by super class
+ reset_flag=False, #needed by super class
+ by_epoch=True): #needed by super class
+ super(CmfLoggerHook, self).__init__(interval, ignore_last,
+ reset_flag, by_epoch)
+ self.mlmdFilePath = mlmd_file_path
+ self.exp_name = exp_name if exp_name else str(uuid.uuid4())
+ self.tags = tags
+
+ try:
+ from cmflib import cmf
+ from cmflib import cmfquery
+ except ImportError:
+ raise ImportError(
+ 'Please run "pip install cmflib" to install cmflib')
+ self.cmf_logger = cmf.Cmf(filename=self.mlmdFilePath, pipeline_name=self.exp_name , graph=True)
+ self.context = self.cmf_logger.create_context(os.environ['stage_name'])
+
+ cmd = str(' '.join(sys.argv))
+ self.execution = self.cmf_logger.create_execution(
+ str(os.environ['stage_name'])+'_'+str(os.environ['execution_name']),
+ {},
+ cmd = str(cmd),
+ create_new_execution=False
+ )
+
+ self.prefix = 'Training_Metrics_' + str(os.environ['stage_name'])+'_'+str(os.environ['execution_name'])
+
+ @master_only
+ def log(self, runner):
+ tags = self.get_loggable_tags(runner)
+ mode = self.get_mode(runner)
+ self.mode = mode
+ if tags:
+ if mode == 'train':
+ self.commit_name = self.prefix + '_' + str(CmfLoggerHook.Training_metric_count)
+ self.cmf_logger.log_metric(self.commit_name, tags)
+
+ else:
+ prefix = 'Validation_Metrics_' + str(os.environ['stage_name'])+'_'+str(os.environ['execution_name'])
+ commit_name = prefix + '_' + str(CmfLoggerHook.Validation_metric_count)
+ self.cmf_logger.log_execution_metrics(commit_name, tags)
+ CmfLoggerHook.Validation_metric_count+=1
+
+ @master_only
+ def after_run(self, runner):
+ self.cmf_logger.commit_metrics(self.commit_name)
+ CmfLoggerHook.Training_metric_count+=1
diff --git a/examples/al_object_detection/configs/MIAOD-GRAY.py b/examples/al_object_detection/configs/MIAOD-GRAY.py
index 4ca6556f..efa9faa0 100644
--- a/examples/al_object_detection/configs/MIAOD-GRAY.py
+++ b/examples/al_object_detection/configs/MIAOD-GRAY.py
@@ -1,5 +1,6 @@
# Please change the dataset directory to your actual directory
-data_root = '/mnt/beegfs/PAMS/data/tomography_data/tiled_annotations/'
+#data_root = '/lustre/data/hdcdatasets/'
+data_root = '/mnt/beegfs/HDC/data/tomography_data/tiled_annotations'
_base_ = [
'./_base_/retinanet_r50_fpn.py', './_base_/hdc.py',
@@ -9,16 +10,17 @@
data = dict(
test=dict(
ann_file=[
- data_root + 'train.txt'#TRAINING SET INDEXES?
+ data_root + '/train.txt',
],
- img_prefix=[data_root])
+ img_prefix=[data_root ])
)
-model = dict(bbox_head=dict(C=2))
+model = dict(bbox_head=dict(C=1))
# The initial learning rate, momentum, weight decay can be changed here.
-optimizer = dict(type='SGD', lr=1e-6, momentum=0.9, weight_decay=0.0001)#changed lr from 1e-3
-#optimizer = dict(type='Adam', lr=1e-6, momentum=0.9, weight_decay=0.0001)#changed lr from 1e-3
+optimizer = dict(type='Adam', lr=12e-6, weight_decay=0.0001)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+#optimizer_config = dict(grad_clip=None)
-optimizer_config = dict(grad_clip=None)
# The moment when the learning rate drops can be changed here.
lr_config = dict(policy='step', step=[2])
# The frequency of saving models can be changed here.
@@ -31,7 +33,7 @@
# The frequency of evaluating the model can be changed here.
evaluation = dict(interval=epoch_ratio[0], metric='mAP')
# The number of outer loops (i.e., all 3 training steps except the first Label Set Training step) can be changed here.
-epoch = 1
+epoch = 7
# The repeat time for the labeled sets and unlabeled sets can be changed here.
# The number of repeat times can be equivalent to the number of actual training epochs.
X_L_repeat = 2
@@ -41,10 +43,16 @@
k = 10000
# The size of the initial labeled set and the newly selected sets after each cycle can be set here.
# Note that there are 16551 images in the PASCAL VOC 2007+2012 trainval sets.
-X_S_size = 12150#9964//40 #should be percentage of all available images (set this to max)
-X_L_0_size = 12150#9964//20#(set to zero)?
+def make_even(x):
+ return (x//2)*2
+X_S_size = 1000#make_even(12150//40)#9964//40
+X_L_0_size = make_even(12150)#make_even(12150//10)#9964//20
# The active learning cycles can be changed here.
cycles = [0, 1, 2, 3, 4, 5, 6]
# The work directory for saving logs and files can be changed here. Please refer to README.md for more information.
work_directory = './work_dirs/MI-AOD'
-
+# Output bounding boxes for most uncertain image regions to help guide labeling
+plot_nuboxes = 5
+nubox_colors = [(0,0,255), (0,255,255), (0,255,0), (255,255,0), (255,0,0)]
+check_nuboxes = 1000
+guide_image_dir = data_root + 'hdc/GuideImages'
diff --git a/examples/al_object_detection/configs/MIAOD.py b/examples/al_object_detection/configs/MIAOD.py
index f53cb78b..57198af9 100644
--- a/examples/al_object_detection/configs/MIAOD.py
+++ b/examples/al_object_detection/configs/MIAOD.py
@@ -1,6 +1,6 @@
# Please change the dataset directory to your actual directory
#data_root = '/home/mfoltin2/AI/al/object_detection/data/VOCdevkit/'
-data_root = '/mnt/beegfs/PAMS/data/tomography_data/tiled_annotations/'
+data_root = '/mnt/beegfs/HDC/data/tomography_data/tiled_annotations/'
_base_ = [
'./_base_/retinanet_r50_fpn.py', './_base_/voc0712.py',
diff --git a/examples/al_object_detection/configs/_base_/.hdc.py.swp b/examples/al_object_detection/configs/_base_/.hdc.py.swp
new file mode 100644
index 00000000..fd87f2ff
Binary files /dev/null and b/examples/al_object_detection/configs/_base_/.hdc.py.swp differ
diff --git a/examples/al_object_detection/configs/_base_/default_runtime.py b/examples/al_object_detection/configs/_base_/default_runtime.py
index 2908c54a..eb1d4a0b 100644
--- a/examples/al_object_detection/configs/_base_/default_runtime.py
+++ b/examples/al_object_detection/configs/_base_/default_runtime.py
@@ -3,8 +3,8 @@
log_config = dict(
interval=1,
hooks=[
- dict(type='TextLoggerHook'),
- # dict(type='TensorboardLoggerHook')
+ dict(type='TensorboardLoggerHook'),
+ dict(type = 'CmfLoggerHook', exp_name = 'ExperimentActiveLearning')
])
# yapf:enable
dist_params = dict(backend='nccl')
diff --git a/examples/al_object_detection/configs/_base_/hdc.py b/examples/al_object_detection/configs/_base_/hdc.py
index 68414141..b63ff182 100644
--- a/examples/al_object_detection/configs/_base_/hdc.py
+++ b/examples/al_object_detection/configs/_base_/hdc.py
@@ -1,5 +1,6 @@
# Please change the dataset directory to your actual directory
-data_root = '/mnt/beegfs/PAMS/data/tomography_data/tiled_annotations/'
+#data_root = '/lustre/data/hdcdatasets/'
+data_root = '/mnt/beegfs/HDC/data/tomography_data/tiled_annotations'
# dataset settings
dataset_type = 'HDCDataset'
@@ -10,7 +11,9 @@
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='Resize', img_scale=(960, 960), keep_ratio=True),
- dict(type='RandomFlip', flip_ratio=0.66),#,direction=['horizontal','vertical','diagonal']),
+ dict(type='RandomFlip', flip_ratio=0.5),#,direction=['horizontal','vertical','diagonal']),
+ dict(type='RandomFlip', flip_ratio=0.5,direction='vertical'),
+
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
@@ -26,7 +29,8 @@
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
- dict(type='RandomFlip', flip_ratio=0.66),#, direction=['horizontal','vertical','diagonal']),
+ dict(type='RandomFlip', flip_ratio=0.5),#,direction=['horizontal','vertical','diagonal']),
+ dict(type='RandomFlip', flip_ratio=0.5,direction='vertical'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
@@ -34,26 +38,26 @@
])
]
data = dict(
- samples_per_gpu=2,
- workers_per_gpu=2,
+ samples_per_gpu=8,
+ workers_per_gpu=8,
train=dict(
type='RepeatDataset',
- times=3,
+ times=3,#was 3
dataset=dict(
type=dataset_type,
ann_file=[
- data_root + 'train.txt'#training set indexes
+ data_root + '/train.txt'#training set indexes
],
img_prefix=[data_root ],
pipeline=train_pipeline)),
val=dict(
type=dataset_type,
- ann_file=data_root + 'train_val.txt',#test set indexes
+ ann_file=data_root + '/train_val.txt',#test set indexes
img_prefix=data_root,
pipeline=test_pipeline),
test=dict(
type=dataset_type,
- ann_file=data_root + 'train.txt',#should be same as training
+ ann_file=data_root + '/train.txt',#should be same as training
img_prefix=data_root ,
pipeline=test_pipeline))
evaluation = dict(interval=1, metric='mAP')
diff --git a/examples/al_object_detection/configs/_base_/hdc_copy.py b/examples/al_object_detection/configs/_base_/hdc_copy.py
new file mode 100644
index 00000000..af7344d6
--- /dev/null
+++ b/examples/al_object_detection/configs/_base_/hdc_copy.py
@@ -0,0 +1,86 @@
+# Please change the dataset directory to your actual directory
+#data_root = '/lustre/data/hdcdatasets/'
+data_root = '/mnt/beegfs/HDC/data/tomography_data/tiled_annotations/'
+
+# dataset settings
+dataset_type = 'HDCDataset'
+img_norm_cfg = dict(
+ mean=[123.675, 123.675, 123.675], std=[58.395, 58.395, 58.395], to_rgb=True)
+train_pipeline = [
+ #dict(type='LoadImageFromFile', color_type='grayscale'),
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(type='Resize', img_scale=(960, 960), keep_ratio=True),
+ dict(type='RandomFlip', flip_ratio=0.5),#,direction=['horizontal','vertical','diagonal']),
+ dict(type='RandomFlip', flip_ratio=0.5,direction='vertical'),
+
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ '''dict(type='Albu',
+ transforms=[dict(
+ type='ShiftScaleRotate',
+ shift_limit=0.0,
+ scale_limit=0.0,
+ rotate_limit=[90,90],
+ interpolation=1,
+ p=0.5),
+ ],
+ bbox_params=dict(
+ type='BboxParams',
+ format='pascal_voc',
+ label_fields=['gt_labels'],
+ min_visibility=0.0,
+ filter_lost_elements=True),
+ keymap={
+ 'img': 'image',
+ 'gt_masks': 'masks',
+ 'gt_bboxes': 'bboxes'
+ },
+ update_pad_shape=False,
+ skip_img_without_anno=False
+ ),'''
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+
+ #dict(type='LoadImageFromFile', color_type='grayscale'),
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(960, 960),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip', flip_ratio=0.5),#,direction=['horizontal','vertical','diagonal']),
+ dict(type='RandomFlip', flip_ratio=0.5,direction='vertical'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+data = dict(
+ samples_per_gpu=8,
+ workers_per_gpu=8,
+ train=dict(
+ type='RepeatDataset',
+ times=3,#was 3
+ dataset=dict(
+ type=dataset_type,
+ ann_file=[
+ data_root + 'train.txt'#training set indexes
+ ],
+ img_prefix=[data_root ],
+ pipeline=train_pipeline)),
+ val=dict(
+ type=dataset_type,
+ ann_file=data_root + 'train_val.txt',#test set indexes
+ img_prefix=data_root,
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ ann_file=data_root + 'train.txt',#should be same as training
+ img_prefix=data_root ,
+ pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='mAP')
diff --git a/examples/al_object_detection/configs/model.py b/examples/al_object_detection/configs/model.py
new file mode 100644
index 00000000..d66aa47d
--- /dev/null
+++ b/examples/al_object_detection/configs/model.py
@@ -0,0 +1,371 @@
+import torch.nn as nn
+import torch
+import math
+import torch.utils.model_zoo as model_zoo
+from torchvision.ops import nms
+from retinanet.utils import BasicBlock, Bottleneck, BBoxTransform, ClipBoxes
+from retinanet.anchors import Anchors
+from retinanet import losses
+
+model_urls = {
+ 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+ 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+ 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+ 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+ 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+
+
+class PyramidFeatures(nn.Module):
+ def __init__(self, C3_size, C4_size, C5_size, feature_size=256):
+ super(PyramidFeatures, self).__init__()
+
+ # upsample C5 to get P5 from the FPN paper
+ self.P5_1 = nn.Conv2d(C5_size, feature_size, kernel_size=1, stride=1, padding=0)
+ self.P5_upsampled = nn.Upsample(scale_factor=2, mode='nearest')
+ self.P5_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1)
+
+ # add P5 elementwise to C4
+ self.P4_1 = nn.Conv2d(C4_size, feature_size, kernel_size=1, stride=1, padding=0)
+ self.P4_upsampled = nn.Upsample(scale_factor=2, mode='nearest')
+ self.P4_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1)
+
+ # add P4 elementwise to C3
+ self.P3_1 = nn.Conv2d(C3_size, feature_size, kernel_size=1, stride=1, padding=0)
+ self.P3_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1)
+
+ # "P6 is obtained via a 3x3 stride-2 conv on C5"
+ self.P6 = nn.Conv2d(C5_size, feature_size, kernel_size=3, stride=2, padding=1)
+
+ # "P7 is computed by applying ReLU followed by a 3x3 stride-2 conv on P6"
+ self.P7_1 = nn.ReLU()
+ self.P7_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=2, padding=1)
+
+ def forward(self, inputs):
+ C3, C4, C5 = inputs
+
+ P5_x = self.P5_1(C5)
+ P5_upsampled_x = self.P5_upsampled(P5_x)
+ P5_x = self.P5_2(P5_x)
+
+ P4_x = self.P4_1(C4)
+ P4_x = P5_upsampled_x + P4_x
+ P4_upsampled_x = self.P4_upsampled(P4_x)
+ P4_x = self.P4_2(P4_x)
+
+ P3_x = self.P3_1(C3)
+ P3_x = P3_x + P4_upsampled_x
+ P3_x = self.P3_2(P3_x)
+
+ P6_x = self.P6(C5)
+
+ P7_x = self.P7_1(P6_x)
+ P7_x = self.P7_2(P7_x)
+
+ return [P3_x, P4_x, P5_x, P6_x, P7_x]
+
+
+class RegressionModel(nn.Module):
+ def __init__(self, num_features_in, num_anchors=9, feature_size=256):
+ super(RegressionModel, self).__init__()
+
+ self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1)
+ self.act1 = nn.ReLU()
+
+ self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
+ self.act2 = nn.ReLU()
+
+ self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
+ self.act3 = nn.ReLU()
+
+ self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
+ self.act4 = nn.ReLU()
+
+ self.output = nn.Conv2d(feature_size, num_anchors * 4, kernel_size=3, padding=1)
+
+ def forward(self, x):
+ out = self.conv1(x)
+ out = self.act1(out)
+
+ out = self.conv2(out)
+ out = self.act2(out)
+
+ out = self.conv3(out)
+ out = self.act3(out)
+
+ out = self.conv4(out)
+ out = self.act4(out)
+
+ out = self.output(out)
+
+ # out is B x C x W x H, with C = 4*num_anchors
+ out = out.permute(0, 2, 3, 1)
+
+ return out.contiguous().view(out.shape[0], -1, 4)
+
+
+class ClassificationModel(nn.Module):
+ def __init__(self, num_features_in, num_anchors=9, num_classes=80, prior=0.01, feature_size=256):
+ super(ClassificationModel, self).__init__()
+
+ self.num_classes = num_classes
+ self.num_anchors = num_anchors
+
+ self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1)
+ self.act1 = nn.ReLU()
+
+ self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
+ self.act2 = nn.ReLU()
+
+ self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
+ self.act3 = nn.ReLU()
+
+ self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
+ self.act4 = nn.ReLU()
+
+ self.output = nn.Conv2d(feature_size, num_anchors * num_classes, kernel_size=3, padding=1)
+ self.output_act = nn.Sigmoid()
+
+ def forward(self, x):
+ out = self.conv1(x)
+ out = self.act1(out)
+
+ out = self.conv2(out)
+ out = self.act2(out)
+
+ out = self.conv3(out)
+ out = self.act3(out)
+
+ out = self.conv4(out)
+ out = self.act4(out)
+
+ out = self.output(out)
+ out = self.output_act(out)
+
+ # out is B x C x W x H, with C = n_classes + n_anchors
+ out1 = out.permute(0, 2, 3, 1)
+
+ batch_size, width, height, channels = out1.shape
+
+ out2 = out1.view(batch_size, width, height, self.num_anchors, self.num_classes)
+
+ return out2.contiguous().view(x.shape[0], -1, self.num_classes)
+
+
+class ResNet(nn.Module):
+
+ def __init__(self, num_classes, block, layers):
+ self.inplanes = 64
+ super(ResNet, self).__init__()
+ self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+ self.bn1 = nn.BatchNorm2d(64)
+ self.relu = nn.ReLU(inplace=True)
+ self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+ self.layer1 = self._make_layer(block, 64, layers[0])
+ self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+ self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+ self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+
+ if block == BasicBlock:
+ fpn_sizes = [self.layer2[layers[1] - 1].conv2.out_channels, self.layer3[layers[2] - 1].conv2.out_channels,
+ self.layer4[layers[3] - 1].conv2.out_channels]
+ elif block == Bottleneck:
+ fpn_sizes = [self.layer2[layers[1] - 1].conv3.out_channels, self.layer3[layers[2] - 1].conv3.out_channels,
+ self.layer4[layers[3] - 1].conv3.out_channels]
+ else:
+ raise ValueError(f"Block type {block} not understood")
+
+ self.fpn = PyramidFeatures(fpn_sizes[0], fpn_sizes[1], fpn_sizes[2])
+
+ self.regressionModel = RegressionModel(256)
+ self.classificationModel = ClassificationModel(256, num_classes=num_classes)
+
+ self.anchors = Anchors()
+
+ self.regressBoxes = BBoxTransform()
+
+ self.clipBoxes = ClipBoxes()
+
+ self.focalLoss = losses.FocalLoss()
+
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+ m.weight.data.normal_(0, math.sqrt(2. / n))
+ elif isinstance(m, nn.BatchNorm2d):
+ m.weight.data.fill_(1)
+ m.bias.data.zero_()
+
+ prior = 0.01
+
+ self.classificationModel.output.weight.data.fill_(0)
+ self.classificationModel.output.bias.data.fill_(-math.log((1.0 - prior) / prior))
+
+ self.regressionModel.output.weight.data.fill_(0)
+ self.regressionModel.output.bias.data.fill_(0)
+
+ self.freeze_bn()
+
+ def _make_layer(self, block, planes, blocks, stride=1):
+ downsample = None
+ if stride != 1 or self.inplanes != planes * block.expansion:
+ downsample = nn.Sequential(
+ nn.Conv2d(self.inplanes, planes * block.expansion,
+ kernel_size=1, stride=stride, bias=False),
+ nn.BatchNorm2d(planes * block.expansion),
+ )
+
+ layers = [block(self.inplanes, planes, stride, downsample)]
+ self.inplanes = planes * block.expansion
+ for i in range(1, blocks):
+ layers.append(block(self.inplanes, planes))
+
+ return nn.Sequential(*layers)
+
+ def freeze_bn(self):
+ '''Freeze BatchNorm layers.'''
+ for layer in self.modules():
+ if isinstance(layer, nn.BatchNorm2d):
+ layer.eval()
+
+ def forward(self, *inputs, **kw):
+ if 'device' in kw:
+ device = kw['device']
+ else:
+ device = None
+ if 'gamma' in kw:
+ gamma = kw['gamma']
+ if 'alpha' in kw:
+ alpha = kw['alpha']
+
+ img_batch = inputs[0]#if testing then we have no annotations
+ x = self.conv1(img_batch)
+ x = self.bn1(x)
+ x = self.relu(x)
+ x = self.maxpool(x)
+
+ x1 = self.layer1(x)
+ x2 = self.layer2(x1)
+ x3 = self.layer3(x2)
+ x4 = self.layer4(x3)
+
+ features = self.fpn([x2, x3, x4])
+
+ regression = torch.cat([self.regressionModel(feature) for feature in features], dim=1)
+
+ classification = torch.cat([self.classificationModel(feature) for feature in features], dim=1)
+ #classification is (batch, NUM_FEATURES, NUM_CLASSES)
+ anchors = self.anchors(img_batch)
+ if self.training:
+ return classification, regression, anchors
+ #return self.focalLoss(classification, regression, anchors, annotations)#, alpha, gamma)
+ else:
+ transformed_anchors = self.regressBoxes(anchors, regression)
+ transformed_anchors = self.clipBoxes(transformed_anchors, img_batch)
+ BATCH = classification.shape[0]
+ NUM_CLASSES = classification.shape[2]
+ allScores = []
+ allClasses = []
+ allAnchors = []
+ for batch_ind in range(BATCH):
+ finalResult = [[], [], []]
+
+ finalScores = torch.Tensor([])
+ finalAnchorBoxesIndexes = torch.Tensor([]).long()
+ finalAnchorBoxesCoordinates = torch.Tensor([])
+ if device is not None:
+ finalScores = finalScores.to(device)
+ finalAnchorBoxesIndexes = finalAnchorBoxesIndexes.to(device)
+ finalAnchorBoxesCoordinates = finalAnchorBoxesCoordinates.to(device)
+ elif torch.cuda.is_available():
+ finalScores = finalScores.cuda()
+ finalAnchorBoxesIndexes = finalAnchorBoxesIndexes.cuda()
+ finalAnchorBoxesCoordinates = finalAnchorBoxesCoordinates.cuda()
+
+ for i in range(NUM_CLASSES):#for every class
+ #print('classif',classification.shape)
+ scores = torch.squeeze(classification[batch_ind, :, i])#classification[:, :, i]
+ scores_over_thresh = (scores > 0.05)
+ if scores_over_thresh.sum() == 0:
+ # no boxes to NMS, just continue
+ continue
+
+ scores = scores[scores_over_thresh]
+ #print('tbox',transformed_anchors.shape)
+ anchorBoxes = torch.squeeze(transformed_anchors[batch_ind,:,:])
+ #print('abox',anchorBoxes.shape)
+ anchorBoxes = anchorBoxes[scores_over_thresh]
+ #print('abox after',anchorBoxes.shape)
+ anchors_nms_idx = nms(anchorBoxes, scores, 0.5)#check if confidence over 0.5
+ finalResult[0].extend(scores[anchors_nms_idx])
+ finalResult[1].extend(torch.tensor([i] * anchors_nms_idx.shape[0]))
+ finalResult[2].extend(anchorBoxes[anchors_nms_idx])
+
+ finalScores = torch.cat((finalScores, scores[anchors_nms_idx]))
+ finalAnchorBoxesIndexesValue = torch.tensor([i] * anchors_nms_idx.shape[0])
+ if torch.cuda.is_available():
+ finalAnchorBoxesIndexesValue = finalAnchorBoxesIndexesValue.cuda()
+
+ finalAnchorBoxesIndexes = torch.cat((finalAnchorBoxesIndexes, finalAnchorBoxesIndexesValue))
+ finalAnchorBoxesCoordinates = torch.cat((finalAnchorBoxesCoordinates, anchorBoxes[anchors_nms_idx]))
+
+ #results += [finalScores, finalAnchorBoxesIndexes, finalAnchorBoxesCoordinates]
+ allScores.append(finalScores)
+ allClasses.append(finalAnchorBoxesIndexes)
+ allAnchors.append(finalAnchorBoxesCoordinates)
+ return allScores, allClasses , allAnchors
+
+
+def resnet18(num_classes, pretrained=False, **kwargs):
+ """Constructs a ResNet-18 model.
+ Args:
+ pretrained (bool): If True, returns a model pre-trained on ImageNet
+ """
+ model = ResNet(num_classes, BasicBlock, [2, 2, 2, 2], **kwargs)
+ if pretrained:
+ model.load_state_dict(model_zoo.load_url(model_urls['resnet18'], model_dir='./Resnet'), strict=False)
+ return model
+
+
+def resnet34(num_classes, pretrained=False, **kwargs):
+ """Constructs a ResNet-34 model.
+ Args:
+ pretrained (bool): If True, returns a model pre-trained on ImageNet
+ """
+ model = ResNet(num_classes, BasicBlock, [3, 4, 6, 3], **kwargs)
+ if pretrained:
+ model.load_state_dict(model_zoo.load_url(model_urls['resnet34'], model_dir='./ResNet'), strict=False)
+ return model
+
+
+def resnet50(num_classes, pretrained=False, **kwargs):
+ """Constructs a ResNet-50 model.
+ Args:
+ pretrained (bool): If True, returns a model pre-trained on ImageNet
+ """
+ model = ResNet(num_classes, Bottleneck, [3, 4, 6, 3], **kwargs)
+ if pretrained:
+ model.load_state_dict(model_zoo.load_url(model_urls['resnet50'], model_dir='./ResNet'), strict=False)
+ return model
+
+
+def resnet101(num_classes, pretrained=False, **kwargs):
+ """Constructs a ResNet-101 model.
+ Args:
+ pretrained (bool): If True, returns a model pre-trained on ImageNet
+ """
+ model = ResNet(num_classes, Bottleneck, [3, 4, 23, 3], **kwargs)
+ if pretrained:
+ model.load_state_dict(model_zoo.load_url(model_urls['resnet101'], model_dir='./ResNet'), strict=False)
+ return model
+
+
+def resnet152(num_classes, pretrained=False, **kwargs):
+ """Constructs a ResNet-152 model.
+ Args:
+ pretrained (bool): If True, returns a model pre-trained on ImageNet
+ """
+ model = ResNet(num_classes, Bottleneck, [3, 8, 36, 3], **kwargs)
+ if pretrained:
+ model.load_state_dict(model_zoo.load_url(model_urls['resnet152'], model_dir='./ResNet'), strict=False)
+ return model
diff --git a/examples/al_object_detection/cycle.yaml b/examples/al_object_detection/cycle.yaml
new file mode 100644
index 00000000..03a7f564
--- /dev/null
+++ b/examples/al_object_detection/cycle.yaml
@@ -0,0 +1,76 @@
+vars:
+ - configs/MIAOD-GRAY.py:data_root
+ - al_cycle.json:al_cycle,next_cycle,al_seed
+
+stages:
+ input_sample_lists:
+ cmd:
+ python input_lists.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --labeled ${data_root}/training.txt --unlabeled ${data_root}/unlabeled.txt --seed ${al_seed} --train ${data_root}/train.txt --selected work_dirs/test/X_L_${al_cycle}.npy --unselected work_dirs/test/X_U_${al_cycle}.npy
+ deps:
+ - ${data_root}/training.txt
+ - ${data_root}/unlabeled.txt
+ outs:
+ - ${data_root}/train.txt
+ - work_dirs/test/X_L_${al_cycle}.npy
+ - work_dirs/test/X_U_${al_cycle}.npy
+ params:
+ - configs/MIAOD-GRAY.py:
+ - data_root
+ - al_cycle.json:
+ - al_cycle
+ - al_seed
+
+ active_learning_training:
+ cmd:
+ #single gpu
+ python cycle_train.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --cycle ${al_cycle} --labeled work_dirs/test/X_L_${al_cycle}.npy --unselected work_dirs/test/X_U_${al_cycle}.npy --seed ${al_seed} --model work_dirs/test/cycle.pth
+ #distributed mode
+ #python -m torch.distributed.launch --nproc_per_node=6 --master_port=39025 cycle_train.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --cycle ${al_cycle} --labeled work_dirs/test/X_L_${al_cycle}.npy --unselected work_dirs/test/X_U_${al_cycle}.npy --seed ${al_seed} --model work_dirs/test/cycle.pth --gpus 6 --launcher pytorch
+ deps:
+ - work_dirs/test/X_L_${al_cycle}.npy
+ - work_dirs/test/X_U_${al_cycle}.npy
+ outs:
+ - work_dirs/test/cycle.pth
+ params:
+ - al_cycle.json:
+ - al_cycle
+ - al_seed
+
+ active_learning_selection:
+ cmd:
+ #single gpu
+ python cycle_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --cycle ${al_cycle} --model work_dirs/test/cycle.pth --labeled work_dirs/test/X_L_${al_cycle}.npy --seed ${al_seed} --labeled_next work_dirs/test/X_L_${next_cycle}.npy --unselected work_dirs/test/X_U_${next_cycle}.npy --bbox_output work_dirs/test/labeling_hints_${next_cycle}.txt
+ #distributed mode
+ #python -m torch.distributed.launch --nproc_per_node=4 --master_port=39016 cycle_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --cycle ${al_cycle} --model work_dirs/test/cycle.pth --labeled work_dirs/test/X_L_${al_cycle}.npy --seed ${al_seed} --labeled_next work_dirs/test/X_L_${next_cycle}.npy --unselected work_dirs/test/X_U_${next_cycle}.npy --bbox_output work_dirs/test/labeling_hints_${next_cycle}.txt --gpus 4 --launcher pytorch
+ deps:
+ - work_dirs/test/cycle.pth
+ - work_dirs/test/X_L_${al_cycle}.npy
+ outs:
+ - work_dirs/test/X_L_${next_cycle}.npy
+ - work_dirs/test/X_U_${next_cycle}.npy
+ params:
+ - al_cycle.json:
+ - al_cycle
+ - next_cycle
+ - al_seed
+
+ output_sample_lists:
+ cmd:
+ python output_lists.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --selected work_dirs/test/X_L_${al_cycle}.npy --selected_next work_dirs/test/X_L_${next_cycle}.npy --train ${data_root}/train.txt --map ${data_root}/map.txt --seed ${al_seed} --label_next ${data_root}/label_next.txt --labeled ${data_root}/training_next.txt --unlabeled ${data_root}/unlabeled_next.txt --cycle_config al_cycle.json --cycle_config_next al_cycle_next.json
+ deps:
+ - work_dirs/test/X_L_${al_cycle}.npy
+ - work_dirs/test/X_L_${next_cycle}.npy
+ - ${data_root}/train.txt
+ - ${data_root}/map.txt
+ outs:
+ - ${data_root}/label_next.txt
+ - ${data_root}/training_next.txt
+ - ${data_root}/unlabeled_next.txt
+ - al_cycle_next.json
+ params:
+ - configs/MIAOD-GRAY.py:
+ - data_root
+ - al_cycle.json:
+ - al_cycle
+ - next_cycle
+ - al_seed
diff --git a/examples/al_object_detection/cycle_select.py b/examples/al_object_detection/cycle_select.py
index 1c758f2f..a962a85c 100644
--- a/examples/al_object_detection/cycle_select.py
+++ b/examples/al_object_detection/cycle_select.py
@@ -2,6 +2,7 @@
import os
import os.path as osp
import time
+import json
import mmcv
import torch
@@ -20,6 +21,7 @@
from mmdet.utils import collect_env, get_root_logger
from mmdet.utils.active_datasets import *
+import cv2
def parse_args():
parser = argparse.ArgumentParser(description='Select informative images')
@@ -30,6 +32,7 @@ def parse_args():
parser.add_argument('--labeled_next',
help='next cycle labeled samples list file')
parser.add_argument('--unselected', help='unselected samples list file')
+ parser.add_argument('--bbox_output', help='labeling guidance file')
parser.add_argument('--work_directory',
help='the dir to save logs and model checkpoints')
parser.add_argument(
@@ -51,6 +54,11 @@ def parse_args():
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none', help='job launcher')
parser.add_argument('--local_rank', type=int, default=0)
+
+ parser.add_argument('--strategy', choices=['active_learning', 'random'],
+ default='active_learning')
+ parser.add_argument('--stage_name', help='Name for current execution')
+ parser.add_argument('--execution_name', help='Name for current execution')
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
@@ -71,6 +79,13 @@ def main():
if args.options is not None:
cfg.merge_from_dict(args.options)
+ # if cfg.plot_nuboxes is set, check that bbox_output file name is given
+ assert (not cfg.get('plot_nuboxes') or args.bbox_output), \
+ ('When output of labeling guides is turned on in the config file '
+ '(cfg.plot_nuboxes > 0), command line argument --bbox_output '
+ 'needs to be used to specify output file for bounding boxes '
+ 'from most uncertain predictions')
+
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
@@ -107,6 +122,12 @@ def main():
# create work_directory
mmcv.mkdir_or_exist(osp.abspath(cfg.work_directory))
+ stage_name = args.stage_name
+ os.environ['stage_name'] = stage_name
+
+ execution_name = args.execution_name
+ os.environ['execution_name'] = execution_name
+
# dump config
cfg.dump(osp.join(cfg.work_directory,
f'cycle_select{args.cycle}_' + osp.basename(args.config)))
@@ -136,7 +157,7 @@ def main():
dataset = build_dataset(cfg.data.test)
data_loader = build_dataloader(dataset, samples_per_gpu=1,
workers_per_gpu=cfg.data.workers_per_gpu,
- dist=False, shuffle=False)
+ dist=distributed, shuffle=False)
# build the model and load checkpoint
model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
@@ -148,9 +169,17 @@ def main():
model = fuse_module(model)
# calculate uncertainty
- uncertainty = calculate_uncertainty(cfg, model, data_loader,
- return_box=False)
-
+ if cfg.get('plot_nuboxes') and (cfg.plot_nuboxes > 0):
+ plot_nuboxes = cfg.plot_nuboxes
+ uncertainty, udets = calculate_uncertainty(cfg, model, data_loader,
+ plot_nuboxes=plot_nuboxes,
+ return_box=False)
+ return_X_S = True
+ else:
+ uncertainty = calculate_uncertainty(cfg, model, data_loader,
+ plot_nuboxes=0, return_box=False)
+ return_X_S = False
+
# update labeled set
all_anns = load_ann_list(cfg.data.train.dataset.ann_file)
if len(all_anns[0]) == 1:
@@ -161,12 +190,77 @@ def main():
j += len(all_anns[i])
X_all = np.arange(j)
X_L = np.load(args.labeled)
- X_L_next, X_U = update_X_L(uncertainty, X_all, X_L, cfg.X_S_size)
+ if args.strategy == 'active_learning':
+ if return_X_S:
+ X_L_next, X_U, X_S = update_X_L(uncertainty, X_all, X_L,
+ cfg.X_S_size, return_X_S=True)
+ else:
+ X_L_next, X_U = update_X_L(uncertainty, X_all, X_L, cfg.X_S_size,
+ return_X_S=False)
+ else:
+ if return_X_S:
+ X_L_next, X_U, X_S = update_X_L_random( X_all, X_L, cfg.X_S_size,
+ return_X_S=True )
+ else:
+ X_L_next, X_U = update_X_L_random( X_all, X_L, cfg.X_S_size,
+ return_X_S=False )
# save next cycle labeled and unlesected lists
np.save(args.labeled_next, X_L_next)
np.save(args.unselected, X_U)
+ # output bounding box hints for highest uncertainty areas in selected images
+ if return_X_S:
+ with open(args.bbox_output, 'w') as f:
+ entries = []
+ for i in np.flip(X_S):
+ if len(all_anns[0]) == 1:
+ idx = all_anns[i]
+ else:
+ j = 0
+ for k in range(len(all_anns)):
+ j += len(all_anns[k])
+ if j > i:
+ idx = all_anns[k-1][i-(j-len(all_anns[k]))]
+ break
+ nested_data = {}
+ #print("Image %s, mean uncertainty %f" % (idx, uncertainty[i]),
+ # file=f)
+ print("Image %s, mean uncertainty %f" % (idx, uncertainty[i]))
+ nested_data['image'] = str(idx)
+ nested_data['mean_uncertainty'] = str(float(uncertainty[i]))
+ nested_data['bboxes']=[]
+
+ for udet in udets[i]:
+ inner_entry = {
+ 'x1':str(int(udet[0])),
+ 'y1':str(int(udet[1])),
+ 'x2':str(int(udet[2])),
+ 'y2':str(int(udet[3])),
+ 'uncertainty':str(float(udet[4]))
+ }
+ nested_data['bboxes'].append(inner_entry)
+ #print("bbox (%d, %d) (%d, %d), uncertainty %f" %
+ # (udet[0], udet[1], udet[2], udet[3], udet[4]), file=f)
+ print("bbox (%d, %d) (%d, %d), uncertainty %f" %
+ (udet[0], udet[1], udet[2], udet[3], udet[4]))
+ entries.append(nested_data)
+ #print("", file=f)
+
+ image_read_path = osp.join(cfg.data.test.img_prefix[0],
+ 'JPEGImages', '{}.jpg'.format(idx))
+ image = cv2.imread(image_read_path)
+ for udet, nubox_color in zip(udets[i], cfg.nubox_colors):
+ image = cv2.rectangle(image,
+ (int(udet[0].item()), int(udet[1].item())),
+ (int(udet[2].item()), int(udet[3].item())),
+ color=nubox_color, thickness=2)
+ image_write_path = osp.join(cfg.guide_image_dir,
+ '{}.jpg'.format(idx))
+ cv2.imwrite(image_write_path, image)
+ json.dump(entries,f, indent=4)
+ #f.write(entries)
+
if __name__ == '__main__':
main()
diff --git a/examples/al_object_detection/cycle_train.py b/examples/al_object_detection/cycle_train.py
index 2ecf0d60..187c452d 100644
--- a/examples/al_object_detection/cycle_train.py
+++ b/examples/al_object_detection/cycle_train.py
@@ -52,6 +52,8 @@ def parse_args():
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none', help='job launcher')
parser.add_argument('--local_rank', type=int, default=0)
+ parser.add_argument('--stage_name', help='Name for current execution')
+ parser.add_argument('--execution_name', help='Name for current execution')
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
@@ -99,6 +101,11 @@ def main():
# create work_directory
mmcv.mkdir_or_exist(osp.abspath(cfg.work_directory))
+ stage_name = args.stage_name
+ os.environ['stage_name'] = stage_name
+
+ execution_name = args.execution_name
+ os.environ['execution_name'] = execution_name
# dump config
cfg.dump(osp.join(cfg.work_directory,
f'cycle_train{args.cycle}_' + osp.basename(args.config)))
diff --git a/examples/al_object_detection/dvc.lock b/examples/al_object_detection/dvc.lock
index 41be89a6..e92f56c3 100644
--- a/examples/al_object_detection/dvc.lock
+++ b/examples/al_object_detection/dvc.lock
@@ -1,346 +1,50 @@
schema: '2.0'
stages:
- initial_selection:
- cmd: python initial_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
- --labeled_next work_dirs/test/X_L_0.npy --unselected work_dirs/test/X_U_0.npy
- deps:
- - path: configs/MIAOD-GRAY.py
- md5: e5731db6529ec8aef0f54e4ca3456ddd
- size: 2449
- outs:
- - path: work_dirs/test/X_L_0.npy
- md5: 1bdb5995455bac6d5bad54740d197876
- size: 97328
- - path: work_dirs/test/X_U_0.npy
- md5: e44f4ad4c5da07e70744ad8e6b1eb2b7
- size: 97328
- active_learning_training@0:
- cmd: python cycle_train.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
- --cycle 0 --labeled work_dirs/test/X_L_0.npy --unselected work_dirs/test/X_U_0.npy
- --model work_dirs/test/cycle_0.pth
- deps:
- - path: work_dirs/test/X_L_0.npy
- md5: 1bdb5995455bac6d5bad54740d197876
- size: 97328
- - path: work_dirs/test/X_U_0.npy
- md5: e44f4ad4c5da07e70744ad8e6b1eb2b7
- size: 97328
- outs:
- - path: work_dirs/test/cycle_0.pth
- md5: a25e9e2f2642dfd5fc45b15a880ff140
- size: 164970901
- active_learning_selection@0:
- cmd: python cycle_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
- --cycle 0 --model work_dirs/test/cycle_0.pth --labeled work_dirs/test/X_L_0.npy
- --labeled_next work_dirs/test/X_L_1.npy --unselected work_dirs/test/X_U_1.npy
- deps:
- - path: work_dirs/test/X_L_0.npy
- md5: 088666589d3625036b70773f975c4a95
- size: 4112
- - path: work_dirs/test/cycle_0.pth
- md5: 2433fe09dc611c64a1cb1fb62cc34344
- size: 164970901
- outs:
+ input_sample_lists:
+ cmd: python input_lists.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
+ --labeled /mnt/beegfs/HDC/data/tomography_data/tiled_annotations/training.txt
+ --unlabeled /mnt/beegfs/HDC/data/tomography_data/tiled_annotations/unlabeled.txt
+ --seed 123 --train /mnt/beegfs/HDC/data/tomography_data/tiled_annotations/train.txt
+ --selected work_dirs/test/X_L_1.npy --unselected work_dirs/test/X_U_1.npy
+ deps:
+ - path: /mnt/beegfs/HDC/data/tomography_data/tiled_annotations/training.txt
+ md5: 465b0575ca40f63bb9a6fdb32a3797da
+ size: 122740
+ - path: /mnt/beegfs/HDC/data/tomography_data/tiled_annotations/unlabeled.txt
+ md5: d31e0407a305a2dcb22646ff95570101
+ size: 55985000
+ params:
+ al_cycle.json:
+ al_cycle: 1
+ al_seed: 123
+ configs/MIAOD-GRAY.py:
+ data_root: /mnt/beegfs/HDC/data/tomography_data/tiled_annotations
+ outs:
+ - path: /mnt/beegfs/HDC/data/tomography_data/tiled_annotations/train.txt
+ md5: de261a7327a8227586e1eac3a5a9dec4
+ size: 56107740
- path: work_dirs/test/X_L_1.npy
- md5: 3cb2464f4e42302a34971c590d62129b
- size: 6104
+ md5: 10a7bdd4723df0e040a26af145ac8db4
+ size: 98320
- path: work_dirs/test/X_U_1.npy
- md5: 2a484e1d7b79b0b0d199e19e74d758ea
- size: 6104
- active_learning_training@1:
+ md5: e5610b59611259f004316ea4732c9fcf
+ size: 98320
+ active_learning_training:
cmd: python cycle_train.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
--cycle 1 --labeled work_dirs/test/X_L_1.npy --unselected work_dirs/test/X_U_1.npy
- --model work_dirs/test/cycle_1.pth
+ --seed 123 --model work_dirs/test/cycle.pth
deps:
- path: work_dirs/test/X_L_1.npy
- md5: 3cb2464f4e42302a34971c590d62129b
- size: 6104
+ md5: 10a7bdd4723df0e040a26af145ac8db4
+ size: 98320
- path: work_dirs/test/X_U_1.npy
- md5: 2a484e1d7b79b0b0d199e19e74d758ea
- size: 6104
- outs:
- - path: work_dirs/test/cycle_1.pth
- md5: d23e6f1f0d0dda2fd0d2b4d03e282293
- size: 164970901
- active_learning_selection@1:
- cmd: python cycle_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
- --cycle 1 --model work_dirs/test/cycle_1.pth --labeled work_dirs/test/X_L_1.npy
- --labeled_next work_dirs/test/X_L_2.npy --unselected work_dirs/test/X_U_2.npy
- deps:
- - path: work_dirs/test/X_L_1.npy
- md5: 3cb2464f4e42302a34971c590d62129b
- size: 6104
- - path: work_dirs/test/cycle_1.pth
- md5: d23e6f1f0d0dda2fd0d2b4d03e282293
- size: 164970901
- outs:
- - path: work_dirs/test/X_L_2.npy
- md5: ac4d646bad5fc3f9451f27992a9b1348
- size: 8096
- - path: work_dirs/test/X_U_2.npy
- md5: 20b9825f2f3a54e536b5bfe9f338e989
- size: 8096
- active_learning_training@2:
- cmd: python cycle_train.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
- --cycle 2 --labeled work_dirs/test/X_L_2.npy --unselected work_dirs/test/X_U_2.npy
- --model work_dirs/test/cycle_2.pth
- deps:
- - path: work_dirs/test/X_L_2.npy
- md5: ac4d646bad5fc3f9451f27992a9b1348
- size: 8096
- - path: work_dirs/test/X_U_2.npy
- md5: 20b9825f2f3a54e536b5bfe9f338e989
- size: 8096
- outs:
- - path: work_dirs/test/cycle_2.pth
- md5: 772f14bad2f49dae4b404e65fed22220
- size: 164970901
- active_learning_selection@2:
- cmd: python cycle_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
- --cycle 2 --model work_dirs/test/cycle_2.pth --labeled work_dirs/test/X_L_2.npy
- --labeled_next work_dirs/test/X_L_3.npy --unselected work_dirs/test/X_U_3.npy
- deps:
- - path: work_dirs/test/X_L_2.npy
- md5: ac4d646bad5fc3f9451f27992a9b1348
- size: 8096
- - path: work_dirs/test/cycle_2.pth
- md5: 772f14bad2f49dae4b404e65fed22220
- size: 164970901
- outs:
- - path: work_dirs/test/X_L_3.npy
- md5: b2371377136bccfb36094fe9c3814559
- size: 10088
- - path: work_dirs/test/X_U_3.npy
- md5: 5577af0806a86a4f82097ea7e5898e5d
- size: 10088
- active_learning_training@3:
- cmd: python cycle_train.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
- --cycle 3 --labeled work_dirs/test/X_L_3.npy --unselected work_dirs/test/X_U_3.npy
- --model work_dirs/test/cycle_3.pth
- deps:
- - path: work_dirs/test/X_L_3.npy
- md5: b2371377136bccfb36094fe9c3814559
- size: 10088
- - path: work_dirs/test/X_U_3.npy
- md5: 5577af0806a86a4f82097ea7e5898e5d
- size: 10088
- outs:
- - path: work_dirs/test/cycle_3.pth
- md5: 94cda304046023f80348ed1329e571c8
- size: 164970901
- active_learning_selection@3:
- cmd: python cycle_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
- --cycle 3 --model work_dirs/test/cycle_3.pth --labeled work_dirs/test/X_L_3.npy
- --labeled_next work_dirs/test/X_L_4.npy --unselected work_dirs/test/X_U_4.npy
- deps:
- - path: work_dirs/test/X_L_3.npy
- md5: b2371377136bccfb36094fe9c3814559
- size: 10088
- - path: work_dirs/test/cycle_3.pth
- md5: 94cda304046023f80348ed1329e571c8
- size: 164970901
- outs:
- - path: work_dirs/test/X_L_4.npy
- md5: 0e915a08f913e79112d7304a828980bd
- size: 12080
- - path: work_dirs/test/X_U_4.npy
- md5: 60081608cee2956535d146c97b194cf6
- size: 12080
- active_learning_training@4:
- cmd: python cycle_train.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
- --cycle 4 --labeled work_dirs/test/X_L_4.npy --unselected work_dirs/test/X_U_4.npy
- --model work_dirs/test/cycle_4.pth
- deps:
- - path: work_dirs/test/X_L_4.npy
- md5: 0e915a08f913e79112d7304a828980bd
- size: 12080
- - path: work_dirs/test/X_U_4.npy
- md5: 60081608cee2956535d146c97b194cf6
- size: 12080
- outs:
- - path: work_dirs/test/cycle_4.pth
- md5: 0ac0eb19c87b4e4b07818b94b6369fce
- size: 164970901
- active_learning_selection@4:
- cmd: python cycle_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
- --cycle 4 --model work_dirs/test/cycle_4.pth --labeled work_dirs/test/X_L_4.npy
- --labeled_next work_dirs/test/X_L_5.npy --unselected work_dirs/test/X_U_5.npy
- deps:
- - path: work_dirs/test/X_L_4.npy
- md5: 0e915a08f913e79112d7304a828980bd
- size: 12080
- - path: work_dirs/test/cycle_4.pth
- md5: 0ac0eb19c87b4e4b07818b94b6369fce
- size: 164970901
- outs:
- - path: work_dirs/test/X_L_5.npy
- md5: 5eadc04f2561404ba2028361015327bb
- size: 14072
- - path: work_dirs/test/X_U_5.npy
- md5: 30e0df67683dd525d8c81e47cd1470f2
- size: 14072
- active_learning_training@5:
- cmd: python cycle_train.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
- --cycle 5 --labeled work_dirs/test/X_L_5.npy --unselected work_dirs/test/X_U_5.npy
- --model work_dirs/test/cycle_5.pth
- deps:
- - path: work_dirs/test/X_L_5.npy
- md5: 5eadc04f2561404ba2028361015327bb
- size: 14072
- - path: work_dirs/test/X_U_5.npy
- md5: 30e0df67683dd525d8c81e47cd1470f2
- size: 14072
- outs:
- - path: work_dirs/test/cycle_5.pth
- md5: bd8401ee5d06c3a46a599e445accd7b6
- size: 164970901
- active_learning_selection@5:
- cmd: python cycle_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
- --cycle 5 --model work_dirs/test/cycle_5.pth --labeled work_dirs/test/X_L_5.npy
- --labeled_next work_dirs/test/X_L_6.npy --unselected work_dirs/test/X_U_6.npy
- deps:
- - path: work_dirs/test/X_L_5.npy
- md5: 5eadc04f2561404ba2028361015327bb
- size: 14072
- - path: work_dirs/test/cycle_5.pth
- md5: bd8401ee5d06c3a46a599e445accd7b6
- size: 164970901
- outs:
- - path: work_dirs/test/X_L_6.npy
- md5: 0e7c4857df9766920e3f659621af5f67
- size: 16064
- - path: work_dirs/test/X_U_6.npy
- md5: d72b3abdc4c57c9cec2257d99742c24b
- size: 16064
- active_learning_training@6:
- cmd: python cycle_train.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
- --cycle 6 --labeled work_dirs/test/X_L_6.npy --unselected work_dirs/test/X_U_6.npy
- --model work_dirs/test/cycle_6.pth
- deps:
- - path: work_dirs/test/X_L_6.npy
- md5: 0e7c4857df9766920e3f659621af5f67
- size: 16064
- - path: work_dirs/test/X_U_6.npy
- md5: d72b3abdc4c57c9cec2257d99742c24b
- size: 16064
- outs:
- - path: work_dirs/test/cycle_6.pth
- md5: 5f6c7ab04bca0158e8a03e4e18e956ec
- size: 164970901
- active_learning_selection@6:
- cmd: python cycle_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
- --cycle 6 --model work_dirs/test/cycle_6.pth --labeled work_dirs/test/X_L_6.npy
- --labeled_next work_dirs/test/X_L_7.npy --unselected work_dirs/test/X_U_7.npy
- deps:
- - path: work_dirs/test/X_L_6.npy
- md5: 0e7c4857df9766920e3f659621af5f67
- size: 16064
- - path: work_dirs/test/cycle_6.pth
- md5: 5f6c7ab04bca0158e8a03e4e18e956ec
- size: 164970901
- outs:
- - path: work_dirs/test/X_L_7.npy
- md5: 8915f632cc59ecf060a24352fffeea42
- size: 18056
- - path: work_dirs/test/X_U_7.npy
- md5: 18458838e244ab34e049265f7f0edcce
- size: 18056
- active_learning_training@7:
- cmd: python cycle_train.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
- --cycle 7 --labeled work_dirs/test/X_L_7.npy --unselected work_dirs/test/X_U_7.npy
- --model work_dirs/test/cycle_7.pth
- deps:
- - path: work_dirs/test/X_L_7.npy
- md5: 8915f632cc59ecf060a24352fffeea42
- size: 18056
- - path: work_dirs/test/X_U_7.npy
- md5: 18458838e244ab34e049265f7f0edcce
- size: 18056
- outs:
- - path: work_dirs/test/cycle_7.pth
- md5: 7a7c7e5972250f71b7a751cde8efce72
- size: 164970901
- active_learning_selection@7:
- cmd: python cycle_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
- --cycle 7 --model work_dirs/test/cycle_7.pth --labeled work_dirs/test/X_L_7.npy
- --labeled_next work_dirs/test/X_L_8.npy --unselected work_dirs/test/X_U_8.npy
- deps:
- - path: work_dirs/test/X_L_7.npy
- md5: 8915f632cc59ecf060a24352fffeea42
- size: 18056
- - path: work_dirs/test/cycle_7.pth
- md5: 7a7c7e5972250f71b7a751cde8efce72
- size: 164970901
- outs:
- - path: work_dirs/test/X_L_8.npy
- md5: 9b20cfe86ff6ccda22a802044ca80e17
- size: 20048
- - path: work_dirs/test/X_U_8.npy
- md5: b70bddf07f0a225b314f4269eb4b4888
- size: 20048
- active_learning_training@8:
- cmd: python cycle_train.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
- --cycle 8 --labeled work_dirs/test/X_L_8.npy --unselected work_dirs/test/X_U_8.npy
- --model work_dirs/test/cycle_8.pth
- deps:
- - path: work_dirs/test/X_L_8.npy
- md5: 9b20cfe86ff6ccda22a802044ca80e17
- size: 20048
- - path: work_dirs/test/X_U_8.npy
- md5: b70bddf07f0a225b314f4269eb4b4888
- size: 20048
- outs:
- - path: work_dirs/test/cycle_8.pth
- md5: c5e1476ba4b34c980583ec704b4813f1
- size: 164970901
- active_learning_selection@8:
- cmd: python cycle_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
- --cycle 8 --model work_dirs/test/cycle_8.pth --labeled work_dirs/test/X_L_8.npy
- --labeled_next work_dirs/test/X_L_9.npy --unselected work_dirs/test/X_U_9.npy
- deps:
- - path: work_dirs/test/X_L_8.npy
- md5: 9b20cfe86ff6ccda22a802044ca80e17
- size: 20048
- - path: work_dirs/test/cycle_8.pth
- md5: c5e1476ba4b34c980583ec704b4813f1
- size: 164970901
- outs:
- - path: work_dirs/test/X_L_9.npy
- md5: eec14a87bfb759f0a59b3c1747d28a38
- size: 22040
- - path: work_dirs/test/X_U_9.npy
- md5: c4beb0c042e48aab8a49caa06c18872d
- size: 22040
- active_learning_training@9:
- cmd: python cycle_train.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
- --cycle 9 --labeled work_dirs/test/X_L_9.npy --unselected work_dirs/test/X_U_9.npy
- --model work_dirs/test/cycle_9.pth
- deps:
- - path: work_dirs/test/X_L_9.npy
- md5: eec14a87bfb759f0a59b3c1747d28a38
- size: 22040
- - path: work_dirs/test/X_U_9.npy
- md5: c4beb0c042e48aab8a49caa06c18872d
- size: 22040
- outs:
- - path: work_dirs/test/cycle_9.pth
- md5: 457d66a14da3af27ff669dda207593db
- size: 164970901
- active_learning_selection@9:
- cmd: python cycle_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test
- --cycle 9 --model work_dirs/test/cycle_9.pth --labeled work_dirs/test/X_L_9.npy
- --labeled_next work_dirs/test/X_L_10.npy --unselected work_dirs/test/X_U_10.npy
- deps:
- - path: work_dirs/test/X_L_9.npy
- md5: eec14a87bfb759f0a59b3c1747d28a38
- size: 22040
- - path: work_dirs/test/cycle_9.pth
- md5: 457d66a14da3af27ff669dda207593db
- size: 164970901
- outs:
- - path: work_dirs/test/X_L_10.npy
- md5: 629870932a4c1d6158b54e358cd779dc
- size: 24032
- - path: work_dirs/test/X_U_10.npy
- md5: 7cf1ff9f74656b9413ab2d5fe210fbfd
- size: 24032
+ md5: e5610b59611259f004316ea4732c9fcf
+ size: 98320
+ params:
+ al_cycle.json:
+ al_cycle: 1
+ al_seed: 123
+ outs:
+ - path: work_dirs/test/cycle.pth
+ md5: 76f3213b889f71415fe0f85eeeae8737
+ size: 164713749
diff --git a/examples/al_object_detection/dvc.yaml b/examples/al_object_detection/dvc.yaml
index dfbe624c..3dbe479d 100644
--- a/examples/al_object_detection/dvc.yaml
+++ b/examples/al_object_detection/dvc.yaml
@@ -1,70 +1,84 @@
+vars:
+ - configs/MIAOD-GRAY.py:data_root
+ - al_cycle.json:al_cycle,next_cycle,al_seed
+ - uuid.json:uuid_var
+
stages:
- initial_selection:
- cmd:
- python initial_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --labeled_next work_dirs/test/X_L_0.npy --unselected work_dirs/test/X_U_0.npy
- deps:
- - configs/MIAOD-GRAY.py
- outs:
- - work_dirs/test/X_L_0.npy
- - work_dirs/test/X_U_0.npy
+ input_sample_lists:
+ cmd:
+ python input_lists.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --labeled ${data_root}/training.txt --unlabeled ${data_root}/unlabeled.txt --seed ${al_seed} --train ${data_root}/train.txt --selected work_dirs/test/X_L_${al_cycle}.npy --unselected work_dirs/test/X_U_${al_cycle}.npy --stage_name input_sample_lists --execution_name ${uuid_var}
+ deps:
+ - ${data_root}/training.txt
+ - ${data_root}/unlabeled.txt
+ outs:
+ - ${data_root}/train.txt
+ - work_dirs/test/X_L_${al_cycle}.npy
+ - work_dirs/test/X_U_${al_cycle}.npy
+ params:
+ - configs/MIAOD-GRAY.py:
+ - data_root
+ - al_cycle.json:
+ - al_cycle
+ - al_seed
active_learning_training:
- foreach:
- - cycle: 0
-# - cycle: 1
-# - cycle: 2
-# - cycle: 3
-# - cycle: 4
-# - cycle: 5
-# - cycle: 6
-# - cycle: 7
-# - cycle: 8
-# - cycle: 9
-
- do:
- cmd:
- #single gpu
- python cycle_train.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --cycle ${item.cycle} --labeled work_dirs/test/X_L_${item.cycle}.npy --unselected work_dirs/test/X_U_${item.cycle}.npy --model work_dirs/test/cycle_${item.cycle}.pth #--model_prev work_dirs/test/latest.pth
- #distributed mode
- #python -m torch.distributed.launch --nproc_per_node=6 --master_port=39025 cycle_train.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --cycle ${item.cycle} --labeled work_dirs/test/X_L_${item.cycle}.npy --unselected work_dirs/test/X_U_${item.cycle}.npy --model work_dirs/test/cycle_${item.cycle}.pth --gpus 6 --launcher pytorch
-
- deps:
- - work_dirs/test/X_L_${item.cycle}.npy
- - work_dirs/test/X_U_${item.cycle}.npy
- outs:
- - work_dirs/test/cycle_${item.cycle}.pth
-
-# active_learning_selection:
-# foreach:
-# - cycle: 0
-# next_cycle: 1
-# - cycle: 1
-# next_cycle: 2
-# - cycle: 2
-# next_cycle: 3
-# - cycle: 3
-# next_cycle: 4
-# - cycle: 4
-# next_cycle: 5
-# - cycle: 5
-# next_cycle: 6
-# - cycle: 6
-# next_cycle: 7
-# - cycle: 7
-# next_cycle: 8
-# - cycle: 8
-# next_cycle: 9
-# - cycle: 9
-# next_cycle: 10
-# do:
-# cmd:
+ cmd:
+ #single gpu
+ python cycle_train.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --cycle ${al_cycle} --labeled work_dirs/test/X_L_${al_cycle}.npy --unselected work_dirs/test/X_U_${al_cycle}.npy --seed ${al_seed} --model work_dirs/test/cycle.pth --stage_name active_learning_training --execution_name ${uuid_var}
+ #distributed mode
+ #python -m torch.distributed.launch --nproc_per_node=2 --master_port=39025 cycle_train.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --cycle ${al_cycle} --labeled work_dirs/test/X_L_${al_cycle}.npy --unselected work_dirs/test/X_U_${al_cycle}.npy --seed ${al_seed} --model work_dirs/test/cycle.pth --gpus 2 --launcher pytorch --stage_name active_learning_training --execution_name ${uuid_var}
+ deps:
+ - work_dirs/test/X_L_${al_cycle}.npy
+ - work_dirs/test/X_U_${al_cycle}.npy
+ - uuid.json
+ outs:
+ - work_dirs/test/cycle.pth
+ params:
+ - configs/MIAOD-GRAY.py:
+ - data_root
+ - al_cycle.json:
+ - al_cycle
+ - al_seed
- # python cycle_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --cycle ${item.cycle} --model work_dirs/test/cycle_${item.cycle}.pth --labeled work_dirs/test/X_L_${item.cycle}.npy --labeled_next work_dirs/test/X_L_${item.next_cycle}.npy --unselected work_dirs/test/X_U_${item.next_cycle}.npy
- #python -m torch.distributed.launch --nproc_per_node=4 --master_port=39016 cycle_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --cycle ${item.cycle} --model work_dirs/test/cycle_${item.cycle}.pth --labeled work_dirs/test/X_L_${item.cycle}.npy --labeled_next work_dirs/test/X_L_${item.next_cycle}.npy --unselected work_dirs/test/X_U_${item.next_cycle}.npy --gpus 4 --launcher pytorch
- # deps:
- # - work_dirs/test/cycle_${item.cycle}.pth
- # - work_dirs/test/X_L_${item.cycle}.npy
- # outs:
- # - work_dirs/test/X_L_${item.next_cycle}.npy
- # - work_dirs/test/X_U_${item.next_cycle}.npy
+ active_learning_selection:
+ cmd:
+ #single gpu
+ python cycle_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --cycle ${al_cycle} --model work_dirs/test/cycle.pth --labeled work_dirs/test/X_L_${al_cycle}.npy --seed ${al_seed} --labeled_next work_dirs/test/X_L_${next_cycle}.npy --unselected work_dirs/test/X_U_${next_cycle}.npy --bbox_output work_dirs/test/labeling_hints_${next_cycle}.txt --stage_name active_learning_selection --execution_name ${uuid_var}
+ #distributed mode
+ #python -m torch.distributed.launch --nproc_per_node=2 --master_port=39016 cycle_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --cycle ${al_cycle} --model work_dirs/test/cycle.pth --labeled work_dirs/test/X_L_${al_cycle}.npy --seed ${al_seed} --labeled_next work_dirs/test/X_L_${next_cycle}.npy --unselected work_dirs/test/X_U_${next_cycle}.npy --bbox_output work_dirs/test/labeling_hints_${next_cycle}.txt --gpus 2 --launcher pytorch --stage_name active_learning_selection --execution_name ${uuid_var}
+ deps:
+ - work_dirs/test/cycle.pth
+ - work_dirs/test/X_L_${al_cycle}.npy
+ - uuid.json
+ outs:
+ - work_dirs/test/X_L_${next_cycle}.npy
+ - work_dirs/test/X_U_${next_cycle}.npy
+ params:
+ - configs/MIAOD-GRAY.py:
+ - data_root
+ - al_cycle.json:
+ - al_cycle
+ - next_cycle
+ - al_seed
+
+ output_sample_lists:
+ cmd:
+ python output_lists.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --selected work_dirs/test/X_L_${al_cycle}.npy --selected_next work_dirs/test/X_L_${next_cycle}.npy --train ${data_root}/train.txt --map ${data_root}/map.txt --seed ${al_seed} --label_next ${data_root}/label_next.txt --labeled ${data_root}/training_next.txt --unlabeled ${data_root}/unlabeled_next.txt --cycle_config al_cycle.json --cycle_config_next al_cycle_next.json --stage_name output_sample_lists --execution_name ${uuid_var}
+ deps:
+ - work_dirs/test/X_L_${al_cycle}.npy
+ - work_dirs/test/X_L_${next_cycle}.npy
+ - ${data_root}/train.txt
+ - ${data_root}/map.txt
+ outs:
+ - ${data_root}/label_next.txt
+ - ${data_root}/training_next.txt
+ - ${data_root}/unlabeled_next.txt
+ - al_cycle_next.json
+ params:
+ - configs/MIAOD-GRAY.py:
+ - data_root
+ - al_cycle.json:
+ - al_cycle
+ - next_cycle
+ - al_seed
diff --git a/examples/al_object_detection/dvc_full_train.yaml b/examples/al_object_detection/dvc_full_train.yaml
new file mode 100644
index 00000000..cc454e9e
--- /dev/null
+++ b/examples/al_object_detection/dvc_full_train.yaml
@@ -0,0 +1,27 @@
+stages:
+ initial_selection:
+ cmd:
+ python initial_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --labeled_next work_dirs/test/X_L_0.npy --unselected work_dirs/test/X_U_0.npy
+ deps:
+ - configs/MIAOD-GRAY.py
+ outs:
+ - work_dirs/test/X_L_0.npy
+ - work_dirs/test/X_U_0.npy
+
+ active_learning_training:
+ foreach:
+ - cycle: 0
+
+ do:
+ cmd:
+ #single gpu
+ python train.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --cycle ${item.cycle} --labeled work_dirs/test/X_L_${item.cycle}.npy --unselected work_dirs/test/X_U_${item.cycle}.npy --model work_dirs/test/cycle_${item.cycle}.pth
+ #distributed mode
+ #python -m torch.distributed.launch --nproc_per_node=6 --master_port=39025 cycle_train.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --cycle ${item.cycle} --labeled work_dirs/test/X_L_${item.cycle}.npy --unselected work_dirs/test/X_U_${item.cycle}.npy --model work_dirs/test/cycle_${item.cycle}.pth --gpus 6 --launcher pytorch
+
+ deps:
+ - work_dirs/test/X_L_${item.cycle}.npy
+ - work_dirs/test/X_U_${item.cycle}.npy
+ outs:
+ - work_dirs/test/cycle_${item.cycle}.pth
+
diff --git a/examples/al_object_detection/dvc_original.yaml b/examples/al_object_detection/dvc_original.yaml
new file mode 100644
index 00000000..f647f0a6
--- /dev/null
+++ b/examples/al_object_detection/dvc_original.yaml
@@ -0,0 +1,69 @@
+stages:
+ initial_selection:
+ cmd:
+ python initial_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --labeled_next work_dirs/test/X_L_0.npy --unselected work_dirs/test/X_U_0.npy
+ deps:
+ - configs/MIAOD-GRAY.py
+ outs:
+ - work_dirs/test/X_L_0.npy
+ - work_dirs/test/X_U_0.npy
+
+ active_learning_training:
+ foreach:
+ - cycle: 0
+# - cycle: 1
+# - cycle: 2
+# - cycle: 3
+# - cycle: 4
+# - cycle: 5
+# - cycle: 6
+# - cycle: 7
+# - cycle: 8
+# - cycle: 9
+
+ do:
+ cmd:
+ #single gpu
+ python train.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --cycle ${item.cycle} --labeled work_dirs/test/X_L_${item.cycle}.npy --unselected work_dirs/test/X_U_${item.cycle}.npy --model work_dirs/test/cycle_${item.cycle}.pth
+ #distributed mode
+ #python -m torch.distributed.launch --nproc_per_node=6 --master_port=39025 cycle_train.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --cycle ${item.cycle} --labeled work_dirs/test/X_L_${item.cycle}.npy --unselected work_dirs/test/X_U_${item.cycle}.npy --model work_dirs/test/cycle_${item.cycle}.pth --gpus 6 --launcher pytorch
+
+ deps:
+ - work_dirs/test/X_L_${item.cycle}.npy
+ - work_dirs/test/X_U_${item.cycle}.npy
+ outs:
+ - work_dirs/test/cycle_${item.cycle}.pth
+
+ active_learning_selection:
+ foreach:
+ - cycle: 0
+ next_cycle: 1
+# - cycle: 1
+# next_cycle: 2
+# - cycle: 2
+# next_cycle: 3
+# - cycle: 3
+# next_cycle: 4
+# - cycle: 4
+# next_cycle: 5
+# - cycle: 5
+# next_cycle: 6
+# - cycle: 6
+# next_cycle: 7
+# - cycle: 7
+# next_cycle: 8
+# - cycle: 8
+# next_cycle: 9
+# - cycle: 9
+# next_cycle: 10
+ do:
+ cmd:
+
+ python cycle_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --cycle ${item.cycle} --model work_dirs/test/cycle_${item.cycle}.pth --labeled work_dirs/test/X_L_${item.cycle}.npy --labeled_next work_dirs/test/X_L_${item.next_cycle}.npy --unselected work_dirs/test/X_U_${item.next_cycle}.npy --bbox_output work_dirs/test/labeling_hints_${item.next_cycle}.txt
+ #python -m torch.distributed.launch --nproc_per_node=4 --master_port=39016 cycle_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --cycle ${item.cycle} --model work_dirs/test/cycle_${item.cycle}.pth --labeled work_dirs/test/X_L_${item.cycle}.npy --labeled_next work_dirs/test/X_L_${item.next_cycle}.npy --unselected work_dirs/test/X_U_${item.next_cycle}.npy --bbox_output work_dirs/test/labeling_hints_${item.next_cycle}.txt --gpus 4 --launcher pytorch
+ deps:
+ - work_dirs/test/cycle_${item.cycle}.pth
+ - work_dirs/test/X_L_${item.cycle}.npy
+ outs:
+ - work_dirs/test/X_L_${item.next_cycle}.npy
+ - work_dirs/test/X_U_${item.next_cycle}.npy
diff --git a/examples/al_object_detection/epoch_based_runner.py b/examples/al_object_detection/epoch_based_runner.py
index eb1db00d..f147937b 100644
--- a/examples/al_object_detection/epoch_based_runner.py
+++ b/examples/al_object_detection/epoch_based_runner.py
@@ -27,6 +27,7 @@ def train(self, data_loader, **kwargs):
X_L.update({'x': X_L.pop('img')})
X_L.update({'y_loc_img': X_L.pop('gt_bboxes')})
X_L.update({'y_cls_img': X_L.pop('gt_labels')})
+ self._add_dataset_flag(X_L, is_unlabeled=False)
self._inner_iter = i
self.call_hook('before_train_iter')
if self.batch_processor is None:
@@ -50,10 +51,17 @@ def train(self, data_loader, **kwargs):
self.call_hook('before_train_epoch')
time.sleep(2) # Prevent possible deadlock during epoch transition
unlabeled_data_iter = iter(data_loader[1])
+ sum_unlabelled = (sum(1 for _ in unlabeled_data_iter))
+ self.logger.info('printing unlabelled sum %d', sum_unlabelled)
+ unlabeled_data_iter = iter(data_loader[1])
for i, X_L in enumerate(data_loader[0]):
+ if i == sum_unlabelled:
+ break
+
X_L.update({'x': X_L.pop('img')})
X_L.update({'y_loc_img': X_L.pop('gt_bboxes')})
X_L.update({'y_cls_img': X_L.pop('gt_labels')})
+ self._add_dataset_flag(X_L, is_unlabeled=False)
self._inner_iter = i
self.call_hook('before_train_iter')
if self.batch_processor is None:
@@ -71,6 +79,7 @@ def train(self, data_loader, **kwargs):
X_U.update({'x': X_U.pop('img')})
X_U.update({'y_loc_img': X_U.pop('gt_bboxes')})
X_U.update({'y_cls_img': X_U.pop('gt_labels')})
+ self._add_dataset_flag(X_U, is_unlabeled=True)
X_U = self.clear_gt_label(X_U)
self._inner_iter = i
self.call_hook('before_train_iter')
@@ -88,6 +97,11 @@ def train(self, data_loader, **kwargs):
self._iter += 1
self.call_hook('after_train_epoch')
self._epoch += 1
+
+ def _add_dataset_flag(self, X, is_unlabeled):
+ BatchSize = len(X['img_metas'].data[0])
+ for i in range(BatchSize):
+ X['img_metas'].data[0][i].update({'is_unlabeled': is_unlabeled})
def clear_gt_label(self, X_U):
BatchSize = len(X_U['y_cls_img'].data[0])
diff --git a/examples/al_object_detection/generate_uuid.py b/examples/al_object_detection/generate_uuid.py
new file mode 100644
index 00000000..618a0f5f
--- /dev/null
+++ b/examples/al_object_detection/generate_uuid.py
@@ -0,0 +1,6 @@
+import uuid
+import json
+my_uuid = str(uuid.uuid4())
+dict_ = {'uuid_var': my_uuid}
+with open('uuid.json','w') as f:
+ json.dump(dict_, f)
\ No newline at end of file
diff --git a/examples/al_object_detection/input_lists.py b/examples/al_object_detection/input_lists.py
new file mode 100644
index 00000000..d62497d9
--- /dev/null
+++ b/examples/al_object_detection/input_lists.py
@@ -0,0 +1,120 @@
+import argparse
+import os
+import os.path as osp
+import uuid
+import json
+
+from mmcv import Config
+
+import mmcv
+import numpy as np
+from mmdet.apis import set_random_seed
+from mmdet.utils import get_root_logger
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description='Input lists of labeled and unlabeled images')
+ parser.add_argument('config', help='train config file path')
+ parser.add_argument('--labeled',
+ help='file with list of labaled training samples')
+ parser.add_argument('--unlabeled',
+ help='file with list of unlabeled samples to consider for training')
+ parser.add_argument('--train',
+ help='file with list of all training samples')
+ parser.add_argument('--selected',
+ help='numpy file with indexes of selected samples from training list')
+ parser.add_argument('--unselected',
+ help='numpy file with indexes of unselected samples from training list')
+ parser.add_argument('--work_directory',
+ help='the dir to save logs and model checkpoints')
+ parser.add_argument('--seed', type=int, default=666, help='random seed')
+ parser.add_argument('--deterministic', action='store_true',
+ help='whether to set deterministic options for CUDNN backend.')
+
+ parser.add_argument('--stage_name', help='Name for current execution')
+
+ parser.add_argument('--execution_name', help='Name for current execution')
+
+ args = parser.parse_args()
+ return args
+
+def main():
+ args = parse_args()
+
+
+ assert (args.labeled and args.unlabeled and
+ args.train and args.selected and args.unselected), \
+ ('Plase specify file names of labeled and unlabeled '
+ 'image list text files with arguments "--labeled" and "--unlabeled". '
+ 'Also specify file name of the list of all samples created by this '
+ 'code with argument "--train" and file names for indexes of labeled '
+ 'and unselected samples in this file with arguments "--selected" and '
+ '"--unselected".')
+
+ cfg = Config.fromfile(args.config)
+
+ # work_directory is determined in this priority: CLI > config > default
+ if args.work_directory is not None:
+ # update work_directory from CLI args if args.work_directory is not None
+ cfg.work_directory = args.work_directory
+ elif cfg.get('work_directory', None) is None:
+ # derive work_directory from config name if cfg.work_directory is None
+ cfg.work_directory = osp.join('./work_dirs',
+ osp.splitext(osp.basename(args.config))[0])
+
+ stage_name = args.stage_name
+ os.environ['stage_name'] = stage_name
+
+ execution_name = args.execution_name
+ os.environ['execution_name'] = execution_name
+
+ # create work_directory
+ mmcv.mkdir_or_exist(osp.abspath(cfg.work_directory))
+
+ # init the logger before other steps
+ log_file = osp.join(cfg.work_directory, 'input_sample_lists.log')
+ logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
+
+ # set random seed
+ if args.seed is not None:
+ logger.info(
+ f'Set random seed to {args.seed}, deterministic: {args.deterministic}')
+ set_random_seed(args.seed, deterministic=args.deterministic)
+ cfg.seed = args.seed
+
+ # create and save sample lists
+ with open(args.labeled) as f:
+ line = f.readline().strip()
+ num_digits = len(line)
+ #labeled = np.loadtxt(args.labeled, dtype=np.uintc)
+ #unlabeled = np.loadtxt(args.unlabeled, dtype=np.uintc)
+ labeled = np.loadtxt(args.labeled, dtype=str)
+ unlabeled = np.loadtxt(args.unlabeled, dtype=str)
+ num_labeled = len(labeled)
+ all = np.concatenate((labeled, unlabeled))
+ all_sorted_indexes = np.argsort(all)
+ all = all[all_sorted_indexes]
+ labeled_indexes = np.nonzero(all_sorted_indexes < num_labeled)[0]
+ unlabeled_indexes = np.nonzero(all_sorted_indexes >= num_labeled)[0]
+ num_unlabeled = len(unlabeled_indexes)
+ if num_unlabeled >= num_labeled :
+ np.random.shuffle(unlabeled_indexes)
+ unlabeled_indexes = unlabeled_indexes[:num_labeled]
+ else :
+ np.random.shuffle(labeled_indexes)
+ unlabeled_indexes = np.concatenate(
+ (unlabeled_indexes, labeled_indexes[:num_labeled - num_unlabeled]))
+ labeled_indexes.sort()
+ unlabeled_indexes.sort()
+ #np.savetxt(args.train, all, fmt='%0'+str(num_digits)+'u')
+ np.savetxt(args.train, all, fmt='%s')
+ np.save(args.selected, labeled_indexes)
+ np.save(args.unselected, unlabeled_indexes)
+
+ my_uuid = str(uuid.uuid4())
+ dict_ = {'uuid_var': my_uuid}
+ with open('uuid.json','w') as f:
+ json.dump(dict_, f)
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/al_object_detection/mmdet.egg-info/PKG-INFO b/examples/al_object_detection/mmdet.egg-info/PKG-INFO
index 7ec5a6b6..d07a9f7b 100644
--- a/examples/al_object_detection/mmdet.egg-info/PKG-INFO
+++ b/examples/al_object_detection/mmdet.egg-info/PKG-INFO
@@ -21,6 +21,312 @@ Provides-Extra: build
Provides-Extra: optional
License-File: LICENSE
+# MI-AOD integration with CMF
+
+
+
+- [Overview](#overview)
+- [Running on Pascal VOC example](#running-on-pascal-voc-example)
+- [Installation](#installation)
+
+
+
+
+## Overview
+
+We divide the MI-AOD pipeline to three stages:
+- random selection of samples for initial training cycle: [initial_select.py](<./initial_select.py>)
+- model training in single active learning cycle: [cycle_train.py](<./cycle_train.py>)
+- selection of most informative samples for the next cycle: [cycle_select.py](<./cycle_select.py>)
+
+These stages are linked through their dependences in [dvc.yaml](<./dvc.yaml>) file
+
+## Running on Pascal VOC example
+
+### Configure the pipeline
+
+Optionally, start Neo4J server if it is not already running to help visualize
+CMF recorded lineage. The steps below assume that Neo4J local directory is
+"~/.neo4j", but a different directory can be used depending on how Neo4J
+is configured:
+
+```bash
+export NEO4J_HOME=~/.neo4j
+export NEO4J_CONF=~/.neo4j/neo4j.conf
+neo4j start
+```
+
+After installing the code (see [Installation](#installation)), copy the code
+from "/examples/al_object_detection/" to run directory.
+This is to avoid conflicts between git repos for master source code and for
+artifacts, metadata, and run code.
+
+```bash
+mkdir
+cp -pr /examples/al_object_detection/*
+```
+
+Configure the pipeline. Everything is pre-configured for this example, except
+for the location of the input data. Modify "data_root" parameter in
+[configs/MIAOD.py](<./configs/MIAOD.py>) and [configs/_base_/voc0712.py](<./configs/_base_/voc0712.py>) configuration files
+to point to the location of Pascal VOC2007 and VOC2012 datasets
+("/VOCdevkit/" - see [Installation](#installation)
+for installing these datasets).
+
+Activate Conda virtual environment containing CMF, PyTorch, MMCV, and MI-AOD
+dependences installed as per [Installation](#installation).
+
+```bash
+conda activate
+```
+
+Initialize CMF. First edit the "/sample_env" file to customize
+your git, DVC and Neo4J configuration. Note that the Neo4J URI is typically
+"bolt://IP_ADDRESS:7687" or "bolt://localhost:7687". Neo4J is not needed by CMF
+to capture the lineage, it is only needed to visualize the lineage.
+Then, source this file and run "/initialize.sh" to initialize
+DVC and git repos. This will print "NEXT STEPS". Ignore the recommendation
+to run "test_script.sh".
+
+```bash
+cd
+# first, edit the sample_env file to customize git, DVC and Neo4J environments
+source sample_env
+./initialize.sh
+```
+
+### Run the pipeline
+
+If not already active (for example, when not running in the same terminal
+window that was used to [Configure](#configure-the-pipeline) the pipeline), activate Conda virtual environment
+containing CMF, PyTorch, MMCV, and MI-AOD dependences installed as per
+[Installation](#installation)
+
+```bash
+conda activate
+```
+
+If not already set (for example, when not running in the same terminal
+window that was used to [Configure](#configure-the-pipeline) the pipeline), source "/sample_env"
+file to customize your Neo4J configuration:
+
+```bash
+cd
+source sample_env
+```
+
+Visualize pipeline execution dependences:
+
+```bash
+cd
+dvc dag
+```
+
+Use dvc to run the pipeline:
+
+```bash
+dvc repro -f
+```
+
+Optional - Change this line (Line Number:126)in dvc_cmf_ingest.py to capture the lineage in neo4j. Set graph=True
+```
+metawriter = cmf.Cmf(filename="mlmd", pipeline_name=pipeline_name, graph=True)
+```
+After running the pipeline, build CMF lineage. This is done separately because
+during the dvc run, stage inputs and outputs are locked by DVC:
+
+```bash
+python dvc_cmf_ingest.py
+```
+
+
+Visualize CMF lineage in Neo4J. TODO: Note that mAP or other metrics recorded by CMF
+during pipeline execution are shown as ouputs from [cycle_select.py](<./cycle_select.py>) stage:
+
+```
+Point your browser to Neo4J server, e.g.:
+:7474/browser/ or
+localhost:7474/browser/
+
+Visualize CMF lineage with Neo4J command:
+MATCH (a:Execution)-[r]-(b) WHERE (b:Dataset or b:Model or b:Metrics) RETURN a,r,b
+```
+
+To delete old lineages from Neo4J database :
+
+```
+MATCH(n) DETACH DELETE n
+```
+
+## Installation
+
+These installation instructions supersede installation instructions for
+the original MI-AOD package.
+
+Create and activate new Conda virtual environment:
+
+```bash
+conda create -n python=3.8
+conda activate
+```
+
+Install CMF.
+
+As of today, clone CMF library from al_objectdetection branch for this example.
+In the future, the code will be merged to master branch:
+
+```bash
+cd
+git clone -b al_objectdetection https://github.com/HewlettPackard/cmf.git
+cd cmf # this is the directory
+python setup.py bdist_wheel
+cd dist
+pip install cmflib-0.0.1-py3-none-any.whl
+```
+
+Optionally, install Neo4J to view pipeline lineages recorded by CMF. Note
+that this is not needed by CMF to capture the lineage, it is only needed
+to visualize the lineage.
+
+Neo4J installation instructions are described at:
+https://neo4j.com/docs/operations-manual/current/installation/linux/ .
+Before proceeding with install, verify installed Java OpenJDK as per those
+instructions. For Ubuntu, Java prerequisites are explained here:
+https://neo4j.com/docs/operations-manual/current/installation/linux/debian/
+We used these installation steps:
+
+```bash
+wget -O - https://debian.neo4j.com/neotechnology.gpg.key | sudo apt-key add -
+echo 'deb https://debian.neo4j.com stable latest' | sudo tee /etc/apt/sources.list.d/neo4j.list
+sudo apt-get update
+sudo add-apt-repository universe
+sudo apt-get install neo4j
+```
+
+Configure Neo4J (if installed) as follows, then edit ~/.neo4j/neo4j.conf
+to point to local directories (data, run, logs, etc.) from below:
+
+```bash
+mkdir ~/.neo4j
+cp -p /etc/neo4j/neo4j.conf ~/.neo4j
+mkdir ~/.neo4j/data
+mkdir ~/.neo4j/run
+mkdir ~/.neo4j/logs
+mkdir ~/.neo4j/metrics
+mkdir ~/.neo4j/data/transactions
+mkdir ~/.neo4j/data/dumps
+# Edit ~/.neo4j/neo4j.conf to point to the above directories
+```
+
+Install PyTorch and MMCV libraries
+
+It is important to choose matching versions of GPU CUDA version, PyTorch
+and MMCV. First, source NVidia environment and identify CUDA version by
+running "nvcc --version". Then, identify corresponding PyTorch version on
+this Web site: https://pytorch.org/get-started/previous-versions/ along
+with TorchVision and TorchAudio versions. Note that multiple PyTorch versions
+listed there will support your CUDA version. From the supported list, choose
+CUDA and PyTorch combination that is supported by MMCV libraries as per this
+Web site: https://mmcv.readthedocs.io/en/latest/get_started/installation.html .
+On this site, scroll down to "Install with pip" section and enter CUDA
+and PyTorch combination in pull-down boxes. The MMCV pull-down box will then
+show the MMCV versions that support this CUDA and PyTorch combination. Choose
+one of these versions, which will then show "pip install" command for this MMCV
+version. We tested the flow with MMCV 1.5.0 (note that this is newer than
+MMCV 1.0.5 mentioned in the original MI-AOD documentation, but we found that
+MMDetection and MI-AOD code works wth this MMCV library version under
+Python 3.8 and various CUDA and PyTorch versions, e.g., CUDA 11.3 and
+PyTorch 1.11.0). Here are the commands to install PyTorch and MMCV libraries:
+
+```bash
+conda install pytorch== torchvision== torchaudio== cudatoolkit= -c pytorch
+pip install mmcv-full== -f https://download.openmmlab.com/mmcv/dist/cu//index.html
+
+# for example:
+# conda install pytorch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 cudatoolkit=11.3 -c pytorch
+#pip install mmcv-full==1.5.0 -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11/index.html
+# where "https://download.openmmlab.com/mmcv/dist/cu113/torch1.11/index.html"
+# is from "https://mmcv.readthedocs.io/en/latest/get_started/installation.html"
+# Web site as per the description above
+```
+
+Install the MI-AOD code
+
+Note that is the conda local directory and
+is the virtual environment name:
+
+```bash
+cd
+cd examples/al_object_detection
+pip install -r requirements/build.txt
+pip install -v -e . # or "python setup.py develop"
+pip install "git+https://github.com/open-mmlab/cocoapi.git#subdirectory=pycocotools"
+pip install "git+https://github.com/open-mmlab/cocoapi.git#subdirectory=lvis"
+cp -v epoch_based_runner.py /envs//lib/python3.8/site-packages/mmcv/runner/
+```
+
+Install Pascal VOC2007 and 2012 datsets
+
+```bash
+cd
+wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
+wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
+wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
+tar -xf VOCtrainval_06-Nov-2007.tar
+tar -xf VOCtest_06-Nov-2007.tar
+tar -xf VOCtrainval_11-May-2012.tar
+```
+
+Notice that dataset annotation XML files are in Annotations subdirectory while
+corresponding input images are in JPEGImages subdirectory.
+
+## Running on Custom Dataset.
+
+Please update the following files
+
+1. Preparing the dataset.
+ Dataset folder should have the following structure.
+ ```
+ BaseFolder
+ |
+ |___ Dataset
+ | |
+ | |---Annotations
+ | | |---xml file
+ | |---ImageSets
+ | | |
+ | | |---Main
+ | | | |---trainval.txt
+ | | | |---test.txt
+ | | |
+ | |---JPEGImages
+ | | | |---Image files in jpg format
+ ```
+ 2. Preparing the dataset file.
+ Create a file inside the folder mmdet/datasets.
+ we can take voc.py in the same folder as a template and make necessary modifications.
+ Newly added file hdc.py was created for the custom dataset.
+ Please modify the class names in this file appropriately.
+
+ 3. Prepare config file for the new dataset in the folder config/_base_
+ hdc.py was created for the custom dataset.
+ Please modify it appropriately.
+
+ 4. Config changes
+ 1. Dataset path
+ - Modify *configs/MIAOD-GRAY* to reflect the dataset path
+ sections - data_root, data
+ - Modify *configs/_base_/hdc.py* to reflect the dataset path
+ sections - data_root, data
+
+ 2. Number of classes
+ Modify the section model in MIAOD-GRAY.
+ C=2, for two class dataset.
+
+ 3. Number of images to train.
+ Modify the section X_S_size AND X_L_0_size in *config/MIAOD-GRAY*
+
+
# MI-AOD
Language: [简体中文](README_cn.md) | English
diff --git a/examples/al_object_detection/mmdet.egg-info/SOURCES.txt b/examples/al_object_detection/mmdet.egg-info/SOURCES.txt
index fdc59982..d715ea51 100644
--- a/examples/al_object_detection/mmdet.egg-info/SOURCES.txt
+++ b/examples/al_object_detection/mmdet.egg-info/SOURCES.txt
@@ -78,6 +78,8 @@ mmdet/datasets/coco.py
mmdet/datasets/custom.py
mmdet/datasets/dataset_wrappers.py
mmdet/datasets/deepfashion.py
+mmdet/datasets/gray.py
+mmdet/datasets/hdc.py
mmdet/datasets/lvis.py
mmdet/datasets/voc.py
mmdet/datasets/wider_face.py
@@ -213,4 +215,13 @@ mmdet/utils/collect_env.py
mmdet/utils/contextmanagers.py
mmdet/utils/logger.py
mmdet/utils/profiling.py
-mmdet/utils/util_mixins.py
\ No newline at end of file
+mmdet/utils/util_mixins.py
+mmdet/utils/hdc/__init__.py
+mmdet/utils/hdc/active_datasets.py
+tests/test_anchor.py
+tests/test_assigner.py
+tests/test_async.py
+tests/test_config.py
+tests/test_fp16.py
+tests/test_masks.py
+tests/test_version.py
\ No newline at end of file
diff --git a/examples/al_object_detection/mmdet/apis/test.py b/examples/al_object_detection/mmdet/apis/test.py
index b710c5de..a9307e5f 100644
--- a/examples/al_object_detection/mmdet/apis/test.py
+++ b/examples/al_object_detection/mmdet/apis/test.py
@@ -10,19 +10,30 @@
from mmcv.runner import get_dist_info
from mmdet.core import encode_mask_results, tensor2imgs
from mmdet.models.detectors.base import *
+from mmcv.ops.nms import batched_nms
-def calculate_uncertainty(cfg, model, data_loader, return_box=False):
+def calculate_uncertainty(cfg, model, data_loader, plot_nuboxes=0, return_box=False):
model.eval()
model.cuda()
dataset = data_loader.dataset
print('>>> Computing Instance Uncertainty...')
uncertainty = torch.zeros(len(dataset)).cuda(torch.cuda.current_device())
+ if plot_nuboxes > 0:
+ udets = torch.zeros(len(dataset), plot_nuboxes, 5).cuda(torch.cuda.current_device())
+ if cfg.get('check_nuboxes') and (cfg.check_nuboxes > 0):
+ check_nuboxes = cfg.check_nuboxes
+ else:
+ check_nuboxes = cfg.k
+ zero_offsets = torch.zeros(check_nuboxes).cuda(torch.cuda.current_device())
for i, data in enumerate(data_loader):
with torch.no_grad():
data['img'][0] = data['img'][0].cuda()
data.update({'x': data.pop('img')})
- y_head_f_1, y_head_f_2, y_head_cls = model(return_loss=False, rescale=True, return_box=return_box, **data)
+ if plot_nuboxes > 0:
+ y_head_f_1, y_head_f_2, y_f_r, y_head_cls = model(return_loss=False, rescale=True, return_box=return_box, uncertain_box=True, **data)
+ else:
+ y_head_f_1, y_head_f_2, y_head_cls = model(return_loss=False, rescale=True, return_box=return_box, uncertain_box=False, **data)
y_head_f_1 = torch.cat(y_head_f_1, 0)
y_head_f_2 = torch.cat(y_head_f_2, 0)
y_head_f_1 = nn.Sigmoid()(y_head_f_1)
@@ -30,11 +41,23 @@ def calculate_uncertainty(cfg, model, data_loader, return_box=False):
loss_l2_p = (y_head_f_1 - y_head_f_2).pow(2)
uncertainty_all_N = loss_l2_p.mean(dim=1)
arg = uncertainty_all_N.argsort()
- uncertainty_single = uncertainty_all_N[arg[-cfg.k:]].mean()
- uncertainty[i] = uncertainty_single
+ if plot_nuboxes > 0:
+ img_shape = data['img_metas'][0].data[0][0]['img_shape']
+ scale_factor = data['img_metas'][0].data[0][0]['scale_factor']
+ ubboxes_single = model.bbox_head.get_uncertain_bboxes(arg,
+ y_f_r, img_shape, scale_factor, check_nuboxes, cfg,
+ rescale=True)
+ uncertainty_single = uncertainty_all_N[arg[-check_nuboxes:]]
+ udets_single, _ = batched_nms(ubboxes_single,
+ uncertainty_single, zero_offsets, cfg.test_cfg.nms)
+ udets[i] = udets_single[:plot_nuboxes]
+ uncertainty[i] = uncertainty_all_N[arg[-cfg.k:]].mean()
if i % 1000 == 0:
print('>>> ', i, '/', len(dataset))
- return uncertainty.cpu()
+ if plot_nuboxes > 0:
+ return uncertainty.cpu(), udets.cpu()
+ else:
+ return uncertainty.cpu()
def single_gpu_test(model, data_loader, show=False):
diff --git a/examples/al_object_detection/mmdet/core/evaluation/bbox_overlaps.py b/examples/al_object_detection/mmdet/core/evaluation/bbox_overlaps.py
index 93559ea0..cbacc61c 100644
--- a/examples/al_object_detection/mmdet/core/evaluation/bbox_overlaps.py
+++ b/examples/al_object_detection/mmdet/core/evaluation/bbox_overlaps.py
@@ -1,20 +1,20 @@
import numpy as np
-def bbox_overlaps(bboxes1, bboxes2, mode='iou', eps=1e-6):
+def bbox_overlaps(bboxes1, bboxes2, mode='iog', eps=1e-6):
"""Calculate the ious between each bbox of bboxes1 and bboxes2.
Args:
bboxes1(ndarray): shape (n, 4)
bboxes2(ndarray): shape (k, 4)
mode(str): iou (intersection over union) or iof (intersection
- over foreground)
+ over foreground) or iog (intersection over ground truth)
Returns:
ious(ndarray): shape (n, k)
"""
- assert mode in ['iou', 'iof']
+ assert mode in ['iou', 'iof', 'iog']
bboxes1 = bboxes1.astype(np.float32)
bboxes2 = bboxes2.astype(np.float32)
@@ -39,8 +39,10 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', eps=1e-6):
y_end - y_start, 0)
if mode == 'iou':
union = area1[i] + area2 - overlap
- else:
+ elif mode == 'iof':
union = area1[i] if not exchange else area2
+ else:
+ union = area1[i] if exchange else area2
union = np.maximum(union, eps)
ious[i, :] = overlap / union
if exchange:
diff --git a/examples/al_object_detection/mmdet/core/evaluation/mean_ap.py b/examples/al_object_detection/mmdet/core/evaluation/mean_ap.py
index 9611c814..d45870f5 100644
--- a/examples/al_object_detection/mmdet/core/evaluation/mean_ap.py
+++ b/examples/al_object_detection/mmdet/core/evaluation/mean_ap.py
@@ -80,8 +80,8 @@ def tpfp_imagenet(det_bboxes,
"""
# an indicator of ignored gts
gt_ignore_inds = np.concatenate(
- (np.zeros(gt_bboxes.shape[0], dtype=np.bool),
- np.ones(gt_bboxes_ignore.shape[0], dtype=np.bool)))
+ (np.zeros(gt_bboxes.shape[0], dtype=bool),
+ np.ones(gt_bboxes_ignore.shape[0], dtype=bool)))
# stack gt_bboxes and gt_bboxes_ignore for convenience
gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore))
@@ -173,8 +173,8 @@ def tpfp_default(det_bboxes,
"""
# an indicator of ignored gts
gt_ignore_inds = np.concatenate(
- (np.zeros(gt_bboxes.shape[0], dtype=np.bool),
- np.ones(gt_bboxes_ignore.shape[0], dtype=np.bool)))
+ (np.zeros(gt_bboxes.shape[0], dtype=bool),
+ np.ones(gt_bboxes_ignore.shape[0], dtype=bool)))
# stack gt_bboxes and gt_bboxes_ignore for convenience
gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore))
@@ -248,7 +248,6 @@ def get_cls_results(det_results, annotations, class_id):
Returns:
tuple[list[np.ndarray]]: detected bboxes, gt bboxes, ignored gt bboxes
"""
-
cls_dets = [img_res[class_id] for img_res in det_results]
cls_gts = []
cls_gts_ignore = []
@@ -261,7 +260,6 @@ def get_cls_results(det_results, annotations, class_id):
cls_gts_ignore.append(ann['bboxes_ignore'][ignore_inds, :])
else:
cls_gts_ignore.append(np.empty((0, 4), dtype=np.float32))
-
return cls_dets, cls_gts, cls_gts_ignore
@@ -316,6 +314,7 @@ def eval_map(det_results,
# get gt and det bboxes of this class
cls_dets, cls_gts, cls_gts_ignore = get_cls_results(
det_results, annotations, i)
+
# choose proper function according to datasets to compute tp and fp
if dataset in ['det', 'vid']:
tpfp_func = tpfp_imagenet
@@ -341,7 +340,9 @@ def eval_map(det_results,
num_gts[k] += np.sum((gt_areas >= min_area)
& (gt_areas < max_area))
# sort all det bboxes by score, also sort tp and fp
+
cls_dets = np.vstack(cls_dets)
+
num_dets = cls_dets.shape[0]
sort_inds = np.argsort(-cls_dets[:, -1])
tp = np.hstack(tp)[:, sort_inds]
@@ -357,7 +358,7 @@ def eval_map(det_results,
recalls = recalls[0, :]
precisions = precisions[0, :]
num_gts = num_gts.item()
- mode = 'area' if dataset != 'voc07' or 'hdc' else '11points'
+ mode = 'area' if dataset != 'voc07' or 'HDCDataset' or 'hdc' else '11points'
ap = average_precision(recalls, precisions, mode)
eval_results.append({
'num_gts': num_gts,
@@ -420,6 +421,7 @@ def print_map_summary(mean_ap,
if scale_ranges is not None:
assert len(scale_ranges) == num_scales
+
num_classes = len(results)
recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
aps = np.zeros((num_scales, num_classes), dtype=np.float32)
diff --git a/examples/al_object_detection/mmdet/core/mask/structures.py b/examples/al_object_detection/mmdet/core/mask/structures.py
index 8bbbfe47..093fc4b4 100644
--- a/examples/al_object_detection/mmdet/core/mask/structures.py
+++ b/examples/al_object_detection/mmdet/core/mask/structures.py
@@ -571,5 +571,5 @@ def polygon_to_bitmap(polygons, height, width):
"""
rles = maskUtils.frPyObjects(polygons, height, width)
rle = maskUtils.merge(rles)
- bitmap_mask = maskUtils.decode(rle).astype(np.bool)
+ bitmap_mask = maskUtils.decode(rle).astype(bool)
return bitmap_mask
diff --git a/examples/al_object_detection/mmdet/datasets/hdc.py b/examples/al_object_detection/mmdet/datasets/hdc.py
index ebf2a3e9..233407a0 100644
--- a/examples/al_object_detection/mmdet/datasets/hdc.py
+++ b/examples/al_object_detection/mmdet/datasets/hdc.py
@@ -6,7 +6,7 @@
@DATASETS.register_module()
class HDCDataset(XMLDataset):
- CLASSES = ['Defect', 'Background']
+ CLASSES = ['Defect']
def __init__(self, **kwargs):
#print("came in VOCDataset")
diff --git a/examples/al_object_detection/mmdet/datasets/xml_style.py b/examples/al_object_detection/mmdet/datasets/xml_style.py
index e5b7cae6..c92c64ea 100644
--- a/examples/al_object_detection/mmdet/datasets/xml_style.py
+++ b/examples/al_object_detection/mmdet/datasets/xml_style.py
@@ -8,6 +8,7 @@
from .builder import DATASETS
from .custom import CustomDataset
+ANNOTATIONS_FOLDER = 'Annotations_no_dummy'
@DATASETS.register_module()
class XMLDataset(CustomDataset):
@@ -38,11 +39,13 @@ def load_annotations(self, ann_file):
img_ids = mmcv.list_from_file(ann_file)
for img_id in img_ids:
filename = f'JPEGImages/{img_id}.jpg'
- xml_path = osp.join(self.img_prefix, 'Annotations',
+ xml_path = osp.join(self.img_prefix, ANNOTATIONS_FOLDER,
f'{img_id}.xml')
- tree = ET.parse(xml_path)
- root = tree.getroot()
- size = root.find('size')
+ size = None
+ if osp.exists(xml_path):
+ tree = ET.parse(xml_path)
+ root = tree.getroot()
+ size = root.find('size')
width = 0
height = 0
if size is not None:
@@ -63,15 +66,16 @@ def get_subset_by_classes(self):
subset_data_infos = []
for data_info in self.data_infos:
img_id = data_info['id']
- xml_path = osp.join(self.img_prefix, 'Annotations',
+ xml_path = osp.join(self.img_prefix, ANNOTATIONS_FOLDER,
f'{img_id}.xml')
- tree = ET.parse(xml_path)
- root = tree.getroot()
- for obj in root.findall('object'):
- name = obj.find('name').text
- if name in self.CLASSES:
- subset_data_infos.append(data_info)
- break
+ if osp.exists(xml_path):
+ tree = ET.parse(xml_path)
+ root = tree.getroot()
+ for obj in root.findall('object'):
+ name = obj.find('name').text
+ if name in self.CLASSES:
+ subset_data_infos.append(data_info)
+ break
return subset_data_infos
@@ -86,50 +90,55 @@ def get_ann_info(self, idx):
"""
img_id = self.data_infos[idx]['id']
- xml_path = osp.join(self.img_prefix, 'Annotations', f'{img_id}.xml')
- tree = ET.parse(xml_path)
- root = tree.getroot()
+ xml_path = osp.join(self.img_prefix, ANNOTATIONS_FOLDER, f'{img_id}.xml')
bboxes = []
labels = []
bboxes_ignore = []
labels_ignore = []
- for obj in root.findall('object'):
- name = obj.find('name').text
- if name not in self.CLASSES:
- continue
- label = self.cat2label[name]
- difficult = int(obj.find('difficult').text)
- bnd_box = obj.find('bndbox')
- # TODO: check whether it is necessary to use int
- # Coordinates may be float type
- bbox = [
- int(float(bnd_box.find('xmin').text)),
- int(float(bnd_box.find('ymin').text)),
- int(float(bnd_box.find('xmax').text)),
- int(float(bnd_box.find('ymax').text))
- ]
- ignore = False
- if self.min_size:
- assert not self.test_mode
- w = bbox[2] - bbox[0]
- h = bbox[3] - bbox[1]
- if w < self.min_size or h < self.min_size:
- ignore = True
- if difficult or ignore:
- bboxes_ignore.append(bbox)
- labels_ignore.append(label)
- else:
- bboxes.append(bbox)
- labels.append(label)
+ if osp.exists(xml_path):
+ tree = ET.parse(xml_path)
+ root = tree.getroot()
+ for obj in root.findall('object'):
+ name = obj.find('name').text
+ if name not in self.CLASSES:
+ continue
+ label = self.cat2label[name]
+ difficult = int(obj.find('difficult').text)
+ bnd_box = obj.find('bndbox')
+ # TODO: check whether it is necessary to use int
+ # Coordinates may be float type
+ bbox = [
+ int(float(bnd_box.find('xmin').text)),
+ int(float(bnd_box.find('ymin').text)),
+ int(float(bnd_box.find('xmax').text)),
+ int(float(bnd_box.find('ymax').text))
+ ]
+ ignore = False
+ if self.min_size:
+ assert not self.test_mode
+ w = bbox[2] - bbox[0]
+ h = bbox[3] - bbox[1]
+ if w < self.min_size or h < self.min_size:
+ ignore = True
+ if difficult or ignore:
+ bboxes_ignore.append(bbox)
+ labels_ignore.append(label)
+ else:
+ bboxes.append(bbox)
+ labels.append(label)
if not bboxes:
- bboxes = np.zeros((0, 4))
- labels = np.zeros((0, ))
+ #bboxes = np.zeros((0, 4))
+ #labels = np.zeros((0, ))
+ bboxes = np.array([[0, 0, 10, 10]], ndmin=2) -1
+ labels = np.zeros((1, ))
else:
bboxes = np.array(bboxes, ndmin=2) - 1
labels = np.array(labels)
if not bboxes_ignore:
- bboxes_ignore = np.zeros((0, 4))
- labels_ignore = np.zeros((0, ))
+ #bboxes_ignore = np.zeros((0, 4))
+ #labels_ignore = np.zeros((0, ))
+ bboxes_ignore = np.array([[0, 0, 10, 10]], ndmin=2) -1
+ labels_ignore = np.zeros((1, ))
else:
bboxes_ignore = np.array(bboxes_ignore, ndmin=2) - 1
labels_ignore = np.array(labels_ignore)
@@ -152,14 +161,15 @@ def get_cat_ids(self, idx):
cat_ids = []
img_id = self.data_infos[idx]['id']
- xml_path = osp.join(self.img_prefix, 'Annotations', f'{img_id}.xml')
- tree = ET.parse(xml_path)
- root = tree.getroot()
- for obj in root.findall('object'):
- name = obj.find('name').text
- if name not in self.CLASSES:
- continue
- label = self.cat2label[name]
- cat_ids.append(label)
+ xml_path = osp.join(self.img_prefix, ANNOTATIONS_FOLDER, f'{img_id}.xml')
+ if osp.exists(xml_path):
+ tree = ET.parse(xml_path)
+ root = tree.getroot()
+ for obj in root.findall('object'):
+ name = obj.find('name').text
+ if name not in self.CLASSES:
+ continue
+ label = self.cat2label[name]
+ cat_ids.append(label)
return cat_ids
diff --git a/examples/al_object_detection/mmdet/models/dense_heads/MIAOD_head.py b/examples/al_object_detection/mmdet/models/dense_heads/MIAOD_head.py
index 7c86dedf..8a494fab 100644
--- a/examples/al_object_detection/mmdet/models/dense_heads/MIAOD_head.py
+++ b/examples/al_object_detection/mmdet/models/dense_heads/MIAOD_head.py
@@ -369,6 +369,9 @@ def l_det(self, y_head_f_single, y_head_f_r_single, x_i_single, y_cls_single, la
x_i_single = x_i_single.reshape(-1, 4)
y_head_f_r_single = self.bbox_coder.decode(x_i_single, y_head_f_r_single)
l_det_loc = self.SmoothL1(y_head_f_r_single, y_loc_single, bbox_weights, avg_factor=num_total_samples)
+
+ l_det_loc = l_det_loc.clamp(None, 10)
+
return l_det_cls, l_det_loc
# Label Set Training
@@ -477,7 +480,7 @@ def L_wave_min(self, y_f, y_f_r, y_head_cls, y_loc_img, y_cls_img, img_metas, y_
l_det_cls2, l_det_loc2 = multi_apply(self.l_det, y_f[1], y_f_r, all_x_i,
y_cls, label_weights_list, y_loc, bbox_weights_list,
num_total_samples=num_total_samples)
- if y_loc_img[0][0][0] < 0:
+ if img_metas[0]['is_unlabeled'] :
l_det_cls = list(map(lambda m, n: (m + n) * 0, l_det_cls1, l_det_cls2))
l_det_loc = list(map(lambda m, n: (m + n) * 0, l_det_loc1, l_det_loc2))
for (i, value) in enumerate(l_det_loc):
@@ -562,8 +565,8 @@ def L_wave_max(self, y_f, y_f_r, y_head_cls, y_loc_img, y_cls_img, img_metas, y_
num_total_samples=num_total_samples)
l_det_cls2, l_det_loc2 = multi_apply(self.l_det, y_f[1], y_f_r, all_x_i,
y_cls, label_weights_list, y_loc, bbox_weights_list,
- num_total_samples=num_total_samples)
- if y_loc_img[0][0][0] < 0:
+ num_total_samples=num_total_samples)
+ if img_metas[0]['is_unlabeled']:
l_det_cls = list(map(lambda m, n: (m + n) * 0, l_det_cls1, l_det_cls2))
l_det_loc = list(map(lambda m, n: (m + n) * 0, l_det_loc1, l_det_loc2))
for (i, value) in enumerate(l_det_loc):
@@ -712,3 +715,25 @@ def _get_bboxes_single(self, y_head_f_single_list, y_head_f_r_single_list,
def loss(self, **kwargs):
# This function is to avoid the TypeError caused by the abstract method defined in "base_dense_head.py".
return
+
+ @force_fp32(apply_to=('y_f_r', ))
+ def get_uncertain_bboxes(self, arg, y_f_r, img_shape, scale_factor, nuboxes, cfg=None, rescale=False):
+ cfg = self.test_cfg if cfg is None else cfg
+ num_levels = len(y_f_r)
+ assert y_f_r[0].size(0) == 1
+ device = y_f_r[0].device
+ featmap_sizes = [y_f_r[i].shape[-2:] for i in range(num_levels)]
+ mlvl_anchors = self.anchor_generator.grid_anchors(featmap_sizes, device=
+device)
+ y_head_f_r = []
+ for y_head_f_r_single in y_f_r:
+ y_head_f_r.append(y_head_f_r_single.permute(0,2,3,1).reshape(-1, 4))
+ y_head_f_r = torch.cat(y_head_f_r, 0)
+ mlvl_anchors = torch.cat(mlvl_anchors, 0)
+ assert y_head_f_r.size(0) == mlvl_anchors.size(0)
+ y_head_f_r = y_head_f_r[arg[-nuboxes:]]
+ mlvl_anchors = mlvl_anchors[arg[-nuboxes:]]
+ bboxes = self.bbox_coder.decode(mlvl_anchors, y_head_f_r, max_shape=img_shape)
+ if rescale:
+ bboxes /= bboxes.new_tensor(scale_factor)
+ return bboxes
diff --git a/examples/al_object_detection/mmdet/models/detectors/single_stage.py b/examples/al_object_detection/mmdet/models/detectors/single_stage.py
index b52c2edd..a541c680 100644
--- a/examples/al_object_detection/mmdet/models/detectors/single_stage.py
+++ b/examples/al_object_detection/mmdet/models/detectors/single_stage.py
@@ -83,7 +83,7 @@ def forward_train(self, x, img_metas, y_loc_img, y_cls_img, y_loc_img_ignore=Non
losses = self.bbox_head.forward_train(x, img_metas, y_loc_img, y_cls_img, y_loc_img_ignore)
return losses
- def simple_test(self, x, img_metas, return_box=True, rescale=False):
+ def simple_test(self, x, img_metas, return_box=True, rescale=False, uncertain_box=False):
"""Test function without test time augmentation.
Args:
@@ -104,7 +104,10 @@ def simple_test(self, x, img_metas, return_box=True, rescale=False):
y_head_f_1_1level.append(y_head_f_i_single.permute(0,2,3,1).reshape(-1, self.bbox_head.cls_out_channels))
for y_head_f_i_single in y_head_f_2:
y_head_f_2_1level.append(y_head_f_i_single.permute(0,2,3,1).reshape(-1, self.bbox_head.cls_out_channels))
- return y_head_f_1_1level, y_head_f_2_1level, y_head_cls
+ if uncertain_box:
+ return y_head_f_1_1level, y_head_f_2_1level, y_head_f_r, y_head_cls
+ else:
+ return y_head_f_1_1level, y_head_f_2_1level, y_head_cls
outs = (y_head_f_1, y_head_f_r)
y_head_loc_cls = self.bbox_head.get_bboxes(*outs, img_metas, rescale=rescale)
# skip post-processing when exporting to ONNX
diff --git a/examples/al_object_detection/mmdet/utils/active_datasets.py b/examples/al_object_detection/mmdet/utils/active_datasets.py
index db5dbf3e..d56587c5 100644
--- a/examples/al_object_detection/mmdet/utils/active_datasets.py
+++ b/examples/al_object_detection/mmdet/utils/active_datasets.py
@@ -57,7 +57,7 @@ def load_ann_list(paths):
return anns
-def update_X_L(uncertainty, X_all, X_L, X_S_size):
+def update_X_L(uncertainty, X_all, X_L, X_S_size, return_X_S=False):
uncertainty = uncertainty.cpu().numpy()
all_X_U = np.array(list(set(X_all) - set(X_L)))
uncertainty_X_U = uncertainty[all_X_U]
@@ -72,4 +72,25 @@ def update_X_L(uncertainty, X_all, X_L, X_S_size):
X_U_next = np.concatenate((X_U_next, X_L_next[:X_L_next.shape[0] - X_U_next.shape[0]]))
X_L_next.sort()
X_U_next.sort()
- return X_L_next, X_U_next
+ if return_X_S:
+ return X_L_next, X_U_next, X_S
+ else:
+ return X_L_next, X_U_next
+
+def update_X_L_random(X_all, X_L, X_S_size, return_X_S=False):
+ all_X_U = np.array(list(set(X_all) - set(X_L)))
+ np.random.shuffle(all_X_U)
+ X_S = all_X_U[:X_S_size]
+ X_L_next = np.concatenate((X_L, X_S))
+ all_X_U_next = np.array(list(set(X_all) - set(X_L_next)))
+ np.random.shuffle(all_X_U_next)
+ X_U_next = all_X_U_next[:X_L_next.shape[0]]
+ if X_L_next.shape[0] > X_U_next.shape[0]:
+ np.random.shuffle(X_L_next)
+ X_U_next = np.concatenate((X_U_next, X_L_next[:X_L_next.shape[0] - X_U_next.shape[0]]))
+ X_L_next.sort()
+ X_U_next.sort()
+ if return_X_S:
+ return X_L_next, X_U_next, X_S
+ else:
+ return X_L_next, X_U_next
diff --git a/examples/al_object_detection/mmdet/utils/hdc/active_datasets.py b/examples/al_object_detection/mmdet/utils/hdc/active_datasets.py
index baa5d010..4135656f 100644
--- a/examples/al_object_detection/mmdet/utils/hdc/active_datasets.py
+++ b/examples/al_object_detection/mmdet/utils/hdc/active_datasets.py
@@ -58,7 +58,7 @@ def load_ann_list(paths):
return anns
-def update_X_L(uncertainty, X_all, X_L, X_S_size):
+def update_X_L(uncertainty, X_all, X_L, X_S_size, return_X_S=False):
uncertainty = uncertainty.cpu().numpy()
all_X_U = np.array(list(set(X_all) - set(X_L)))
uncertainty_X_U = uncertainty[all_X_U]
@@ -73,4 +73,7 @@ def update_X_L(uncertainty, X_all, X_L, X_S_size):
X_U_next = np.concatenate((X_U_next, X_L_next[:X_L_next.shape[0] - X_U_next.shape[0]]))
X_L_next.sort()
X_U_next.sort()
- return X_L_next, X_U_next
+ if return_X_S:
+ return X_L_next, X_U_next, X_S
+ else:
+ return X_L_next, X_U_next
diff --git a/examples/al_object_detection/output_lists.py b/examples/al_object_detection/output_lists.py
new file mode 100644
index 00000000..24b24d05
--- /dev/null
+++ b/examples/al_object_detection/output_lists.py
@@ -0,0 +1,144 @@
+import argparse
+import os
+import os.path as osp
+import json
+
+from mmcv import Config
+
+import mmcv
+import numpy as np
+from mmdet.apis import set_random_seed
+from mmdet.utils import get_root_logger
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description='Output list of images to be labeled')
+ parser.add_argument('config', help='train config file path')
+ parser.add_argument('--selected',
+ help='numpy file with indexes of samples labeled before this cycle')
+ parser.add_argument('--selected_next',
+ help='numpy file with indexes of samples to be labeled in this cycle')
+ parser.add_argument('--train',
+ help='file with list of all training samples')
+ parser.add_argument('--map',
+ help='file with mapping between image number names and full names')
+ parser.add_argument('--label_next',
+ help='text file with list of samples to be labeled in this cycle')
+ parser.add_argument('--labeled',
+ help='text file with list of all samples labeled after this cycle')
+ parser.add_argument('--unlabeled',
+ help='text file with list of all samples unlabeled after this cycle')
+ parser.add_argument('--cycle_config',
+ help='config file with active learning cycle and seed')
+ parser.add_argument('--cycle_config_next',
+ help='config file with next active learning cycle and seed')
+ parser.add_argument('--work_directory',
+ help='the dir to save logs and model checkpoints')
+ parser.add_argument('--seed', type=int, default=666, help='random seed')
+ parser.add_argument('--deterministic', action='store_true',
+ help='whether to set deterministic options for CUDNN backend.')
+ parser.add_argument('--stage_name', help='Name for current execution')
+ parser.add_argument('--execution_name', help='Name for current execution')
+ args = parser.parse_args()
+ return args
+
+def main():
+ args = parse_args()
+
+ assert (args.selected and args.selected_next and args.train and
+ args.map and args.label_next and args.labeled and
+ args.unlabeled and args.cycle_config and args.cycle_config_next), \
+ ('Plase specify file names of numpy lists of images labeled before '
+ 'this cycle and in this cycle with arguments "--selected" and '
+ '"--selected_next". Also specify the file name with the list of all '
+ 'images that the lists of selected and unselected samples point to, '
+ 'with argument "--train". Then specify file name for the map between '
+ 'image number names and full image names with argument "--map". '
+ 'Finally, specify names for text files output by this code with lists '
+ 'of images to be labeled in this cycle, all labeled after this cycle '
+ 'and remaining unlabeled, with arguments "--label_next", "--labeled" '
+ 'and "--unlabeled". Also specify config file with active learning '
+ 'cycle number and random number generator seed with argument '
+ '"--cycle_config" and the name of the next cycle config file output '
+ 'by this code with argument "--cycle_config_next"')
+
+ cfg = Config.fromfile(args.config)
+
+ # work_directory is determined in this priority: CLI > config > default
+ if args.work_directory is not None:
+ # update work_directory from CLI args if args.work_directory is not None
+ cfg.work_directory = args.work_directory
+ elif cfg.get('work_directory', None) is None:
+ # derive work_directory from config name if cfg.work_directory is None
+ cfg.work_directory = osp.join('./work_dirs',
+ osp.splitext(osp.basename(args.config))[0])
+
+ # create work_directory
+ mmcv.mkdir_or_exist(osp.abspath(cfg.work_directory))
+
+ stage_name = args.stage_name
+ os.environ['stage_name'] = stage_name
+
+ execution_name = args.execution_name
+ os.environ['execution_name'] = execution_name
+
+ # init the logger before other steps
+ log_file = osp.join(cfg.work_directory, 'input_sample_lists.log')
+ logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
+
+ # set random seed
+ if args.seed is not None:
+ logger.info(
+ f'Set random seed to {args.seed}, deterministic: {args.deterministic}')
+ set_random_seed(args.seed, deterministic=args.deterministic)
+ cfg.seed = args.seed
+
+ # load the map of image number to image name
+ map = {}
+ with open(args.map) as f:
+ for line in f:
+ tokens = line.split(",")
+ assert (len(tokens) == 2), \
+ (f'Invalid format of file {args.map} : Expected 2 tokens '
+ 'per line, received {len(tokens)} : {line}')
+ map[int(tokens[0])] = tokens[1].strip()
+
+ # get the lists of images to label, all labeled and unlabeled
+ labeled_indexes = np.load(args.selected)
+ labeled_next_indexes = np.load(args.selected_next)
+ to_label_indexes = np.array(list(set(labeled_next_indexes) -
+ set(labeled_indexes)))
+ with open(args.train) as f:
+ line = f.readline().strip()
+ num_digits = len(line)
+ all = np.loadtxt(args.train, dtype=np.uintc)
+ to_label = all[to_label_indexes]
+ to_label.sort()
+ labeled = all[labeled_next_indexes]
+ labeled.sort()
+ unlabeled = np.delete(all, labeled_next_indexes)
+ unlabeled.sort()
+
+ # save the lists after converting from file numbers to file names in
+ # the list of images to label
+ with open(args.label_next, 'w') as f:
+ for image_number in to_label:
+ image_name = map[image_number]
+ f.write(image_name + "\n")
+ np.savetxt(args.labeled, labeled, fmt='%0'+str(num_digits)+'u')
+ np.savetxt(args.unlabeled, unlabeled, fmt='%0'+str(num_digits)+'u')
+
+ # Update active learning config file with new cycle and random number
+ # generator seed
+ cycle_config = {}
+ with open(args.cycle_config) as f:
+ cycle_config = json.load(f)
+ cycle_config['al_cycle'] += 1
+ cycle_config['next_cycle'] += 1
+ rng = np.random.default_rng(args.seed)
+ cycle_config['al_seed'] = int(rng.integers(1000))
+ with open(args.cycle_config_next, "w") as f:
+ json.dump(cycle_config, f, indent = 4)
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/al_object_detection/test_hints.sh b/examples/al_object_detection/test_hints.sh
new file mode 100755
index 00000000..84e6a145
--- /dev/null
+++ b/examples/al_object_detection/test_hints.sh
@@ -0,0 +1 @@
+python cycle_select.py configs/MIAOD-GRAY.py --work_directory work_dirs/test --cycle 0 --model work_dirs/test/cycle.pth --labeled work_dirs/test/X_L_0.npy --seed 666 --labeled_next work_dirs/test/X_L_1.npy --unselected work_dirs/test/X_U_1.npy --bbox_output work_dirs/test/labeling_hints_1.txt
diff --git a/examples/al_object_detection/train.py b/examples/al_object_detection/train.py
new file mode 100644
index 00000000..e4aefecb
--- /dev/null
+++ b/examples/al_object_detection/train.py
@@ -0,0 +1,216 @@
+import argparse
+import copy
+import os
+import os.path as osp
+import time
+
+import mmcv
+import torch
+import random
+import numpy as np
+from mmcv import Config, DictAction
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import init_dist, load_checkpoint, save_checkpoint
+from mmcv.utils import get_git_hash
+
+from mmdet import __version__
+from mmdet.apis import set_random_seed, train_detector
+from mmdet.apis import single_gpu_test, calculate_uncertainty
+from mmdet.datasets import build_dataloader, build_dataset
+from mmdet.models import build_detector
+from mmdet.utils import collect_env, get_root_logger
+#from mmdet.utils.active_datasets import *
+from mmdet.utils.hdc.active_datasets import *
+from tools.utils import losstype
+from torch import distributed as dist
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Train a detector')
+ parser.add_argument('config', help='train config file path')
+ parser.add_argument('--cycle', help='active learning iteration >= 0')
+ parser.add_argument('--labeled', help='labeled samples list file')
+ parser.add_argument('--unselected', help='unselected samples list file')
+ parser.add_argument('--model_prev',
+ help='previous cycle model checkpoint file')
+ parser.add_argument('--model', help='model checkpoint file')
+ parser.add_argument('--work_directory',
+ help='the dir to save logs and model checkpoints')
+ parser.add_argument('--no-validate', action='store_false',
+ help='whether not to evaluate the checkpoint during training')
+ group_gpus = parser.add_mutually_exclusive_group()
+ group_gpus.add_argument('--gpus', type=int,
+ help='number of gpus to use (only applicable to non-distributed run)')
+ group_gpus.add_argument('--gpu_ids', type=int, nargs='+',
+ help='ids of gpus to use (only applicable to non-distributed run)')
+ parser.add_argument('--seed', type=int, default=666, help='random seed')
+ parser.add_argument('--deterministic', action='store_true',
+ help='whether to set deterministic options for CUDNN backend.')
+ parser.add_argument('--options', nargs='+', action=DictAction,
+ help='arguments in dict')
+ parser.add_argument('--launcher',
+ choices=['none', 'pytorch', 'slurm', 'mpi'],
+ default='none', help='job launcher')
+ parser.add_argument('--local_rank', type=int, default=0)
+ args = parser.parse_args()
+ if 'LOCAL_RANK' not in os.environ:
+ os.environ['LOCAL_RANK'] = str(args.local_rank)
+ return args
+
+
+def main():
+ args = parse_args()
+
+ assert (args.cycle and args.labeled and args.unselected and args.model), \
+ ('Please specify active learning cycle, file names of labeled '
+ 'and unselected image list files, and model checkpoint file '
+ 'with arguments "--cycle", "--labeled", "--unselected" and "model"')
+
+ cfg = Config.fromfile(args.config)
+ if args.options is not None:
+ cfg.merge_from_dict(args.options)
+
+ # set cudnn_benchmark
+ if cfg.get('cudnn_benchmark', False):
+ torch.backends.cudnn.benchmark = True
+
+ # work_directory is determined in this priority: CLI > config > defasult
+ if args.work_directory is not None:
+ # update work_directory from CLI args if args.work_directory is not None
+ cfg.work_directory = args.work_directory
+ elif cfg.get('work_directory', None) is None:
+ # derive work_directory from config name if cfg.work_directory is None
+ cfg.work_directory = osp.join('./work_dirs',
+ osp.splitext(osp.basename(args.config))[0])
+
+ # TO DO: placeholder for distributed processing. Will require code changes
+ if args.gpu_ids is not None:
+ cfg.gpu_ids = args.gpu_ids
+ else:
+ cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
+
+ # init distributed env first, since logger depends on the dist info.
+ if args.launcher == 'none':
+ distributed = False
+ else:
+ distributed = True
+ init_dist(args.launcher, **cfg.dist_params)
+
+ # create work_directory
+ mmcv.mkdir_or_exist(osp.abspath(cfg.work_directory))
+
+ # dump config
+ cfg.dump(osp.join(cfg.work_directory,
+ f'cycle_train{args.cycle}_' + osp.basename(args.config)))
+
+ # init the logger before other steps
+ timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+ log_file = osp.join(cfg.work_directory, f'cycle_train{args.cycle}.log')
+ logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
+
+ # init the meta dict to record some important information
+ # such as environment info and seed, which will be logged
+ meta = dict()
+
+ # log env info
+ env_info_dict = collect_env()
+ env_info = '\n'.join([f'{k}: {v}' for k, v in env_info_dict.items()])
+ dash_line = '-' * 60 + '\n'
+ logger.info('Environment info:\n' + dash_line + env_info + '\n' + dash_line)
+ meta['env_info'] = env_info
+
+ # log some basic info
+ logger.info(f'Distributed training: {distributed}')
+ logger.info(f'Config:\n{cfg.pretty_text}')
+
+ # set random seeds
+ if args.seed is not None:
+ logger.info(
+ f'Set random seed to {args.seed}, deterministic: {args.deterministic}')
+ set_random_seed(args.seed, deterministic=args.deterministic)
+ cfg.seed = args.seed
+ meta['seed'] = args.seed
+
+ # load lists of all, labeled and unselected images
+ all_anns = load_ann_list(cfg.data.train.dataset.ann_file)
+
+ if len(all_anns[0]) == 1:
+ X_all = np.arange(len(all_anns))
+ else:
+ j = 0
+ for i in range(len(all_anns)):
+ j += len(all_anns[i])
+ X_all = np.arange(j)
+ X_L = np.load(args.labeled)
+ X_U = np.load(args.unselected)
+ cycle = args.cycle
+ cfg.cycles = list((args.cycle))
+ initial_step = cfg.lr_config.step
+
+ # get the config of the labeled dataset
+ cfg = create_X_L_file(cfg, X_L, all_anns, cycle)
+
+ # build the model and load checkpoint, if specified
+ if args.model_prev is not None:
+ cfg.model.pretrained = None
+ if cfg.model.get('neck'):
+ if cfg.model.neck.get('rfp_backbone'):
+ if cfg.model.neck.rfp_backbone.get('pretrained'):
+ cfg.model.neck.rfp_backbone.pretrained = None
+ model = build_detector(cfg.model, train_cfg=cfg.train_cfg,
+ test_cfg=cfg.test_cfg)
+ if args.model_prev is not None:
+ checkpoint = load_checkpoint(model, args.model_prev)
+
+ # load dataset
+ datasets = [build_dataset(cfg.data.train)]
+ if len(cfg.workflow) == 2:
+
+ val_dataset = copy.deepcopy(cfg.data.val)
+ val_dataset.pipeline = cfg.data.train.dataset.pipeline
+ datasets.append(build_dataset(val_dataset))
+ model.CLASSES = datasets[0].CLASSES
+
+ # save mmdet version, config file content and class names in
+ # checkpoints as meta data
+ if cfg.checkpoint_config is None:
+ cfg.checkpoint_config = dict()
+ cfg.checkpoint_config.meta = dict(
+ mmdet_version=__version__ + get_git_hash()[:7],
+ config=cfg.pretty_text, CLASSES=datasets[0].CLASSES)
+
+ for epoch in range(cfg.epoch):
+ # Only in the last 3 epoch does the learning rate need to be reduced
+ # and the model needs to be evaluated.
+ if epoch == cfg.epoch - 1:
+ cfg.lr_config.step = initial_step
+ cfg.evaluation.interval = cfg.epoch_ratio[0]
+ else:
+ cfg.lr_config.step = [1000]
+ #cfg.evaluation.interval = 100
+
+
+
+ cfg = create_X_L_file(cfg, X_L, all_anns, cycle)
+
+ if dist.is_initialized():
+ torch.distributed.barrier()
+ datasets = [build_dataset(cfg.data.train)]
+ losstype.update_vars(0)
+ cfg.total_epochs = cfg.epoch_ratio[0]
+ cfg_bak = cfg.deepcopy()
+ for name, value in model.named_parameters():
+ value.requires_grad = True
+ time.sleep(2)
+ train_detector(model, datasets, cfg,
+ distributed=distributed,
+ validate=args.no_validate,
+ timestamp=timestamp, meta=meta)
+ cfg = cfg_bak
+
+ # save the model to checkpoint
+ save_checkpoint(model, args.model, meta=cfg.checkpoint_config.meta)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/al_object_detection/uuid.json b/examples/al_object_detection/uuid.json
new file mode 100644
index 00000000..5b05a272
--- /dev/null
+++ b/examples/al_object_detection/uuid.json
@@ -0,0 +1 @@
+{"uuid_var": "b4d14eb9-2c07-48fa-bcd6-63be0c8df8f4"}
\ No newline at end of file
diff --git a/examples/example-get-started/Query_Tester-base_mlmd.ipynb b/examples/example-get-started/Query_Tester-base_mlmd.ipynb
index a8588ea0..69ec3afa 100644
--- a/examples/example-get-started/Query_Tester-base_mlmd.ipynb
+++ b/examples/example-get-started/Query_Tester-base_mlmd.ipynb
@@ -10,7 +10,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 40,
"id": "d8cecabd",
"metadata": {},
"outputs": [],
@@ -24,25 +24,36 @@
"id": "9ae54008",
"metadata": {},
"source": [
- "##### Initialize the library and get all the stages in the pipeline\n",
+ "### Initialize the library and get all the stages in the pipeline\n",
"Point the library to the metadata file.
\n",
"The `get_pipeline_stages` call point to the different stages in the pipeline."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 53,
"id": "8b735117",
"metadata": {
"pycharm": {
"is_executing": true
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Prepare', 'Featurize', 'Train', 'Evaluate']"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
"query = cmfquery.CmfQuery(\"./mlmd\")\n",
- "stages = query.get_pipeline_stages(\"Test-env\")\n",
- "print(stages)"
+ "pipelines = query.get_pipeline_names()\n",
+ "stages = query.get_pipeline_stages(pipelines[0])\n",
+ "display(stages)"
]
},
{
@@ -50,95 +61,340 @@
"id": "5ac1591f",
"metadata": {},
"source": [
- "##### Query the Executions in each stage"
+ "### Query the Executions in each stage"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 54,
"id": "da0ee66d",
"metadata": {
"pycharm": {
"is_executing": true
}
},
- "outputs": [],
- "source": [
- "executions = query.get_all_executions_in_stage('Prepare')\n",
- "print(executions)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "cbc05fd9",
- "metadata": {},
- "source": [
- "##### Query the Executions in each stage\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "cd81efe0",
- "metadata": {
- "pycharm": {
- "is_executing": true
- }
- },
- "outputs": [],
- "source": [
- "executions = query.get_all_executions_in_stage('Featurize')\n",
- "print(executions)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "dd5c7d70",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "##### Query the Executions in each stage"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "32c8fd0c",
- "metadata": {
- "pycharm": {
- "is_executing": true
- }
- },
- "outputs": [],
- "source": [
- "executions = query.get_all_executions_in_stage('Train')\n",
- "print(executions)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0342196a",
- "metadata": {},
- "source": [
- "##### Query the Executions in each stage"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e1bcfcd0",
- "metadata": {
- "pycharm": {
- "is_executing": true
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Displaying execution for stage Prepare\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Context_ID | \n",
+ " Context_Type | \n",
+ " Execution | \n",
+ " Git_End_Commit | \n",
+ " Git_Repo | \n",
+ " Git_Start_Commit | \n",
+ " Pipeline_Type | \n",
+ " Pipeline_id | \n",
+ " id | \n",
+ " seed | \n",
+ " split | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2 | \n",
+ " Prepare | \n",
+ " ['src/parse.py', 'artifacts/data.xml.gz', 'artifacts/parsed'] | \n",
+ " | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 8158283953c04affb8fe5ea6710656564ede7d3a | \n",
+ " Test-env | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 20170428 | \n",
+ " 0.2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Context_ID Context_Type \\\n",
+ "0 2 Prepare \n",
+ "\n",
+ " Execution \\\n",
+ "0 ['src/parse.py', 'artifacts/data.xml.gz', 'artifacts/parsed'] \n",
+ "\n",
+ " Git_End_Commit Git_Repo \\\n",
+ "0 /tmp/cmf/example_get_started/git_remote \n",
+ "\n",
+ " Git_Start_Commit Pipeline_Type Pipeline_id id \\\n",
+ "0 8158283953c04affb8fe5ea6710656564ede7d3a Test-env 1 1 \n",
+ "\n",
+ " seed split \n",
+ "0 20170428 0.2 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Displaying execution for stage Featurize\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Context_ID | \n",
+ " Context_Type | \n",
+ " Execution | \n",
+ " Git_End_Commit | \n",
+ " Git_Repo | \n",
+ " Git_Start_Commit | \n",
+ " Pipeline_Type | \n",
+ " Pipeline_id | \n",
+ " id | \n",
+ " max_features | \n",
+ " ngrams | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Featurize-execution | \n",
+ " ['src/featurize.py', 'artifacts/parsed', 'artifacts/features'] | \n",
+ " | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 8158283953c04affb8fe5ea6710656564ede7d3a | \n",
+ " Test-env | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3000 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Context_ID Context_Type \\\n",
+ "0 3 Featurize-execution \n",
+ "\n",
+ " Execution \\\n",
+ "0 ['src/featurize.py', 'artifacts/parsed', 'artifacts/features'] \n",
+ "\n",
+ " Git_End_Commit Git_Repo \\\n",
+ "0 /tmp/cmf/example_get_started/git_remote \n",
+ "\n",
+ " Git_Start_Commit Pipeline_Type Pipeline_id id \\\n",
+ "0 8158283953c04affb8fe5ea6710656564ede7d3a Test-env 1 2 \n",
+ "\n",
+ " max_features ngrams \n",
+ "0 3000 2 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Displaying execution for stage Train\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Context_ID | \n",
+ " Context_Type | \n",
+ " Execution | \n",
+ " Git_End_Commit | \n",
+ " Git_Repo | \n",
+ " Git_Start_Commit | \n",
+ " Pipeline_Type | \n",
+ " Pipeline_id | \n",
+ " id | \n",
+ " min_split | \n",
+ " n_est | \n",
+ " seed | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 4 | \n",
+ " Train-execution | \n",
+ " ['src/train.py', 'artifacts/features', 'artifacts/model'] | \n",
+ " | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 8158283953c04affb8fe5ea6710656564ede7d3a | \n",
+ " Test-env | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 64 | \n",
+ " 100 | \n",
+ " 20170428 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Context_ID Context_Type \\\n",
+ "0 4 Train-execution \n",
+ "\n",
+ " Execution Git_End_Commit \\\n",
+ "0 ['src/train.py', 'artifacts/features', 'artifacts/model'] \n",
+ "\n",
+ " Git_Repo \\\n",
+ "0 /tmp/cmf/example_get_started/git_remote \n",
+ "\n",
+ " Git_Start_Commit Pipeline_Type Pipeline_id id \\\n",
+ "0 8158283953c04affb8fe5ea6710656564ede7d3a Test-env 1 3 \n",
+ "\n",
+ " min_split n_est seed \n",
+ "0 64 100 20170428 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Displaying execution for stage Evaluate\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Context_ID | \n",
+ " Context_Type | \n",
+ " Execution | \n",
+ " Git_End_Commit | \n",
+ " Git_Repo | \n",
+ " Git_Start_Commit | \n",
+ " Pipeline_Type | \n",
+ " Pipeline_id | \n",
+ " id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 5 | \n",
+ " Evaluate-execution | \n",
+ " ['src/test.py', 'artifacts/model', 'artifacts/features', 'artifacts/tes... | \n",
+ " | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 8158283953c04affb8fe5ea6710656564ede7d3a | \n",
+ " Test-env | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Context_ID Context_Type \\\n",
+ "0 5 Evaluate-execution \n",
+ "\n",
+ " Execution \\\n",
+ "0 ['src/test.py', 'artifacts/model', 'artifacts/features', 'artifacts/tes... \n",
+ "\n",
+ " Git_End_Commit Git_Repo \\\n",
+ "0 /tmp/cmf/example_get_started/git_remote \n",
+ "\n",
+ " Git_Start_Commit Pipeline_Type Pipeline_id id \n",
+ "0 8158283953c04affb8fe5ea6710656564ede7d3a Test-env 1 4 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
}
- },
- "outputs": [],
+ ],
"source": [
- "executions = query.get_all_executions_in_stage('Evaluate')\n",
- "print(executions)"
+ "for stage in stages:\n",
+ " executions = query.get_all_executions_in_stage(stage)\n",
+ " print(f\"Displaying execution for stage {stage}\")\n",
+ " display(executions)"
]
},
{
@@ -146,44 +402,146 @@
"id": "5a3599af",
"metadata": {},
"source": [
- "##### Get all the artifacts of execution. \n",
+ "### Get all artifacts of an execution. \n",
"input parameter - execution_id
\n",
"output parameter - artifacts
\n"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 56,
"id": "6fa93876",
"metadata": {
"pycharm": {
"is_executing": true
}
},
- "outputs": [],
- "source": [
- "artifacts = query.get_all_artifacts_for_execution(1)\n",
- "print(artifacts)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "2e82cf28",
- "metadata": {
- "pycharm": {
- "is_executing": true
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Displaying the artifacts for execution with id 1 belonging to Prepare\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Commit | \n",
+ " create_time_since_epoch | \n",
+ " event | \n",
+ " git_repo | \n",
+ " id | \n",
+ " last_update_time_since_epoch | \n",
+ " name | \n",
+ " type | \n",
+ " uri | \n",
+ " user-metadata1 | \n",
+ " user-metadata2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " commit 03c25dfdb6c188b7b04f7e675dec072de192b851 | \n",
+ " 1667255770792 | \n",
+ " INPUT | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 1 | \n",
+ " 1667255778222 | \n",
+ " artifacts/data.xml.gz:236d9502e0283d91f689d7038b8508a2 | \n",
+ " Dataset | \n",
+ " 236d9502e0283d91f689d7038b8508a2 | \n",
+ " metadata_value | \n",
+ " metadata_value | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " commit 4fba7197919fb85dd1a0899d2cf5c5c690ee607c | \n",
+ " 1667255774532 | \n",
+ " OUTPUT | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 2 | \n",
+ " 1667255774532 | \n",
+ " artifacts/parsed/train.tsv:22ec7737f442cfc81e8c701fb58d1007 | \n",
+ " Dataset | \n",
+ " 22ec7737f442cfc81e8c701fb58d1007 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " commit 5dfd3ac63c950f6394e5b7cebd55343402c7fdb6 | \n",
+ " 1667255776391 | \n",
+ " OUTPUT | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 3 | \n",
+ " 1667255776391 | \n",
+ " artifacts/parsed/test.tsv:03e3627bda150c8cf51a55ef96ab3ede | \n",
+ " Dataset | \n",
+ " 03e3627bda150c8cf51a55ef96ab3ede | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Commit create_time_since_epoch \\\n",
+ "0 commit 03c25dfdb6c188b7b04f7e675dec072de192b851 1667255770792 \n",
+ "1 commit 4fba7197919fb85dd1a0899d2cf5c5c690ee607c 1667255774532 \n",
+ "2 commit 5dfd3ac63c950f6394e5b7cebd55343402c7fdb6 1667255776391 \n",
+ "\n",
+ " event git_repo id \\\n",
+ "0 INPUT /tmp/cmf/example_get_started/git_remote 1 \n",
+ "1 OUTPUT /tmp/cmf/example_get_started/git_remote 2 \n",
+ "2 OUTPUT /tmp/cmf/example_get_started/git_remote 3 \n",
+ "\n",
+ " last_update_time_since_epoch \\\n",
+ "0 1667255778222 \n",
+ "1 1667255774532 \n",
+ "2 1667255776391 \n",
+ "\n",
+ " name type \\\n",
+ "0 artifacts/data.xml.gz:236d9502e0283d91f689d7038b8508a2 Dataset \n",
+ "1 artifacts/parsed/train.tsv:22ec7737f442cfc81e8c701fb58d1007 Dataset \n",
+ "2 artifacts/parsed/test.tsv:03e3627bda150c8cf51a55ef96ab3ede Dataset \n",
+ "\n",
+ " uri user-metadata1 user-metadata2 \n",
+ "0 236d9502e0283d91f689d7038b8508a2 metadata_value metadata_value \n",
+ "1 22ec7737f442cfc81e8c701fb58d1007 NaN NaN \n",
+ "2 03e3627bda150c8cf51a55ef96ab3ede NaN NaN "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
}
- },
- "outputs": [],
+ ],
"source": [
- "# print(pd.options.display.max_colwidth)\n",
- "pd.options.display.max_colwidth = 75\n",
- "artifacts = query.get_all_artifacts_for_execution(2)\n",
"\n",
- "# print(artifacts)\n",
- "print(artifacts[\"name\"])\n",
- "print(artifacts[\"event\"])"
+ "executions = query.get_all_executions_in_stage(stages[0])\n",
+ "print(f\"Displaying the artifacts for execution with id {executions.iloc[0]['id']} belonging to {stages[0]}\")\n",
+ "artifacts = query.get_all_artifacts_for_execution(executions.iloc[0][\"id\"])\n",
+ "display(artifacts)"
]
},
{
@@ -191,22 +549,78 @@
"id": "d15b7386",
"metadata": {},
"source": [
- "#### get all executions for an artifact(pass the artifact full name as the input parameter)"
+ "### Get all executions for an artifact (pass the artifact full name as the input parameter)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 57,
"id": "f1632d60",
"metadata": {
"pycharm": {
"is_executing": true
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Type | \n",
+ " execution_id | \n",
+ " execution_name | \n",
+ " pipeline | \n",
+ " stage | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " INPUT | \n",
+ " 1 | \n",
+ " | \n",
+ " Test-env | \n",
+ " Prepare | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Type execution_id execution_name pipeline stage\n",
+ "0 INPUT 1 Test-env Prepare"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
- "linked = query.get_all_executions_for_artifact(\"artifacts/data.xml.gz:a304afb96060aad90176268345e10355\")\n",
- "print(linked)"
+ "#Provide the artifact in name:hash format\n",
+ "artifacts = query.get_all_artifacts_for_execution(executions.iloc[0]['id'])\n",
+ "for index, art in artifacts.iterrows():\n",
+ " if art[\"event\"] == \"INPUT\":\n",
+ " artifact_name = art[\"name\"]\n",
+ " break\n",
+ "linked = query.get_all_executions_for_artifact(artifact_name)\n",
+ "display(linked)"
]
},
{
@@ -214,22 +628,138 @@
"id": "7ad864e3",
"metadata": {},
"source": [
- "#### Get all the parent artifacts of an artifact. Provides the artifact lineage chain"
+ "### Get all the parent artifacts of an artifact. Provides the artifact lineage chain"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 58,
"id": "09652709",
"metadata": {
"pycharm": {
"is_executing": true
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Parent artifact of artifacts/features/train.pkl:5de5e987eadb4b86fc47604b59cb3725\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Commit | \n",
+ " create_time_since_epoch | \n",
+ " git_repo | \n",
+ " id | \n",
+ " last_update_time_since_epoch | \n",
+ " name | \n",
+ " type | \n",
+ " uri | \n",
+ " user-metadata1 | \n",
+ " user-metadata2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " commit 4fba7197919fb85dd1a0899d2cf5c5c690ee607c | \n",
+ " 1667255774532 | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 2 | \n",
+ " 1667255774532 | \n",
+ " artifacts/parsed/train.tsv:22ec7737f442cfc81e8c701fb58d1007 | \n",
+ " Dataset | \n",
+ " 22ec7737f442cfc81e8c701fb58d1007 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " commit 5dfd3ac63c950f6394e5b7cebd55343402c7fdb6 | \n",
+ " 1667255776391 | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 3 | \n",
+ " 1667255776391 | \n",
+ " artifacts/parsed/test.tsv:03e3627bda150c8cf51a55ef96ab3ede | \n",
+ " Dataset | \n",
+ " 03e3627bda150c8cf51a55ef96ab3ede | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " commit 03c25dfdb6c188b7b04f7e675dec072de192b851 | \n",
+ " 1667255770792 | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 1 | \n",
+ " 1667255778222 | \n",
+ " artifacts/data.xml.gz:236d9502e0283d91f689d7038b8508a2 | \n",
+ " Dataset | \n",
+ " 236d9502e0283d91f689d7038b8508a2 | \n",
+ " metadata_value | \n",
+ " metadata_value | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Commit create_time_since_epoch \\\n",
+ "0 commit 4fba7197919fb85dd1a0899d2cf5c5c690ee607c 1667255774532 \n",
+ "1 commit 5dfd3ac63c950f6394e5b7cebd55343402c7fdb6 1667255776391 \n",
+ "2 commit 03c25dfdb6c188b7b04f7e675dec072de192b851 1667255770792 \n",
+ "\n",
+ " git_repo id last_update_time_since_epoch \\\n",
+ "0 /tmp/cmf/example_get_started/git_remote 2 1667255774532 \n",
+ "1 /tmp/cmf/example_get_started/git_remote 3 1667255776391 \n",
+ "2 /tmp/cmf/example_get_started/git_remote 1 1667255778222 \n",
+ "\n",
+ " name type \\\n",
+ "0 artifacts/parsed/train.tsv:22ec7737f442cfc81e8c701fb58d1007 Dataset \n",
+ "1 artifacts/parsed/test.tsv:03e3627bda150c8cf51a55ef96ab3ede Dataset \n",
+ "2 artifacts/data.xml.gz:236d9502e0283d91f689d7038b8508a2 Dataset \n",
+ "\n",
+ " uri user-metadata1 user-metadata2 \n",
+ "0 22ec7737f442cfc81e8c701fb58d1007 NaN NaN \n",
+ "1 03e3627bda150c8cf51a55ef96ab3ede NaN NaN \n",
+ "2 236d9502e0283d91f689d7038b8508a2 metadata_value metadata_value "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
- "linked = query.get_all_parent_artifacts(\"artifacts/features/test.pkl\")\n",
- "print(linked)"
+ "artifacts = query.get_all_artifacts_for_execution(2)\n",
+ "for index, art in artifacts.iterrows():\n",
+ " if art[\"event\"] == \"OUTPUT\":\n",
+ " artifact_name = art[\"name\"]\n",
+ " break\n",
+ "print(f\"Parent artifact of {artifact_name}\")\n",
+ "linked = query.get_all_parent_artifacts(artifact_name)\n",
+ "display(linked)"
]
},
{
@@ -237,25 +767,133 @@
"id": "63b615f1",
"metadata": {},
"source": [
- "#### Get all the child artifacts of an artifact. Provides the lineage chain in the downstream direction"
+ "### Get all child artifacts of an artifact. It provides the lineage chain of successors"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 46,
"id": "57b85ea6",
"metadata": {
"pycharm": {
"is_executing": true
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Child artifact of artifacts/features/train.pkl:5de5e987eadb4b86fc47604b59cb3725\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Commit | \n",
+ " avg_prec | \n",
+ " create_time_since_epoch | \n",
+ " id | \n",
+ " last_update_time_since_epoch | \n",
+ " metrics_name | \n",
+ " model_framework | \n",
+ " model_name | \n",
+ " model_type | \n",
+ " name | \n",
+ " roc_auc | \n",
+ " type | \n",
+ " uri | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " commit 8158283953c04affb8fe5ea6710656564ede7d3a | \n",
+ " NaN | \n",
+ " 1667255811813 | \n",
+ " 6 | \n",
+ " 1667255811813 | \n",
+ " NaN | \n",
+ " SKlearn | \n",
+ " RandomForestClassifier:default | \n",
+ " RandomForestClassifier | \n",
+ " artifacts/model/model.pkl:5f6e4aa57cce9e3a0b2f12e5766d19be:3 | \n",
+ " NaN | \n",
+ " Model | \n",
+ " 5f6e4aa57cce9e3a0b2f12e5766d19be | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 0.526754 | \n",
+ " 1667255818388 | \n",
+ " 7 | \n",
+ " 1667255818388 | \n",
+ " metrics:878d492e-596c-11ed-99a3-b47af137252e:4 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " metrics:878d492e-596c-11ed-99a3-b47af137252e:4 | \n",
+ " 0.959238 | \n",
+ " Metrics | \n",
+ " 878d492e-596c-11ed-99a3-b47af137252e | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Commit avg_prec \\\n",
+ "0 commit 8158283953c04affb8fe5ea6710656564ede7d3a NaN \n",
+ "1 NaN 0.526754 \n",
+ "\n",
+ " create_time_since_epoch id last_update_time_since_epoch \\\n",
+ "0 1667255811813 6 1667255811813 \n",
+ "1 1667255818388 7 1667255818388 \n",
+ "\n",
+ " metrics_name model_framework \\\n",
+ "0 NaN SKlearn \n",
+ "1 metrics:878d492e-596c-11ed-99a3-b47af137252e:4 NaN \n",
+ "\n",
+ " model_name model_type \\\n",
+ "0 RandomForestClassifier:default RandomForestClassifier \n",
+ "1 NaN NaN \n",
+ "\n",
+ " name roc_auc \\\n",
+ "0 artifacts/model/model.pkl:5f6e4aa57cce9e3a0b2f12e5766d19be:3 NaN \n",
+ "1 metrics:878d492e-596c-11ed-99a3-b47af137252e:4 0.959238 \n",
+ "\n",
+ " type uri \n",
+ "0 Model 5f6e4aa57cce9e3a0b2f12e5766d19be \n",
+ "1 Metrics 878d492e-596c-11ed-99a3-b47af137252e "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
- "linked = query.get_all_child_artifacts(\"artifacts/features/train.pkl\")\n",
- "\n",
- "print(\"Name : \" + linked[\"name\"].to_string(index=False, header=False))\n",
- "print(\"Type : \" + linked[\"type\"].to_string(index=False, header=False))\n",
- "print(\"URI : \" + linked[\"uri\"].to_string(index=False, header=False))"
+ "print(f\"Child artifact of {artifact_name}\")\n",
+ "linked = query.get_all_child_artifacts(artifact_name)\n",
+ "display(linked)\n"
]
},
{
@@ -267,64 +905,313 @@
}
},
"source": [
- "#### Get all the parent artifacts of an artifact. Provides the artifact lineage chain"
+ "### Get all the parent artifacts of an artifact. Provides the artifact lineage chain of predecessors"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 59,
"id": "493bd571",
"metadata": {
"pycharm": {
"is_executing": true
}
},
- "outputs": [],
- "source": [
- "linked = query.get_all_parent_artifacts(\"artifacts/model/model.pkl\")\n",
- "print(\"NAME\")\n",
- "print(linked[\"name\"].to_string(index=False, header=False))\n",
- "print(\"TYPE\")\n",
- "print(linked[\"type\"].to_string(index=False, header=False))\n",
- "print(\"URI\")\n",
- "print(linked[\"uri\"].to_string(index=False, header=False))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a4c338be",
- "metadata": {
- "pycharm": {
- "is_executing": true
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Commit | \n",
+ " create_time_since_epoch | \n",
+ " git_repo | \n",
+ " id | \n",
+ " last_update_time_since_epoch | \n",
+ " name | \n",
+ " type | \n",
+ " uri | \n",
+ " user-metadata1 | \n",
+ " user-metadata2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " commit 03c25dfdb6c188b7b04f7e675dec072de192b851 | \n",
+ " 1667255770792 | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 1 | \n",
+ " 1667255778222 | \n",
+ " artifacts/data.xml.gz:236d9502e0283d91f689d7038b8508a2 | \n",
+ " Dataset | \n",
+ " 236d9502e0283d91f689d7038b8508a2 | \n",
+ " metadata_value | \n",
+ " metadata_value | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Commit create_time_since_epoch \\\n",
+ "0 commit 03c25dfdb6c188b7b04f7e675dec072de192b851 1667255770792 \n",
+ "\n",
+ " git_repo id last_update_time_since_epoch \\\n",
+ "0 /tmp/cmf/example_get_started/git_remote 1 1667255778222 \n",
+ "\n",
+ " name type \\\n",
+ "0 artifacts/data.xml.gz:236d9502e0283d91f689d7038b8508a2 Dataset \n",
+ "\n",
+ " uri user-metadata1 user-metadata2 \n",
+ "0 236d9502e0283d91f689d7038b8508a2 metadata_value metadata_value "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
}
- },
- "outputs": [],
+ ],
"source": [
- "linked = query.get_all_parent_artifacts(\"artifacts/parsed/test.tsv\")\n",
- "print(\"Name : \" + linked[\"name\"].to_string(index=False, header=False))\n",
- "print(\"Type : \" + linked[\"type\"].to_string(index=False, header=False))\n",
- "print(\"URI : \" + linked[\"uri\"].to_string(index=False, header=False))"
+ "\n",
+ "linked = query.get_all_parent_artifacts(linked.iloc[0][\"name\"])\n",
+ "display(linked)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 60,
"id": "4eb35ec5",
"metadata": {
"pycharm": {
"is_executing": true
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Commit | \n",
+ " avg_prec | \n",
+ " create_time_since_epoch | \n",
+ " git_repo | \n",
+ " id | \n",
+ " last_update_time_since_epoch | \n",
+ " metrics_name | \n",
+ " model_framework | \n",
+ " model_name | \n",
+ " model_type | \n",
+ " name | \n",
+ " roc_auc | \n",
+ " type | \n",
+ " uri | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " commit 4fba7197919fb85dd1a0899d2cf5c5c690ee607c | \n",
+ " NaN | \n",
+ " 1667255774532 | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 2 | \n",
+ " 1667255774532 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " artifacts/parsed/train.tsv:22ec7737f442cfc81e8c701fb58d1007 | \n",
+ " NaN | \n",
+ " Dataset | \n",
+ " 22ec7737f442cfc81e8c701fb58d1007 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " commit 5dfd3ac63c950f6394e5b7cebd55343402c7fdb6 | \n",
+ " NaN | \n",
+ " 1667255776391 | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 3 | \n",
+ " 1667255776391 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " artifacts/parsed/test.tsv:03e3627bda150c8cf51a55ef96ab3ede | \n",
+ " NaN | \n",
+ " Dataset | \n",
+ " 03e3627bda150c8cf51a55ef96ab3ede | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " commit 4546b0679bcae18bd85893c69581db91da40495c | \n",
+ " NaN | \n",
+ " 1667255800206 | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 4 | \n",
+ " 1667255800206 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " artifacts/features/train.pkl:5de5e987eadb4b86fc47604b59cb3725 | \n",
+ " NaN | \n",
+ " Dataset | \n",
+ " 5de5e987eadb4b86fc47604b59cb3725 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " commit d67bedaa20e64e45fe9f553935d9ff0726f19b59 | \n",
+ " NaN | \n",
+ " 1667255802382 | \n",
+ " /tmp/cmf/example_get_started/git_remote | \n",
+ " 5 | \n",
+ " 1667255802382 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " artifacts/features/test.pkl:b1f98b4ebd09a0bdc72f1a8c102065dd | \n",
+ " NaN | \n",
+ " Dataset | \n",
+ " b1f98b4ebd09a0bdc72f1a8c102065dd | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " commit 8158283953c04affb8fe5ea6710656564ede7d3a | \n",
+ " NaN | \n",
+ " 1667255811813 | \n",
+ " NaN | \n",
+ " 6 | \n",
+ " 1667255811813 | \n",
+ " NaN | \n",
+ " SKlearn | \n",
+ " RandomForestClassifier:default | \n",
+ " RandomForestClassifier | \n",
+ " artifacts/model/model.pkl:5f6e4aa57cce9e3a0b2f12e5766d19be:3 | \n",
+ " NaN | \n",
+ " Model | \n",
+ " 5f6e4aa57cce9e3a0b2f12e5766d19be | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " NaN | \n",
+ " 0.526754 | \n",
+ " 1667255818388 | \n",
+ " NaN | \n",
+ " 7 | \n",
+ " 1667255818388 | \n",
+ " metrics:878d492e-596c-11ed-99a3-b47af137252e:4 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " metrics:878d492e-596c-11ed-99a3-b47af137252e:4 | \n",
+ " 0.959238 | \n",
+ " Metrics | \n",
+ " 878d492e-596c-11ed-99a3-b47af137252e | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Commit avg_prec \\\n",
+ "0 commit 4fba7197919fb85dd1a0899d2cf5c5c690ee607c NaN \n",
+ "1 commit 5dfd3ac63c950f6394e5b7cebd55343402c7fdb6 NaN \n",
+ "2 commit 4546b0679bcae18bd85893c69581db91da40495c NaN \n",
+ "3 commit d67bedaa20e64e45fe9f553935d9ff0726f19b59 NaN \n",
+ "4 commit 8158283953c04affb8fe5ea6710656564ede7d3a NaN \n",
+ "5 NaN 0.526754 \n",
+ "\n",
+ " create_time_since_epoch git_repo id \\\n",
+ "0 1667255774532 /tmp/cmf/example_get_started/git_remote 2 \n",
+ "1 1667255776391 /tmp/cmf/example_get_started/git_remote 3 \n",
+ "2 1667255800206 /tmp/cmf/example_get_started/git_remote 4 \n",
+ "3 1667255802382 /tmp/cmf/example_get_started/git_remote 5 \n",
+ "4 1667255811813 NaN 6 \n",
+ "5 1667255818388 NaN 7 \n",
+ "\n",
+ " last_update_time_since_epoch \\\n",
+ "0 1667255774532 \n",
+ "1 1667255776391 \n",
+ "2 1667255800206 \n",
+ "3 1667255802382 \n",
+ "4 1667255811813 \n",
+ "5 1667255818388 \n",
+ "\n",
+ " metrics_name model_framework \\\n",
+ "0 NaN NaN \n",
+ "1 NaN NaN \n",
+ "2 NaN NaN \n",
+ "3 NaN NaN \n",
+ "4 NaN SKlearn \n",
+ "5 metrics:878d492e-596c-11ed-99a3-b47af137252e:4 NaN \n",
+ "\n",
+ " model_name model_type \\\n",
+ "0 NaN NaN \n",
+ "1 NaN NaN \n",
+ "2 NaN NaN \n",
+ "3 NaN NaN \n",
+ "4 RandomForestClassifier:default RandomForestClassifier \n",
+ "5 NaN NaN \n",
+ "\n",
+ " name roc_auc \\\n",
+ "0 artifacts/parsed/train.tsv:22ec7737f442cfc81e8c701fb58d1007 NaN \n",
+ "1 artifacts/parsed/test.tsv:03e3627bda150c8cf51a55ef96ab3ede NaN \n",
+ "2 artifacts/features/train.pkl:5de5e987eadb4b86fc47604b59cb3725 NaN \n",
+ "3 artifacts/features/test.pkl:b1f98b4ebd09a0bdc72f1a8c102065dd NaN \n",
+ "4 artifacts/model/model.pkl:5f6e4aa57cce9e3a0b2f12e5766d19be:3 NaN \n",
+ "5 metrics:878d492e-596c-11ed-99a3-b47af137252e:4 0.959238 \n",
+ "\n",
+ " type uri \n",
+ "0 Dataset 22ec7737f442cfc81e8c701fb58d1007 \n",
+ "1 Dataset 03e3627bda150c8cf51a55ef96ab3ede \n",
+ "2 Dataset 5de5e987eadb4b86fc47604b59cb3725 \n",
+ "3 Dataset b1f98b4ebd09a0bdc72f1a8c102065dd \n",
+ "4 Model 5f6e4aa57cce9e3a0b2f12e5766d19be \n",
+ "5 Metrics 878d492e-596c-11ed-99a3-b47af137252e "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
- "linked = query.get_all_child_artifacts(\"artifacts/parsed/test.tsv\")\n",
- "print(\"NAME\")\n",
- "print(linked[\"name\"].to_string(index=False, header=False))\n",
- "print(\"TYPE\")\n",
- "print(linked[\"type\"].to_string(index=False, header=False))\n",
- "print(\"URI\")\n",
- "print(linked[\"uri\"].to_string(index=False, header=False))"
+ "linked = query.get_all_child_artifacts(linked.iloc[0][\"name\"])\n",
+ "display(linked)"
]
},
{
@@ -332,53 +1219,225 @@
"id": "29060ba9",
"metadata": {},
"source": [
- "#### Get immediate child artifacts of an artifact. "
+ "### Get immediate child artifacts of an artifact. "
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 61,
"id": "93bd401d",
"metadata": {
"pycharm": {
"is_executing": true
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Commit | \n",
+ " create_time_since_epoch | \n",
+ " id | \n",
+ " last_update_time_since_epoch | \n",
+ " model_framework | \n",
+ " model_name | \n",
+ " model_type | \n",
+ " name | \n",
+ " type | \n",
+ " uri | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " commit 8158283953c04affb8fe5ea6710656564ede7d3a | \n",
+ " 1667255811813 | \n",
+ " 6 | \n",
+ " 1667255811813 | \n",
+ " SKlearn | \n",
+ " RandomForestClassifier:default | \n",
+ " RandomForestClassifier | \n",
+ " artifacts/model/model.pkl:5f6e4aa57cce9e3a0b2f12e5766d19be:3 | \n",
+ " Model | \n",
+ " 5f6e4aa57cce9e3a0b2f12e5766d19be | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Commit create_time_since_epoch \\\n",
+ "0 commit 8158283953c04affb8fe5ea6710656564ede7d3a 1667255811813 \n",
+ "\n",
+ " id last_update_time_since_epoch model_framework \\\n",
+ "0 6 1667255811813 SKlearn \n",
+ "\n",
+ " model_name model_type \\\n",
+ "0 RandomForestClassifier:default RandomForestClassifier \n",
+ "\n",
+ " name type \\\n",
+ "0 artifacts/model/model.pkl:5f6e4aa57cce9e3a0b2f12e5766d19be:3 Model \n",
+ "\n",
+ " uri \n",
+ "0 5f6e4aa57cce9e3a0b2f12e5766d19be "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "linked = query.get_one_hop_child_artifacts(artifact_name)\n",
+ "display(linked)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "529817a7",
+ "metadata": {},
"source": [
- "linked = query.get_one_hop_child_artifacts(\"artifacts/data.xml.gz\")\n",
- "print(\"NAME\")\n",
- "print(linked[\"name\"].to_string(index=False, header=False))\n",
- "print(\"TYPE\")\n",
- "print(linked[\"type\"].to_string(index=False, header=False))\n",
- "print(\"URI\")\n",
- "print(linked[\"uri\"].to_string(index=False, header=False))"
+ "### Get all child artifacts "
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 62,
"id": "496ee2bc",
"metadata": {
"pycharm": {
"is_executing": true
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Commit | \n",
+ " avg_prec | \n",
+ " create_time_since_epoch | \n",
+ " id | \n",
+ " last_update_time_since_epoch | \n",
+ " metrics_name | \n",
+ " model_framework | \n",
+ " model_name | \n",
+ " model_type | \n",
+ " name | \n",
+ " roc_auc | \n",
+ " type | \n",
+ " uri | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " commit 8158283953c04affb8fe5ea6710656564ede7d3a | \n",
+ " NaN | \n",
+ " 1667255811813 | \n",
+ " 6 | \n",
+ " 1667255811813 | \n",
+ " NaN | \n",
+ " SKlearn | \n",
+ " RandomForestClassifier:default | \n",
+ " RandomForestClassifier | \n",
+ " artifacts/model/model.pkl:5f6e4aa57cce9e3a0b2f12e5766d19be:3 | \n",
+ " NaN | \n",
+ " Model | \n",
+ " 5f6e4aa57cce9e3a0b2f12e5766d19be | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 0.526754 | \n",
+ " 1667255818388 | \n",
+ " 7 | \n",
+ " 1667255818388 | \n",
+ " metrics:878d492e-596c-11ed-99a3-b47af137252e:4 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " metrics:878d492e-596c-11ed-99a3-b47af137252e:4 | \n",
+ " 0.959238 | \n",
+ " Metrics | \n",
+ " 878d492e-596c-11ed-99a3-b47af137252e | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Commit avg_prec \\\n",
+ "0 commit 8158283953c04affb8fe5ea6710656564ede7d3a NaN \n",
+ "1 NaN 0.526754 \n",
+ "\n",
+ " create_time_since_epoch id last_update_time_since_epoch \\\n",
+ "0 1667255811813 6 1667255811813 \n",
+ "1 1667255818388 7 1667255818388 \n",
+ "\n",
+ " metrics_name model_framework \\\n",
+ "0 NaN SKlearn \n",
+ "1 metrics:878d492e-596c-11ed-99a3-b47af137252e:4 NaN \n",
+ "\n",
+ " model_name model_type \\\n",
+ "0 RandomForestClassifier:default RandomForestClassifier \n",
+ "1 NaN NaN \n",
+ "\n",
+ " name roc_auc \\\n",
+ "0 artifacts/model/model.pkl:5f6e4aa57cce9e3a0b2f12e5766d19be:3 NaN \n",
+ "1 metrics:878d492e-596c-11ed-99a3-b47af137252e:4 0.959238 \n",
+ "\n",
+ " type uri \n",
+ "0 Model 5f6e4aa57cce9e3a0b2f12e5766d19be \n",
+ "1 Metrics 878d492e-596c-11ed-99a3-b47af137252e "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
- "linked = query.get_all_child_artifacts(\"artifacts/data.xml.gz\")\n",
- "#print(linked.sort_values('create_time_since_epoch', ascending=True))\n",
- "print(\"NAME\")\n",
- "print(linked[\"name\"].to_string(index=False, header=False))\n",
- "print(\"TYPE\")\n",
- "print(linked[\"type\"].to_string(index=False, header=False))\n",
- "print(\"URI\")\n",
- "print(linked[\"uri\"].to_string(index=False, header=False))"
+ "linked = query.get_all_child_artifacts(artifact_name)\n",
+ "display(linked)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 63,
"id": "1ae1d868",
"metadata": {
"pycharm": {
@@ -387,13 +1446,12 @@
},
"outputs": [],
"source": [
+ "# Provide Execution id corresponding to a Evaluate stage\n",
"linked = query.get_all_artifacts_for_execution(4)\n",
- "print(\"NAME\")\n",
- "print(linked[\"name\"].to_string(index=False, header=False))\n",
- "print(\"TYPE\")\n",
- "print(linked[\"type\"].to_string(index=False, header=False))\n",
- "print(\"URI\")\n",
- "print(linked[\"uri\"].to_string(index=False, header=False))"
+ "for index, row in linked.iterrows():\n",
+ " if row[\"type\"] == \"Metrics\":\n",
+ " break\n",
+ "\n"
]
},
{
@@ -401,44 +1459,95 @@
"id": "ae3109b7",
"metadata": {},
"source": [
- "### Change the metrics name in the get_artifact call with the metrics name from output of the previous cell"
+ "### Get artifact "
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 64,
"id": "8bf8a41d",
"metadata": {
"pycharm": {
"is_executing": true
}
},
- "outputs": [],
- "source": [
- "artifacts = query.get_artifact(\"metrics:aaae534e-915d-11ec-b106-89841b9859cd:4\")\n",
- "print(artifacts)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "ca26387d",
- "metadata": {
- "pycharm": {
- "name": "#%%\n",
- "is_executing": true
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " type | \n",
+ " uri | \n",
+ " name | \n",
+ " create_time_since_epoch | \n",
+ " last_update_time_since_epoch | \n",
+ " metrics_name | \n",
+ " avg_prec | \n",
+ " roc_auc | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 7 | \n",
+ " Metrics | \n",
+ " 878d492e-596c-11ed-99a3-b47af137252e | \n",
+ " metrics:878d492e-596c-11ed-99a3-b47af137252e:4 | \n",
+ " 1667255818388 | \n",
+ " 1667255818388 | \n",
+ " metrics:878d492e-596c-11ed-99a3-b47af137252e:4 | \n",
+ " 0.526754 | \n",
+ " 0.959238 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id type uri \\\n",
+ "0 7 Metrics 878d492e-596c-11ed-99a3-b47af137252e \n",
+ "\n",
+ " name create_time_since_epoch \\\n",
+ "0 metrics:878d492e-596c-11ed-99a3-b47af137252e:4 1667255818388 \n",
+ "\n",
+ " last_update_time_since_epoch \\\n",
+ "0 1667255818388 \n",
+ "\n",
+ " metrics_name avg_prec roc_auc \n",
+ "0 metrics:878d492e-596c-11ed-99a3-b47af137252e:4 0.526754 0.959238 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
}
- },
- "outputs": [],
+ ],
"source": [
- "new_parquet_df = pd.read_parquet(\"./slice-a\")\n",
- "print(new_parquet_df)"
+ "artifact = query.get_artifact(row[\"name\"])\n",
+ "display(artifact)\n"
]
}
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "Python 3",
"language": "python",
"name": "python3"
},
@@ -452,9 +1561,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.10"
+ "version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/examples/example-get-started/README.md b/examples/example-get-started/README.md
index 71607c01..088fb07c 100644
--- a/examples/example-get-started/README.md
+++ b/examples/example-get-started/README.md
@@ -4,7 +4,7 @@
1. Copy contents of `example-get-started` directory to a separate directory outside this repository.
-2. Create python virtual environment (version >= 3.6), install git, install python dependencies
+2. Create python virtual environment (version >= 3.6 and < 3.9), install git, install python dependencies
3. Modify the [sample_env](./sample_env) file with appropriate values for the exports.
diff --git a/examples/example-get-started/initialize.sh b/examples/example-get-started/initialize.sh
index 36910017..1435cdaf 100644
--- a/examples/example-get-started/initialize.sh
+++ b/examples/example-get-started/initialize.sh
@@ -8,18 +8,18 @@
echo "[1/5] [GIT/DVC INIT ] executing git init and dvc init."
git init -q
dvc init -q
-git config --global user.name "${GIT_USER_NAME}"
-git config --global user.email "${GIT_USER_EMAIL}"
+git config --global user.name "${GIT_USER_NAME:-first second}"
+git config --global user.email "${GIT_USER_EMAIL:-first.second@corp.com}"
echo "[2/5] [INITIAL COMMIT] performing initial blank commit into main."
git checkout -b master
git commit --allow-empty -n -m "Initial code commit"
echo "[3/5] [GIT REMOTE ] setting git remote to ${GIT_REMOTE_URL}"
-git remote add origin "${GIT_REMOTE_URL}"
+git remote add origin "${GIT_REMOTE_URL:-/tmp/gitremote/url}"
echo "[4/5] [DVC REMOTE ] setting dvc remote to ${DVC_REMOTE_URL}"
-dvc remote add myremote "${DVC_REMOTE_URL}"
+dvc remote add myremote -f "${DVC_REMOTE_URL:-/tmp/dvcremote}"
dvc remote default myremote
echo "[5/5] [NEXT STEPS ]"
diff --git a/examples/example-get-started/sample_env b/examples/example-get-started/sample_env
index 789891d6..e1db4518 100644
--- a/examples/example-get-started/sample_env
+++ b/examples/example-get-started/sample_env
@@ -2,6 +2,6 @@ export DVC_REMOTE_URL=/tmp/cmf/example_get_started/dvc_remote
export GIT_USER_NAME="First Second"
export GIT_USER_EMAIL=first.second@corp.org
export GIT_REMOTE_URL=git@github.com:first-second/experiment-repo.git
-export NEO4J_USER_NAME=neo4j
-export NEO4J_PASSWD=neo4j
+export NEO4J_USER_NAME=user
+export NEO4J_PASSWD=XXXXXX
export NEO4J_URI="bolt://IP:PORT"
diff --git a/examples/example-get-started/src/featurize.py b/examples/example-get-started/src/featurize.py
index 31df2833..0cb99973 100644
--- a/examples/example-get-started/src/featurize.py
+++ b/examples/example-get-started/src/featurize.py
@@ -72,8 +72,9 @@ def featurize(input_dir: str, output_dir: str) -> None:
os.makedirs(output_dir, exist_ok=True)
output_ds = Dataset(train=os.path.join(output_dir, "train.pkl"), test=os.path.join(output_dir, "test.pkl"))
-
- metawriter = cmf.Cmf(filename="mlmd", pipeline_name="Test-env")
+ graph_env = os.getenv("NEO4J","False")
+ graph = True if graph_env == "True" or graph_env == "TRUE" else False
+ metawriter = cmf.Cmf(filename="mlmd", pipeline_name="Test-env", graph=graph)
_ = metawriter.create_context(pipeline_stage="Featurize")
_ = metawriter.create_execution(execution_type="Featurize-execution", custom_properties=params)
diff --git a/examples/example-get-started/src/parse.py b/examples/example-get-started/src/parse.py
index 2eaea35b..23729d46 100644
--- a/examples/example-get-started/src/parse.py
+++ b/examples/example-get-started/src/parse.py
@@ -59,8 +59,9 @@ def parse(input_file: str, output_dir: str) -> None:
"""
params = yaml.safe_load(open("params.yaml"))["parse"]
random.seed(params["seed"])
-
- metawriter = cmf.Cmf(filename="mlmd", pipeline_name="Test-env")
+ graph_env = os.getenv("NEO4J","False")
+ graph = True if graph_env == "True" or graph_env == "TRUE" else False
+ metawriter = cmf.Cmf(filename="mlmd", pipeline_name="Test-env", graph=graph)
_ = metawriter.create_context(pipeline_stage="Prepare", custom_properties={"user-metadata1": "metadata_value"})
_ = metawriter.create_execution(execution_type="Prepare", custom_properties=params)
_ = metawriter.log_dataset(input_file, "input", custom_properties={"user-metadata1": "metadata_value"})
diff --git a/examples/example-get-started/src/test.py b/examples/example-get-started/src/test.py
index 065e9843..27e54bbc 100644
--- a/examples/example-get-started/src/test.py
+++ b/examples/example-get-started/src/test.py
@@ -46,8 +46,9 @@ def test(model_dir: str, dataset_dir: str, output_dir: str) -> None:
prc=os.path.join(output_dir, 'prc.json'),
roc=os.path.join(output_dir, 'roc.json')
)
-
- metawriter = cmf.Cmf(filename="mlmd", pipeline_name="Test-env")
+ graph_env = os.getenv("NEO4J","False")
+ graph = True if graph_env == "True" or graph_env == "TRUE" else False
+ metawriter = cmf.Cmf(filename="mlmd", pipeline_name="Test-env", graph=graph)
_ = metawriter.create_context(pipeline_stage="Evaluate")
_ = metawriter.create_execution(execution_type="Evaluate-execution")
diff --git a/examples/example-get-started/src/train.py b/examples/example-get-started/src/train.py
index 863b9eb1..b7e1124f 100644
--- a/examples/example-get-started/src/train.py
+++ b/examples/example-get-started/src/train.py
@@ -37,8 +37,9 @@ def train(input_dir: str, output_dir: str) -> None:
Output: ${output_dir}/model.pkl
"""
params = yaml.safe_load(open("params.yaml"))["train"]
-
- metawriter = cmf.Cmf(filename="mlmd", pipeline_name="Test-env")
+ graph_env = os.getenv("NEO4J","False")
+ graph = True if graph_env == "True" or graph_env == "TRUE" else False
+ metawriter = cmf.Cmf(filename="mlmd", pipeline_name="Test-env", graph=graph)
_ = metawriter.create_context(pipeline_stage="Train")
_ = metawriter.create_execution(execution_type="Train-execution", custom_properties=params)
diff --git a/examples/example-get-started/test-data-slice.py b/examples/example-get-started/test-data-slice.py
index 0442851f..c4f95c08 100644
--- a/examples/example-get-started/test-data-slice.py
+++ b/examples/example-get-started/test-data-slice.py
@@ -59,11 +59,11 @@ def generate_dataset():
# Creating the data slice - today we have only path and hash.
# Would need to expand to take in more metadata.
for i in range(1, 3, 1):
- dataslice: cmf.Cmf.dataslice = metawriter.create_dataslice(name="slice-" + str(i))
+ dataslice: cmf.Cmf.DataSlice = metawriter.create_dataslice(name="slice-" + str(i))
for _ in range(1, 20, 1):
j = random.randrange(1, 100)
print(folder_path + "/" + str(j) + ".txt")
- dataslice.add_data(path=folder_path + "/" + str(j) + ".txt", custom_props={"key1": "value1", "key2": "value2"})
+ dataslice.add_data(path=folder_path + "/" + str(j) + ".txt", custom_properties={"key1": "value1", "key2": "value2"})
dataslice.commit()
# Reading the files in the slice.
@@ -73,13 +73,13 @@ def generate_dataset():
for label, content in df.iterrows():
record = label
row_content = content
-
+print("Updating the value from `value1` to `1` and from `value2` to `2`")
print("Before update")
print(record)
print(row_content)
# Update the metadata for a record in the slice.
-metawriter.update_dataslice(name="slice-1", record=record, custom_props={"key1": "1", "key2": "2"})
+metawriter.update_dataslice(name="slice-1", record=record, custom_properties={"key1": "1", "key2": "2"})
df = metawriter.read_dataslice(name="slice-1")
print("After update")
diff --git a/mkdocs.yaml b/mkdocs.yaml
new file mode 100644
index 00000000..041b2aeb
--- /dev/null
+++ b/mkdocs.yaml
@@ -0,0 +1,88 @@
+###
+# Copyright (2022) Hewlett Packard Enterprise Development LP
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# You may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###
+
+
+site_name: CMF
+site_description: CMF - Common Metadata Framework
+site_author: AI Research & Development Lab (Hewlett-Packard Labs)
+
+
+repo_name: HewlettPackard/cmf
+repo_url: https://github.com/HewlettPackard/cmf
+edit_uri: ""
+docs_dir: docs/
+
+
+theme:
+ name: material
+ language: en
+ features:
+ - navigation.sections # Sections are included in the navigation on the left.
+ - toc.integrate # Table of contents is integrated on the left; does not appear separately on the right.
+ - header.autohide # header disappears as you scroll
+ palette:
+ primary: 'blue'
+ accent: 'blue'
+ font:
+ text: 'Circular'
+ code: 'Inconsolata'
+
+
+markdown_extensions:
+ - admonition
+ - pymdownx.arithmatex: # Render LaTeX via MathJax
+ generic: true
+ - pymdownx.tasklist
+ - pymdownx.details # Allowing hidden expandable regions denoted by ???
+ - pymdownx.superfences # Seems to enable syntax highlighting when used with the Material theme.
+ - pymdownx.tabbed:
+ alternate_style: true
+ - toc:
+ permalink: True
+ - attr_list
+ - md_in_html # Parse markdown inside HTML tags (default is not to parse).
+
+extra_css:
+ - extra.css # This defines custom properties, in particular, enforces custom HTML tables to be 100% width.
+
+
+plugins:
+ - autorefs # Need to reference mkdocstrings-generated content from documentation files ([][] thing).
+ - search
+ - mkdocstrings: # Automatically generate API documentation.
+ default_handler: python
+
+extra:
+ social:
+ # Buttons at the bottom of every page.
+ - icon: fontawesome/brands/github
+ link: https://github.com/HewlettPackard/cmf
+ - icon: fontawesome/brands/slack
+ link: https://commonmetadata.slack.com/
+
+
+# This defines the structure of the CMF documentation portal (all must be in `docs/` directory).
+nav:
+ - index.md
+ - Examples:
+ - Getting Started: examples/getting_started.md
+ - Architecture:
+ - Overview: architecture/overview.md
+ - Components: architecture/components.md
+ - Advantages: architecture/advantages.md
+ - Public API:
+ - CMF: api/public/cmf.md
+ - DataSlice: api/public/dataslice.md
diff --git a/presentations/LFAI_v2.pdf b/presentations/LFAI_v2.pdf
new file mode 100644
index 00000000..65c61fcb
Binary files /dev/null and b/presentations/LFAI_v2.pdf differ
diff --git a/presentations/LFAI_v2.pptx b/presentations/LFAI_v2.pptx
new file mode 100644
index 00000000..0337734f
Binary files /dev/null and b/presentations/LFAI_v2.pptx differ
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..d509b501
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,17 @@
+[project]
+name = "cmflib"
+version = "0.0.2"
+authors = [
+ { name="Hewlett Packard Enterprise"},
+]
+description = "Track metadata for AI pipeline"
+readme = "README.md"
+requires-python = ">=3.7,<3.10"
+classifiers = [
+ "Programming Language :: Python :: 3",
+ "Operating System :: POSIX :: Linux",
+]
+
+[project.urls]
+"Homepage" = "https://github.com/HewlettPackard/cmf"
+"Bug Tracker" = "https://github.com/HewlettPackard/cmf/issues"
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 00000000..76b73e13
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,3 @@
+[metadata]
+description-file=README.md
+license_files=LICENSE
diff --git a/setup.py b/setup.py
index e5d6071d..0116090a 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,6 @@
from setuptools import setup, find_packages
-VERSION = '0.0.1'
+VERSION = '0.0.2'
DESCRIPTION = 'Metadata Python Package'
LONG_DESCRIPTION = 'Metadata framework storing AI metadata into MLMD'
@@ -9,13 +9,13 @@
# the name must match the folder name 'verysimplemodule'
name="cmflib",
version=VERSION,
- author="Annmary Justine",
- author_email="annmary.roy@hpe.com",
+ author="Hewlett Packard Enterprise",
description=DESCRIPTION,
long_description=LONG_DESCRIPTION,
packages=find_packages(),
- install_requires=["ml-metadata==1.3.0",
- "dvc","pandas","retrying", "pyarrow", "neo4j", "scikit-learn", "tabulate", "click"], # add any additional packages that
+ install_requires=["ml-metadata==1.11.0",
+ "dvc", "pandas", "retrying", "pyarrow", "neo4j", \
+ "scikit-learn", "tabulate", "click"], # add any additional packages that
# needs to be installed along with your package. Eg: 'caer'
keywords=['python', 'first package'],
@@ -23,6 +23,6 @@
"Development Status :: 3 - Alpha",
"Intended Audience :: Education",
"Programming Language :: Python :: 3",
- "Operating System :: Linux :: Ubuntu",
+ "Operating System :: POSIX :: Linux",
]
)