Merge branch 'main' into scipy-ml-fixup

ucsd-ets · Aug 22, 2024 · 0202298 · 0202298
2 parents 21cf921 + bc8b5b6
commit 0202298
Show file tree

Hide file tree

Showing 9 changed files with 152 additions and 12 deletions.
diff --git a/.github/workflows/scipy-migrate.yml b/.github/workflows/scipy-migrate.yml
@@ -0,0 +1,66 @@
+name: migrate scipy
+
+env:
+  DOCKER_CLIENT_TIMEOUT: "300"
+  COMPOSE_HTTP_TIMEOUT: "300"
+  REGISTRY: ghcr.io
+
+on:
+  push:
+    paths:
+      - "images/**"
+      - "model/**"
+      - "scripts/**"
+      - "dodo.py"
+      - ".github/workflows/main.yml"
+
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "images/**"
+      - "model/**"
+      - "scripts/**"
+      - "dodo.py"
+      - ".github/workflows/main.yml"
+
+  workflow_dispatch:
+
+jobs:
+  docker-pipeline:
+    runs-on: ubuntu-latest
+    if: >
+      !contains(github.event.head_commit.message , 'skip ci') &&
+      !contains(github.event.pull_request.title, 'skip ci')
+    steps:
+      - name: Checkout after Free Space
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Check Free Space 0
+        run: |
+          echo "Free space:"
+          df -h
+
+      - name: Docker/ENV cleanup Cleanup
+        run: |
+          docker image prune -a -f
+          docker container prune -f
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf "/usr/local/share/boost"
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+      
+      - name: Log in to the Container registry
+        uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Docker push
+        run: |
+          docker pull ucsdets/scipy-ml-notebook:2021.3-stable
+          docker tag docker.io/ucsdets/scipy-ml-notebook:2021.3-stable ghcr.io/ucsd-ets/scipy-ml-notebook:2021.3-stable
+          docker push ghcr.io/ucsd-ets/scipy-ml-notebook:2021.3-stable
diff --git a/images/datascience-notebook/Dockerfile b/images/datascience-notebook/Dockerfile
@@ -3,6 +3,7 @@ ARG PYTHON_VERSION=python-3.11.8
 ARG PY_VER_SHORT=3.11
 ARG JUPYTERHUB_VERSION=4.1.5
 
+# Jupyter has changed where it has stored its images
 FROM quay.io/jupyter/datascience-notebook:$PYTHON_VERSION
 USER root
 
@@ -63,13 +64,13 @@ USER jovyan
 # Python/Mamba Deps
 ## Package versions
 ARG JUPYTERSERVER_VERSION=2.14.2 NBGRADER_VERSION=0.9.3 JUPYTERLAB_VERSION=4.2.4 NBCONVERT_VERSION=7.16.4 NOTEBOOK_VERSION=7.2.1 NBCLASSIC_VERSION=1.1.0
-ARG PANDAS_VERSION=2.2.2 STATSMODELS_VERSION=0.14.2
+ARG PANDAS_VERSION=2.2.2 STATSMODELS_VERSION=0.14.2 BOTTLENECK_VERSION=1.3.6 NUMEXPR_VERSION=2.8.4
 
 # Install essential+datascience pip packages 
 ## mistune added for nbgrader issues
 RUN mamba install -c conda-forge pillow typing-extensions tzlocal appdirs gputil mock pytest umap-learn && \
     mamba install -c conda-forge nltk statsmodels=$STATSMODELS_VERSION pandas=$PANDAS_VERSION mistune && \
-    mamba install -c conda-forge dpkt nose datascience && \
+    mamba install -c conda-forge dpkt nose datascience pyarrow bottleneck=$BOTTLENECK_VERSION umap-learn numexpr=$NUMEXPR_VESION && \
     python -c 'import matplotlib.pyplot' && \
     fix-permissions $CONDA_DIR && \
     fix-permissions /home/$NB_USER && \

diff --git a/images/datascience-notebook/scripts/nbgrader_config.py b/images/datascience-notebook/scripts/nbgrader_config.py
@@ -10,11 +10,3 @@
 c.Exchange.root = "/srv/nbgrader/exchange"
 
 c.ExecutePreprocessor.timeout = 300
-
-c.ClearSolutions.begin_solution_delimeter = "BEGIN MY SOLUTION"
-c.ClearSolutions.end_solution_delimeter = "END MY SOLUTION"
-c.ClearSolutions.code_stub = {
-    "R": "# your code here\nfail() # No Answer - remove if you provide an answer",
-    "python": "# your code here\nraise NotImplementedError",
-    "javascript": "// your code here\nthrow new Error();"
-}
diff --git a/images/scipy-ml-notebook/Dockerfile b/images/scipy-ml-notebook/Dockerfile
@@ -50,6 +50,7 @@ USER jovyan
 
 # CUDA setup w/mamba
 ## TODO: Investigate this command, seems to duplicate cuda packages for nvidia (pypi + conda-forge). 
+# cuda-toolkit is a skeleton package on CUDA 12, unlike CUDA <= 11
 RUN mamba install -c "nvidia/label/cuda-12.0.0" cuda-nvcc \
     cuda-toolkit=$CUDA_VERSION \
     # For CUDA 11: cudatoolkit=$CUDA_VERSION \
@@ -90,6 +91,8 @@ RUN pip install nvidia-cudnn-cu12==$CUDNN_VERSION torch==$TORCH_VERSION torchvis
   mamba clean -a -y && \
   pip cache purge
 
+RUN pip install transformers datasets accelerate huggingface-cli timm && pip cache purge
+
 USER $NB_UID:$NB_GID
 ENV PATH=${PATH}:/usr/local/nvidia/bin:/opt/conda/bin
 

diff --git a/images/scipy-ml-notebook/README.md b/images/scipy-ml-notebook/README.md
@@ -0,0 +1 @@
+Tensorflow compatability matrix: https://www.tensorflow.org/install/source?hl=en#gpu
diff --git a/images/scipy-ml-notebook/workflow_tests/test_huggingface.py b/images/scipy-ml-notebook/workflow_tests/test_huggingface.py
@@ -0,0 +1,76 @@
+### THESE TESTS WILL DOWNLOAD A BUNCH OF MODELS TO YOUR .CACHE DIR
+### IF MANUALLY RUN, DELETE THEM AFTER TO SAVE SPACE
+
+# The results of these tests are somewhat subject to randomness. It's possible that values will change as models change. You can always run these from the container to see what's wrong with them.
+
+from transformers import pipeline
+from transformers import AutoTokenizer
+
+import pytest
+
+# test basic sentiment analysis
+def get_sentiment_analysis(string):
+    return pipeline("sentiment-analysis")(string)
+
+def test_positive_sent():
+    sent = get_sentiment_analysis("I love you")[0]
+    assert sent["label"] == "POSITIVE"
+    assert sent["score"] > .9
+
+def test_negative_sent():
+    sent = get_sentiment_analysis("I hate you you")[0]
+    assert sent["label"] == "NEGATIVE"
+    assert sent["score"] > .9  
+
+# basic transcription, don't specify a model if you care about the space in your .cache dir
+def test_transcribe_mlk():
+    transcriber = pipeline(task="automatic-speech-recognition")
+    result = transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")["text"]
+    assert "HAVE A DREAM" in result
+
+def test_cat_recognition():
+    vision_classifier = pipeline(model="google/vit-base-patch16-224")
+    preds = vision_classifier(
+        images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+    )
+    preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+
+    assert any('cat' in pred["label"] for pred in preds)
+
+def test_zero_shot_class():
+    classifier = pipeline(task="zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")
+    results = classifier(
+        "I have a problem with my iphone that needs to be resolved asap!!",
+        candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
+    )
+    assert results["labels"][0] == "urgent"
+    assert results["scores"][0] > .4
+
+# the function will return a bunch of nonsense that we can't assert but will verify that
+# tensorflow probably works fine with transformer
+def test_tf_tokenizer():
+    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+
+    batch_sentences = [
+        "But what about second breakfast?",
+        "Don't think he knows about second breakfast, Pip.",
+        "What about elevensies?",
+    ]
+    encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf")
+    assert str(type(encoded_input["input_ids"])) == "<class 'tensorflow.python.framework.ops.EagerTensor'>"
+
+# the function will return a bunch of nonsense that we can't assert but will verify that
+# pytorch probably works fine with transformer
+def test_pytorch_tokenizer():
+    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+
+    batch_sentences = [
+    "But what about second breakfast?",
+    "Don't think he knows about second breakfast, Pip.",
+    "What about elevensies?",
+    ]
+    encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
+    print(encoded_input)
+
+    assert str(type(encoded_input["input_ids"])) == "<class 'torch.Tensor'>"
+
diff --git a/images/spec.yml b/images/spec.yml
@@ -25,7 +25,7 @@ images:
     #prepull: false #-- uncomment to disable prepulling behavior for scipy-ml. gives you space on machine in exchange for build time.
 
 tag:
-  prefix: "2024.3"
+  prefix: "2024.4"
 
 all_info_cmds:
   PY_VER:

diff --git a/images/tests_common/test_notebook.py b/images/tests_common/test_notebook.py
@@ -1,7 +1,9 @@
 # Copyright (c) Jupyter Development Team.
 # Distributed under the terms of the Modified BSD License.
 
+import pytest
 
+@pytest.mark.skip(reason="Hub upgrade version likely changes how this test works.")
 def test_secured_server(container, http_client):
     try:
         """Notebook server should eventually request user login."""

diff --git a/scripts/docker_adapter.py b/scripts/docker_adapter.py
@@ -74,7 +74,6 @@ def build(node: Node) -> Tuple[bool, str]:
             # line is of type dict
             content_str = line.get('stream', '').strip()    # sth like 'Step 1/20 : ARG PYTHON_VERSION=python-3.9.5'
             error_str = line.get('error', '').strip()
-
             if error_str:
                 raise docker_client.errors.BuildError(build_log=error_str, reason=error_str)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Tensorflow compatability matrix: https://www.tensorflow.org/install/source?hl=en#gpu