Skip to content

Commit

Permalink
Merge branch 'main' into scipy-ml-fixup
Browse files Browse the repository at this point in the history
  • Loading branch information
dafeliton authored Aug 22, 2024
2 parents 21cf921 + bc8b5b6 commit 0202298
Show file tree
Hide file tree
Showing 9 changed files with 152 additions and 12 deletions.
66 changes: 66 additions & 0 deletions .github/workflows/scipy-migrate.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
name: migrate scipy

env:
DOCKER_CLIENT_TIMEOUT: "300"
COMPOSE_HTTP_TIMEOUT: "300"
REGISTRY: ghcr.io

on:
push:
paths:
- "images/**"
- "model/**"
- "scripts/**"
- "dodo.py"
- ".github/workflows/main.yml"

pull_request:
branches: [ main ]
paths:
- "images/**"
- "model/**"
- "scripts/**"
- "dodo.py"
- ".github/workflows/main.yml"

workflow_dispatch:

jobs:
docker-pipeline:
runs-on: ubuntu-latest
if: >
!contains(github.event.head_commit.message , 'skip ci') &&
!contains(github.event.pull_request.title, 'skip ci')
steps:
- name: Checkout after Free Space
uses: actions/checkout@v3
with:
fetch-depth: 0

- name: Check Free Space 0
run: |
echo "Free space:"
df -h
- name: Docker/ENV cleanup Cleanup
run: |
docker image prune -a -f
docker container prune -f
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- name: Log in to the Container registry
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Docker push
run: |
docker pull ucsdets/scipy-ml-notebook:2021.3-stable
docker tag docker.io/ucsdets/scipy-ml-notebook:2021.3-stable ghcr.io/ucsd-ets/scipy-ml-notebook:2021.3-stable
docker push ghcr.io/ucsd-ets/scipy-ml-notebook:2021.3-stable
5 changes: 3 additions & 2 deletions images/datascience-notebook/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ARG PYTHON_VERSION=python-3.11.8
ARG PY_VER_SHORT=3.11
ARG JUPYTERHUB_VERSION=4.1.5

# Jupyter has changed where it has stored its images
FROM quay.io/jupyter/datascience-notebook:$PYTHON_VERSION
USER root

Expand Down Expand Up @@ -63,13 +64,13 @@ USER jovyan
# Python/Mamba Deps
## Package versions
ARG JUPYTERSERVER_VERSION=2.14.2 NBGRADER_VERSION=0.9.3 JUPYTERLAB_VERSION=4.2.4 NBCONVERT_VERSION=7.16.4 NOTEBOOK_VERSION=7.2.1 NBCLASSIC_VERSION=1.1.0
ARG PANDAS_VERSION=2.2.2 STATSMODELS_VERSION=0.14.2
ARG PANDAS_VERSION=2.2.2 STATSMODELS_VERSION=0.14.2 BOTTLENECK_VERSION=1.3.6 NUMEXPR_VERSION=2.8.4

# Install essential+datascience pip packages
## mistune added for nbgrader issues
RUN mamba install -c conda-forge pillow typing-extensions tzlocal appdirs gputil mock pytest umap-learn && \
mamba install -c conda-forge nltk statsmodels=$STATSMODELS_VERSION pandas=$PANDAS_VERSION mistune && \
mamba install -c conda-forge dpkt nose datascience && \
mamba install -c conda-forge dpkt nose datascience pyarrow bottleneck=$BOTTLENECK_VERSION umap-learn numexpr=$NUMEXPR_VESION && \
python -c 'import matplotlib.pyplot' && \
fix-permissions $CONDA_DIR && \
fix-permissions /home/$NB_USER && \
Expand Down
8 changes: 0 additions & 8 deletions images/datascience-notebook/scripts/nbgrader_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,3 @@
c.Exchange.root = "/srv/nbgrader/exchange"

c.ExecutePreprocessor.timeout = 300

c.ClearSolutions.begin_solution_delimeter = "BEGIN MY SOLUTION"
c.ClearSolutions.end_solution_delimeter = "END MY SOLUTION"
c.ClearSolutions.code_stub = {
"R": "# your code here\nfail() # No Answer - remove if you provide an answer",
"python": "# your code here\nraise NotImplementedError",
"javascript": "// your code here\nthrow new Error();"
}
3 changes: 3 additions & 0 deletions images/scipy-ml-notebook/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ USER jovyan

# CUDA setup w/mamba
## TODO: Investigate this command, seems to duplicate cuda packages for nvidia (pypi + conda-forge).
# cuda-toolkit is a skeleton package on CUDA 12, unlike CUDA <= 11
RUN mamba install -c "nvidia/label/cuda-12.0.0" cuda-nvcc \
cuda-toolkit=$CUDA_VERSION \
# For CUDA 11: cudatoolkit=$CUDA_VERSION \
Expand Down Expand Up @@ -90,6 +91,8 @@ RUN pip install nvidia-cudnn-cu12==$CUDNN_VERSION torch==$TORCH_VERSION torchvis
mamba clean -a -y && \
pip cache purge

RUN pip install transformers datasets accelerate huggingface-cli timm && pip cache purge

USER $NB_UID:$NB_GID
ENV PATH=${PATH}:/usr/local/nvidia/bin:/opt/conda/bin

Expand Down
1 change: 1 addition & 0 deletions images/scipy-ml-notebook/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Tensorflow compatability matrix: https://www.tensorflow.org/install/source?hl=en#gpu
76 changes: 76 additions & 0 deletions images/scipy-ml-notebook/workflow_tests/test_huggingface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
### THESE TESTS WILL DOWNLOAD A BUNCH OF MODELS TO YOUR .CACHE DIR
### IF MANUALLY RUN, DELETE THEM AFTER TO SAVE SPACE

# The results of these tests are somewhat subject to randomness. It's possible that values will change as models change. You can always run these from the container to see what's wrong with them.

from transformers import pipeline
from transformers import AutoTokenizer

import pytest

# test basic sentiment analysis
def get_sentiment_analysis(string):
return pipeline("sentiment-analysis")(string)

def test_positive_sent():
sent = get_sentiment_analysis("I love you")[0]
assert sent["label"] == "POSITIVE"
assert sent["score"] > .9

def test_negative_sent():
sent = get_sentiment_analysis("I hate you you")[0]
assert sent["label"] == "NEGATIVE"
assert sent["score"] > .9

# basic transcription, don't specify a model if you care about the space in your .cache dir
def test_transcribe_mlk():
transcriber = pipeline(task="automatic-speech-recognition")
result = transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")["text"]
assert "HAVE A DREAM" in result

def test_cat_recognition():
vision_classifier = pipeline(model="google/vit-base-patch16-224")
preds = vision_classifier(
images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
)
preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]

assert any('cat' in pred["label"] for pred in preds)

def test_zero_shot_class():
classifier = pipeline(task="zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")
results = classifier(
"I have a problem with my iphone that needs to be resolved asap!!",
candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
)
assert results["labels"][0] == "urgent"
assert results["scores"][0] > .4

# the function will return a bunch of nonsense that we can't assert but will verify that
# tensorflow probably works fine with transformer
def test_tf_tokenizer():
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

batch_sentences = [
"But what about second breakfast?",
"Don't think he knows about second breakfast, Pip.",
"What about elevensies?",
]
encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf")
assert str(type(encoded_input["input_ids"])) == "<class 'tensorflow.python.framework.ops.EagerTensor'>"

# the function will return a bunch of nonsense that we can't assert but will verify that
# pytorch probably works fine with transformer
def test_pytorch_tokenizer():
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

batch_sentences = [
"But what about second breakfast?",
"Don't think he knows about second breakfast, Pip.",
"What about elevensies?",
]
encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
print(encoded_input)

assert str(type(encoded_input["input_ids"])) == "<class 'torch.Tensor'>"

2 changes: 1 addition & 1 deletion images/spec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ images:
#prepull: false #-- uncomment to disable prepulling behavior for scipy-ml. gives you space on machine in exchange for build time.

tag:
prefix: "2024.3"
prefix: "2024.4"

all_info_cmds:
PY_VER:
Expand Down
2 changes: 2 additions & 0 deletions images/tests_common/test_notebook.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.

import pytest

@pytest.mark.skip(reason="Hub upgrade version likely changes how this test works.")
def test_secured_server(container, http_client):
try:
"""Notebook server should eventually request user login."""
Expand Down
1 change: 0 additions & 1 deletion scripts/docker_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ def build(node: Node) -> Tuple[bool, str]:
# line is of type dict
content_str = line.get('stream', '').strip() # sth like 'Step 1/20 : ARG PYTHON_VERSION=python-3.9.5'
error_str = line.get('error', '').strip()

if error_str:
raise docker_client.errors.BuildError(build_log=error_str, reason=error_str)

Expand Down

0 comments on commit 0202298

Please sign in to comment.