Skip to content

Commit

Permalink
Merge pull request #86 from ucsd-ets/mamba-scipy-ml-fixup
Browse files Browse the repository at this point in the history
Add more workflow tests
  • Loading branch information
dafeliton authored Mar 28, 2024
2 parents 73e599c + abc9760 commit 430f1a3
Show file tree
Hide file tree
Showing 6 changed files with 301 additions and 0 deletions.
32 changes: 32 additions & 0 deletions images/scipy-ml-notebook/workflow_tests/test_keras.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
import pytest

@pytest.fixture
def simple_model():
model = Sequential()
model.add(Dense(units=10, activation='relu', input_shape=(5,)))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
return model

def test_model_training(simple_model):
x_train = np.random.random((100, 5))
y_train = np.random.randint(2, size=(100, 1))
simple_model.fit(x_train, y_train, epochs=1, batch_size=32, verbose=0)
assert simple_model.layers[0].input_shape == (None, 5)
assert simple_model.layers[1].output_shape == (None, 1)

def test_model_evaluation(simple_model):
x_test = np.random.random((20, 5))
y_test = np.random.randint(2, size=(20, 1))
loss, accuracy = simple_model.evaluate(x_test, y_test, verbose=0)
assert loss >= 0
assert 0 <= accuracy <= 1

def test_model_prediction(simple_model):
x_new = np.random.random((1, 5))
prediction = simple_model.predict(x_new)
assert prediction.shape == (1, 1)
assert 0 <= prediction <= 1
29 changes: 29 additions & 0 deletions images/scipy-ml-notebook/workflow_tests/test_matplotlib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import matplotlib.pyplot as plt
import numpy as np

def create_simple_plot(x, y, title="Test Plot"):
fig, ax = plt.subplots()
ax.plot(x, y)
ax.set_title(title)
return fig, ax

def test_number_of_plots_created():
x = np.arange(0, 10, 1)
y = x ** 2
fig, ax = create_simple_plot(x, y)
assert len(fig.axes) == 1, "There should be exactly one plot created"

def test_plot_title_is_correct():
x = np.arange(0, 10, 1)
y = x ** 2
title = "Test Plot"
_, ax = create_simple_plot(x, y, title=title)
assert ax.get_title() == title, f"The title should be '{title}'"

def test_data_matches_input():
x = np.arange(0, 10, 1)
y = x ** 2
_, ax = create_simple_plot(x, y)
line = ax.lines[0] # Get the first (and in this case, only) line object
np.testing.assert_array_equal(line.get_xdata(), x, "X data does not match input")
np.testing.assert_array_equal(line.get_ydata(), y, "Y data does not match input")
80 changes: 80 additions & 0 deletions images/scipy-ml-notebook/workflow_tests/test_nltk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import nltk
import pytest

def setup_module(module):
nltk.download('punkt', download_dir='/tmp/nltk_data')
nltk.download('maxent_ne_chunker', download_dir='/tmp/nltk_data')
nltk.download('words', download_dir='/tmp/nltk_data')
nltk.data.path.append('/tmp/nltk_data')

def test_tokenization():
# Test sentence tokenization
sentence = "This is a sample sentence. It consists of two sentences."
tokenized_sentences = nltk.sent_tokenize(sentence)
assert len(tokenized_sentences) == 2
assert tokenized_sentences[0] == "This is a sample sentence."
assert tokenized_sentences[1] == "It consists of two sentences."

# Test word tokenization
sentence = "The quick brown fox jumps over the lazy dog."
tokenized_words = nltk.word_tokenize(sentence)
assert len(tokenized_words) == 10
assert tokenized_words == ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog", "."]

def test_stemming():
# Test Porter stemmer
porter_stemmer = nltk.PorterStemmer()
words = ["running", "runs", "ran", "runner"]
stemmed_words = [porter_stemmer.stem(word) for word in words]
assert stemmed_words == ["run", "run", "ran", "runner"]

# Test Lancaster stemmer
lancaster_stemmer = nltk.LancasterStemmer()
words = ["happiness", "happier", "happiest", "happily"]
stemmed_words = [lancaster_stemmer.stem(word) for word in words]
assert stemmed_words == ["happy", "happy", "happiest", "happy"]

def test_named_entity_recognition():
sentence = "Barack Obama was the 44th President of the United States."
tokens = nltk.word_tokenize(sentence)
tags = nltk.pos_tag(tokens)
ne_chunks = nltk.ne_chunk(tags)

found_barack_obama = False
found_united_states = False

# Buffer for consecutive person tags
person_buffer = []

def check_and_clear_buffer():
nonlocal found_barack_obama
if person_buffer:
person_name = " ".join(person_buffer)
if person_name == "Barack Obama":
found_barack_obama = True
person_buffer.clear()

for ne in ne_chunks:
if isinstance(ne, nltk.tree.Tree):
if ne.label() == "PERSON":
person_buffer.append(" ".join(token[0] for token in ne))
else:
# If we encounter a non-PERSON entity, check and clear the buffer
check_and_clear_buffer()
if ne.label() == "GPE" and " ".join(token[0] for token in ne) == "United States":
found_united_states = True
else:
# For tokens not recognized as NE, clear the buffer
check_and_clear_buffer()

check_and_clear_buffer()

#print(str(ne_chunks))

# Assert the named entities were found
assert found_barack_obama, "Barack Obama as PERSON not found"
assert found_united_states, "United States as GPE not found"

# Assert the named entities were found
assert found_barack_obama, "Barack Obama as PERSON not found"
assert found_united_states, "United States as GPE not found"
61 changes: 61 additions & 0 deletions images/scipy-ml-notebook/workflow_tests/test_pandas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import pandas as pd
import numpy as np
import pytest

def test_dataframe_creation():
# Test creating a DataFrame from a dictionary
data = {'name': ['Alice', 'Bob', 'Charlie'],
'age': [25, 30, 35],
'city': ['New York', 'London', 'Paris']}
df = pd.DataFrame(data)

assert df.shape == (3, 3)

assert list(df.columns) == ['name', 'age', 'city']

assert df['name'].dtype == object
assert df['age'].dtype == int
assert df['city'].dtype == object

def test_dataframe_indexing():
# Create a sample DataFrame
data = {'A': [1, 2, 3],
'B': [4, 5, 6],
'C': [7, 8, 9]}
df = pd.DataFrame(data)

assert df['A'].tolist() == [1, 2, 3]
assert df['B'].tolist() == [4, 5, 6]
assert df['C'].tolist() == [7, 8, 9]

assert df.iloc[0].tolist() == [1, 4, 7]
assert df.iloc[1].tolist() == [2, 5, 8]
assert df.iloc[2].tolist() == [3, 6, 9]

def test_dataframe_merge():
# Create two sample DataFrames
df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'],
'value1': [1, 2, 3, 4]})
df2 = pd.DataFrame({'key': ['B', 'D', 'E', 'F'],
'value2': [5, 6, 7, 8]})

merged_df = pd.merge(df1, df2, on='key')

assert merged_df.shape == (2, 3)

assert merged_df['key'].tolist() == ['B', 'D']
assert merged_df['value1'].tolist() == [2, 4]
assert merged_df['value2'].tolist() == [5, 6]

def test_dataframe_groupby():
# Create a sample DataFrame
data = {'category': ['A', 'B', 'A', 'B', 'A'],
'value': [1, 2, 3, 4, 5]}
df = pd.DataFrame(data)

grouped_df = df.groupby('category').sum()

assert grouped_df.shape == (2, 1)

assert grouped_df.loc['A', 'value'] == 9
assert grouped_df.loc['B', 'value'] == 6
46 changes: 46 additions & 0 deletions images/scipy-ml-notebook/workflow_tests/test_pytorch.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import io, urllib
import torch
import torchaudio


def can_access_cuda():
Expand Down Expand Up @@ -189,6 +191,42 @@ def multiply_dataset_calculate_mean_cuda():
return ("Test passed!")
else:
raise Exception("Test failed...output was " + str(exactFloat))

def load_dummy_audio_file():
# Download an example audio file
url = "https://pytorch.org/tutorials/_static/img/steam-train-whistle-daniel_simon-converted-from-mp3.wav"

# Load the audio file into memory
with urllib.request.urlopen(url) as response:
data = response.read()

# Create a BytesIO object from the audio data
audio_file = io.BytesIO(data)

# Load the audio file from memory
waveform, sample_rate = torchaudio.load(audio_file)

# Check the shape and sample rate
if waveform.shape == (2, 276858) and sample_rate == 44100:
return "Test passed!"
else:
raise Exception(f"Test failed...waveform shape: {waveform.shape}, sample rate: {sample_rate}")

def resample_dummy_audio_sample():
# Create a dummy audio signal
waveform = torch.rand(1, 16000)
sample_rate = 16000

# Resample to 8000 Hz
new_sample_rate = 8000
resampler = torchaudio.transforms.Resample(sample_rate, new_sample_rate)
resampled_waveform = resampler(waveform)

# Check the new shape
if resampled_waveform.shape == (1, 8000):
return "Test passed!"
else:
raise Exception(f"Test failed...resampled waveform shape: {resampled_waveform.shape}")


def test_can_access_cuda():
Expand Down Expand Up @@ -224,3 +262,11 @@ def test_mean_pixel_value_cuda():
def test_multiply_dataset_calculate_mean_cuda():
result = multiply_dataset_calculate_mean_cuda()
assert result == "Test passed!"

def test_load_dummy_audio_file():
result = load_dummy_audio_file()
assert result == "Test passed!"

def test_resample_dummy_audio_sample():
result = resample_dummy_audio_sample()
assert result == "Test passed!"
53 changes: 53 additions & 0 deletions images/scipy-ml-notebook/workflow_tests/test_statsmodels.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import numpy as np
import statsmodels.api as sm
import pytest

def test_ols_simple_fit():
# Generate synthetic data (reproducible with seed(0))
np.random.seed(0)
X = np.random.rand(100, 1)
X = sm.add_constant(X) # Adds a constant term for the intercept
beta = [0.5, 2.0] # True coefficients
y = np.dot(X, beta) + np.random.normal(size=100)

# Fit the model
model = sm.OLS(y, X)
results = model.fit()

# Check if the estimated coefficients are close to the true coefficients
assert np.allclose(results.params, beta, atol=0.5), "The estimated coefficients are not as expected."

def test_logistic_regression_prediction():
# Generate synthetic data
np.random.seed(1)
X = np.random.randn(100, 2)
X = sm.add_constant(X)
beta = [0.1, 0.5, -0.3]
y_prob = 1 / (1 + np.exp(-np.dot(X, beta))) # Sigmoid function for true probabilities
y = (y_prob > 0.5).astype(int) # Binary outcome

# Fit the logistic regression model
model = sm.Logit(y, X)
results = model.fit(disp=0) # disp=0 suppresses the optimization output

# Predict using the model
predictions = results.predict(X) > 0.5

# Check if the predictions match the actual binary outcomes
accuracy = np.mean(predictions == y)
assert accuracy > 0.75, "The prediction accuracy should be higher than 75%."

def test_ols_summary_contains_r_squared():
# Simple linear regression with synthetic data
np.random.seed(2)
X = np.random.rand(50, 1)
y = 2 * X.squeeze() + 1 + np.random.normal(scale=0.5, size=50)
X = sm.add_constant(X)

model = sm.OLS(y, X)
results = model.fit()

summary_str = str(results.summary())

# Check if 'R-squared' is in the summary
assert 'R-squared' in summary_str, "'R-squared' not found in the model summary."

0 comments on commit 430f1a3

Please sign in to comment.