diff --git a/Dockerfile b/Dockerfile index bfca05f..f8693f1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,9 +4,11 @@ FROM quay.io/jupyter/minimal-notebook:notebook-7.0.6 # install necessary packages for analysis RUN conda install -y \ python=3.11.6 \ - altair=5.1.2 \ + altair=altair \ pandas=2.1.2 \ ipykernel=6.26.0 \ scikit-learn=1.3.2 \ requests=2.31.0 \ - notebook=6.5.4 + notebook=7.0.6 \ + pytest=7.4.3 \ + openpyxl=3.1.2 diff --git a/src/breast_cancer_predictor_report.ipynb b/src/breast_cancer_predictor_report.ipynb index 0060371..dbfaf57 100644 --- a/src/breast_cancer_predictor_report.ipynb +++ b/src/breast_cancer_predictor_report.ipynb @@ -87,8 +87,6 @@ "with open(\"../data/raw/breast+cancer+wisconsin+original.zip\", 'wb') as f:\n", " f.write(request.content)\n", "\n", - "pd.arrays\n", - "\n", "with zipfile.ZipFile(\"../data/raw/breast+cancer+wisconsin+original.zip\", 'r') as zip_ref:\n", " zip_ref.extractall(\"../data/raw\")" ] diff --git a/src/read_zip.py b/src/read_zip.py new file mode 100644 index 0000000..b4a6520 --- /dev/null +++ b/src/read_zip.py @@ -0,0 +1,15 @@ +def read_zip(url, directory): + """ + Read a zip file from the given URL and extract its contents to the specified directory. + + Parameters: + ---------- + url : str + The URL of the zip file to be read. + directory : str + The directory where the contents of the zip file will be extracted. + + Returns: + ------- + None + """ diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..6087516 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,9 @@ +## How to run the test suite + +### Preparation of test zip files +The test zip files used in `test_read_zip.py` were genereated +by running the `generate_test_zip_files.py` script in the `tests` directory. +These files need to exist in the remote GitHub repository for the tests to pass. +If for some reason they go missing from the remote repository, +we can re-run the `generate_test_zip_files.py` script to re-generate them +and then push them to the remote repository. \ No newline at end of file diff --git a/tests/empty.zip b/tests/empty.zip new file mode 100644 index 0000000..15cb0ec Binary files /dev/null and b/tests/empty.zip differ diff --git a/tests/files_txt_subdir.zip b/tests/files_txt_subdir.zip new file mode 100644 index 0000000..b6d2038 Binary files /dev/null and b/tests/files_txt_subdir.zip differ diff --git a/tests/files_txt_xlsx.zip b/tests/files_txt_xlsx.zip new file mode 100644 index 0000000..be83991 Binary files /dev/null and b/tests/files_txt_xlsx.zip differ diff --git a/tests/generate_test_zip_files.py b/tests/generate_test_zip_files.py new file mode 100644 index 0000000..2d0e6c5 --- /dev/null +++ b/tests/generate_test_zip_files.py @@ -0,0 +1,46 @@ +import pytest +import os +import openpyxl +import zipfile +import shutil + +# Create a directory named 'subdir' +os.makedirs('subdir', exist_ok=True) + +# Create 'test1.txt' and write "test data" to it +with open('test1.txt', 'w') as file: + file.write('test data') + +# Create 'test2.txt' inside 'subdir' and write "test data" to it +with open('subdir/test2.txt', 'w') as file: + file.write('test data') + +# Create 'test1.xlsx' and write "test data" to it +workbook = openpyxl.Workbook() +worksheet = workbook.active +worksheet.cell(row=1, column=1, value='test data') +workbook.save('test1.xlsx') + +# Case 1 - Create a zip file containing 'test1.txt' and 'test1.xlsx' +with zipfile.ZipFile('files_txt_xlsx.zip', 'w', zipfile.ZIP_DEFLATED) as zipf: + zipf.write('test1.txt') + zipf.write('test1.xlsx') + +# Case 2 - Create a zip file containing 'test1.txt' and 'subdir/test2.txt' +with zipfile.ZipFile('files_txt_subdir.zip', 'w', zipfile.ZIP_DEFLATED) as zipf: + zipf.write('test1.txt') + zipf.write('subdir/test2.txt') + +# Case 3 - Create an empty zip file +with zipfile.ZipFile('empty.zip', 'w', zipfile.ZIP_DEFLATED) as zipf: + pass + +# Clean up the files and directories created +test_files = ['test1.txt', 'test1.xlsx'] + +for file in test_files: + if os.path.exists(file): + os.remove(file) + +if os.path.exists("subdir"): + shutil.rmtree("subdir") diff --git a/tests/test_read_zip.py b/tests/test_read_zip.py new file mode 100644 index 0000000..566efc8 --- /dev/null +++ b/tests/test_read_zip.py @@ -0,0 +1,42 @@ +import pytest +import os +import shutil + +# Test files setup +# setup empty directory for data files to be downloaded to +if not os.path.exists('test_zip_data1'): + os.makedirs('test_zip_data1') + +# setup directory that contains a file for data files to be downloaded to +if not os.path.exists('test_zip_data2'): + os.makedirs('test_zip_data2') +with open('test_zip_data2/test3.txt', 'w') as file: + pass # The 'pass' statement does nothing, creating an empty file + +# test read_zip function can download and extract a zip file containing files +# and subdirectories containing files +def test_read_zip_function(): + # add tests here + + +# test read_zip function throws an error if the zip file is empty +def test_read_zip_error_on_empty(): + # add tests here + + +# test read_zip function throws an error if the input URL is invalid +# (e.g., points to a non-existent file or a non-zip file) +def test_read_zip_error_on_invalid_url(): + # add tests here + + +# test read_zip function throws an error +# if the directory path provided does not exist +def test_read_zip_error_on_missing_dir(): + # add tests here + + +# clean up data directory +if os.path.exists("subdir"): + shutil.rmtree("subdir") +