Skip to content

Commit

Permalink
Merge pull request #8 from calacademy-research/picturae_import
Browse files Browse the repository at this point in the history
Picturae import
  • Loading branch information
foozleface authored Sep 19, 2023
2 parents e61a26f + e1ef462 commit 5c401e2
Show file tree
Hide file tree
Showing 9 changed files with 165 additions and 86 deletions.
25 changes: 15 additions & 10 deletions image_client/picturae_csv_create.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,17 @@


class CsvCreatePicturae(Importer):
def __init__(self, date_string, istesting=False):
def __init__(self, date_string, istest = False):
super().__init__(picturae_config, "Botany")
self.init_all_vars(date_string)

if istest is False:
self.run_all()


def init_all_vars(self, date_string):
"""init_all_vars:to use for testing and decluttering init function,
initializes all class level variables """
self.date_use = date_string
self.logger = logging.getLogger('DataOnboard')

Expand All @@ -40,8 +49,6 @@ def __init__(self, date_string, istesting=False):
for param in init_list:
setattr(self, param, None)

if istesting is False:
self.run_all()

def file_present(self):
"""file_present:
Expand Down Expand Up @@ -78,14 +85,14 @@ def file_present(self):
else:
raise ValueError(f"subdirectory for {self.date_use} not present")

def csv_read_folder(self, folder_string):
"""csv_read_folder:
def csv_read_path(self, csv_level: str):
"""csv_read_path:
reads in csv data for given date self.date_use
args:
folder_string: denotes whether specimen or folder level data with "folder" or "specimen"
"""

folder_path = 'picturae_csv/' + str(self.date_use) + '/picturae_' + str(folder_string) + '(' + \
folder_path = 'picturae_csv/' + str(self.date_use) + '/picturae_' + str(csv_level) + '(' + \
str(self.date_use) + ').csv'

folder_csv = pd.read_csv(folder_path)
Expand All @@ -99,8 +106,8 @@ def csv_merge(self):
fold_csv: folder level csv to be input as argument for merging
spec_csv: specimen level csv to be input as argument for merging
"""
fold_csv = self.csv_read_folder("folder")
spec_csv = self.csv_read_folder("specimen")
fold_csv = self.csv_read_path(csv_level="folder")
spec_csv = self.csv_read_path(csv_level="specimen")

# checking if columns to merge contain same data
if (set(fold_csv['specimen_barcode']) == set(spec_csv['specimen_barcode'])) is True:
Expand Down Expand Up @@ -412,8 +419,6 @@ def check_barcode_match(self):
def check_if_images_present(self):
"""checks that each image exists, creating boolean column for later use"""

print(os.getcwd())

self.record_full['image_valid'] = self.record_full['image_path'].apply(self.check_for_valid_image)

def write_upload_csv(self):
Expand Down
41 changes: 25 additions & 16 deletions image_client/picturae_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,32 @@ class PicturaeImporter(Importer):
along with attached images
"""

def __init__(self, paths, date_string=None, istesting=False):
def __init__(self, paths, date_string=None, istest=False):
super().__init__(picturae_config, "Botany")

self.setting_init_variables(date_string=date_string, paths=paths)

# running csv create
CsvCreatePicturae(date_string=self.date_use)

self.file_path = f"PIC_upload/PIC_record_{self.date_use}.csv"

self.record_full = pd.read_csv(self.file_path)

self.batch_size = len(self.record_full)

self.batch_md5 = generate_token(starting_time_stamp, self.file_path)

if istest is False:
self.run_all_methods()


def setting_init_variables(self, date_string, paths):
"""setting init variables:
a list of variables and data structures to be initialized at the beginning of the class.
args:
date_string: the date input recieved from init params
paths: the paths string recieved from the init params"""
self.date_use = date_string

self.logger = logging.getLogger('PicturaeImporter')
Expand All @@ -44,18 +67,6 @@ def __init__(self, paths, date_string=None, istesting=False):

self.no_match_dict = {}

# running csv create
if istesting is False:
CsvCreatePicturae(date_string=self.date_use)

self.file_path = f"PIC_upload/PIC_record_{self.date_use}.csv"

self.record_full = pd.read_csv(self.file_path)

self.batch_size = len(self.record_full)

self.batch_md5 = generate_token(starting_time_stamp, self.file_path)

# intializing parameters for database upload
init_list = ['GeographyID', 'taxon_id', 'barcode',
'verbatim_date', 'start_date', 'end_date',
Expand All @@ -73,16 +84,14 @@ def __init__(self, paths, date_string=None, istesting=False):

self.paths = paths

if istesting is False:
self.run_all_methods()


def run_timestamps(self, batch_size: int):
"""updating md5 fields for new taxon and taxon mismatch batches"""
ending_time_stamp = datetime.now()

sql = create_batch_record(start_time=starting_time_stamp, end_time=ending_time_stamp,
batch_md5=self.batch_md5, batch_size=batch_size)
batch_md5=self.batch_md5, batch_size=batch_size)

insert_table_record(connection=self.batch_db_connection, logger_int=self.logger, sql=sql)

Expand Down
57 changes: 57 additions & 0 deletions tests/test_check_record.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""tests to test the record_present, barcode_present and image_has_record functions."""
import unittest
import picturae_csv_create as pcc
import pandas as pd
from tests.testing_tools import TestingTools

class DatabaseChecks(unittest.TestCase, TestingTools):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.md5_hash = self.generate_random_md5()
def setUp(self):
"""creates fake dataset with dummy columns,
that have a small subset of representative real column names,
"""
# initializing
self.CsvCreatePicturae = pcc.CsvCreatePicturae(date_string=self.md5_hash, istesting=True)

# creating dummy dataset, one mistake 530923 != 530924 inserted on purpose
# the test barcode that is set to return a false is 58719322,
# an unrealistically high barcode higher than digit limit in DB #
data = {'CatalogNumber': ['530923', '58719322', '8708'],
'image_path': ['picturae_img/cas0530924.jpg',
'picturae_img/cas58719322.jpg',
'picturae_img/cas0008708.jpg'],
'folder_barcode': ['2310_2', '2310_2', '2312_2']}

self.CsvCreatePicturae.record_full = pd.DataFrame(data)

def test_barcode_present(self):
"""checks whether boolean column added for record present"""
self.CsvCreatePicturae.barcode_has_record()
# checks whether boolean column correctly added
self.assertEqual(len(self.CsvCreatePicturae.record_full.columns), 4)
# checks that no NAs were dropped
self.assertEqual(len(self.CsvCreatePicturae.record_full), 3)
# checks that the correct boolean order is returned
test_list = list(self.CsvCreatePicturae.record_full['barcode_present'])
self.assertEqual(test_list, [True, False, True])

def test_if_barcode_match(self):
"""tests if there is a barcode in the barcode
column that does not match the barcode in the img file name,
the correct boolean is returned"""
self.CsvCreatePicturae.check_barcode_match()
test_list = list(self.CsvCreatePicturae.record_full['is_barcode_match'])
self.assertEqual([False, True, True], test_list)

def test_image_has_record(self):
"""tests if image_has_record returns true for
one real attachment in test df"""
self.CsvCreatePicturae.image_has_record()
test_list = list(self.CsvCreatePicturae.record_full['image_present'])
self.assertEqual([True, False, False], test_list)


def tearDown(self):
del self.CsvCreatePicturae
9 changes: 3 additions & 6 deletions tests/test_col_clean.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,20 @@
"""tests the rename_cols function, to make sure correct column names are assigned"""
import unittest
import shutil
import os
import pandas as pd
import picturae_csv_create as pcc
from tests.testing_tools import TestingTools
import picturae_config
from tests.testing_tools import TestingTools
from datetime import date, timedelta

os.chdir("./image_client")

class ColNamesTest(unittest.TestCase, TestingTools):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.date_string = self.test_date()
self.md5_hash = self.generate_random_md5()
def setUp(self):
"""creates dummy dataset with representative column names"""
# initializing class
self.CsvCreatePicturae = pcc.CsvCreatePicturae(date_string=self.date_string, istesting = True)
self.CsvCreatePicturae = pcc.CsvCreatePicturae(date_string=self.md5_hash, istest=True)
# creating dummy dataset
numb_range = list(range(1, 101))
column_names = ['application_batch', 'csv_batch', 'object_type', 'folder_barcode',
Expand Down
28 changes: 13 additions & 15 deletions tests/test_pic_dir.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,34 +4,32 @@
import picturae_csv_create as pcc
import picturae_config
from tests.testing_tools import TestingTools
from datetime import date, timedelta

os.chdir("./image_client")

class DirectoryTests(unittest.TestCase, TestingTools):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.date_string = self.test_date()

self.md5_hash = self.generate_random_md5()

"""WorkingDirectoryTests: a series of unit tests to verify
correct working directory, subdirectories."""
def setUp(self):
"""setUP: unittest setup function creates empty csvs,
and folders for given test path"""
# initializing
self.CsvCreatePicturae = pcc.CsvCreatePicturae(date_string=self.date_string, istesting=True)
self.CsvCreatePicturae = pcc.CsvCreatePicturae(date_string=self.md5_hash, istest=True)

if self._testMethodName == "test_missing_folder_raise_error":
pass
else:
# create test directories

expected_folder_path = picturae_config.DATA_FOLDER + f"{self.date_string}" + picturae_config.CSV_FOLD + \
f"{self.date_string}" + ").csv"
expected_folder_path = picturae_config.DATA_FOLDER + f"{self.md5_hash}" + picturae_config.CSV_FOLD + \
f"{self.md5_hash}" + ").csv"

expected_specimen_path = picturae_config.DATA_FOLDER + f"{self.date_string}" + picturae_config.CSV_SPEC + \
f"{self.date_string}" + ").csv"
expected_specimen_path = picturae_config.DATA_FOLDER + f"{self.md5_hash}" + picturae_config.CSV_SPEC + \
f"{self.md5_hash}" + ").csv"

# making the directories
os.makedirs(os.path.dirname(expected_folder_path), exist_ok=True)
Expand All @@ -45,7 +43,7 @@ def test_missing_folder_raise_error(self):
"""checks if incorrect sub_directory raises error from file present"""
with self.assertRaises(ValueError) as cm:
self.CsvCreatePicturae.file_present()
self.assertEqual(str(cm.exception), f"subdirectory for {self.date_string} not present")
self.assertEqual(str(cm.exception), f"subdirectory for {self.md5_hash} not present")


def test_expected_path_date(self):
Expand All @@ -61,8 +59,8 @@ def test_raise_specimen(self):
"""test_raise_specimen: tests whether correct value
error is raised for missing specimen_csv"""
# removing test path specimen
os.remove('picturae_csv/' + str(self.date_string) + '/picturae_specimen(' +
str(self.date_string) + ').csv')
os.remove('picturae_csv/' + str(self.md5_hash) + '/picturae_specimen(' +
str(self.md5_hash) + ').csv')
with self.assertRaises(ValueError) as cm:
self.CsvCreatePicturae.file_present()
self.assertEqual(str(cm.exception), "Specimen csv does not exist")
Expand All @@ -71,8 +69,8 @@ def test_raise_folder(self):
"""test_raise_folder: tests whether correct value error
is raised for missing folder_csv"""
# removing test path folder
os.remove('picturae_csv/' + str(self.date_string) + '/picturae_folder(' +
str(self.date_string) + ').csv')
os.remove('picturae_csv/' + str(self.md5_hash) + '/picturae_folder(' +
str(self.md5_hash) + ').csv')
with self.assertRaises(ValueError) as cm:
self.CsvCreatePicturae.file_present()
self.assertEqual(str(cm.exception), "Folder csv does not exist")
Expand All @@ -88,8 +86,8 @@ def tearDown(self):
del self.CsvCreatePicturae
# create test directories

expected_folder_path = picturae_config.DATA_FOLDER + f"{self.date_string}" + picturae_config.CSV_FOLD + \
f"{self.date_string}" + ").csv"
expected_folder_path = picturae_config.DATA_FOLDER + f"{self.md5_hash}" + picturae_config.CSV_FOLD + \
f"{self.md5_hash}" + ").csv"
shutil.rmtree(os.path.dirname(expected_folder_path))


Expand Down
Loading

0 comments on commit 5c401e2

Please sign in to comment.