diff --git a/README.md b/README.md index ac5e1d4..1c0dc83 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ Each entry in the `metadata` tab can have the following entries: * `column`: The column of the spreadsheet that is being described. * `neotoma`: A database table and column combination from the database schema. * `vocab`: If there is a fixed vocabulary for the column, include the possible terms here. -* `repeat`: [`true`, `false`] Is each entry unique and tied to the row, or is this a set of entries associated with the site? +* `repeat`: [`true`, `false`] Is each entry unique and tied to the row (`false`, this isn't a set of repeated values), or is this a set of entries associated with the site (`true`, there is only a single value that repeats throughout)? * `type`: [`integer`, `numeric`, `date`] The variable type for the field. * `ordered`: [`true`, `false`] Does the order of the column matter? diff --git a/functions.py b/functions.py deleted file mode 100644 index e8db15b..0000000 --- a/functions.py +++ /dev/null @@ -1,7 +0,0 @@ -def insertSampleAges(cur): - sampleageinsert = """ - """ - -#def insertSampleAnalyst(cur): - -#def insertData(cur): \ No newline at end of file diff --git a/neotomaUploader/__init__.py b/neotomaUploader/__init__.py index 3777331..c903893 100644 --- a/neotomaUploader/__init__.py +++ b/neotomaUploader/__init__.py @@ -6,23 +6,36 @@ import argparse import os -from .cleanCol import cleanCol +from .clean_column import clean_column from .yaml_values import yaml_values from .insert_site import insert_site -from .insertAnalysisUnit import insertAnalysisUnit -from .validAgent import validAgent +from .insert_analysisunit import insert_analysisunit +from .valid_agent import valid_agent from .valid_date import valid_date from .read_csv import read_csv from .validUnits import validUnits from .valid_site import valid_site from .valid_collectionunit import valid_collectionunit from .validGeoPol import validGeoPol -from .validHorizon import validHorizon -from .hashFile import hashFile -from .checkFile import checkFile -from .insertGeoPol import insertGeoPol -from .insertCollUnit import insertCollUnit -from .csvValidator import csvValidator -from .csvValidator import ymlToDict +from .valid_horizon import valid_horizon +from .hash_file import hash_file +from .check_file import check_file +from .insert_geopol import insert_geopol +from .insert_collunit import insert_collunit +from .csv_validator import csv_validator +from .csv_validator import yml_to_dict from .vocabDict import vocabDict -from .parseArguments import parseArguments +from .parse_arguments import parse_arguments +from .csv_to_yaml import csv_to_yaml +from .valid_taxa import valid_taxa +from .insert_chronology import insert_chronology +from .insert_chron_control import insert_chron_control +from .insert_dataset import insert_dataset +from .insert_dataset_pi import insert_dataset_pi +from .insert_data_processor import insert_data_processor +from .insert_dataset_repository import insert_dataset_repository +from .insert_dataset_database import insert_dataset_database +from .insert_sample import insert_sample +from .insert_sample_analyst import insert_sample_analyst +from .insert_data import insert_data +from .insert_sample_age import insert_sample_age \ No newline at end of file diff --git a/neotomaUploader/checkFile.py b/neotomaUploader/check_file.py similarity index 97% rename from neotomaUploader/checkFile.py rename to neotomaUploader/check_file.py index 89c9448..851f86d 100644 --- a/neotomaUploader/checkFile.py +++ b/neotomaUploader/check_file.py @@ -1,7 +1,7 @@ import re import os -def checkFile(filename): +def check_file(filename): """_Validate the existence and result of a logfile._ Args: diff --git a/neotomaUploader/cleanCol.py b/neotomaUploader/clean_column.py similarity index 90% rename from neotomaUploader/cleanCol.py rename to neotomaUploader/clean_column.py index 53ba1aa..50ff997 100644 --- a/neotomaUploader/cleanCol.py +++ b/neotomaUploader/clean_column.py @@ -1,4 +1,4 @@ -def cleanCol(column, template, clean = True): +def clean_column(column, template, clean = True): """_cleanCol_ Args: diff --git a/neotomaUploader/csvValidator.py b/neotomaUploader/csvValidator.py deleted file mode 100644 index 67e28dd..0000000 --- a/neotomaUploader/csvValidator.py +++ /dev/null @@ -1,49 +0,0 @@ -import yaml -from yaml.loader import SafeLoader -from collections import defaultdict -import itertools -import pandas as pd -import os -import sys -import argparse - -""" -To run from command line use: -python csvValidator.py /path/to/directory -Example:/ -python 210Pb_Template/neotomaUploader/csvValidator.py --path=210Pb_Template/data/ --template=210Pb_Template/template.yml -""" - -def ymlToDict(yml_file): - with open(yml_file) as f: - yml_data = yaml.load(f, Loader=SafeLoader) - return yml_data - -def csvValidator(filename, yml_data): - log_file = [] - # Take directly from .yml file - col_values = [d.get('column') for d in yml_data] - - try: - # Load csv file as data frame and extract columns - df = pd.read_csv(filename) - df_columns = list(df.columns) - # Verify that all columns from the DF are in the YAML file - diff_col = sorted(set(col_values) - set(df_columns)) - - # Verify that all columns from the YAML are in the DF - diff_val = sorted(set(df_columns)-set(col_values)) - - # Report in the log - if diff_col == diff_val: - message = ["✔ The column names and flattened YAML keys match"] - log_file = log_file + message - else: - log_file = log_file + ["✗ The column names and flattened YAML keys do not match"] - log_file = log_file + [f"Columns from the YAML template are not in the data frame: '{diff_val}'"] - log_file = log_file + [f"Columns from the data frame not in the YAML template: '{diff_col}'"] - - except Exception as e: - log_file.append(f"✗ Error opening file '{filename}': {e}"+ '\n') - - return log_file diff --git a/neotomaUploader/csv_to_yaml.py b/neotomaUploader/csv_to_yaml.py new file mode 100644 index 0000000..ab50cbe --- /dev/null +++ b/neotomaUploader/csv_to_yaml.py @@ -0,0 +1,26 @@ +import pandas as pd +import yaml + +def csv_to_yaml(xl_path, yml_output= 'output_yml.yml'): + """ + _csv_to_yaml_ + + Args: + xl_path (_list_): _Excel file to be used as template_ + yml_output (_list_): _Location and file name where the yaml template will be stored_ + + Returns: + _None_: _The output file will be stored, no need to return anything here_ + """ + df = pd.read_excel(xl_path) + + # Convert DataFrame to a dictionary with list of columns + data_dict = df.to_dict(orient='records') + nested_data = [{key: value for key, value in zip(df.columns, row)} for row in data_dict] + + with open(yml_output, 'w') as yaml_file: + yaml.dump(nested_data, yaml_file, default_flow_style=False) + + print(f'YAML file stored in {yml_output} successfully.') + + return None \ No newline at end of file diff --git a/neotomaUploader/csv_validator.py b/neotomaUploader/csv_validator.py new file mode 100644 index 0000000..2ae2ae6 --- /dev/null +++ b/neotomaUploader/csv_validator.py @@ -0,0 +1,73 @@ +import yaml +from yaml.loader import SafeLoader +from collections import defaultdict +import itertools +import pandas as pd +import os +import sys +import argparse + +""" +To run from command line use: +python csv_validator.py /path/to/directory +Example:/ +python 210Pb_Template/neotomaUploader/csvValidator.py --path=210Pb_Template/data/ --template=210Pb_Template/template.yml +""" + +def yml_to_dict(yml_file): + """_Read in valid yaml file._ + + Args: + yml_file (_string_): _A valid filename for a yaml file._ + + Returns: + _dict_: _A dict representation of a yaml file._ + """ + if not os.path.isfile(yml_file): + raise FileNotFoundError(f"The file '{yml_file}' could not be found within the current path.") + + with open(yml_file, encoding = "UTF-8") as file: + yml_data = yaml.load(file, Loader = SafeLoader) + return yml_data + + +def csv_validator(filename, yml_data): + """_Validate csv file for use in the validator._ + + Args: + filename (_string_): _A valid csv filename._ + yml_data (_dict_): _A dict passed from yml_to_dict()_ + + Returns: + _type_: _description_ + """ + log_file = [] + # Take directly from .yml file + col_values = [d.get('column') for d in yml_data] + + if not os.path.isfile(filename): + raise FileNotFoundError(f"The file '{filename}' could not be found within the current path.") + + try: + # Load csv file as data frame and extract columns + df = pd.read_csv(filename) + except pd.errors.ParserError: + log_file.append(f"✗ Error opening file '{filename}': {e}"+ '\n') + + df_columns = list(df.columns) + # Verify that all columns from the DF are in the YAML file + diff_col = sorted(set(col_values) - set(df_columns)) + + # Verify that all columns from the YAML are in the DF + diff_val = sorted(set(df_columns)-set(col_values)) + + # Report in the log + if diff_col == diff_val: + message = ["✔ The column names and flattened YAML keys match"] + log_file = log_file + message + else: + log_file = log_file + ["✗ The column names and flattened YAML keys do not match"] + log_file = log_file + [f"Columns from the YAML template are not in the data frame: '{diff_val}'"] + log_file = log_file + [f"Columns from the data frame not in the YAML template: '{diff_col}'"] + + return log_file diff --git a/neotomaUploader/getAgent.py b/neotomaUploader/getAgent.py deleted file mode 100644 index 7bec5dc..0000000 --- a/neotomaUploader/getAgent.py +++ /dev/null @@ -1,12 +0,0 @@ -def validAgent(cur, agentname): - """_Get user agent or contact from Neotoma_ - - Args: - cur (_psycopg2.extensions.cursor_): _A cursor pointing to the Neotoma Paleoecology Database._ - agentname (_string_): _A user name or individual._ - """ - nameQuery = """ - SELECT ct.contactid - FROM ndb.contacts AS ct - WHERE %(name)s = ct.contactname""" - cur.execute(nameQuery, {'name'}) diff --git a/neotomaUploader/hashFile.py b/neotomaUploader/hash_file.py similarity index 96% rename from neotomaUploader/hashFile.py rename to neotomaUploader/hash_file.py index d9c1b31..033bd08 100644 --- a/neotomaUploader/hashFile.py +++ b/neotomaUploader/hash_file.py @@ -1,7 +1,7 @@ import hashlib import os -def hashFile(filename): +def hash_file(filename): response = {'pass': False, 'hash': None, 'message': []} logfile = filename + '.log' response['hash'] = hashlib.md5(open(filename,'rb').read()).hexdigest() diff --git a/neotomaUploader/insertAnalysisUnit.py b/neotomaUploader/insertAnalysisUnit.py deleted file mode 100644 index 5af9a57..0000000 --- a/neotomaUploader/insertAnalysisUnit.py +++ /dev/null @@ -1,36 +0,0 @@ -from .retrieveDict import retrieveDict -from .cleanCol import cleanCol - -def insertAnalysisUnit(cur, yml_dict, csvTemplate, uploader): - """_Inserting analysis units_ - - Args: - cur (_type_): _description_ - yml_dict (_type_): _description_ - csvTemplate (_type_): _description_ - uploader (_type_): _description_ - - Returns: - _type_: _description_ - """ - addUnit = """ - SELECT ts.insertanalysisunit(_collectionunitid := %(collunitid)s, _mixed := FALSE, _depth := %(depth)s, _thickness := %(thickness)s) - """ - - depthD = retrieveDict(yml_dict, 'ndb.analysisunits.depth') - thickD = retrieveDict(yml_dict, 'ndb.analysisunits.thickness') - - depths = cleanCol(depthD.get('column'), - csvTemplate, - clean = not depthD.get('repeat')) - - thicks = cleanCol(thickD.get('column'), - csvTemplate, - clean = not thickD.get('repeat')) - - anunits = [] - for i, value in enumerate(depths): - cur.execute(addUnit, {'collunitid': uploader['collunitid'], - 'depth': value, 'thickness': thicks[i]}) - anunits.append(cur.fetchone()[0]) - return anunits diff --git a/neotomaUploader/insertChronControl.py b/neotomaUploader/insertChronControl.py deleted file mode 100644 index 8d97d45..0000000 --- a/neotomaUploader/insertChronControl.py +++ /dev/null @@ -1,35 +0,0 @@ -import logging - -def insertChronControl(cur, chronid, annunits, dthick, - agetype): - for i in range(len(dthick)): - dthick[i]['annunit'] = annunits[i] - - addcontrol = """ - SELECT ts.insertchroncontrol(_chronologyid := %(chronid)s, - _chroncontroltypeid := 10, - _analysisunitid := %(annuid)s, - _depth := %(depth)s, - _thickness := %(thickness)s, - _agetypeid := %(agetypeid)s, - _age := %(age)s, - _agelimityounger := %(ageyoung)s, - _agelimitolder := %(ageold)s, - _notes := %(notes)s)""" - for i in dthick: - if agetype == 'cal yr BP': - agetypeid = 2 - elif agetype == 'CE/BCE': - agetypeid = 1 - else: - logging.error("The provided age type is incorrect..") - cur.execute(addcontrol, {'chronid': chronid, - 'annuid': i['annunit'], - 'depth': i['depth'], - 'thickness': i['thick'], - 'agetypeid': agetypeid, - 'age': i['age'], - 'ageyoung': i['age'] + i['error'], - 'ageold': i['age'] - i['error']}) - return None - diff --git a/neotomaUploader/insertChronology.py b/neotomaUploader/insertChronology.py deleted file mode 100644 index 0bef025..0000000 --- a/neotomaUploader/insertChronology.py +++ /dev/null @@ -1,43 +0,0 @@ -import datetime -import logging - -def insertChronology(cur, collunitid, agetype, agemodel, ages, - contactname, default = True, - chronologyname = 'Default 210Pb', - dateprepared = datetime.datetime.today().date()): - def cleanage(x): - try: - y = float(x) - except ValueError: - y = None - return y - - cleanage = list(map(lambda x: cleanage(x), ages)) - minage = min([i for i in cleanage if i is not None]) - maxage = max([i for i in cleanage if i is not None]) - - addChron = """SELECT ts.insertchronology(_collectionunitid := %(collunitid)s, - _agetypeid := %(agetype)s, - _contactid := %(contactid)s, - _isdefault := TRUE, - _chronologyname := %(chronologyname)s, - _dateprepared := %(dateprepared)s, - _agemodel := %(agemodel)s, - _ageboundyounger := %(maxage)s, - _ageboundolder := %(minage)s)""" - getCont = """SELECT contactid FROM ndb.contacts WHERE %(contactname)s %% contactname;""" - cur.execute(getCont, {'contactname': contactname[0]}) - contactid = cur.fetchone()[0] - if agetype == 'cal yr BP': - agetypeid = 2 - elif agetype == 'CE/BCE': - agetypeid = 1 - else: - logging.error("The provided age type is incorrect..") - cur.execute(addChron, {'collunitid':collunitid, 'contactid': contactid, - 'chronologyname': chronologyname, - 'agetype': agetypeid, - 'dateprepared': dateprepared, 'agemodel': agemodel, - 'maxage': int(maxage), 'minage': int(minage)}) - chronid = cur.fetchone()[0] - return chronid diff --git a/neotomaUploader/insertCollUnit.py b/neotomaUploader/insertCollUnit.py deleted file mode 100644 index bb7c775..0000000 --- a/neotomaUploader/insertCollUnit.py +++ /dev/null @@ -1,69 +0,0 @@ -import datetime -import logging -from .retrieveDict import retrieveDict -from .cleanCol import cleanCol - -def insertCollUnit(cur, yml_dict, csvTemplate, uploader): - """_Insert a new collection unit to a site_ - - Args: - cur (_psycopg2.extensions.cursor_): _A cursor pointing to the Neotoma Paleoecology Database._ - sitename (_list_): _A list returned by the function cleanCol()_ - coords (_list_): _A list returned by the function cleanCol()_ - - Returns: - _int_: _The integer value of the newly created siteid from the Neotoma Database._ - - Returns: - _type_: _description_ - """ - - coordsD = retrieveDict(yml_dict, 'ndb.collectionunits.geom') - collnameD = retrieveDict(yml_dict, 'ndb.collectionunits.handle') - collDateD = retrieveDict(yml_dict, 'ndb.collectionunits.colldate') - collLocD = retrieveDict(yml_dict, 'ndb.collectionunits.location') - - coords = cleanCol(coordsD.get('column'), - csvTemplate, - clean = not coordsD.get('repeat')) - - colldate = cleanCol(collDateD.get('column'), - csvTemplate, - clean = not collDateD.get('repeat')) - - collunits = cleanCol(collnameD.get('column'), - csvTemplate, - clean = not collnameD.get('repeat')) - - location = cleanCol(collLocD.get('column'), - csvTemplate, - clean = not collLocD.get('repeat')) - - newdate = datetime.datetime.strptime(colldate[0], '%Y-%m-%d').date() - - handle = collunits[0].upper().replace(' ', '')[0:9] - - try: - coords = list(map(lambda x: float(x), coords[0].split(','))) - assert len(coords) == 2 - assert coords[0] >= -90 and coords[0] <= 90 - assert coords[1] >= -180 and coords[1] <= 180 - except AssertionError: - logging.error("Coordinates are improperly formatted. They must be in the form 'LAT, LONG' [-90 -> 90] and [-180 -> 180].") - - cur.execute(""" - SELECT ts.insertcollectionunit( - _handle := %(handle)s, - _collunitname := %(collname)s, - _siteid := %(siteid)s, - _colltypeid := 3, - _depenvtid := 19, - _colldate := %(newdate)s, - _location := %(location)s, - _gpslatitude := %(ns)s, _gpslongitude := %(ew)s)""", - {'collname': collunits[0], 'newdate': newdate, - 'siteid' : uploader.get('siteid'), - 'handle': handle, 'location': location[0], - 'ns': coords[0], 'ew': coords[1]}) - collunitid = cur.fetchone()[0] - return collunitid diff --git a/neotomaUploader/insertDataProcessor.py b/neotomaUploader/insertDataProcessor.py deleted file mode 100644 index fb2df2f..0000000 --- a/neotomaUploader/insertDataProcessor.py +++ /dev/null @@ -1,5 +0,0 @@ -def insertDataProcessor(cur, datasetid, names): - processor = """SELECT ts.insertdataprocessor(_datasetid := %(datasetid)s, - _contactid := %(contactid)s)""" - cur.execute(processor, {'datasetid': datasetid, 'contactid': names}) - return None diff --git a/neotomaUploader/insertDataset.py b/neotomaUploader/insertDataset.py deleted file mode 100644 index 3500973..0000000 --- a/neotomaUploader/insertDataset.py +++ /dev/null @@ -1,9 +0,0 @@ -def insertDataset (cur, collunitid, datasetname): - insertString = """SELECT ts.insertdataset( - __collectionunitid:= %(collunitid)s, - _datasettypeid := , - _datasetname := %(datasetname)s);""" - cur.execute(insertString, {'collunitid': collunitid, - 'datasetname': datasetname}) - datasetid = cur.fetchone()[0] - return datasetid diff --git a/neotomaUploader/insertDatasetPI.py b/neotomaUploader/insertDatasetPI.py deleted file mode 100644 index dfc3fac..0000000 --- a/neotomaUploader/insertDatasetPI.py +++ /dev/null @@ -1,14 +0,0 @@ -def insertDatasetPI(cur, datasetid, datasetpis): - result = [] - getCont = """SELECT * FROM ndb.contacts WHERE contactname %% %(name)s;""" - contids = [] - baseid = 1 - for i in datasetpis: - cur.execute(getCont, {'name': i}) - contids.append({'name': i, 'id': cur.fetchone()[0], 'order': baseid}) - baseid = baseid + 1 - for i in contids: - inserter = """SELECT ts.insertdatasetpi(_datasetid := %(datasetid)s, _contactid := %(contid)s);""" - cur.execute(inserter, {'datasetid': datasetid, 'contid': contids}) - result.append(cur.fetchone()[0]) - return result diff --git a/neotomaUploader/insertGeoPol.py b/neotomaUploader/insertGeoPol.py deleted file mode 100644 index 914005b..0000000 --- a/neotomaUploader/insertGeoPol.py +++ /dev/null @@ -1,21 +0,0 @@ -def insertGeoPol(cur, uploader): - if 'siteid' in uploader.keys(): - # First test if the site exists. - isAdded = """SELECT * FROM ap.sitegadm WHERE siteid = %(siteid)s""" - cur.execute(isAdded, { 'siteid': uploader['siteid'] }) - result = cur.fetchone() - if result is None: - # If the site doesn't already exist: - assignGeoPol = """ - INSERT INTO ap.sitegadm(siteid, fid) - (SELECT st.siteid, ga.fid - FROM ndb.sites AS st - JOIN ap.gadm_410 AS ga ON ST_Covers(ga.geom, st.geog) - WHERE st.siteid = %(siteid)s);""" - cur.execute(assignGeoPol, { 'siteid': uploader['siteid'] }) - result = cur.fetchone() - if result is not None: - result = result[0] - else: - result = None - return result diff --git a/neotomaUploader/insertSamples.py b/neotomaUploader/insertSamples.py deleted file mode 100644 index 2a4486a..0000000 --- a/neotomaUploader/insertSamples.py +++ /dev/null @@ -1,7 +0,0 @@ -def insertSamples(cur, datasetid, annunitss): - sampleinsert = """SELECT ts.insertsample(_analysisunitid := %(annuid)s, - _datasetid := %(datasetid)s, - _sampledate := %(sampdate)s, - _analysisdate := %(anndate)s, - _taxonid := %(taxonid)s)""" - \ No newline at end of file diff --git a/neotomaUploader/insert_analysisunit.py b/neotomaUploader/insert_analysisunit.py new file mode 100644 index 0000000..e418a89 --- /dev/null +++ b/neotomaUploader/insert_analysisunit.py @@ -0,0 +1,48 @@ +from .retrieve_dict import retrieve_dict +from .clean_column import clean_column +import logging +from .pull_params import pull_params + +def insert_analysisunit(cur, yml_dict, csv_template, uploader): + """_Inserting analysis units_ + + Args: + cur (_psycopg2.extensions.cursor_): _A cursor pointing to the Neotoma + Paleoecology Database._ + yml_dict (_dict_): _A `dict` returned by the YAML template._ + csv_template (_dict_): _The csv file with the required data to be uploaded._ + uploader (_dict_): A `dict` object that contains critical information about the + object uploaded so far. + + Returns: + _int_: _The integer value of the newly created siteid from the Neotoma Database._ + """ + + add_unit = """ + SELECT ts.insertanalysisunit(_collectionunitid := %(collunitid)s, + _depth := %(depth)s, + _thickness := %(thickness)s, + _faciesid := %(faciesid)s, + _mixed := %(mixed)s, + _igsn := %(igsn)s, + _notes := %(notes)s) + """ + + params = ["analysisunitname", "depth", "thickness", "faciesid", "mixed", "igsn", "notes"] + inputs = pull_params(params, yml_dict, csv_template, 'ndb.analysisunits') + + anunits = [] + for i, value in enumerate(inputs['depth']): + if inputs['mixed'][i] == None: + mixed_input = False + else: + mixed_input = inputs['mixed'][i] + cur.execute(add_unit, {'collunitid': uploader['collunitid'], + 'depth': inputs['depth'][i], + 'thickness': inputs['thickness'][i], + 'faciesid': inputs['faciesid'][i], + 'mixed': mixed_input, + 'igsn': inputs['igsn'][i], + 'notes': inputs['notes'][i]}) + anunits.append(cur.fetchone()[0]) + return anunits diff --git a/neotomaUploader/insert_chron_control.py b/neotomaUploader/insert_chron_control.py new file mode 100644 index 0000000..0dfccbd --- /dev/null +++ b/neotomaUploader/insert_chron_control.py @@ -0,0 +1,50 @@ +import logging +import numpy as np +from .pull_params import pull_params + +def insert_chron_control(cur, yml_dict, csv_template, uploader): + addcontrol = """ + SELECT ts.insertchroncontrol(_chronologyid := %(chronid)s, + _chroncontroltypeid := 10, + _analysisunitid := %(annuid)s, + _depth := %(depth)s, + _thickness := %(thickness)s, + _agetypeid := %(agetypeid)s, + _age := %(age)s, + _agelimityounger := %(ageyoung)s, + _agelimitolder := %(ageold)s, + _notes := %(notes)s)""" + + params = ["depth", "thickness", 'notes',] + inputs = pull_params(params, yml_dict, csv_template, 'ndb.analysisunits') + + params_age = ['age'] + inputs_age = pull_params(params_age, yml_dict, csv_template, 'ndb.sampleages') + + inputs_age['age'] = [float(value) if value != 'NA' else np.nan for value in inputs_age['age']] + inputs_age['uncertainty'] = [float(value) if value != 'NA' else np.nan for value in inputs_age['uncertainty']] + agetype = list(set(inputs_age['unitcolumn'])) + agetype = agetype[0] + + assert len(uploader['anunits']) == len(inputs_age['age']) == len(inputs['thickness']), \ + "The number of analysis units, ages, and thicknesses is not the same. Please check." + + chron_control_units = list() + for i in range(len(uploader['anunits'])): + if inputs_age['unitcolumn'][i] == 'cal yr BP': + agetypeid = 2 + elif inputs_age['unitcolumn'][i] == 'CE/BCE': + agetypeid = 1 + else: + logging.error("The provided age type is incorrect..") + cur.execute(addcontrol, {'chronid': int(uploader['chronology']), + 'annuid': int(uploader['anunits'][i]), + 'depth': inputs['depth'][i], + 'thickness': inputs['thickness'][i], + 'agetypeid': agetypeid, + 'age': inputs_age['age'][i], + 'notes':inputs['notes'][i], + 'ageyoung': inputs_age['age'][i] + inputs_age['uncertainty'][i], + 'ageold': inputs_age['age'][i] - inputs_age['uncertainty'][i]}) + chron_control_units.append(cur.fetchone()[0]) + return chron_control_units \ No newline at end of file diff --git a/neotomaUploader/insert_chronology.py b/neotomaUploader/insert_chronology.py new file mode 100644 index 0000000..173af10 --- /dev/null +++ b/neotomaUploader/insert_chronology.py @@ -0,0 +1,52 @@ +import datetime +import logging +import datetime +import numpy as np +from .pull_params import pull_params + +def insert_chronology(cur, yml_dict, csv_template, uploader): + addChron = """ + SELECT ts.insertchronology(_collectionunitid := %(collunitid)s, + _agetypeid := %(agetype)s, + _contactid := %(contactid)s, + _isdefault := TRUE, + _chronologyname := %(chronologyname)s, + _dateprepared := %(dateprepared)s, + _agemodel := %(agemodel)s, + _ageboundyounger := %(maxage)s, + _ageboundolder := %(minage)s) + """ + + get_cont = """SELECT contactid FROM ndb.contacts WHERE %(contactname)s = contactname;""" + + params = ["contactid", "agemodel", "notes"] + inputs = pull_params(params, yml_dict, csv_template, 'ndb.chronologies') + + params2 = ['age'] + inputs_age = pull_params(params2, yml_dict, csv_template, 'ndb.sampleages') + + inputs_age['age'] = [float(value) if value != 'NA' else np.nan for value in inputs_age['age']] + agetype = list(set(inputs_age['unitcolumn'])) + agetype = agetype[0] + + cur.execute(get_cont, {'contactname': inputs['contactid'][0]}) + contactid = cur.fetchone()[0] + + if agetype == 'cal yr BP': + agetypeid = 2 + elif agetype == 'CE/BCE': + agetypeid = 1 + else: + logging.error("The provided age type is incorrect..") + + cur.execute(addChron, {'collunitid': int(uploader['collunitid']), + 'contactid': contactid, + 'chronologyname': 'Default 210Pb', # This is a default but might be better to specify in template + 'agetype': agetypeid, # Comming from column X210Pb.Date.Units which should be linked to params3 + 'dateprepared': datetime.datetime.today().date(), # Default but should be coming from template s + 'agemodel': inputs['agemodel'][0], + 'maxage': int(max(inputs_age['age'])), + 'minage': int(min(inputs_age['age']))}) + chronid = cur.fetchone()[0] + + return chronid diff --git a/neotomaUploader/insert_collunit.py b/neotomaUploader/insert_collunit.py new file mode 100644 index 0000000..2a490c6 --- /dev/null +++ b/neotomaUploader/insert_collunit.py @@ -0,0 +1,58 @@ +import logging +from .pull_params import pull_params + +def insert_collunit(cur, yml_dict, csv_template, uploader): + """_Insert a new collection unit to a site_ + + Args: + cur (_psycopg2.extensions.cursor_): _A cursor pointing to the Neotoma + Paleoecology Database._ + yml_dict (_dict_): _A `dict` returned by the YAML template._ + csv_template (_dict_): _The csv file with the required data to be uploaded._ + uploader (_dict_): A `dict` object that contains critical information about the + object uploaded so far. + + Returns: + _int_: _The integer value of the newly created siteid from the Neotoma Database._ + """ + try: + # Here we're just checking to make sure that we do have a site coordinate + # and geometry. + assert all(element in [d.get('neotoma') for d in yml_dict.get('metadata')] + for element in ['ndb.collectionunits.handle']) + except AssertionError: + logging.error("The template must contain a collectionunit handle.", exc_info = True) + params = ["handle", "colltypeid", "depenvtid", "collunitname", "colldate", "colldevice", + "gpslatitude", "gpslongitude", "gpsaltitude", "gpserror", + "waterdepth", "substrateid", "slopeaspect", "slopeangle", "location", "notes", "geog"] + inputs = pull_params(params, yml_dict, csv_template, 'ndb.collectionunits') + try: + coords = inputs['geog'] + assert len(coords) == 2 + assert coords[0] >= -90 and coords[0] <= 90 + assert coords[1] >= -180 and coords[1] <= 180 + except AssertionError: + logging.error("Coordinates are improperly formatted. They must be in the form 'LAT, LONG' [-90 -> 90] and [-180 -> 180].") + collname = inputs['handle'][0] + cur.execute(""" + SELECT ts.insertcollectionunit( + _handle := %(handle)s, + _collunitname := %(collname)s, + _siteid := %(siteid)s, + _colltypeid := %(colltypeid)s, + _depenvtid := %(depenvtid)s, + _colldate := %(newdate)s, + _location := %(location)s, + _gpslatitude := %(ns)s, + _gpslongitude := %(ew)s)""", + {'handle': collname[:10], # Must be smaller than 10 chars + 'collname': collname, + 'siteid' : uploader.get('siteid'), + 'colltypeid': 3, # to do: put it as input + 'depenvtid': 19, # to do: put it as input + 'newdate': inputs['colldate'][0], + 'location': inputs['location'][0], + 'ew': coords[0], + 'ns': coords[1]}) + collunitid = cur.fetchone()[0] + return collunitid \ No newline at end of file diff --git a/neotomaUploader/insert_data.py b/neotomaUploader/insert_data.py new file mode 100644 index 0000000..28f556e --- /dev/null +++ b/neotomaUploader/insert_data.py @@ -0,0 +1,61 @@ +import logging +from .pull_params import pull_params + +def insert_data(cur, yml_dict, csv_template, uploader): + data_query = """ + SELECT ts.insertdata(_sampleid := %(sampleid)s, + _variableid := %(variableid)s, + _value := %(value)s) + """ + params = ['value'] + inputs = pull_params(params, yml_dict, csv_template, 'ndb.data') + data_points = [] + for i, val_dict in enumerate(inputs): + val_dict['value'] = [None if item == 'NA' else item for item in val_dict['value']] + val_dict['variableelementid'] = None # placeholder + val_dict['variablecontextid'] = None # placeholder + + # Getting TaxonID + get_taxonid = """SELECT * FROM ndb.taxa WHERE taxonname %% %(taxonname)s;""" + cur.execute(get_taxonid, {'taxonname': inputs[i]['taxonname']}) + taxonid = cur.fetchone() + if taxonid != None: + taxonid = int(taxonid[0]) + else: + #print(inputs[i]['taxonname']) + taxonid = 5 #placeholder + + for j, val in enumerate(val_dict['unitcolumn']): + # Get UnitsID + get_unitsid = """SELECT * FROM ndb.variableunits WHERE variableunits %% %(units)s;""" + cur.execute(get_unitsid, {'units': val_dict['unitcolumn'][j]}) + unitsid = cur.fetchone()[0] # This is just getting the varunitsid + + get_varid = """SELECT * FROM ndb.variables + WHERE variableunitsid = %(unitsid)s + AND taxonid = %(taxonid)s + AND variableelementid = %(variableelementid)s + AND variablecontextid = %(variablecontextid)s + """ + cur.execute(get_varid, {'unitsid':unitsid, 'taxonid': taxonid, 'variableelementid': val_dict['variableelementid'], 'variablecontextid': val_dict['variablecontextid']}) + varid = cur.fetchone() + if varid != None: + varid = int(varid[0]) + else: + var_query = """SELECT ts.insertvariable(_taxonid := %(taxonid)s, + _variableelementid := %(variableelementid)s, + _variableunitsid := %(variableunitsid)s, + _variablecontextid := %(variablecontextid)s)""" + cur.execute(var_query, {'taxonid': taxonid, + 'variableelementid': None, + 'variableunitsid': unitsid, + 'variablecontextid': None}) # inputs[i]['variablecontextid']}) + varid = cur.fetchone()[0] + cur.execute(data_query, {'sampleid': int(uploader['samples'][i]), + 'variableid': int(varid), + 'value': val_dict['value'][i]}) + + result = cur.fetchone()[0] + data_points.append(result) + + return data_points \ No newline at end of file diff --git a/neotomaUploader/insert_data_processor.py b/neotomaUploader/insert_data_processor.py new file mode 100644 index 0000000..3451cfe --- /dev/null +++ b/neotomaUploader/insert_data_processor.py @@ -0,0 +1,23 @@ +import logging +from .pull_params import pull_params + +def insert_data_processor(cur, yml_dict, csv_template, uploader): + + params = ['contactid'] + inputs = pull_params(params, yml_dict, csv_template, 'ndb.sampleanalysts') + + get_contact = """SELECT * FROM ndb.contacts WHERE contactname %% %(name)s;""" + + contids = list() + for i in inputs['contactid']: + cur.execute(get_contact, {'name': i}) + contids.append({'name': i, 'id': cur.fetchone()[0]}) + + results = [] + for contact in contids: + processor = """SELECT ts.insertdataprocessor(_datasetid := %(datasetid)s, + _contactid := %(contactid)s)""" + cur.execute(processor, {'datasetid': int(uploader['datasetid']), + 'contactid': int(contact['id'])}) + results.append(cur.fetchone()[0]) + return None \ No newline at end of file diff --git a/neotomaUploader/insert_data_repository.py b/neotomaUploader/insert_data_repository.py new file mode 100644 index 0000000..852f39f --- /dev/null +++ b/neotomaUploader/insert_data_repository.py @@ -0,0 +1,17 @@ +import logging +from .pull_params import pull_params +import numpy as np + +def insert_data_repository(cur, yml_dict, csv_template, uploader): + """ + """ + + repo_query = """ + SELECT ts.insertrepositoryinstitution(_acronym := %(acronym)s, + _repository := %(repository)s, + _notes := %(notes)s) + """ + params = ['acronym', 'repo'] + inputs = pull_params(params, yml_dict, csv_template, 'ndb.repository') + + return \ No newline at end of file diff --git a/neotomaUploader/insert_dataset.py b/neotomaUploader/insert_dataset.py new file mode 100644 index 0000000..ad2e114 --- /dev/null +++ b/neotomaUploader/insert_dataset.py @@ -0,0 +1,20 @@ +import logging +from .pull_params import pull_params + +def insert_dataset (cur, yml_dict, csv_template, uploader): + #cur, collunitid, datasetname): + dataset_query = """SELECT ts.insertdataset(_collectionunitid:= %(collunitid)s, + _datasettypeid := %(datasettypeid)s, + _datasetname := %(datasetname)s);""" + + params = ['datasetname', 'datasettypeid'] + inputs = pull_params(params, yml_dict, csv_template, 'ndb.datasets') + + inputs = dict(map(lambda item: (item[0], None if all([i is None for i in item[1]]) else item[1]), + inputs.items())) + + cur.execute(dataset_query, {'collunitid': int(uploader['collunitid']), + 'datasettypeid': int(5), #inputs['datasettypeid'], + 'datasetname': inputs['datasetname']}) + datasetid = cur.fetchone()[0] + return datasetid diff --git a/neotomaUploader/insert_dataset_database.py b/neotomaUploader/insert_dataset_database.py new file mode 100644 index 0000000..ed607a7 --- /dev/null +++ b/neotomaUploader/insert_dataset_database.py @@ -0,0 +1,17 @@ + +import logging +from .pull_params import pull_params + +def insert_dataset_database(cur, yml_dict, uploader): + db_query = """ + SELECT ts.insertdatasetdatabase(_datasetid := %(datasetid)s, + _databaseid := %(databaseid)s) + """ + # Put it in the XLXs + databaseid = yml_dict['databaseid'] + + cur.execute(db_query, {'datasetid': int(uploader['datasetid']), + 'databaseid': int(databaseid)}) + result = cur.fetchone()[0] + + return result \ No newline at end of file diff --git a/neotomaUploader/insert_dataset_pi.py b/neotomaUploader/insert_dataset_pi.py new file mode 100644 index 0000000..b32fc51 --- /dev/null +++ b/neotomaUploader/insert_dataset_pi.py @@ -0,0 +1,28 @@ +import logging +from .pull_params import pull_params + +def insert_dataset_pi(cur, yml_dict, csv_template, uploader): + + params = ['contactname'] + inputs = pull_params(params, yml_dict, csv_template, 'ndb.contacts') + + get_contact = """SELECT * FROM ndb.contacts WHERE contactname %% %(name)s;""" + + baseid = 1 + contids = [] + for i in inputs['contactname']: + cur.execute(get_contact, {'name': i}) + contids.append({'name': i, 'id': cur.fetchone()[0], 'order': baseid}) + baseid = baseid + 1 + + result = [] + for contact in contids: + inserter = """SELECT ts.insertdatasetpi(_datasetid := %(datasetid)s, + _contactid := %(contid)s, + _piorder := %(piorder)s);""" + cur.execute(inserter, {'datasetid': int(uploader['datasetid']), + 'contid': int(contact['id']), + 'piorder': int(contact['order'])}) + result.append(cur.fetchone()[0]) + + return result \ No newline at end of file diff --git a/neotomaUploader/insert_dataset_repository.py b/neotomaUploader/insert_dataset_repository.py new file mode 100644 index 0000000..2dace69 --- /dev/null +++ b/neotomaUploader/insert_dataset_repository.py @@ -0,0 +1,14 @@ + +import logging +from .pull_params import pull_params + +def insert_dataset_repository(cur, yml_dict, csv_template, uploader): + params = ['contactid'] + inputs = pull_params(params, yml_dict, csv_template, 'ndb.sampleanalysts') + repo_query = """SELECT ts.insertdatasetrepository(_acronym:= %(acronym)s, + _repository := %(repository)s, + _notes := %(notes)s);""" + #_datasetid integer, _repositoryid integer, _notes character varying DEFAULT NULL::character varying) + + + return None \ No newline at end of file diff --git a/neotomaUploader/insert_geopol.py b/neotomaUploader/insert_geopol.py new file mode 100644 index 0000000..9c2d14b --- /dev/null +++ b/neotomaUploader/insert_geopol.py @@ -0,0 +1,36 @@ +def insert_geopol(cur, yml_dict, csv_template, uploader): + """_Insert a site's geopolitical unit to Neotoma_ + + def insert_geopol(cur, yml_dict, csv_template) + + Args: + cur (_psycopg2.extensions.cursor_): _A cursor pointing to the Neotoma + Paleoecology Database._ + yml_dict (_dict_): _A `dict` returned by the YAML template._ + csv_template (_dict_): _The csv file with the required data to be uploaded._ + + Returns: + _int_: _The integer value of the newly created siteid from the Neotoma Database._ + """ + + if 'siteid' in uploader.keys(): + # First test if the site exists. + isAdded = """SELECT * FROM ap.sitegadm WHERE siteid = %(siteid)s""" + cur.execute(isAdded, { 'siteid': uploader['siteid'] }) + result = cur.fetchone() + if result is None: + # Inserts site and fid in ap.sitegadm if it didn't exist + #INSERT INTO ap.sitegadm(siteid, fid) + assignGeoPol = """ + INSERT INTO ap.sitegadm(siteid, fid) + (SELECT st.siteid, ga.uid + FROM ap.gadm AS ga + JOIN ndb.sites AS st ON ST_Covers(ga.shape, st.geog) + WHERE st.siteid = %(siteid)s) + RETURNING fid; + """ + cur.execute(assignGeoPol, {'siteid': uploader['siteid']}) + result = cur.fetchone() + else: + result = None + return result \ No newline at end of file diff --git a/neotomaUploader/insert_sample.py b/neotomaUploader/insert_sample.py new file mode 100644 index 0000000..bb53190 --- /dev/null +++ b/neotomaUploader/insert_sample.py @@ -0,0 +1,56 @@ +import logging +from .pull_params import pull_params +import numpy as np +import datetime + +def insert_sample(cur, yml_dict, csv_template, uploader): + """ + insert samples + """ + sample_query = """ + SELECT ts.insertsample(_analysisunitid := %(analysisunitid)s, + _datasetid := %(datasetid)s, + _samplename := %(samplename)s, + _sampledate := %(sampledate)s, + _analysisdate := %(analysisdate)s, + _taxonid := %(taxonid)s, + _labnumber := %(labnumber)s, + _prepmethod := %(prepmethod)s, + _notes := %(notes)s) + """ + + #lab_number = yml_dict['lab_number'] + params = ['value'] + val_inputs = pull_params(params, yml_dict, csv_template, 'ndb.data') + + params2 = ['lab_number', 'sampledate', 'analysisdate', 'labnumber', 'prepmethod', 'notes', 'taxonname', 'samplename'] + inputs2 = pull_params(params2, yml_dict, csv_template, 'ndb.samples') + inputs2 = dict(map(lambda item: (item[0], None if all([i is None for i in item[1]]) else item[1]), + inputs2.items())) + + # There might be several loops so I might need a for loop here + samples = [] + + # Assert aunits and samples are same in length + for j, val in enumerate(uploader['anunits']): + get_taxonid = """SELECT * FROM ndb.taxa WHERE taxonname %% %(taxonname)s;""" + cur.execute(get_taxonid, {'taxonname': inputs2['taxonname']}) + taxonid = cur.fetchone() + if taxonid != None: + taxonid = int(taxonid[0]) + else: + taxonid = None + + cur.execute(sample_query, {'analysisunitid': int(uploader['anunits'][j]), + 'datasetid': int(uploader['datasetid']), + 'samplename': inputs2['samplename'], + 'sampledate': inputs2['sampledate'], # datetime.datetime.today().date(), + 'analysisdate': inputs2['analysisdate'], + 'taxonid': taxonid, + 'labnumber': inputs2['lab_number'], + 'prepmethod': inputs2['prepmethod'], + 'notes': inputs2['notes']}) + sampleid = cur.fetchone()[0] + samples.append(sampleid) + + return samples \ No newline at end of file diff --git a/neotomaUploader/insert_sample_age.py b/neotomaUploader/insert_sample_age.py new file mode 100644 index 0000000..c0fa237 --- /dev/null +++ b/neotomaUploader/insert_sample_age.py @@ -0,0 +1,39 @@ +import logging +from .pull_params import pull_params +import numpy as np + +def insert_sample_age(cur, yml_dict, csv_template, uploader): + """ + """ + + sample_age_query = """ + SELECT ts.insertsampleage(_sampleid := %(sampleid)s, + _chronologyid := %(chronologyid)s, + _age := %(age)s, + _ageyounger := %(ageyounger)s, + _ageolder := %(ageolder)s) + """ + + params = ['age'] + inputs = pull_params(params, yml_dict, csv_template, 'ndb.sampleages') + + inputs['age'] = [float(value) if value != 'NA' else np.nan for value in inputs['age']] + inputs['uncertainty'] = [float(value) if value != 'NA' else np.nan for value in inputs['uncertainty']] + results = [] + + for i, item in enumerate(uploader['samples']): + # Matching the different kinds of taxons + # Have to ask about this, why is it that there are multiple taxon in the same row + + index = i % len(inputs['age']) + + cur.execute(sample_age_query, {'sampleid': uploader['samples'][i], + 'chronologyid': uploader['chronology'], + 'age': inputs['age'][index], + 'ageyounger': inputs['age'][index]-inputs['uncertainty'][index], + 'ageolder': inputs['age'][index]+inputs['uncertainty'][index]}) + + result = cur.fetchone()[0] + results.append(result) + + return results \ No newline at end of file diff --git a/neotomaUploader/insert_sample_analyst.py b/neotomaUploader/insert_sample_analyst.py new file mode 100644 index 0000000..fbf7800 --- /dev/null +++ b/neotomaUploader/insert_sample_analyst.py @@ -0,0 +1,34 @@ +import logging +from .pull_params import pull_params + +def insert_sample_analyst(cur, yml_dict, csv_template, uploader): + """ + """ + + params = ['contactid'] + inputs = pull_params(params, yml_dict, csv_template, 'ndb.sampleanalysts') + get_contact = """SELECT * FROM ndb.contacts WHERE contactname %% %(contactname)s;""" + + contids = [] + baseid = 1 + for i in inputs['contactid']: + cur.execute(get_contact, {'contactname': i}) + contids.append({'contactname': i, 'id': cur.fetchone()[0], 'order': baseid}) + baseid = baseid + 1 + + result = [] + counter = 0 + for i in range(len(uploader['samples'])): + for contact in contids: + inserter = """ + SELECT ts.insertsampleanalyst(_sampleid := %(sampleid)s, + _contactid := %(contactid)s, + _analystorder := %(analystorder)s) + """ + cur.execute(inserter, {'sampleid': int(uploader['samples'][counter]), + 'contactid': int(contact['id']), + 'analystorder': int(contact['order'])}) + result.append(cur.fetchone()[0]) + counter += 1 + + return result \ No newline at end of file diff --git a/neotomaUploader/insert_site.py b/neotomaUploader/insert_site.py index 5fc4eb7..84c5b48 100644 --- a/neotomaUploader/insert_site.py +++ b/neotomaUploader/insert_site.py @@ -1,5 +1,5 @@ import logging -from .yaml_values import yaml_values +from .pull_params import pull_params def insert_site(cur, yml_dict, csv_template): """_Insert a site to Neotoma_ @@ -7,34 +7,65 @@ def insert_site(cur, yml_dict, csv_template): def insertSite(cur, yml_dict, csv_template) Args: - cur (_psycopg2.extensions.cursor_): _A cursor pointing to the Neotoma Paleoecology Database._ - sitename (_list_): _A list returned by the function cleanCol()_ - coords (_list_): _A list returned by the function cleanCol()_ + cur (_psycopg2.extensions.cursor_): _A cursor pointing to the Neotoma + Paleoecology Database._ + yml_dict (_dict_): _A `dict` returned by the YAML template._ + csv_template (_dict_): _The csv file with the required data to be uploaded._ Returns: _int_: _The integer value of the newly created siteid from the Neotoma Database._ """ + site_query = """ + SELECT ts.insertsite(_sitename := %(sitename)s, + _altitude := %(altitude)s, + _area := %(area)s, + _descript := %(description)s, + _notes := %(notes)s, + _east := %(ew)s, + _north := %(ns)s, + _west := %(ew)s, + _south := %(ns)s) + """ try: + # Here we're just checking to make sure that we do have a site coordinate + # and geometry. assert all(element in [d.get('neotoma') for d in yml_dict.get('metadata')] - for element in ['ndb.sites.sitename', 'ndb.sites.geom']) + for element in ['ndb.sites.sitename', 'ndb.sites.geog']) except AssertionError: logging.error("The template must contain a sitename and coordinates.", exc_info=True) + + params = ["sitename", "altitude", "area", "sitedescription", "notes", "geog"] + inputs = pull_params(params, yml_dict, csv_template, 'ndb.sites') + inputs = dict(map(lambda item: (item[0], None if all([i is None for i in item[1]]) else item[1]), + inputs.items())) + + if isinstance(inputs['sitename'], list): + if len(list(set(inputs['sitename']))) > 1: + logging.error("There should only be one site name.") + inputs['sitename'] = inputs['sitename'][0] + if inputs['altitude'] is not None: + inputs['altitude'] = inputs['altitude'][0] + if inputs['area'] is not None: + inputs['area'] = inputs['area'][0] + if inputs['sitedescription'] is not None: + inputs['description'] = inputs['sitedescription'][0] + else: + inputs['description'] = None + if inputs['notes'] is not None: + inputs['notes'] = inputs['notes'][0] + try: - coord_dict = yaml_values(yml_dict, csv_template, 'ndb.sites.geom') - coords = [float(i) for i in coord_dict[0].get('values')[0].split(',')] + coords = inputs['geog'] assert len(coords) == 2 - assert coords[0] >= 0 and coords[0] <= 90 - assert coords[1] <= 0 and coords[1] >= -180 - except AssertionError: - logging.error("Coordinates are improperly formatted. They must be in the form 'LAT, LONG'.") - try: - sitenameDict = yaml_values(yml_dict, csv_template, 'ndb.sites.sitename') - assert len(sitenameDict) == 1 - assert isinstance(sitenameDict[0].get('values')[0], str) + assert coords[0] >= -90 and coords[0] <= 90 + assert coords[1] >= -180 and coords[1] <= 180 except AssertionError: - logging.error("A single sitename value must be provided. Check your yaml template to be sure.") + logging.error("Coordinates are improperly formatted. They must be in the form 'LAT, LONG' [-90 -> 90] and [-180 -> 180].") + inputs['ew'] = coords[0] + inputs['ns'] = coords[1] - cur.execute("SELECT ts.insertsite(_sitename := %(sitename)s, _east := %(ew)s, _north := %(ns)s, _west := %(ew)s, _south := %(ns)s)", - {'sitename': sitenameDict[0].get('values')[0], 'ew': coords[1], 'ns': coords[0]}) + cur.execute(site_query, + inputs) + siteid = cur.fetchone()[0] return siteid diff --git a/neotomaUploader/parseArguments.py b/neotomaUploader/parse_arguments.py similarity index 69% rename from neotomaUploader/parseArguments.py rename to neotomaUploader/parse_arguments.py index ef3be8d..00000b1 100644 --- a/neotomaUploader/parseArguments.py +++ b/neotomaUploader/parse_arguments.py @@ -1,6 +1,6 @@ import argparse import os -def parseArguments(): +def parse_arguments(): """_Parse commandline arguments to the Uploader_ Args: @@ -27,14 +27,9 @@ def parseArguments(): args = parser.parse_args() if not os.path.isdir(args.data): - raise FileNotFoundError("There is no directory named '" + - args.data + - "' within the current path. Please set the --data property.") + raise FileNotFoundError(f"There is no directory named '{args.data}' within the current path.") if not os.path.isfile(args.template): - raise FileNotFoundError("The file '" + - args.template + - "' could not be found within the current path." + - " Please set the --template property.") + raise FileNotFoundError(f"The file '{args.template}' could not be found within the current path.") return {'data': args.data, 'yml': args.template} \ No newline at end of file diff --git a/neotomaUploader/pull_params.py b/neotomaUploader/pull_params.py new file mode 100644 index 0000000..07a0d02 --- /dev/null +++ b/neotomaUploader/pull_params.py @@ -0,0 +1,79 @@ +import datetime +import re +from itertools import chain +from .retrieve_dict import retrieve_dict +from .clean_column import clean_column + +def pull_params(params, yml_dict, csv_template, table): + """_Pull parameters associated with an insert statement from the yml/csv tables._ + + Args: + params (_list_): _A list of strings for the columns needed to generate the insert statement._ + yml_dict (_dict_): _A `dict` returned by the YAML template._ + csv_template (_dict_): _The csv file with the required data to be uploaded._ + table (_string_): _The name of the table the parameters are being drawn for._ + + Returns: + _dict_: _cleaned and repeated valors for input into a Tilia insert function._ + """ + add_unit_inputs = {} + if re.match('.*\.$', table) == None: + table = table + '.' + add_units_inputs_list=[] + for i in params: + valor = retrieve_dict(yml_dict, table + i) + if len(valor) > 0: + for count, val in enumerate(valor): + new_dict = {} + clean_valor = clean_column(val.get('column'), + csv_template, + clean = not val.get('rowwise')) + if len(clean_valor) > 0: + match val.get('type'): + case "string": + clean_valor = list(map(str, clean_valor)) + case "date": + #clean_valor = list(map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').date(), chain(*clean_valor))) + clean_valor = list(map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').date(), clean_valor)) + case "int": + clean_valor = list(map(int, clean_valor[0])) + case "float": + clean_valor = list(map(float, clean_valor[0])) + case "coordinates (latlong)": + clean_valor = [float(num) for num in clean_valor[0].split(',')] + add_unit_inputs[i] = clean_valor + if 'unitcolumn' in val: + clean_valor2 = clean_column(val.get('unitcolumn'), + csv_template, + clean = not val.get('rowwise')) + add_unit_inputs['unitcolumn'] = clean_valor2 + + if 'uncertainty' in val.keys(): + clean_valor3 = clean_column(val['uncertainty']['uncertaintycolumn'], + csv_template, + clean = not val.get('rowwise')) + add_unit_inputs['uncertainty'] = clean_valor3 + + samples_dict = add_unit_inputs.copy() + samples_dict['name'] = val.get('column') + samples_dict['taxonid'] = val.get('taxonid') + samples_dict['taxonname'] = val.get('taxonname') + add_units_inputs_list.append(samples_dict) + + else: + add_unit_inputs[i] = [] + + maxlen = 0 + for i in params: + if len(add_unit_inputs.get(i)) > maxlen: + maxlen = len(add_unit_inputs.get(i)) + for i in params: + if len(add_unit_inputs.get(i)) == 0: + add_unit_inputs[i] = [None for j in range(maxlen)] + elif len(add_unit_inputs.get(i)) == 1: + add_unit_inputs[i] = [add_unit_inputs[i][0] for j in range(maxlen)] + + if params == ['value']: + return add_units_inputs_list + else: + return add_unit_inputs \ No newline at end of file diff --git a/neotomaUploader/retrieveDict.py b/neotomaUploader/retrieve_dict.py similarity index 70% rename from neotomaUploader/retrieveDict.py rename to neotomaUploader/retrieve_dict.py index 2c67f1f..a5cebd0 100644 --- a/neotomaUploader/retrieveDict.py +++ b/neotomaUploader/retrieve_dict.py @@ -1,12 +1,12 @@ import warnings import logging - -def retrieveDict(yml_dict, sqlColumn): +import re +def retrieve_dict(yml_dict, sql_column): """_Get Dictionary for a Neotoma column using the YAML template_ Args: yml_dict (_dict_): _The YAML template object imported by the user._ - sqlColumn (_str_): _A character string indicating the SQL column to be matched._ + sql_column (_str_): _A character string indicating the SQL column to be matched._ Returns: list_: _A list of all dictionaries associated with a particular Neotoma table/column_ @@ -18,8 +18,9 @@ def retrieveDict(yml_dict, sqlColumn): assert yml_dict.get('metadata') except AssertionError: logging.error("The yml_dict must be a dict object (not a list) containing the key 'metadata'.", exc_info=True) - result = [d for d in yml_dict['metadata'] if d['neotoma'] == sqlColumn] + #result = [d for d in yml_dict['metadata'] if d['neotoma'] == sqlColumn] + result= [d for d in yml_dict['metadata'] if re.search(sql_column, d['neotoma'])] if result is None: warnings.warn("No matching dictionary entry found.") else: - return result + return result \ No newline at end of file diff --git a/neotomaUploader/validAgent.py b/neotomaUploader/validAgent.py deleted file mode 100644 index d45dc21..0000000 --- a/neotomaUploader/validAgent.py +++ /dev/null @@ -1,43 +0,0 @@ -from .retrieveDict import retrieveDict -from .valid_column import valid_column, cleanColumn -#def validAgent(cur, agentname): -def validAgent(cur, df, yml_dict, str_contact): - """_Get user agent or contact from Neotoma_ - - Args: - cur (_psycopg2.extensions.cursor_): _A cursor pointing to the Neotoma Paleoecology Database._ - agentname (_string_): _A user name or individual._ - """ - response = { 'pass': False, 'name': None, 'message': [] } - namematch = [] - agentnameD = retrieveDict(yml_dict, str_contact) - agent_message = valid_column(df, agentnameD) - agentname = cleanColumn(df, agentnameD) - if len(agent_message) >0: - response['message'].append(agent_message) - - for name in agentname: - response['message'].append(f"*** PI: {name} ***") - nameQuery = """ - SELECT contactid, contactname - FROM ndb.contacts AS ct - WHERE to_tsvector(ct.contactname) @@ plainto_tsquery(%(name)s);""" - cur.execute(nameQuery, {'name': name}) - result = {'name': name, 'match': cur.fetchall() or []} - namematch.append(result) - matches = [] - for person in namematch: - if len(person['match']) ==0: - response['message'].append(f"✗ No approximate matches found for {person['name']}. Have they been added to Neotoma?") - matches.append(False) - elif any([person['name'] == i[1] for i in person['match']]): - response['message'].append(f"✔ Exact match found for {person['name']}.") - matches.append(True) - else: - response['message'].append(f"? No exact match found for {person['name']}, several potential matches follow:") - matches.append(False) - for i in person['match']: - response['message'].append(f" * {i[1]}") - if all(matches): - response['pass'] = True - return response \ No newline at end of file diff --git a/neotomaUploader/valid_agent.py b/neotomaUploader/valid_agent.py new file mode 100644 index 0000000..54ff832 --- /dev/null +++ b/neotomaUploader/valid_agent.py @@ -0,0 +1,51 @@ +from .retrieve_dict import retrieve_dict +from .valid_column import valid_column +from .yaml_values import yaml_values +import re +#def validAgent(cur, agentname): + +def valid_agent(cur, csv_template, yml_dict): + """_Get user agent or contact from Neotoma_ + + Args: + cur (_psycopg2.extensions.cursor_): _A cursor pointing to the Neotoma Paleoecology Database._ + csv_template (_string_): _A user name or individual._ + yml_dict (_dict_): _The dictionary object passed by yml_to_dict._ + """ + response = { 'pass': False, 'name': None, 'message': [] } + + pattern = r'(contactid|contactname)' + agent_dict = yaml_values(yml_dict, csv_template, pattern) + + for element in agent_dict: + response['message'].append(f" === Checking Against Dataset {element['column']} ===") + agent_message = valid_column(element) + agentname = element['values'] + namematch = [] + if len(agent_message) > 0: + response['message'].append(agent_message) + for name in agentname: + response['message'].append(f" *** Named Individual: {name} ***") + nameQuery = """ + SELECT contactid, contactname + FROM ndb.contacts AS ct + WHERE to_tsvector(ct.contactname) @@ plainto_tsquery(%(name)s);""" + cur.execute(nameQuery, {'name': name}) + result = {'name': name, 'match': cur.fetchall() or []} + namematch.append(result) + matches = [] + for person in namematch: + if len(person['match']) ==0: + response['message'].append(f" ✗ No approximate matches found for {person['name']}. Have they been added to Neotoma?") + matches.append(False) + elif any([person['name'] == i[1] for i in person['match']]): + response['message'].append(f" ✔ Exact match found for {person['name']}.") + matches.append(True) + else: + response['message'].append(f" ? No exact match found for {person['name']}, several potential matches follow:") + matches.append(False) + for i in person['match']: + response['message'].append(f" * {i[1]}") + if all(matches): + response['pass'] = True + return response \ No newline at end of file diff --git a/neotomaUploader/valid_collectionunit.py b/neotomaUploader/valid_collectionunit.py index fb918c0..edd67cc 100644 --- a/neotomaUploader/valid_collectionunit.py +++ b/neotomaUploader/valid_collectionunit.py @@ -1,6 +1,4 @@ import itertools -from .retrieveDict import retrieveDict -from .valid_column import valid_column, cleanColumn from .yaml_values import yaml_values def valid_collectionunit(cur, yml_dict, csv_template): @@ -18,7 +16,7 @@ def valid_collectionunit(cur, yml_dict, csv_template): response = {'pass': False, 'message': []} - coords = yaml_values(yml_dict, csv_template, 'ndb.sites.geom') + coords = yaml_values(yml_dict, csv_template, 'ndb.collectionunits.geog') try: assert len(coords) == 1 except AssertionError: diff --git a/neotomaUploader/valid_column.py b/neotomaUploader/valid_column.py index e8f1394..c193ab9 100644 --- a/neotomaUploader/valid_column.py +++ b/neotomaUploader/valid_column.py @@ -1,26 +1,42 @@ -import pandas as pd +import datetime -def valid_column(yaml_vals): - response = {'message': []} - - if dict1['type']=='number': - if not pd.api.types.is_numeric_dtype(df[column_name]): - response['message'].append('✗ Site {column_name} is not properly formatted.') - if dict1['type']=='string': - if not pd.api.types.is_string_dtype(df[column_name]): - response['message'].append('✗ Site {column_name} is not properly formatted.') - if dict1['type']=='date': - if not pd.api.types.is_datetime64_any_dtype(df[column_name]): - response['message'].append('✗ Site {column_name} is not properly formatted.') - message = ' '.join(response['message']) +def is_valid_date(value): + try: + datetime.strptime(value, '%Y-%m-%d') + return True + except ValueError: + return False - return response['message'] +def is_numeric(value): + '''check if the values can be cast properly as numbers''' + try: + int(value) + return True + except ValueError: + try: + float(value) + return True + except ValueError: + return False -def cleanColumn(df, dict1): - column_name = dict1['column'] - if dict1['repeat']==True: - column_vals = df[column_name].tolist() - else: - column_vals = list(df[column_name].unique()) - return column_vals +def valid_column(pointer): + response = {'message': []} + allowed_types = { + 'string': str, + 'number': is_numeric, + 'date': is_valid_date + } + value_type = pointer.get('type') + values_list = pointer.get('values') + if callable(allowed_types[value_type]): + # If the type is a date check, call the function for each value + result = all(allowed_types[value_type](value) for value in values_list) + + else: + # If the type is a standard Python type, perform the isinstance check + result = all(isinstance(value, allowed_types[value_type]) for value in values_list) + if result is False: + response['message'].append(f'✗ {pointer["column"]} is not properly formatted.') + response['message'] = ''.join(response['message']) + return response['message'] diff --git a/neotomaUploader/valid_date.py b/neotomaUploader/valid_date.py index 7f2eca1..1ff15e4 100644 --- a/neotomaUploader/valid_date.py +++ b/neotomaUploader/valid_date.py @@ -1,7 +1,7 @@ -from .retrieveDict import retrieveDict -from .valid_column import cleanColumn +from .retrieve_dict import retrieve_dict from .yaml_values import yaml_values import datetime +import re def valid_date(yml_dict, csv_template): """_Check to see if the date format is valid for a given type_ @@ -14,15 +14,13 @@ def valid_date(yml_dict, csv_template): _dict_: _An object with a valid parameter and the re-formatted date (as a datetime object)._ """ response = {'pass': False, 'message': []} - date_cols = ['ndb.samples.analysisdate', 'ndb.collectionunits.colldate', 'ndb.datataxonnotes.date', - 'ndb.datasettaxonnotes.date', 'ndb.samples.sampledate', 'ndb.chronologies.dateprepared'] - dates = [j for j in [yaml_values(yml_dict, csv_template, i) for i in date_cols] if j != []] - date_format = '%Y-%m-%d' - for i in dates: + pattern = r'(date)' + dateD = yaml_values(yml_dict, csv_template, pattern) + for i in dateD: try: - date_set = [date for date in i[0].get('values')] - new_date = [datetime.datetime.strptime(j, date_format).date() for j in date_set] - response['message'].append(f"✔ Dates for {i[0].get('neotoma')} looks good!") + date_set = i.get('values') + new_date = [datetime.datetime.strptime(j, i['format']).date() for j in date_set] + response['message'].append(f"✔ Dates for {i.get('neotoma')} looks good!") except ValueError: response['message'].append(f"✗ Expected date format is {format}") return response \ No newline at end of file diff --git a/neotomaUploader/validHorizon.py b/neotomaUploader/valid_horizon.py similarity index 75% rename from neotomaUploader/validHorizon.py rename to neotomaUploader/valid_horizon.py index 99809e0..bfdc5bc 100644 --- a/neotomaUploader/validHorizon.py +++ b/neotomaUploader/valid_horizon.py @@ -1,6 +1,7 @@ -from .retrieveDict import retrieveDict -from .valid_column import valid_column, cleanColumn -def validHorizon(df, yml_dict, depth_str, horizon_str): +from .yaml_values import yaml_values +from .valid_column import valid_column + +def valid_horizon(yml_dict, csv_template): """_Is the dated horizon one of the accepted dates?_ Args: @@ -13,15 +14,18 @@ def validHorizon(df, yml_dict, depth_str, horizon_str): response = {'pass': False, 'index': [], 'message': []} - depthD = retrieveDict(yml_dict, depth_str) - depth_message = valid_column(df, depthD) - depths = cleanColumn(df, depthD) + + depthD = yaml_values(yml_dict, csv_template, 'ndb.analysisunits.depth') + depths = depthD[0]['values'] + depth_message = valid_column(depthD[0]) + if len(depth_message) >0: response['message'].append(depth_message) - horizonD = retrieveDict(yml_dict, horizon_str) - horizon_message = valid_column(df, horizonD) - horizon = cleanColumn(df, horizonD) + horizonD = yaml_values(yml_dict, csv_template, 'ndb.leadmodels.datinghorizon') + horizon = horizonD[0]['values'] + + horizon_message = valid_column(horizonD[0]) if len(horizon_message) >0: response['message'].append(horizon_message) diff --git a/neotomaUploader/valid_site.py b/neotomaUploader/valid_site.py index 1accb7c..88c9e85 100644 --- a/neotomaUploader/valid_site.py +++ b/neotomaUploader/valid_site.py @@ -1,5 +1,3 @@ -from .retrieveDict import retrieveDict -from .valid_column import valid_column, cleanColumn from .yaml_values import yaml_values def valid_site(cur, yml_dict, csv_template): @@ -28,7 +26,7 @@ def valid_site(cur, yml_dict, csv_template): 'message': []} ## Retrieve the fields needed from the yml. - coords = yaml_values(yml_dict, csv_template, 'ndb.sites.geom') + coords = yaml_values(yml_dict, csv_template, 'ndb.sites.geog') try: assert len(coords) == 1 except AssertionError: @@ -96,8 +94,8 @@ def valid_site(cur, yml_dict, csv_template): for i in response['sitelist']: response['message'].append(f" * siteid: {i['id']}; sitename: {i['name']:<25}; distance (m): {i['distance (m)']:<7} coords: [{i['coordla']}, {i['coordlo']}]") else: - valid = True - sitelist = [{'id': None, 'name': None, 'coordlo': None, 'coordla': None, 'distance (m)': None}] + response['valid'] = True + response['sitelist'] = [{'id': None, 'name': None, 'coordlo': None, 'coordla': None, 'distance (m)': None}] response['matched'] = {'namematch': False, 'distmatch': False} response['message'].append('✔ There are no sites close to the proposed site.') return response \ No newline at end of file diff --git a/neotomaUploader/valid_taxa.py b/neotomaUploader/valid_taxa.py new file mode 100644 index 0000000..462086f --- /dev/null +++ b/neotomaUploader/valid_taxa.py @@ -0,0 +1,50 @@ +from .valid_column import valid_column +from .yaml_values import yaml_values + +def valid_taxa(cur, csv_template, yml_dict): + """_Get taxa content from Neotoma_ + + Args: + cur (_psycopg2.extensions.cursor_): _A cursor pointing to the Neotoma Paleoecology Database._ + csv_template (_string_): _A taxa name._ + yml_dict (_dict_): _The dictionary object passed by yml_to_dict._ + """ + + response = { 'pass': False, 'name': None, 'message': [] } + + pattern = r'(values)' + taxa_dict = yaml_values(yml_dict, csv_template, pattern) + + for element in taxa_dict: + response['message'].append(f" === Checking Against Taxa {element['column']} ===") + taxa_message = valid_column(element) + taxonname = element['taxonname'] + taxamatch = [] + if len(taxa_message) > 0: + response['message'].append(taxa_message) + + response['message'].append(f" *** Named Taxa: {taxonname} ***") + nameQuery = """ + SELECT taxonid, taxonname + FROM ndb.taxa AS tx + WHERE to_tsvector(tx.taxonname) @@ plainto_tsquery(%(taxonname)s);""" + cur.execute(nameQuery, {'taxonname': taxonname}) + result = {'name': taxonname, 'match': cur.fetchall() or []} + taxamatch.append(result) + + matches = [] + for taxon in taxamatch: + if len(taxon['match']) ==0: + response['message'].append(f" ✗ No approximate matches found for {taxon['name']}. Have they been added to Neotoma?") + matches.append(False) + elif any([taxon['name'] == i[1] for i in taxon['match']]): + response['message'].append(f" ✔ Exact match found for {taxon['name']}.") + matches.append(True) + else: + response['message'].append(f" ? No exact match found for {taxon['name']}, several potential matches follow:") + matches.append(False) + for i in taxon['match']: + response['message'].append(f" * {i[1]}") + if all(matches): + response['pass'] = True + return response \ No newline at end of file diff --git a/neotomaUploader/yaml_values.py b/neotomaUploader/yaml_values.py index 8088b96..ccbb3f1 100644 --- a/neotomaUploader/yaml_values.py +++ b/neotomaUploader/yaml_values.py @@ -1,5 +1,5 @@ -from .retrieveDict import retrieveDict -from .cleanCol import cleanCol +from .retrieve_dict import retrieve_dict +from .clean_column import clean_column def yaml_values(yml_dict, csv_template, column): """_Extract values from CSV file conforming to the YAML dictionary entry_ @@ -12,9 +12,9 @@ def yaml_values(yml_dict, csv_template, column): Returns: _list_: _A list of the same structure as individual elements within yml_dict, with a 'values' field appended._ """ - pointer = retrieveDict(yml_dict, column) + pointer = retrieve_dict(yml_dict, column) def add_val (x): - x['values'] = cleanCol(x.get('column'), + x['values'] = clean_column(x.get('column'), csv_template, clean = not x.get('repeat')) return x diff --git a/requirements.txt b/requirements.txt index ca86158..e940780 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ pytz==2023.3 PyYAML==6.0 six==1.16.0 tzdata==2023.3 +openpyxl \ No newline at end of file diff --git a/template.yml b/template.yml index 7a74ce2..6e7d8ef 100644 --- a/template.yml +++ b/template.yml @@ -1,448 +1,541 @@ -apiVersion: neotoma v2.0 # This is simply a statement of the API version. +--- +apiVersion: neotoma v2.0 +headers: 2 kind: Development -headers: 2 # How many header rows will there be for this csv file (before data starts) +databaseid: 37 +lab_number: 5 metadata: - column: Site.name + link: https://open.neotomadb.org/dbschema/tables/sites.html neotoma: ndb.sites.sitename - required: True + notes: null + required: true + rowwise: false type: string - link: https://open.neotomadb.org/dbschema/tables/sites.html - vocab: - repeat: False - notes: + vocab: null - column: Core.number.or.code - neotoma: ndb.collectionunits.handle - required: True - type: string/numeric link: https://open.neotomadb.org/dbschema/tables/collectionunits.html - vocab: - repeat: False - notes: + neotoma: ndb.collectionunits.handle + notes: null + required: true + rowwise: false + type: string/number + vocab: null - column: Publications - neotoma: ndb.publications.citation - required: True + link: null + neotoma: ndb.publications + notes: Would it be easier to get a number of DOIs + required: true + rowwise: false type: string - link: - vocab: - repeat: - notes: Would it be easier to get a number of DOIs? + vocab: null - column: Location neotoma: ndb.geopoliticalunits - required: True - type: string - vocab: - repeat: False notes: We can extract this from the coordinates. - - column: Geographic.coordinates - neotoma: ndb.sites.geom - required: True + required: true + rowwise: false type: string - vocab: - repeat: False + vocab: null + - column: Geographic.coordinates + neotoma: ndb.sites.geog + notes: Spatial (lat/long pair or geoJSON) + required: true + rowwise: false + type: coordinates (latlong) + vocab: null + - column: Geographic.coordinates + neotoma: ndb.collectionunits.geog notes: Spatial (lat/long pair or geoJSON) + required: true + rowwise: false + type: coordinates (latlong) + vocab: null - column: Coordinate.precision neotoma: ndb.collectionunits.location - required: False + notes: null + required: false + rowwise: false type: string vocab: ['core-site','GPS','core-site approximate','lake center'] - repeat: True - notes: - column: Site.coordinates - neotoma: ndb.collectionunits.geom - required: True + neotoma: '???' + notes: null + required: true + rowwise: false type: string - vocab: - repeat: False - notes: + vocab: null - column: Depositional.Env neotoma: '???' - required: False + notes: null + required: false + rowwise: false type: string - vocab: - repeat: - notes: + vocab: null - column: Date.of.core.collection - neotoma: ndb.collectionunits.colldate - required: True - type: date format: '%Y-%m-%d' - vocab: - repeat: True + neotoma: ndb.collectionunits.colldate notes: Full-date notation as defined by RFC 3339, section 5.6, for example, 2017-07-21 + required: true + rowwise: false + type: date + vocab: null - column: Principal.Investigator.s. neotoma: ndb.contacts.contactname - required: True + notes: null + required: true + rowwise: false type: string - vocab: - repeat: False - notes: + vocab: null - column: Analyst neotoma: ndb.sampleanalysts.contactid - required: True + notes: null + required: true + rowwise: false type: string - vocab: - repeat: False - notes: + vocab: null - column: Modeler neotoma: ndb.chronologies.contactid - required: True + notes: null + required: true + rowwise: false type: string - vocab: - repeat: - notes: + vocab: null - column: X210.LeadModel neotoma: ndb.chronologies.agemodel - required: False + notes: null + required: false + rowwise: false type: string vocab: ['CRS', 'CIC', 'CF:CS', 'PLUM', 'other'] - repeat: True - notes: - column: X210.Lead.Model.Notes neotoma: ndb.chronologies.notes - required: False + notes: null + required: false + rowwise: false type: string - vocab: - repeat: - notes: + vocab: null - column: Method.for.estimating.supported.210Pb neotoma: ndb.leadmodels.basis - required: False + notes: null + required: false + rowwise: false type: string vocab: ['asymptote of alpha', 'gamma point-subtraction', 'gamma average'] - repeat: True - notes: - column: X210Pb.dating.horizon neotoma: ndb.leadmodels.datinghorizon - required: True + notes: null + required: true + rowwise: false type: number - vocab: - repeat: - notes: + vocab: null - column: Cumulative.210Pb.Inventory neotoma: ndb.leadmodels.cumulativeinventory - required: False - type: - vocab: - repeat: - notes: + notes: null + required: false + rowwise: null + type: null + vocab: null - column: Depth neotoma: ndb.analysisunits.depth - required: True + notes: null + required: true + rowwise: true type: number - vocab: - repeat: True - notes: + vocab: null - column: Thickness neotoma: ndb.analysisunits.thickness - required: False + notes: null + required: false + rowwise: true type: number - vocab: - repeat: True - notes: + vocab: null - column: Depth.position neotoma: '???' - required: False + notes: null + required: false + rowwise: true type: string vocab: ['Top', 'Mid', 'Bottom'] - repeat: True - notes: - column: Dry.Density neotoma: ndb.data.value - required: False + notes: null + required: false + rowwise: true + taxonid: 5782 + taxonname: bulk density type: number - vocab: - repeat: True - notes: + unitcolumn: Dry.Density.Units + vocab: null - column: Dry.Density.Units neotoma: ndb.variableunits.variableunits - required: False + notes: null + required: false + rowwise: true type: string vocab: ['g/cm3'] - repeat: True - notes: - column: Cumulative.dry.mass neotoma: ndb.data.value - required: False + notes: null + required: false + rowwise: true + taxonid: '???' + taxonname: cumulative dry mass type: number - vocab: - repeat: - notes: + vocab: null - column: Cumulative.dry.mass.units neotoma: ndb.variableunits.variableunits - required: False + notes: null + required: false + rowwise: true type: string vocab: ['g/cm2'] - repeat: True - notes: - column: Total.210Pb.Alpha..synonym.Total.210Po. neotoma: ndb.data.value - required: False + notes: null + required: false + rowwise: true + taxonid: '???' + taxonname: 210Pb/210Po type: number - vocab: - repeat: - notes: + uncertainty: + uncertaintybasis: null + uncertaintycolumn: Error..total.210Pb.alpha. + unitcolumn: Error..total.210Pb.alpha..units + unitcolumn: Total.210Pb.Alpha..synonym.Total.210Po..Units + vocab: null - column: Total.210Pb.Alpha..synonym.Total.210Po..Units neotoma: ndb.variableunits.variableunits - required: False + notes: null + required: false + rowwise: true type: string vocab: ['pCi/g', 'Bq/g', 'Bq/kg', 'dpm/g'] - repeat: - notes: - column: Error..total.210Pb.alpha. - neotoma: ndb.data.value - required: False + neotoma: ndb.values + notes: null + required: false + rowwise: true + taxonid: '???' + taxonname: 210Pb/210Po type: number - vocab: - repeat: - notes: + vocab: null - column: Error..total.210Pb.alpha..units neotoma: ndb.variableunits.variableunits - required: False + notes: null + required: false + rowwise: true type: string vocab: ['pCi/g', 'Bq/g', 'Bq/kg', 'dpm/g'] - repeat: - notes: - column: Total.210Pb.Gamma neotoma: ndb.data.value - required: False + notes: null + required: false + rowwise: true + taxonid: '???' + taxonname: 210Pb type: number - vocab: - repeat: - notes: + uncertainty: + uncertaintybasis: null + uncertaintycolumn: Error..total.210Pb.Gamma. + unitcolumn: Error..total.210Pb.Gamma..Units + unitcolumn: Total.210Pb.Gamma.Units + vocab: null - column: Total.210Pb.Gamma.Units neotoma: ndb.variableunits.variableunits - required: False + notes: null + required: false + rowwise: true type: string vocab: ['pCi/g', 'Bq/g', 'Bq/kg', 'dpm/g'] - repeat: True - notes: - column: Error..total.210Pb.Gamma. - neotoma: ndb.data.value - required: False + neotoma: ndb.values + notes: null + required: false + rowwise: true + taxonid: '???' + taxonname: 210Pb type: number - vocab: - repeat: - notes: + vocab: null - column: Error..total.210Pb.Gamma..Units neotoma: ndb.variableunits.variableunits - required: False + notes: null + required: false + rowwise: true type: string vocab: ['pCi/g', 'Bq/g', 'Bq/kg', 'dpm/g'] - repeat: True - notes: - column: X214Pb neotoma: ndb.data.value - required: False + notes: null + required: false + rowwise: true + taxonid: '???' + taxonname: 214Pb type: number - vocab: - repeat: - notes: + uncertainty: + uncertaintybasis: null + uncertaintycolumn: Error..214Pb. + unitcolumn: Error..214Pb..Units + unitcolumn: X214Pb.Units + vocab: null - column: X214Pb.Units neotoma: ndb.variableunits.variableunits - required: False + notes: null + required: false + rowwise: true type: string vocab: ['pCi/g', 'Bq/g', 'Bq/kg', 'dpm/g'] - repeat: True - notes: - column: Error..214Pb. - neotoma: ndb.data.value - required: False + neotoma: ndb.values + notes: null + required: false + rowwise: true + taxonid: '???' + taxonname: 214Pb type: number - vocab: - repeat: - notes: + vocab: null - column: Error..214Pb..Units neotoma: ndb.variableunits.variableunits - required: False + notes: null + required: false + rowwise: true type: string vocab: ['pCi/g', 'Bq/g', 'Bq/kg', 'dpm/g'] - repeat: True - notes: - column: X214Bi neotoma: ndb.data.value - required: False + notes: null + required: false + rowwise: true + taxonid: '???' + taxonname: 214Bi type: number - vocab: - repeat: - notes: + uncertainty: + uncertaintybasis: null + uncertaintycolumn: Error.214Bi + unitcolumn: Error..214Bi..Units + unitcolumn: X214Bi.Units + vocab: null - column: X214Bi.Units neotoma: ndb.variableunits.variableunits - required: False + notes: null + required: false + rowwise: true type: string vocab: ['pCi/g', 'Bq/g', 'Bq/kg', 'dpm/g'] - repeat: True - notes: - column: Error.214Bi - neotoma: ndb.data.value - required: False + neotoma: ndb.values + notes: null + required: false + rowwise: true + taxonid: '???' + taxonname: 214Bi type: number - vocab: - repeat: - notes: + vocab: null - column: Error..214Bi..Units neotoma: ndb.variableunits.variableunits - required: False + notes: null + required: false + rowwise: true type: string vocab: ['pCi/g', 'Bq/g', 'Bq/kg', 'dpm/g'] - repeat: True - notes: - column: X137Cs neotoma: ndb.data.value - required: False + notes: null + required: false + rowwise: true + taxonid: '???' + taxonname: 137Cs type: number - vocab: - repeat: - notes: + uncertainty: + uncertaintybasis: null + uncertaintycolumn: Error..137Cs. + unitcolumn: Error..137Cs..Units + unitcolumn: X137Cs.Units + vocab: null - column: X137Cs.Units neotoma: ndb.variableunits.variableunits - required: False + notes: null + required: false + rowwise: true type: string vocab: ['pCi/g', 'Bq/g', 'Bq/kg', 'dpm/g'] - repeat: True - notes: - column: Error..137Cs. - neotoma: - field: ndb.variableunits.variableunits - taxonname: 137Cs - linkedcolumn: Error..137Cs. - required: False + neotoma: ndb.values + notes: null + required: false + rowwise: true + taxonid: '???' + taxonname: 137Cs type: number - vocab: - repeat: - notes: - - column: Error..137Cs..Units - neotoma: - field: ndb.variableunits.variableunits - taxonname: 137Cs - linkedcolumn: Error..137Cs. - required: False + vocab: null + - column: Error..137Cs..Units + neotoma: ndb.variableunits.variableunits + notes: null + required: false + rowwise: true type: string vocab: ['pCi/g', 'Bq/g', 'Bq/kg', 'dpm/g'] - repeat: True - notes: - column: Assigned.137Cs.Date - neotoma: ndb.data.value - required: False + neotoma: ndb.geochroncontrols.age + notes: null + required: false + rowwise: true + taxonid: '???' + taxonname: 137Cs date type: number - vocab: - repeat: - notes: + unitcolumn: Assigned.137Cs.Date.Units + vocab: null - column: Assigned.137Cs.Date.Units neotoma: ndb.variableunits.variableunits - required: False + notes: null + required: false + rowwise: true type: string vocab: ['CE/BCE', 'cal yr BP', 'Cal yr BP'] - repeat: True - notes: - column: Supported.210Pb neotoma: ndb.data.value - required: False + notes: null + required: false + rowwise: true + taxonid: '???' + taxonname: 210Pb/210Po type: number - vocab: - repeat: - notes: + uncertainty: + uncertaintybasis: null + uncertaintycolumn: Error..Supported.210Pb..1SD + unitcolumn: Error..Supported.210Pb..1SD.Units + unitcolumn: Supported.210Pb.Units + vocab: null - column: Supported.210Pb.Units neotoma: ndb.variableunits.variableunits - required: False + notes: null + required: false + rowwise: true type: string vocab: ['pCi/g', 'Bq/g', 'Bq/kg', 'dpm/g'] - repeat: True - notes: - column: Error..Supported.210Pb..1SD - neotoma: ndb.data.value - required: False + neotoma: ndb.values + notes: null + required: false + rowwise: true + taxonid: '???' + taxonname: 210Pb/210Po type: number - vocab: - repeat: - notes: + vocab: null - column: Error..Supported.210Pb..1SD.Units neotoma: ndb.variableunits.variableunits - required: False + notes: null + required: false + rowwise: true type: string vocab: ['pCi/g', 'Bq/g', 'Bq/kg', 'dpm/g'] - repeat: True - notes: - column: Unsupported.210Pb neotoma: ndb.data.value - required: False + notes: null + required: false + rowwise: true + taxonid: '???' + taxonname: 210Pb/210Po type: number - vocab: - repeat: - notes: + uncertainty: + uncertaintybasis: null + uncertaintycolumn: Error..Unsupported.210Pb..1SD + unitcolumn: Error..Unsupported.210Pb..1SD.Units + unitcolumn: Unsupported.210Pb.Units + vocab: null - column: Unsupported.210Pb.Units neotoma: ndb.variableunits.variableunits - required: False + notes: null + required: false + rowwise: true type: string vocab: ['pCi/g', 'Bq/g', 'Bq/kg', 'dpm/g'] - repeat: True - notes: - column: Error..Unsupported.210Pb..1SD - neotoma: ndb.data.value - required: False + neotoma: ndb.values + notes: null + required: false + rowwise: true + taxonid: '???' + taxonname: 210Pb/210Po type: number - vocab: - repeat: - notes: + vocab: null - column: Error..Unsupported.210Pb..1SD.Units neotoma: ndb.variableunits.variableunits - required: False + notes: null + required: false + rowwise: true type: string vocab: ['pCi/g', 'Bq/g', 'Bq/kg', 'dpm/g'] - repeat: True - notes: - column: X210Pb.Date neotoma: ndb.sampleages.age - required: False - type: string - vocab: - repeat: - notes: + notes: null + required: false + rowwise: true + taxonid: '???' + taxonname: 210Pb date + type: string + uncertainty: + uncertaintybasis: Error..210Pb.Date..1SD.Units + uncertaintycolumn: Error..210Pb.Date. + unitcolumn: null + unitcolumn: X210Pb.Date.Units + vocab: null - column: X210Pb.Date.Units - neotoma: ndb.agetypes.agetype - required: False + neotoma: ndb.variableunits.variableunits + notes: null + required: false + rowwise: true type: string vocab: ['CE/BCE', 'cal yr BP', 'Cal yr BP'] - repeat: - notes: - column: Error..210Pb.Date. - neotoma: ndb.sampleages.ageerror - required: False + neotoma: ndb.values + notes: null + required: false + rowwise: true + taxonid: '???' + taxonname: 210Pb date type: number - vocab: - repeat: - notes: This is the error, not the ageyounger/ageolder that we use in the model. + vocab: null - column: Error..210Pb.Date..1SD.Units - neotoma: ndb.agetypes.agetype - required: False + neotoma: ndb.variableunits.variableunits + notes: null + required: false + rowwise: true type: string vocab: ['CE/BCE', 'cal yr BP', 'Cal yr BP'] - repeat: True - notes: - column: DMAR neotoma: ndb.data.value - required: False + notes: null + required: false + rowwise: true + taxonid: '???' + taxonname: dry mass accumulation rate type: number - vocab: - repeat: - notes: + uncertainty: + uncertaintybasis: null + uncertaintycolumn: Error..DMAR. + unitcolumn: Error..DMAR..Units` + unitcolumn: DMAR.Units + vocab: null - column: DMAR.Units neotoma: ndb.variableunits.variableunits - required: False + notes: null + required: false + rowwise: true type: string vocab: ['g/cm2/yr','g/m2/yr','kg/m2/yr'] - repeat: - notes: - column: Error..DMAR. - neotoma: ndb.data.value - required: False + neotoma: ndb.values + notes: null + required: false + rowwise: true + taxonid: '???' + taxonname: dry mass accumulation rate type: number - vocab: - repeat: - notes: + vocab: null - column: Error..DMAR..Units neotoma: ndb.variableunits.variableunits - required: False - type: - vocab: ['g/cm2/yr','g/m2/yr','kg/m2/yr'] - repeat: - notes: \ No newline at end of file + notes: null + required: false + rowwise: true + type: null + vocab: ['g/cm2/yr','g/m2/yr','kg/m2/yr'] \ No newline at end of file diff --git a/template_upload.py b/template_upload.py index 792262f..24b92a2 100644 --- a/template_upload.py +++ b/template_upload.py @@ -1,134 +1,237 @@ import json +import os import psycopg2 +import glob from dotenv import load_dotenv import neotomaUploader as nu -import os load_dotenv() -data = json.loads(os.getenv('PGDB_HOLDING')) +data = json.loads(os.getenv('PGDB_LOCAL')) conn = psycopg2.connect(**data, connect_timeout = 5) cur = conn.cursor() -args = nu.parseArguments() - -if args.get('data') == 1: - filename = 'data/Speckled Trout 2006 GRPO.csv' -else: - filename = 'data/Speckled Trout 2006 GRPO.csv' - -logfile = [] -hashcheck = nu.hashFile(filename) -filecheck = nu.checkFile(filename) - -if hashcheck['pass'] is False and filecheck['pass'] is False: - csv_template = nu.read_csv(filename) - logfile.append("File must be properly validated before it can be uploaded.") -else: - csv_template = nu.read_csv(filename) - # This possibly needs to be fixed. How do we know that there is one or more header rows? - -uploader = {} - -yml_dict = nu.ymlToDict(yml_file=args['yml']) -yml_data = yml_dict['metadata'] - -# Verify that the CSV columns and the YML keys match -csvValid = nu.csvValidator(filename = filename, - yml_data = yml_data) - -# Cleaning fields to unique values: -geog = nu.cleanCol('Location', csv_template) -piname = nu.cleanCol('Principal.Investigator.s.', csv_template) -analystname = nu.cleanCol('Analyst', csv_template) -modelername = nu.cleanCol('Modeler', csv_template) -pubname = nu.cleanCol('Publications', csv_template) -collunits = nu.cleanCol('Core.number.or.code', csv_template) -colldate = nu.cleanCol('Date.of.core.collection', csv_template) -location = nu.cleanCol('Coordinate.precision', csv_template) -depths = nu.cleanCol('Depth', csv_template, False) -thicks = nu.cleanCol('Thickness', csv_template, False) -dateunits = nu.cleanCol('X210Pb.Date.Units', csv_template) -ages = nu.cleanCol('X210Pb.Date', csv_template, False) -ageerror = nu.cleanCol('Error..210Pb.Date.', csv_template, False) -agemodel = nu.cleanCol('X210.LeadModel', csv_template) -chronnotes = nu.cleanCol('X210.Lead.Model.Notes', csv_template) -datasetname = nu.cleanCol('Core.number.or.code', csv_template) - -dthick = [] -# We need to arrange the depths, thicknesses and ages. -for i, value in enumerate(depths): - dthick.append({'depth': value, - 'thickness': thicks[i], - 'age': ages[i], - 'error': ageerror[i]}) - -logfile.append('=== Inserting new Site ===') -uploader['siteid'] = nu.insert_site(cur = cur, - yml_dict = yml_dict, - csv_template = csv_template) - -logfile.append('siteid: %s' % uploader['siteid']) - -# logfile.append('=== Inserting Site Geopol ===') -# # uploader['geopolid'] = nu.insertGeoPol(cur = cur, uploader = uploader) -# # logfile.append('Geopolitical Unit: %s' % uploader['geopolid']) - -logfile.append('=== Inserting Collection Units ===') -uploader['collunitid'] = nu.insertCollUnit(cur = cur, - yml_dict = yml_dict, - csv_template = csv_template, - uploader = uploader) - -logfile.append('collunitid: %s' % uploader['collunitid']) - -logfile.append('=== Inserting Analysis Units ===') -uploader['anunits'] = nu.insertAnalysisUnit(cur = cur, +args = nu.parse_arguments() + +filenames = glob.glob(args['data'] + "*.csv") +upload_logs = 'upload_logs' +if not os.path.exists(upload_logs): + os.makedirs(upload_logs) + +corrupted_files = "data/corrupted_files" + +for filename in filenames: + test_dict = {} + print(filename) + logfile = [] + hashcheck = nu.hash_file(filename) + filecheck = nu.check_file(filename) + + if hashcheck['pass'] is False and filecheck['pass'] is False: + csv_template = nu.read_csv(filename) + logfile.append("File must be properly validated before it can be uploaded.") + else: + csv_template = nu.read_csv(filename) + # This possibly needs to be fixed. How do we know that there is one or more header rows? + + uploader = {} + + yml_dict = nu.yml_to_dict(yml_file=args['yml']) + yml_data = yml_dict['metadata'] + + # Verify that the CSV columns and the YML keys match + csv_valid = nu.csv_validator(filename = filename, + yml_data = yml_data) + try: + logfile.append('=== Inserting new Site ===') + uploader['siteid'] = nu.insert_site(cur = cur, + yml_dict = yml_dict, + csv_template = csv_template) + logfile.append(f"siteid: {uploader['siteid']}") + test_dict['site'] = True + except Exception as e: + test_dict['site'] = False + logfile.append(f"Site Error: {e}") + + # logfile.append('=== Inserting Site Geopol ===') + # uploader['geopolid'] = nu.insert_geopol(cur = cur, + # yml_dict = yml_dict, + # csv_template = csv_template, + # uploader = uploader) + # logfile.append(f"Geopolitical Unit: {uploader['geopolid']}") + + try: + logfile.append('=== Inserting Collection Units ===') + uploader['collunitid'] = nu.insert_collunit(cur = cur, + yml_dict = yml_dict, + csv_template = csv_template, + uploader = uploader) + logfile.append(f"collunitid: {uploader['collunitid']}") + test_dict['collunit'] = True + except Exception as e: + test_dict['collunit'] = False + logfile.append(f"Collection Unit Error: {e}") + + try: + logfile.append('=== Inserting Analysis Units ===') + uploader['anunits'] = nu.insert_analysisunit(cur = cur, + yml_dict = yml_dict, + csv_template = csv_template, + uploader = uploader) + logfile.append(f"anunits: {uploader['anunits']}") + test_dict['anunits'] = True + except Exception as e: + test_dict['anunits'] = False + logfile.append(f"Analysis Units Error: {e}") + + try: + logfile.append('=== Inserting Chronology ===') + uploader['chronology'] = nu.insert_chronology(cur = cur, + yml_dict = yml_dict, + csv_template = csv_template, + uploader = uploader) + logfile.append(f"chronology: {uploader['chronology']}") + test_dict['chronology'] = True + except Exception as e: + test_dict['chronology'] = False + logfile.append(f"Chronology Error: {e}") + + try: + logfile.append('=== Inserting Chroncontrol ===') + uploader['chroncontrol'] = nu.insert_chron_control(cur = cur, + yml_dict = yml_dict, + csv_template = csv_template, + uploader = uploader) + logfile.append(f"chroncontrol: {uploader['chroncontrol']}") + test_dict['chroncontrol'] = True + except Exception as e: + test_dict['chroncontrol'] = False + logfile.append(f"Chroncontrols Error: {e}") + + + try: + logfile.append('=== Inserting Dataset ===') + uploader['datasetid'] = nu.insert_dataset(cur = cur, + yml_dict = yml_dict, + csv_template = csv_template, + uploader = uploader) + logfile.append(f"datasetid: {uploader['datasetid']}") + test_dict['dataset'] = True + except Exception as e: + test_dict['dataset'] = False + logfile.append(f"Dataset Error: {e}") + + try: + logfile.append('=== Inserting Dataset PI ===') + uploader['datasetpi'] = nu.insert_dataset_pi(cur = cur, + yml_dict = yml_dict, + csv_template = csv_template, + uploader = uploader) + logfile.append(f"datasetPI: {uploader['datasetpi']}") + test_dict['datasetpi'] = True + except Exception as e: + test_dict['datasetpi'] = False + logfile.append(f"Dataset PI Error: {e}") + + + try: + logfile.append('=== Inserting Data Processor ===') + uploader['processor'] = nu.insert_data_processor(cur = cur, + yml_dict = yml_dict, + csv_template = csv_template, + uploader = uploader) + logfile.append(f"dataset Processor: {uploader['processor']}") + test_dict['processor'] = True + except Exception as e: + test_dict['processor'] = False + logfile.append(f"Processor Error: {e}") + + + # Not sure where to get this information from + # logfile.append('=== Inserting Repository ===') + # uploader['repository'] = nu.insert_dataset_repository(cur = cur, + # yml_dict = yml_dict, + # csv_template = csv_template, + # uploader = uploader) + # logfile.append(f"dataset Processor: {uploader['repository']}") + + try: + logfile.append('=== Inserting Dataset Database ===') + uploader['database'] = nu.insert_dataset_database(cur = cur, + yml_dict = yml_dict, + uploader = uploader) + logfile.append(f"Dataset Database: {uploader['database']}") + test_dict['database'] = True + except Exception as e: + test_dict['database'] = False + logfile.append(f"Database Error: {e}") + + try: + logfile.append('=== Inserting Samples ===') + uploader['samples'] = nu.insert_sample(cur, yml_dict = yml_dict, csv_template = csv_template, uploader = uploader) - -# logfile.append('=== Inserting Chronology ===') -# uploader['chronology'] = nu.insertChronology(cur = cur, -# yml_dict = yml_dict, -# csv_template = csv_template, -# uploader = uploader) - -# #(cur = cur, -# collunitid = uploader['collunitid'], -# agetype = agetype[1], -# agemodel = agemodel[0], -# ages = ages, -# contactname = modelername, -# default = True, -# chronologyname = 'Default 210Pb') - -# logfile.append('=== Inserting Chroncontrol ===') -# uploader['chroncontrol'] = nu.insertChroncontrol(cur = cur, -# collunitid = uploader['collunitid'], -# agetype = agetype[1], -# agemodel = agemodel[0], -# ages = ages, -# contactname = modelername, -# default = True, -# chronologyname = 'Default 210Pb') - -# uploader['datasetid'] = nu.insertDataset(cur, uploader['collunitid'], datasetname) - -# uploader['datasetpi'] = nu.insertDatasetPI(cur, uploader['datasetid'], piname[i], i + 1) - -# uploader['processor'] = nu.insertDatasetProcessor(cur, uploader['datasetid']) - -# uploader['repository'] = nu.insertDatasetRepository(cur, uploader['datasetid']) - -# nu.insertDatasetDatabase(cur, uploader['datasetid'], "") -# nu.insertSamples(cur, ts.insertsample -# ts.insertsampleanalyst -# ts.insertsampleage -# ts.insertdata - -# conn.commit() -print(logfile) -conn.rollback() \ No newline at end of file + logfile.append(f"Dataset Samples: {uploader['samples']}") + test_dict['samples'] = True + except Exception as e: + test_dict['samples'] = False + logfile.append(f"Samples Error: {e}") + + try: + logfile.append('=== Inserting Sample Analyst ===') + uploader['sampleAnalyst'] = nu.insert_sample_analyst(cur, + yml_dict = yml_dict, + csv_template = csv_template, + uploader = uploader) + logfile.append(f"Sample Analyst: {uploader['sampleAnalyst']}") + test_dict['sampleAnalyst'] = True + except Exception as e: + test_dict['sampleAnalyst'] = False + logfile.append(f"Sample Analysts Error: {e}") + + try: + logfile.append('=== Inserting Sample Age ===') + uploader['sampleAge'] = nu.insert_sample_age(cur, + yml_dict = yml_dict, + csv_template = csv_template, + uploader = uploader) + logfile.append(f"Sample Age: {uploader['sampleAge']}") + test_dict['sampleAge'] = True + except Exception as e: + test_dict['sampleAge'] = False + logfile.append(f"Sample Age Error: {e}") + + try: + logfile.append('=== Inserting Data ===') + uploader['data'] = nu.insert_data(cur, + yml_dict = yml_dict, + csv_template = csv_template, + uploader = uploader) + logfile.append(f"Data: {uploader['data']}") + test_dict['data'] = True + except Exception as e: + test_dict['data'] = False + logfile.append(f"Data Error: {e}") + + with open(filename + '.upload.log', 'w', encoding = "utf-8") as writer: + for i in logfile: + writer.write(i) + writer.write('\n') + + all_true = all(value for value in test_dict.values()) + + if all_true: + print(f"{filename} was uploaded.") + conn.commit() + #conn.rollback() + else: + if not os.path.exists(corrupted_files): + os.makedirs(corrupted_files) + corrupted_path = os.path.join(corrupted_files, os.path.basename(filename)) + os.replace(filename, corrupted_path) + print(f"filename {filename} could not be uploaded.\nMoved {filename} to the 'corrupted_files' folder.") + + conn.rollback() \ No newline at end of file diff --git a/template_validate.py b/template_validate.py index fd95699..fba4e39 100644 --- a/template_validate.py +++ b/template_validate.py @@ -14,11 +14,13 @@ import neotomaUploader as nu # Obtain arguments and parse them to handle command line arguments -args = nu.parseArguments() +args = nu.parse_arguments() load_dotenv() -data = json.loads(os.getenv('PGDB_HOLDING')) +#data = json.loads(os.getenv('PGDB_HOLDING')) +data = json.loads(os.getenv('PGDB_LOCAL')) + conn = psycopg2.connect(**data, connect_timeout = 5) cur = conn.cursor() @@ -29,22 +31,22 @@ print(filename) logfile = [] - hashcheck = nu.hashFile(filename) - filecheck = nu.checkFile(filename) + hashcheck = nu.hash_file(filename) + filecheck = nu.check_file(filename) logfile = logfile + hashcheck['message'] + filecheck['message'] if hashcheck['pass'] and filecheck['pass']: print(" - File is correct and hasn't changed since last validation.") else: # Load the yml template as a dictionary - yml_dict = nu.ymlToDict(yml_file=args['yml']) + yml_dict = nu.yml_to_dict(yml_file=args['yml']) yml_data = yml_dict['metadata'] # Obtain the unitcols and units to be used vocab_ = nu.vocabDict(yml_data) # Verify that the CSV columns and the YML keys match - csvValid = nu.csvValidator(filename = filename, + csvValid = nu.csv_validator(filename = filename, yml_data = yml_data) # Log if the file is valid logfile = logfile + csvValid @@ -59,14 +61,12 @@ logfile.append('=== Checking Template Unit Definitions ===') testset['units'] = unittest['pass'] logfile = logfile + unittest['message'] - - ########### Testing site coordinates: - #sitename + ########## Testing site coordinates: + # sitename logfile.append('=== Checking Against Current Sites ===') - # removed hemisphere = ["NW"], added a note on which hemisphere the site would be. sitecheck = nu.valid_site(cur = cur, - yml_dict = yml_dict, - csv_template = csv_template) + yml_dict = yml_dict, + csv_template = csv_template) testset['sites'] = sitecheck['pass'] logfile = logfile + sitecheck['message'] @@ -75,60 +75,48 @@ logfile.append('=== Checking All Date Formats ===') # format is retrieved in validDate via the yml dateCheck = nu.valid_date(yml_dict, - csv_template) + csv_template) logfile = logfile + dateCheck['message'] testset['date'] = dateCheck['pass'] ########### Collection Units logfile.append('=== Checking Against Collection Units ===') nameCheck = nu.valid_collectionunit(cur, - yml_dict, - csv_template) + yml_dict, + csv_template) logfile = logfile + nameCheck['message'] testset['colunits'] = nameCheck['pass'] - + ########### Geopolitical unit: - logfile.append('=== Checking Against Geopolitical Units ===') + #logfile.append('=== Checking Against Geopolitical Units ===') # Commenting for now so that I can run the script # namecheck = nu.validGeoPol(cur, geog, coords) #logfile = logfile + namecheck['message'] #testset['geopol'] = namecheck['pass'] ########### PI names: - logfile.append('=== Checking Against Dataset PI Name ===') - namecheck = nu.validAgent(cur, - df, - yml_dict, - 'ndb.contacts.contactname') - logfile = logfile + namecheck['message'] - - ########### Age Modeller Name - logfile.append('=== Checking Against Age Modeller Name(s) ===') - namecheck = nu.validAgent(cur, - df, - yml_dict, - 'ndb.chronologies.contactid') - logfile = logfile + namecheck['message'] - - ########### Analyst Name - logfile.append('=== Checking Against Analyst Name(s) ===') - namecheck = nu.validAgent(cur, - df, - yml_dict, - 'ndb.sampleanalysts.contactid') + logfile.append('=== Checking Against Contact Names ===') + namecheck = nu.valid_agent(cur, + csv_template, + yml_dict) logfile = logfile + namecheck['message'] ########### Make sure the dating horizon is in the analysis units: logfile.append('=== Checking the Dating Horizon is Valid ===') - horizoncheck = nu.validHorizon(df, - yml_dict, - 'ndb.analysisunits.depth', - 'ndb.leadmodels.datinghorizon') + horizoncheck = nu.valid_horizon(yml_dict, + csv_template) testset['datinghorizon'] = horizoncheck['pass'] logfile = logfile + horizoncheck['message'] + ########### Taxa names: + logfile.append('=== Checking Against Taxa Names ===') + namecheck = nu.valid_taxa(cur, + csv_template, + yml_dict) + logfile = logfile + namecheck['message'] + ########### Write to log. with open(filename + '.log', 'w', encoding = "utf-8") as writer: for i in logfile: writer.write(i) - writer.write('\n') + writer.write('\n') \ No newline at end of file diff --git a/uncertainty.pdf b/uncertainty.pdf new file mode 100644 index 0000000..c08ee14 Binary files /dev/null and b/uncertainty.pdf differ diff --git a/uncertainty.svg b/uncertainty.svg new file mode 100644 index 0000000..dae8dc9 --- /dev/null +++ b/uncertainty.svg @@ -0,0 +1,415 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ndb.analysisunits + ndb.samples + ndb.data + + + ndb.variables + + + + ndb.variableunits + + + ndb.datauncertainty + + ndb.uncertaintybasis + + + + + + + + + + diff --git a/uncertaintyadditions.md b/uncertaintyadditions.md new file mode 100644 index 0000000..784a426 --- /dev/null +++ b/uncertaintyadditions.md @@ -0,0 +1,76 @@ +--- +title: "A New Neotoma Uncertainty Model" +format: pdf +--- + +# Adding Uncertainty to Neotoma + +The use of uncertainty for measured values is critical. We need it directly associated with individual measurements, and we need to identify the type of uncertainty, and, potentially, the source of the uncertainty (methods of calculation, etc.). This means that for any uncertainty measurement we need to have a link to the sample and the variable that is being measured, we need to have some set of fixed uncertainty measures (standard deviations, standard errors), we also need to be able to freely define the source of the uncertainty (or perhaps again have a fixed set of measures). So, it should be possible to report the following: + +| reference | value | units | uncertainty reported | source | +|--------------------------------|-------|-------|----------------------|-----------------------------------------| +| Pinus count for sample 1223445 | 12 | NISP | 1SD | Mahr Nomograms (cf. Maher Jr 1972) | +| pH for sample 23244 | .02 | pH | 95% CI | Reported instrumental error from device | +| NaOH for sample 23244 | .02 | ug | 95% CI | Reported instrumental error from device | + +## Table modifications + +The uncertainty must be linked with the `ndb.data.dataid` because it modifies the `ndb.data.value` for that variable & sample. If we can assume that the units for the uncertainty are equivalent to the units associated with the variable, however it is possible that uncertainty may be expressed as a percent value. Given this, we will create a new table that links the `ndb.data.dataid` primary key. This allows us to traverse the `ndb.variables` entry for the record (to retrieve the taxonomic information), and potentially link to the variable units if they are equivalent. + +Given this data model: + +* The table `ndb.data` remains as is. +* The table `ndb.variables` remains as is. +* We add a new table `ndb.datauncertainties` that uses fk(dataid) (the `fk(variableid)` is implied). + * The table has columns `uncertaintyvalue`, `uncertaintyunit`, `uncertaintybasisid` and `notes` along with the standard `recdatecreated` and `recdatemodified`. + +They will inherit information from the `ndb.variables` row, so the assumption is that the uncertainty is reported in the same units (and for the same taxon) as the `ndb.data.value`. + +![Overall structure of the tables](uncertainty.svg) + +### Example Table + +| column | type | nulls | default | children | parents | comments | +|---------------------|---------|-------|---------|----------|----------|------------| +| dataid | integer | F | null | | ndb.data | fk(dataid) | +| uncertaintyvalue | float | F | | | | | The value is required. | +| uncertaintyunit | float | F | | | | | The value is required. | +| uncertaintybasisid | integer | F | | | | ndb.uncertaintybases | | +| notes | text | T | null | | | | + +#### Proposed `ndb.uncertaintybasis.uncertaintybasis` values + +Proposed values for uncertainty tables will come from standard reporting of uncertainty. + +* 1 Standard Deviation +* 2 Standard Deviations +* 3 Standard Deviations +* Mean square error + +```SQL +CREATE TABLE IF NOT EXISTS ndb.uncertaintybases ( + uncertaintybasisid SERIAL PRIMARY KEY, + uncertaintybasis text, + CONSTRAINT uniquebasis UNIQUE (uncertaintybasis)) +) +INSERT INTO ndb.uncertaintybases (uncertaintybasis) +VALUES ('1 Standard Deviation'), + ('2 Standard Deviations'), + ('3 Standard Deviation'), + ('1 Standard Error'); +``` + +### Proposed `ndb.datauncertainties` structure + +| uncertaintybasisid | uncertaintybasis | . . . | + +```SQL +CREATE TABLE IF NOT EXISTS ndb.datauncertainties ( + dataid INTEGER REFERENCES ndb.data(dataid), + uncertaintyvalue float, + uncertaintyunitid integer REFERENCES ndb.variableunits(variableunitsid), + uncertaintybasisid integer REFERENCES ndb.uncertaintybases(uncertaintybasisid), + notes text, + CONSTRAINT uniqueentryvalue UNIQUE (dataid, uncertaintyunitid, uncertaintybasisid) +); +``` diff --git a/uncertaintyadditions.qmd b/uncertaintyadditions.qmd new file mode 100644 index 0000000..bf31307 --- /dev/null +++ b/uncertaintyadditions.qmd @@ -0,0 +1,34 @@ +--- +title: "Untitled" +format: html +--- + +# Adding Uncertainty to Neotoma + +The use of uncertainty is critical. We need it directly associated with individual measurements, and we need to identify the type of uncertainty. + +## Table modifications + +The table `ndb.data` needs two new columns: `uncertaintyvalue` and `uncertaintytype`. + +They will inherit information from the `ndb.variables` row, so the assumption is that the uncertainty is reported in the same units (and for the same taxon) as the `ndb.data.value`. + +![Overall structure of the tables](uncertainty.svg) + +### Proposed `ndb.data` structure: + +| dataid | sampleid | variableid | value | uncertaintyvalue | uncertaintybasisid | . . . | + +### Proposed `ndb.uncertaintybasis` structure: + +| uncertaintybasisid | uncertaintybasis | . . . | + +#### Proposed `ndb.uncertaintybasis.uncertaintybasis` values: + +Proposed values for uncertainty tables will come from standard reporting of uncertainty. + +* 1 Standard Deviation +* 2 Standard Deviations +* 3 Standard Deviations +* Mean square error +