Merge pull request #2 from NeotomaDB/develop_sedv

Develop sedv
NeotomaDB · Nov 14, 2023 · 196c321 · 196c321
2 parents ae33149 + a3b90fe
commit 196c321
Show file tree

Hide file tree

Showing 55 changed files with 2,114 additions and 868 deletions.
diff --git a/README.md b/README.md
@@ -41,7 +41,7 @@ Each entry in the `metadata` tab can have the following entries:
 * `column`:  The column of the spreadsheet that is being described.
 * `neotoma`: A database table and column combination from the database schema.
 * `vocab`: If there is a fixed vocabulary for the column, include the possible terms here.
-* `repeat`: [`true`, `false`] Is each entry unique and tied to the row, or is this a set of entries associated with the site?
+* `repeat`: [`true`, `false`] Is each entry unique and tied to the row (`false`, this isn't a set of repeated values), or is this a set of entries associated with the site (`true`, there is only a single value that repeats throughout)?
 * `type`: [`integer`, `numeric`, `date`] The variable type for the field.
 * `ordered`: [`true`, `false`] Does the order of the column matter?
 

diff --git a/functions.py b/functions.py
diff --git a/neotomaUploader/__init__.py b/neotomaUploader/__init__.py
@@ -6,23 +6,36 @@
 import argparse
 import os
 
-from .cleanCol import cleanCol
+from .clean_column import clean_column
 from .yaml_values import yaml_values
 from .insert_site import insert_site
-from .insertAnalysisUnit import insertAnalysisUnit
-from .validAgent import validAgent
+from .insert_analysisunit import insert_analysisunit
+from .valid_agent import valid_agent
 from .valid_date import valid_date
 from .read_csv import read_csv
 from .validUnits import validUnits
 from .valid_site import valid_site
 from .valid_collectionunit import valid_collectionunit
 from .validGeoPol import validGeoPol
-from .validHorizon import validHorizon
-from .hashFile import hashFile
-from .checkFile import checkFile
-from .insertGeoPol import insertGeoPol
-from .insertCollUnit import insertCollUnit
-from .csvValidator import csvValidator
-from .csvValidator import ymlToDict
+from .valid_horizon import valid_horizon
+from .hash_file import hash_file
+from .check_file import check_file
+from .insert_geopol import insert_geopol
+from .insert_collunit import insert_collunit
+from .csv_validator import csv_validator
+from .csv_validator import yml_to_dict
 from .vocabDict import vocabDict
-from .parseArguments import parseArguments
+from .parse_arguments import parse_arguments
+from .csv_to_yaml import csv_to_yaml
+from .valid_taxa import valid_taxa
+from .insert_chronology import insert_chronology
+from .insert_chron_control import insert_chron_control
+from .insert_dataset import insert_dataset
+from .insert_dataset_pi import insert_dataset_pi
+from .insert_data_processor import insert_data_processor
+from .insert_dataset_repository import insert_dataset_repository
+from .insert_dataset_database import insert_dataset_database
+from .insert_sample import insert_sample
+from .insert_sample_analyst import insert_sample_analyst
+from .insert_data import insert_data
+from .insert_sample_age import insert_sample_age
diff --git a/neotomaUploader/checkFile.py → neotomaUploader/check_file.py b/neotomaUploader/checkFile.py → neotomaUploader/check_file.py
@@ -1,7 +1,7 @@
 import re
 import os
 
-def checkFile(filename):
+def check_file(filename):
     """_Validate the existence and result of a logfile._
 
     Args:

diff --git a/neotomaUploader/cleanCol.py → neotomaUploader/clean_column.py b/neotomaUploader/cleanCol.py → neotomaUploader/clean_column.py
@@ -1,4 +1,4 @@
-def cleanCol(column, template, clean = True):
+def clean_column(column, template, clean = True):
     """_cleanCol_
 
     Args:

diff --git a/neotomaUploader/csvValidator.py b/neotomaUploader/csvValidator.py
diff --git a/neotomaUploader/csv_to_yaml.py b/neotomaUploader/csv_to_yaml.py
@@ -0,0 +1,26 @@
+import pandas as pd
+import yaml
+
+def csv_to_yaml(xl_path, yml_output= 'output_yml.yml'):
+    """
+    _csv_to_yaml_
+
+    Args:
+        xl_path (_list_): _Excel file to be used as template_
+        yml_output (_list_): _Location and file name where the yaml template will be stored_
+
+    Returns:
+        _None_: _The output file will be stored, no need to return anything here_
+    """
+    df = pd.read_excel(xl_path)
+
+    # Convert DataFrame to a dictionary with list of columns
+    data_dict = df.to_dict(orient='records')
+    nested_data = [{key: value for key, value in zip(df.columns, row)} for row in data_dict]
+
+    with open(yml_output, 'w') as yaml_file:
+        yaml.dump(nested_data, yaml_file, default_flow_style=False)
+
+    print(f'YAML file stored in {yml_output} successfully.')
+
+    return None
diff --git a/neotomaUploader/csv_validator.py b/neotomaUploader/csv_validator.py
@@ -0,0 +1,73 @@
+import yaml
+from yaml.loader import SafeLoader
+from collections import defaultdict
+import itertools
+import pandas as pd
+import os
+import sys
+import argparse
+
+"""
+To run from command line use:
+python csv_validator.py /path/to/directory
+Example:/
+python 210Pb_Template/neotomaUploader/csvValidator.py --path=210Pb_Template/data/ --template=210Pb_Template/template.yml
+"""
+
+def yml_to_dict(yml_file):
+    """_Read in valid yaml file._
+
+    Args:
+        yml_file (_string_): _A valid filename for a yaml file._
+
+    Returns:
+        _dict_: _A dict representation of a yaml file._
+    """
+    if not os.path.isfile(yml_file):
+        raise FileNotFoundError(f"The file '{yml_file}' could not be found within the current path.")
+
+    with open(yml_file, encoding = "UTF-8") as file:
+        yml_data = yaml.load(file, Loader = SafeLoader)
+    return yml_data
+
+
+def csv_validator(filename, yml_data):
+    """_Validate csv file for use in the validator._
+
+    Args:
+        filename (_string_): _A valid csv filename._
+        yml_data (_dict_): _A dict passed from yml_to_dict()_
+
+    Returns:
+        _type_: _description_
+    """
+    log_file = []
+    # Take directly from .yml file
+    col_values = [d.get('column') for d in yml_data]
+
+    if not os.path.isfile(filename):
+        raise FileNotFoundError(f"The file '{filename}' could not be found within the current path.")
+
+    try:
+        # Load csv file as data frame and extract columns
+        df = pd.read_csv(filename)
+    except pd.errors.ParserError:
+        log_file.append(f"✗  Error opening file '{filename}': {e}"+ '\n')    
+
+    df_columns = list(df.columns)
+    # Verify that all columns from the DF are in the YAML file
+    diff_col = sorted(set(col_values) - set(df_columns))
+
+    # Verify that all columns from the YAML are in the DF
+    diff_val = sorted(set(df_columns)-set(col_values))
+
+    # Report in the log
+    if diff_col == diff_val:
+        message = ["✔  The column names and flattened YAML keys match"]
+        log_file = log_file + message
+    else:
+        log_file = log_file + ["✗  The column names and flattened YAML keys do not match"]
+        log_file = log_file + [f"Columns from the YAML template are not in the data frame: '{diff_val}'"]
+        log_file = log_file + [f"Columns from the data frame not in the YAML template: '{diff_col}'"]
+
+    return log_file
diff --git a/neotomaUploader/getAgent.py b/neotomaUploader/getAgent.py
diff --git a/neotomaUploader/hashFile.py → neotomaUploader/hash_file.py b/neotomaUploader/hashFile.py → neotomaUploader/hash_file.py
@@ -1,7 +1,7 @@
 import hashlib
 import os
 
-def hashFile(filename):
+def hash_file(filename):
     response = {'pass': False, 'hash': None, 'message': []}
     logfile = filename + '.log'
     response['hash'] = hashlib.md5(open(filename,'rb').read()).hexdigest()

diff --git a/neotomaUploader/insertAnalysisUnit.py b/neotomaUploader/insertAnalysisUnit.py
diff --git a/neotomaUploader/insertChronControl.py b/neotomaUploader/insertChronControl.py
diff --git a/neotomaUploader/insertChronology.py b/neotomaUploader/insertChronology.py