Skip to content

Commit

Permalink
Merge pull request #2 from NeotomaDB/develop_sedv
Browse files Browse the repository at this point in the history
Develop sedv
  • Loading branch information
sedv8808 authored Nov 14, 2023
2 parents ae33149 + a3b90fe commit 196c321
Show file tree
Hide file tree
Showing 55 changed files with 2,114 additions and 868 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ Each entry in the `metadata` tab can have the following entries:
* `column`: The column of the spreadsheet that is being described.
* `neotoma`: A database table and column combination from the database schema.
* `vocab`: If there is a fixed vocabulary for the column, include the possible terms here.
* `repeat`: [`true`, `false`] Is each entry unique and tied to the row, or is this a set of entries associated with the site?
* `repeat`: [`true`, `false`] Is each entry unique and tied to the row (`false`, this isn't a set of repeated values), or is this a set of entries associated with the site (`true`, there is only a single value that repeats throughout)?
* `type`: [`integer`, `numeric`, `date`] The variable type for the field.
* `ordered`: [`true`, `false`] Does the order of the column matter?

Expand Down
7 changes: 0 additions & 7 deletions functions.py

This file was deleted.

35 changes: 24 additions & 11 deletions neotomaUploader/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,36 @@
import argparse
import os

from .cleanCol import cleanCol
from .clean_column import clean_column
from .yaml_values import yaml_values
from .insert_site import insert_site
from .insertAnalysisUnit import insertAnalysisUnit
from .validAgent import validAgent
from .insert_analysisunit import insert_analysisunit
from .valid_agent import valid_agent
from .valid_date import valid_date
from .read_csv import read_csv
from .validUnits import validUnits
from .valid_site import valid_site
from .valid_collectionunit import valid_collectionunit
from .validGeoPol import validGeoPol
from .validHorizon import validHorizon
from .hashFile import hashFile
from .checkFile import checkFile
from .insertGeoPol import insertGeoPol
from .insertCollUnit import insertCollUnit
from .csvValidator import csvValidator
from .csvValidator import ymlToDict
from .valid_horizon import valid_horizon
from .hash_file import hash_file
from .check_file import check_file
from .insert_geopol import insert_geopol
from .insert_collunit import insert_collunit
from .csv_validator import csv_validator
from .csv_validator import yml_to_dict
from .vocabDict import vocabDict
from .parseArguments import parseArguments
from .parse_arguments import parse_arguments
from .csv_to_yaml import csv_to_yaml
from .valid_taxa import valid_taxa
from .insert_chronology import insert_chronology
from .insert_chron_control import insert_chron_control
from .insert_dataset import insert_dataset
from .insert_dataset_pi import insert_dataset_pi
from .insert_data_processor import insert_data_processor
from .insert_dataset_repository import insert_dataset_repository
from .insert_dataset_database import insert_dataset_database
from .insert_sample import insert_sample
from .insert_sample_analyst import insert_sample_analyst
from .insert_data import insert_data
from .insert_sample_age import insert_sample_age
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import re
import os

def checkFile(filename):
def check_file(filename):
"""_Validate the existence and result of a logfile._
Args:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
def cleanCol(column, template, clean = True):
def clean_column(column, template, clean = True):
"""_cleanCol_
Args:
Expand Down
49 changes: 0 additions & 49 deletions neotomaUploader/csvValidator.py

This file was deleted.

26 changes: 26 additions & 0 deletions neotomaUploader/csv_to_yaml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import pandas as pd
import yaml

def csv_to_yaml(xl_path, yml_output= 'output_yml.yml'):
"""
_csv_to_yaml_
Args:
xl_path (_list_): _Excel file to be used as template_
yml_output (_list_): _Location and file name where the yaml template will be stored_
Returns:
_None_: _The output file will be stored, no need to return anything here_
"""
df = pd.read_excel(xl_path)

# Convert DataFrame to a dictionary with list of columns
data_dict = df.to_dict(orient='records')
nested_data = [{key: value for key, value in zip(df.columns, row)} for row in data_dict]

with open(yml_output, 'w') as yaml_file:
yaml.dump(nested_data, yaml_file, default_flow_style=False)

print(f'YAML file stored in {yml_output} successfully.')

return None
73 changes: 73 additions & 0 deletions neotomaUploader/csv_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import yaml
from yaml.loader import SafeLoader
from collections import defaultdict
import itertools
import pandas as pd
import os
import sys
import argparse

"""
To run from command line use:
python csv_validator.py /path/to/directory
Example:/
python 210Pb_Template/neotomaUploader/csvValidator.py --path=210Pb_Template/data/ --template=210Pb_Template/template.yml
"""

def yml_to_dict(yml_file):
"""_Read in valid yaml file._
Args:
yml_file (_string_): _A valid filename for a yaml file._
Returns:
_dict_: _A dict representation of a yaml file._
"""
if not os.path.isfile(yml_file):
raise FileNotFoundError(f"The file '{yml_file}' could not be found within the current path.")

with open(yml_file, encoding = "UTF-8") as file:
yml_data = yaml.load(file, Loader = SafeLoader)
return yml_data


def csv_validator(filename, yml_data):
"""_Validate csv file for use in the validator._
Args:
filename (_string_): _A valid csv filename._
yml_data (_dict_): _A dict passed from yml_to_dict()_
Returns:
_type_: _description_
"""
log_file = []
# Take directly from .yml file
col_values = [d.get('column') for d in yml_data]

if not os.path.isfile(filename):
raise FileNotFoundError(f"The file '{filename}' could not be found within the current path.")

try:
# Load csv file as data frame and extract columns
df = pd.read_csv(filename)
except pd.errors.ParserError:
log_file.append(f"✗ Error opening file '{filename}': {e}"+ '\n')

df_columns = list(df.columns)
# Verify that all columns from the DF are in the YAML file
diff_col = sorted(set(col_values) - set(df_columns))

# Verify that all columns from the YAML are in the DF
diff_val = sorted(set(df_columns)-set(col_values))

# Report in the log
if diff_col == diff_val:
message = ["✔ The column names and flattened YAML keys match"]
log_file = log_file + message
else:
log_file = log_file + ["✗ The column names and flattened YAML keys do not match"]
log_file = log_file + [f"Columns from the YAML template are not in the data frame: '{diff_val}'"]
log_file = log_file + [f"Columns from the data frame not in the YAML template: '{diff_col}'"]

return log_file
12 changes: 0 additions & 12 deletions neotomaUploader/getAgent.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import hashlib
import os

def hashFile(filename):
def hash_file(filename):
response = {'pass': False, 'hash': None, 'message': []}
logfile = filename + '.log'
response['hash'] = hashlib.md5(open(filename,'rb').read()).hexdigest()
Expand Down
36 changes: 0 additions & 36 deletions neotomaUploader/insertAnalysisUnit.py

This file was deleted.

35 changes: 0 additions & 35 deletions neotomaUploader/insertChronControl.py

This file was deleted.

43 changes: 0 additions & 43 deletions neotomaUploader/insertChronology.py

This file was deleted.

Loading

0 comments on commit 196c321

Please sign in to comment.