diff --git a/src/scripts/find_dadis_local_ids.py b/src/scripts/find_dadis_local_ids.py index 8ee1168..832e3de 100644 --- a/src/scripts/find_dadis_local_ids.py +++ b/src/scripts/find_dadis_local_ids.py @@ -45,6 +45,7 @@ def full_local_match_workflow( "dadis_update_date", ], ) + matched_breeds = clean_output(matched_breeds) matched_breeds.to_csv(temp_out, sep="\t", index=False, header=False) temp_out.close() logger.info("Output written to temp file.") @@ -61,8 +62,17 @@ def full_local_match_workflow( def read_vbo_data(filename: str) -> pd.DataFrame: vbo_breeds = pd.read_table( - filename, sep="\t", skiprows=[1], low_memory=False - ).convert_dtypes() + filename, + sep="\t", + skiprows=[1], + dtype={"obsolete": str, "description_of_origin": str}, + na_values=[], + low_memory=False + ).convert_dtypes( + infer_objects=False, + convert_string=False, + convert_boolean=False + ) return vbo_breeds @@ -155,6 +165,18 @@ def write_tsv_header( csv_out.writerow(header) +def clean_output(df: pd.DataFrame) -> pd.DataFrame: + """ + Clean the dataframe before writing + + * Convert any None values to empty strings + """ + string_columns = df.select_dtypes(include="object").columns + for column in string_columns: + df[column] = df[column].fillna("") + return df + + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Find DADIS entries matching VBO breeds" diff --git a/src/scripts/find_dadis_transboundary_ids.py b/src/scripts/find_dadis_transboundary_ids.py index be99db3..4af3293 100644 --- a/src/scripts/find_dadis_transboundary_ids.py +++ b/src/scripts/find_dadis_transboundary_ids.py @@ -38,6 +38,7 @@ def full_matching_workflow( output_file=temp_out, extra_cols=["dadis_transboundary_id"], ) + matched_breeds = clean_output(matched_breeds) # Write the actual data below the headers matched_breeds.to_csv(temp_out, sep="\t", index=False, header=False) temp_out.close() @@ -48,7 +49,16 @@ def full_matching_workflow( def read_vbo_data(filename: str) -> pd.DataFrame: - df = pd.read_table(filename, skiprows=[1]).convert_dtypes() + df = pd.read_table( + filename, + skiprows=[1], + na_values=[], + dtype={"obsolete": str}, + ).convert_dtypes( + infer_objects=False, + convert_string=False, + convert_boolean=False + ) return df @@ -215,6 +225,18 @@ def write_tsv_header( csv_out.writerow(header) +def clean_output(df: pd.DataFrame) -> pd.DataFrame: + """ + Clean the dataframe before writing + + * Convert any None values to empty strings + """ + string_columns = df.select_dtypes(include="object").columns + for column in string_columns: + df[column] = df[column].fillna("") + return df + + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Find DADIS entries matching VBO breeds"