diff --git a/ParseCNVQC/parse_cnv_qc.py b/ParseCNVQC/parse_cnv_qc.py index b7a1e6a..4095bd1 100755 --- a/ParseCNVQC/parse_cnv_qc.py +++ b/ParseCNVQC/parse_cnv_qc.py @@ -62,6 +62,51 @@ def make_mail(today, daysago, attachment, run_status): send_email(settings.email_from, settings.email_to, subject, text, attachment) +def get_number_samples_per_run_from_samplesheet(folder, rawfolder, projects, warnings): + """ + Determine number of samples in a run based on the SampleSheet. + Only includes samples that are in predefined projects as stated in settings file. + Includes a warning if Samplesheet.csv is not detected in the raw data folder + + Args: + folder (string): full path to raw data folder + rawfolder (dict): + key: full path to raw data folder, values: list of all processed folder from run, total number of samples (int) + projects (list): projectIDs to include in total sample calculation. Excludes i.e. non-dx projects in calculations + warnings (list): List with warning messages + + Returns: + rawfolder (dict): rawfolder including sample count + warnings (list): message list including potential new warnings + """ + number_samples_run = 0 + lanes = [] + lane_index = "" + if os.path.exists("{}/SampleSheet.csv".format(folder)): + with open("{}/SampleSheet.csv".format(folder), 'r') as samplesheet: + sample_section = False + for line in samplesheet: + if sample_section: + for project in projects: + if project in line.upper(): + number_samples_run += 1 + if line.split(",")[lane_index] not in lanes: + lanes.append(line.split(",")[lane_index]) + if "Sample_ID" not in line: + continue + else: + sample_section = True + header = [column for column in line.split(",")] + lane_index = header.index('Lane') + else: + warnings.append("no samplesheet for run {}, assuming unknown number of samples in run".format(folder)) + + # prevent division by zero. + if len(lanes) > 0: + rawfolder[folder][1] += number_samples_run/len(lanes) + return rawfolder, warnings + + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( @@ -121,23 +166,9 @@ def make_mail(today, daysago, attachment, run_status): rawfolder[run_path] = [[], 0] rawfolder[run_path][0].append(folder) + # Get number of samples for each run for folder in rawfolder: - number_samples_run = 0 - if os.path.exists("{}/SampleSheet.csv".format(folder)): - with open("{}/SampleSheet.csv".format(folder), 'r') as samplesheet: - sample_section = False - for line in samplesheet: - if sample_section: - for project in settings.projects: - if project in line.upper(): - number_samples_run += 1 - if "Sample_ID" not in line: - continue - else: - sample_section = True - else: - warnings.append("no samplesheet for run {}, assuming unknown number of samples in run".format(folder)) - rawfolder[folder][1] += number_samples_run + rawfolder, warnings = get_number_samples_per_run_from_samplesheet(folder, rawfolder, settings.projects, warnings) folder_summary = {} sample_qc = [] diff --git a/ParseCNVQC/tests/__init__.py b/ParseCNVQC/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ParseCNVQC/tests/run1/SampleSheet.csv b/ParseCNVQC/tests/run1/SampleSheet.csv new file mode 100644 index 0000000..2adea1f --- /dev/null +++ b/ParseCNVQC/tests/run1/SampleSheet.csv @@ -0,0 +1,20 @@ +[Header] + +[Reads] + +[Sequencing_Settings] + +[BCLConvert_Settings] + +[BCLConvert_Data] +Lane,Sample_ID,Index,Index2,OverrideCycles,Sample_Project,AdapterRead1,AdapterRead2 +1,SAMPLE1,AGCGAGTT,TACGGCGA,U5Y145N1;I8N11;N2I8;U5Y145N1,CREv4_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +1,SAMPLE2,CAGTTGCG,AACGCATT,U5Y145N1;I8N11;N2I8;U5Y145N1,SSv7_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +1,SAMPLE3,CAGTTGCG,AACGCATT,U5Y145N1;I8N11;N2I8;U5Y145N1,NICU_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +2,SAMPLE1,AGCGAGTT,TACGGCGA,U5Y145N1;I8N11;N2I8;U5Y145N1,CREv4_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +2,SAMPLE2,CAGTTGCG,AACGCATT,U5Y145N1;I8N11;N2I8;U5Y145N1,SSv7_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +2,SAMPLE3,CAGTTGCG,AACGCATT,U5Y145N1;I8N11;N2I8;U5Y145N1,NICU_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +3,SAMPLE1,AGCGAGTT,TACGGCGA,U5Y145N1;I8N11;N2I8;U5Y145N1,CREv4_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +3,SAMPLE2,CAGTTGCG,AACGCATT,U5Y145N1;I8N11;N2I8;U5Y145N1,SSv7_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +3,SAMPLE3,CAGTTGCG,AACGCATT,U5Y145N1;I8N11;N2I8;U5Y145N1,NICU_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT + diff --git a/ParseCNVQC/tests/test_parse_cnv_qc.py b/ParseCNVQC/tests/test_parse_cnv_qc.py new file mode 100755 index 0000000..864b12d --- /dev/null +++ b/ParseCNVQC/tests/test_parse_cnv_qc.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +import pytest + +from parse_cnv_qc import get_number_samples_per_run_from_samplesheet + +def test_get_number_samples_per_run_from_samplesheet_withsamples(): + folder = "./run1" + rawfolder_run = {"./run1": [["run1"], 0]} + projects = ["CREV4", "NICU"] + assert get_number_samples_per_run_from_samplesheet(folder, rawfolder_run, projects, [])[0] == {"./run1": [["run1"], 2]} + +def test_get_number_samples_per_run_from_samplesheet_nosamples(): + folder = "./run1" + rawfolder_run = {"./run1": [["run1"], 0]} + projects = ["WGS", "RNASEQ"] + assert get_number_samples_per_run_from_samplesheet(folder, rawfolder_run, projects, [])[0] == {"./run1": [["run1"], 0]} + +def test_get_number_samples_per_run_from_samplesheet_nosamplesheet(): + folder = "./run2" + rawfolder_run = {"./run2": [["run2"], 0]} + projects = ["CREV4", "NICU"] + assert get_number_samples_per_run_from_samplesheet(folder, rawfolder_run, projects, [])[0] == {"./run2": [["run2"], 0]} + +def test_get_number_samples_per_run_from_samplesheet_nowarning(): + folder = "./run1" + rawfolder_run = {"./run1": [["run1"], 0]} + projects = ["CREV4", "NICU"] + assert get_number_samples_per_run_from_samplesheet(folder, rawfolder_run, projects, [])[1] == [] + +def test_get_number_samples_per_run_from_samplesheet_with_warning_nosamplsheet(): + folder = "./run2" + rawfolder_run = {"./run2": [["run2"], 0]} + projects = ["CREV4", "NICU"] + assert get_number_samples_per_run_from_samplesheet(folder, rawfolder_run, projects, [])[1] == [ + f"no samplesheet for run {folder}, assuming unknown number of samples in run" + ] +