Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/fixParseCNV #160

Open
wants to merge 10 commits into
base: develop
Choose a base branch
from
63 changes: 47 additions & 16 deletions ParseCNVQC/parse_cnv_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,51 @@ def make_mail(today, daysago, attachment, run_status):
send_email(settings.email_from, settings.email_to, subject, text, attachment)


def get_number_samples_per_run_from_samplesheet(folder, rawfolder, projects, warnings):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ik zou hier de argumenten wat meer specifieker maken, b.v. run_folder ipv folder

"""
Determine number of samples in a run based on the SampleSheet.
Only includes samples that are in predefined projects as stated in settings file.
Includes a warning if Samplesheet.csv is not detected in the raw data folder

Args:
folder (string): full path to raw data folder
rawfolder (dict):
key: full path to raw data folder, values: list of all processed folder from run, total number of samples (int)
projects (list): projectIDs to include in total sample calculation. Excludes i.e. non-dx projects in calculations
warnings (list): List with warning messages

Returns:
rawfolder (dict): rawfolder including sample count
warnings (list): message list including potential new warnings
"""
number_samples_run = 0
lanes = []
lane_index = ""
if os.path.exists("{}/SampleSheet.csv".format(folder)):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ik zie dat we vaan verschillende soorten python strings door elkaar gebruiken.
wat er nu staat is niet fout ofzo, maar zouden we misschien voor de nieuwere f-strings kunnen kiezen?

Dan zou deze zin:
if os.path.exists(f"{folder}/SampleSheet.csv"):
worden (en de rest van de strings worden dan ook net anders, maar dat scheelt weer text en het is meestal leesbaarder)

with open("{}/SampleSheet.csv".format(folder), 'r') as samplesheet:
sample_section = False
for line in samplesheet:
if sample_section:
for project in projects:
if project in line.upper():
number_samples_run += 1
if line.split(",")[lane_index] not in lanes:
lanes.append(line.split(",")[lane_index])
if "Sample_ID" not in line:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry maar de logica klopt, alleen ik vind het niet heel leesbaar.

als ik het goed begrijp lezen we dus elke lijn van een samplesheet, en als we niet Sample_ID in de regel vinden gaan we door, maar als we in de else terecht komen gaan we een ander codeblock in waarin we dan in sample_section zitten.

ik denk dat een config parser hier de oplossing is:
https://docs.python.org/3/library/configparser.html
dan kan je een config section lezen uit een samplesheet a la
import configparser
config = configparser.ConfigParser()
samplesheet = config.read(samplesheet_path)

je hebt dan 1 sample block met alle samples, en die kan je makkelijk filteren met een for-loopje ofzo.

is een idee, wellicht werkt t niet, maar t proberen waard :)

continue
else:
sample_section = True
header = [column for column in line.split(",")]
lane_index = header.index('Lane')
else:
warnings.append("no samplesheet for run {}, assuming unknown number of samples in run".format(folder))

# prevent division by zero.
if len(lanes) > 0:
rawfolder[folder][1] += number_samples_run/len(lanes)
return rawfolder, warnings

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mocht er tijd voor zijn, hou dit stukje dan eens kritisch tegen het licht.

op dit moment zit er namelijk logica in de functie om het aantal samples in een samplesheet te bepalen, maar het doet ook dingen daar buitenom, b.v. een dict bijhouden met het aantal samples per project. eigenlijk mag dat best zijn eigen functie zijn.

en een mooi voorbeeldje van code die heerlijk zou werken met OO!
dan kan je gewoon een run object maken dat 1 of meerdere projects heeft en de determine_samples uitvoeren op een run object, die het op alle (gefilterde) project lijst uitvoert



if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
Expand Down Expand Up @@ -121,23 +166,9 @@ def make_mail(today, daysago, attachment, run_status):
rawfolder[run_path] = [[], 0]
rawfolder[run_path][0].append(folder)

# Get number of samples for each run
for folder in rawfolder:
number_samples_run = 0
if os.path.exists("{}/SampleSheet.csv".format(folder)):
with open("{}/SampleSheet.csv".format(folder), 'r') as samplesheet:
sample_section = False
for line in samplesheet:
if sample_section:
for project in settings.projects:
if project in line.upper():
number_samples_run += 1
if "Sample_ID" not in line:
continue
else:
sample_section = True
else:
warnings.append("no samplesheet for run {}, assuming unknown number of samples in run".format(folder))
rawfolder[folder][1] += number_samples_run
rawfolder, warnings = get_number_samples_per_run_from_samplesheet(folder, rawfolder, settings.projects, warnings)

folder_summary = {}
sample_qc = []
Expand Down
Empty file added ParseCNVQC/tests/__init__.py
Empty file.
20 changes: 20 additions & 0 deletions ParseCNVQC/tests/run1/SampleSheet.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[Header]

[Reads]

[Sequencing_Settings]

[BCLConvert_Settings]

[BCLConvert_Data]
Lane,Sample_ID,Index,Index2,OverrideCycles,Sample_Project,AdapterRead1,AdapterRead2
1,SAMPLE1,AGCGAGTT,TACGGCGA,U5Y145N1;I8N11;N2I8;U5Y145N1,CREv4_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
1,SAMPLE2,CAGTTGCG,AACGCATT,U5Y145N1;I8N11;N2I8;U5Y145N1,SSv7_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
1,SAMPLE3,CAGTTGCG,AACGCATT,U5Y145N1;I8N11;N2I8;U5Y145N1,NICU_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
2,SAMPLE1,AGCGAGTT,TACGGCGA,U5Y145N1;I8N11;N2I8;U5Y145N1,CREv4_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
2,SAMPLE2,CAGTTGCG,AACGCATT,U5Y145N1;I8N11;N2I8;U5Y145N1,SSv7_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
2,SAMPLE3,CAGTTGCG,AACGCATT,U5Y145N1;I8N11;N2I8;U5Y145N1,NICU_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
3,SAMPLE1,AGCGAGTT,TACGGCGA,U5Y145N1;I8N11;N2I8;U5Y145N1,CREv4_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
3,SAMPLE2,CAGTTGCG,AACGCATT,U5Y145N1;I8N11;N2I8;U5Y145N1,SSv7_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
3,SAMPLE3,CAGTTGCG,AACGCATT,U5Y145N1;I8N11;N2I8;U5Y145N1,NICU_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT

37 changes: 37 additions & 0 deletions ParseCNVQC/tests/test_parse_cnv_qc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env python
import pytest

from parse_cnv_qc import get_number_samples_per_run_from_samplesheet

def test_get_number_samples_per_run_from_samplesheet_withsamples():
folder = "./run1"
rawfolder_run = {"./run1": [["run1"], 0]}
projects = ["CREV4", "NICU"]
assert get_number_samples_per_run_from_samplesheet(folder, rawfolder_run, projects, [])[0] == {"./run1": [["run1"], 2]}

def test_get_number_samples_per_run_from_samplesheet_nosamples():
folder = "./run1"
rawfolder_run = {"./run1": [["run1"], 0]}
projects = ["WGS", "RNASEQ"]
assert get_number_samples_per_run_from_samplesheet(folder, rawfolder_run, projects, [])[0] == {"./run1": [["run1"], 0]}

def test_get_number_samples_per_run_from_samplesheet_nosamplesheet():
folder = "./run2"
rawfolder_run = {"./run2": [["run2"], 0]}
projects = ["CREV4", "NICU"]
assert get_number_samples_per_run_from_samplesheet(folder, rawfolder_run, projects, [])[0] == {"./run2": [["run2"], 0]}

def test_get_number_samples_per_run_from_samplesheet_nowarning():
folder = "./run1"
rawfolder_run = {"./run1": [["run1"], 0]}
projects = ["CREV4", "NICU"]
assert get_number_samples_per_run_from_samplesheet(folder, rawfolder_run, projects, [])[1] == []

def test_get_number_samples_per_run_from_samplesheet_with_warning_nosamplsheet():
folder = "./run2"
rawfolder_run = {"./run2": [["run2"], 0]}
projects = ["CREV4", "NICU"]
assert get_number_samples_per_run_from_samplesheet(folder, rawfolder_run, projects, [])[1] == [
f"no samplesheet for run {folder}, assuming unknown number of samples in run"
]