-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feature/fixParseCNV #160
base: develop
Are you sure you want to change the base?
Feature/fixParseCNV #160
Changes from all commits
5004878
4e23864
2f9b3b3
e9de2c1
cd73f38
33f9eb5
2a80c39
97a7443
37529c8
5a189d3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -62,6 +62,51 @@ def make_mail(today, daysago, attachment, run_status): | |
send_email(settings.email_from, settings.email_to, subject, text, attachment) | ||
|
||
|
||
def get_number_samples_per_run_from_samplesheet(folder, rawfolder, projects, warnings): | ||
""" | ||
Determine number of samples in a run based on the SampleSheet. | ||
Only includes samples that are in predefined projects as stated in settings file. | ||
Includes a warning if Samplesheet.csv is not detected in the raw data folder | ||
|
||
Args: | ||
folder (string): full path to raw data folder | ||
rawfolder (dict): | ||
key: full path to raw data folder, values: list of all processed folder from run, total number of samples (int) | ||
projects (list): projectIDs to include in total sample calculation. Excludes i.e. non-dx projects in calculations | ||
warnings (list): List with warning messages | ||
|
||
Returns: | ||
rawfolder (dict): rawfolder including sample count | ||
warnings (list): message list including potential new warnings | ||
""" | ||
number_samples_run = 0 | ||
lanes = [] | ||
lane_index = "" | ||
if os.path.exists("{}/SampleSheet.csv".format(folder)): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ik zie dat we vaan verschillende soorten python strings door elkaar gebruiken. Dan zou deze zin: |
||
with open("{}/SampleSheet.csv".format(folder), 'r') as samplesheet: | ||
sample_section = False | ||
for line in samplesheet: | ||
if sample_section: | ||
for project in projects: | ||
if project in line.upper(): | ||
number_samples_run += 1 | ||
if line.split(",")[lane_index] not in lanes: | ||
lanes.append(line.split(",")[lane_index]) | ||
if "Sample_ID" not in line: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sorry maar de logica klopt, alleen ik vind het niet heel leesbaar. als ik het goed begrijp lezen we dus elke lijn van een samplesheet, en als we niet Sample_ID in de regel vinden gaan we door, maar als we in de else terecht komen gaan we een ander codeblock in waarin we dan in sample_section zitten. ik denk dat een config parser hier de oplossing is: je hebt dan 1 sample block met alle samples, en die kan je makkelijk filteren met een for-loopje ofzo. is een idee, wellicht werkt t niet, maar t proberen waard :) |
||
continue | ||
else: | ||
sample_section = True | ||
header = [column for column in line.split(",")] | ||
lane_index = header.index('Lane') | ||
else: | ||
warnings.append("no samplesheet for run {}, assuming unknown number of samples in run".format(folder)) | ||
|
||
# prevent division by zero. | ||
if len(lanes) > 0: | ||
rawfolder[folder][1] += number_samples_run/len(lanes) | ||
return rawfolder, warnings | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mocht er tijd voor zijn, hou dit stukje dan eens kritisch tegen het licht. op dit moment zit er namelijk logica in de functie om het aantal samples in een samplesheet te bepalen, maar het doet ook dingen daar buitenom, b.v. een dict bijhouden met het aantal samples per project. eigenlijk mag dat best zijn eigen functie zijn. en een mooi voorbeeldje van code die heerlijk zou werken met OO! |
||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
|
@@ -121,23 +166,9 @@ def make_mail(today, daysago, attachment, run_status): | |
rawfolder[run_path] = [[], 0] | ||
rawfolder[run_path][0].append(folder) | ||
|
||
# Get number of samples for each run | ||
for folder in rawfolder: | ||
number_samples_run = 0 | ||
if os.path.exists("{}/SampleSheet.csv".format(folder)): | ||
with open("{}/SampleSheet.csv".format(folder), 'r') as samplesheet: | ||
sample_section = False | ||
for line in samplesheet: | ||
if sample_section: | ||
for project in settings.projects: | ||
if project in line.upper(): | ||
number_samples_run += 1 | ||
if "Sample_ID" not in line: | ||
continue | ||
else: | ||
sample_section = True | ||
else: | ||
warnings.append("no samplesheet for run {}, assuming unknown number of samples in run".format(folder)) | ||
rawfolder[folder][1] += number_samples_run | ||
rawfolder, warnings = get_number_samples_per_run_from_samplesheet(folder, rawfolder, settings.projects, warnings) | ||
|
||
folder_summary = {} | ||
sample_qc = [] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
[Header] | ||
|
||
[Reads] | ||
|
||
[Sequencing_Settings] | ||
|
||
[BCLConvert_Settings] | ||
|
||
[BCLConvert_Data] | ||
Lane,Sample_ID,Index,Index2,OverrideCycles,Sample_Project,AdapterRead1,AdapterRead2 | ||
1,SAMPLE1,AGCGAGTT,TACGGCGA,U5Y145N1;I8N11;N2I8;U5Y145N1,CREv4_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT | ||
1,SAMPLE2,CAGTTGCG,AACGCATT,U5Y145N1;I8N11;N2I8;U5Y145N1,SSv7_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT | ||
1,SAMPLE3,CAGTTGCG,AACGCATT,U5Y145N1;I8N11;N2I8;U5Y145N1,NICU_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT | ||
2,SAMPLE1,AGCGAGTT,TACGGCGA,U5Y145N1;I8N11;N2I8;U5Y145N1,CREv4_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT | ||
2,SAMPLE2,CAGTTGCG,AACGCATT,U5Y145N1;I8N11;N2I8;U5Y145N1,SSv7_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT | ||
2,SAMPLE3,CAGTTGCG,AACGCATT,U5Y145N1;I8N11;N2I8;U5Y145N1,NICU_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT | ||
3,SAMPLE1,AGCGAGTT,TACGGCGA,U5Y145N1;I8N11;N2I8;U5Y145N1,CREv4_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT | ||
3,SAMPLE2,CAGTTGCG,AACGCATT,U5Y145N1;I8N11;N2I8;U5Y145N1,SSv7_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT | ||
3,SAMPLE3,CAGTTGCG,AACGCATT,U5Y145N1;I8N11;N2I8;U5Y145N1,NICU_1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#!/usr/bin/env python | ||
import pytest | ||
|
||
from parse_cnv_qc import get_number_samples_per_run_from_samplesheet | ||
|
||
def test_get_number_samples_per_run_from_samplesheet_withsamples(): | ||
folder = "./run1" | ||
rawfolder_run = {"./run1": [["run1"], 0]} | ||
projects = ["CREV4", "NICU"] | ||
assert get_number_samples_per_run_from_samplesheet(folder, rawfolder_run, projects, [])[0] == {"./run1": [["run1"], 2]} | ||
|
||
def test_get_number_samples_per_run_from_samplesheet_nosamples(): | ||
folder = "./run1" | ||
rawfolder_run = {"./run1": [["run1"], 0]} | ||
projects = ["WGS", "RNASEQ"] | ||
assert get_number_samples_per_run_from_samplesheet(folder, rawfolder_run, projects, [])[0] == {"./run1": [["run1"], 0]} | ||
|
||
def test_get_number_samples_per_run_from_samplesheet_nosamplesheet(): | ||
folder = "./run2" | ||
rawfolder_run = {"./run2": [["run2"], 0]} | ||
projects = ["CREV4", "NICU"] | ||
assert get_number_samples_per_run_from_samplesheet(folder, rawfolder_run, projects, [])[0] == {"./run2": [["run2"], 0]} | ||
|
||
def test_get_number_samples_per_run_from_samplesheet_nowarning(): | ||
folder = "./run1" | ||
rawfolder_run = {"./run1": [["run1"], 0]} | ||
projects = ["CREV4", "NICU"] | ||
assert get_number_samples_per_run_from_samplesheet(folder, rawfolder_run, projects, [])[1] == [] | ||
|
||
def test_get_number_samples_per_run_from_samplesheet_with_warning_nosamplsheet(): | ||
folder = "./run2" | ||
rawfolder_run = {"./run2": [["run2"], 0]} | ||
projects = ["CREV4", "NICU"] | ||
assert get_number_samples_per_run_from_samplesheet(folder, rawfolder_run, projects, [])[1] == [ | ||
f"no samplesheet for run {folder}, assuming unknown number of samples in run" | ||
] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ik zou hier de argumenten wat meer specifieker maken, b.v. run_folder ipv folder