diff --git a/CHANGELOG.md b/CHANGELOG.md index bfcf14bd8..b766101ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ All changed fall under either one of these types: `Added`, `Changed`, `Deprecate ### Changed +- Checking for validity of samples.tsv now happens with pandasschema - Explicit priority arguments to all group jobs (aligner + samtools_presort) - Snakemake version (5.22.1) - Reduced threads on salmon indexing (matching aligners) diff --git a/requirements.yaml b/requirements.yaml index 4175262ff..7783098d9 100644 --- a/requirements.yaml +++ b/requirements.yaml @@ -17,4 +17,5 @@ dependencies: - pkgs/main::pyyaml=5.3.1 - pkgs/main::beautifulsoup4=4.9.0 - conda-forge:pretty_html_table=0.9.dev0 + - conda-forge:pandas_schema=0.3.5 - bioconda::trackhub=0.1.2019.12.24 diff --git a/seq2science/rules/configuration_generic.smk b/seq2science/rules/configuration_generic.smk index b18975010..2dbff1f34 100644 --- a/seq2science/rules/configuration_generic.smk +++ b/seq2science/rules/configuration_generic.smk @@ -18,11 +18,13 @@ import urllib.request from bs4 import BeautifulSoup from multiprocessing.pool import ThreadPool from filelock import FileLock +from pandas_schema import Column, Schema +from pandas_schema.validation import MatchesPatternValidation, IsDistinctValidation from snakemake.logging import logger from snakemake.utils import validate from snakemake.utils import min_version - +from snakemake.exceptions import TerminatedException import seq2science @@ -102,7 +104,6 @@ for key, value in config.items(): value = os.path.abspath(os.path.join(config['result_dir'], value)) config[key] = re.split("\/$", value)[0] - # samples.tsv @@ -110,25 +111,29 @@ for key, value in config.items(): samples = pd.read_csv(config["samples"], sep='\t', dtype='str', comment='#') samples.columns = samples.columns.str.strip() -# check that the columns are named assert all([col[0:7] not in ["Unnamed", ''] for col in samples]), \ (f"\nEncountered unnamed column in {config['samples']}.\n" + f"Column names: {str(', '.join(samples.columns))}.\n") -# check that the columns contains no irregular characters -assert not any(samples.columns.str.contains('[^A-Za-z0-9_.\-%]+', regex=True)), \ - (f"\n{config['samples']} may only contain letters, numbers and " + - "percentage signs (%), underscores (_), periods (.), or minuses (-).\n") +# use pandasschema for checking if samples file is filed out correctly +allowed_pattern = r'[A-Za-z0-9_.\-%]+' +distinct_columns = ["sample"] +if "descriptive_name" in samples.columns: + distinct_columns.append("descriptive_name") + +distinct_schema = Schema( + [Column(col, [MatchesPatternValidation(allowed_pattern), + IsDistinctValidation()] if col in distinct_columns else [MatchesPatternValidation(allowed_pattern)], allow_empty=True) for col in + samples.columns]) -# check that the file contains no irregular characters -assert not any([any(samples[col].str.contains('[^A-Za-z0-9_.\-%]+', regex=True, na=False)) for col in samples if col != "control"]), \ - (f"\n{config['samples']} may only contain letters, numbers and " + - "percentage signs (%), underscores (_), periods (.), or minuses (-).\n") +errors = distinct_schema.validate(samples) -# check that sample names are unique -assert len(samples["sample"]) == len(set(samples["sample"])), \ - (f"\nDuplicate samples found in {config['samples']}:\n" + - f"{samples[samples.duplicated(['sample'], keep=False)].to_string()}\n") +if len(errors): + logger.error("\nThere are some issues with parsing the samples file:") + for error in errors: + logger.error(error) + logger.error("") # empty line + raise TerminatedException # for each column, if found in samples.tsv: # 1) if it is incomplete, fill the blanks with replicate/sample names @@ -365,12 +370,13 @@ with FileLock(layout_cachefile_lock): config['layout'] = {**{key: value for key, value in config['layout'].items() if key in samples.index}, **{key: value for key, value in config['layout'].items() if "control" in samples and key in samples["control"].values}} -for sample in samples.index: - if sample not in config["layout"]: - raise ValueError(f"The command to lookup sample {sample} online failed!\n" - f"Are you sure this sample exists..? Downloading samples with restricted " - f"access is currently not supported. We advise you to download the sample " - f"manually, and continue the pipeline from there on.") +bad_samples = [sample for sample in samples.index if sample not in config["layout"]] +if len(bad_samples) > 0: + logger.error(f"\nThe instructions to lookup sample(s) {' '.join(bad_samples)} online failed!\n" + f"Are you sure these sample(s) exists..? Downloading samples with restricted " + f"access is currently not supported. We advise you to download the sample " + f"manually, and continue the pipeline from there on.\n") + raise TerminatedException sample_to_srr = {**{k: v.get() for k, v in trace_layout1.items() if v.get() is not None}, **{k: v.get() for k, v in trace_layout2.items() if v.get() is not None}} diff --git a/tests/atac_seq/complex_samples.tsv b/tests/atac_seq/complex_samples.tsv index 03c784540..2c540d138 100644 --- a/tests/atac_seq/complex_samples.tsv +++ b/tests/atac_seq/complex_samples.tsv @@ -2,8 +2,8 @@ sample assembly replicate condition descriptive_name S1_1 assembly1 R1_1 A sample1_1 S1_2 assembly1 R1_1 A sample1_2 -S2_1 assembly1 R2_1 A sample2 -S2_2 assembly1 R2_1 A sample2 +S2_1 assembly1 R2_1 A sample2_1 +S2_2 assembly1 R2_1 A sample2_2 S3_1 assembly1 B sample3 S4_1 assembly1 B S5_1 assembly2 C diff --git a/tests/dag_tests.sh b/tests/dag_tests.sh index 2705b9a69..d1e4243e1 100644 --- a/tests/dag_tests.sh +++ b/tests/dag_tests.sh @@ -237,7 +237,7 @@ if [ $1 = "atac-seq" ]; then assert_rulecount $1 bwa_mem 7 printf "\ninput control different across same condition\n" - seq2science run atac-seq -n --cores $CORES --configfile tests/$WF/genrich_macs2.yaml --snakemakeOptions dryrun=True quiet=True config={samples:tests/atac_seq/complex_samples2.tsv,create_qc_report:True} | tee tests/local_test_results/${1}_dag + seq2science run atac-seq -n --configfile tests/$WF/genrich_macs2.yaml --snakemakeOptions dryrun=True quiet=True config={samples:tests/atac_seq/complex_samples2.tsv,create_qc_report:True} | tee tests/local_test_results/${1}_dag assert_rulecount $1 genrich_pileup 4 assert_rulecount $1 macs2_callpeak 4 diff --git a/tests/rna_seq/complex_samples.tsv b/tests/rna_seq/complex_samples.tsv index 0d8a1919f..6130bb228 100644 --- a/tests/rna_seq/complex_samples.tsv +++ b/tests/rna_seq/complex_samples.tsv @@ -2,8 +2,8 @@ sample assembly replicate stage batch descriptive_name S1_1 assembly1 R1_1 1 batch1 sample1_1 S1_2 assembly1 R1_1 1 batch1 sample1_2 -S2_1 assembly1 R2_1 1 batch2 sample2 -S2_2 assembly1 R2_1 1 batch2 sample2 +S2_1 assembly1 R2_1 1 batch2 sample2_1 +S2_2 assembly1 R2_1 1 batch2 sample2_2 S3_1 assembly1 R3_1 1 batch1 sample3 S4_1 assembly1 R4_1 2 batch1 S5_1 assembly1 R5_1 2 batch1 diff --git a/tests/rna_seq/dag_samples.tsv b/tests/rna_seq/dag_samples.tsv index 2853c74aa..4fa334524 100644 --- a/tests/rna_seq/dag_samples.tsv +++ b/tests/rna_seq/dag_samples.tsv @@ -2,8 +2,8 @@ sample assembly replicate stage batch descriptive_name S1_1 assembly1 R1_1 1 batch1 sample1_1 S1_2 assembly1 R1_1 1 batch1 sample1_2 -S2_1 assembly1 R2_1 1 batch2 sample2 -S2_2 assembly1 R2_1 1 batch2 sample2 +S2_1 assembly1 R2_1 1 batch2 sample2_1 +S2_2 assembly1 R2_1 1 batch2 sample2_2 S3_1 assembly1 R3_1 1 batch1 sample3 S4_1 assembly1 R4_1 2 batch1 S5_1 assembly1 R5_1 2 batch1