vanheeringen-lab · Maarten-vd-Sande · Aug 17, 2020 · Aug 11, 2020 · Aug 11, 2020 · Aug 11, 2020
@@ -10,6 +10,7 @@ All changed fall under either one of these types: `Added`, `Changed`, `Deprecate
 
 ### Changed
 
+- Checking for validity of samples.tsv now happens with pandasschema
 - Explicit priority arguments to all group jobs (aligner + samtools_presort)
 - Snakemake version (5.22.1)
 - Reduced threads on salmon indexing (matching aligners)

@@ -17,4 +17,5 @@ dependencies:
   - pkgs/main::pyyaml=5.3.1
   - pkgs/main::beautifulsoup4=4.9.0
   - conda-forge:pretty_html_table=0.9.dev0
+  - conda-forge:pandas_schema=0.3.5
   - bioconda::trackhub=0.1.2019.12.24
@@ -18,11 +18,13 @@ import urllib.request
 from bs4 import BeautifulSoup
 from multiprocessing.pool import ThreadPool
 from filelock import FileLock
+from pandas_schema import Column, Schema
+from pandas_schema.validation import MatchesPatternValidation, IsDistinctValidation
 
 from snakemake.logging import logger
 from snakemake.utils import validate
 from snakemake.utils import min_version
-
+from snakemake.exceptions import TerminatedException
 
 import seq2science
 
@@ -102,33 +104,36 @@ for key, value in config.items():
             value = os.path.abspath(os.path.join(config['result_dir'], value))
         config[key] = re.split("\/$", value)[0]
 
-
 # samples.tsv
 
 
 # read the samples.tsv file as all text, drop comment lines
 samples = pd.read_csv(config["samples"], sep='\t', dtype='str', comment='#')
 samples.columns = samples.columns.str.strip()
 
-# check that the columns are named
 assert all([col[0:7] not in ["Unnamed", ''] for col in samples]), \
     (f"\nEncountered unnamed column in {config['samples']}.\n" +
      f"Column names: {str(', '.join(samples.columns))}.\n")
 
-# check that the columns contains no irregular characters
-assert not any(samples.columns.str.contains('[^A-Za-z0-9_.\-%]+', regex=True)), \
-    (f"\n{config['samples']} may only contain letters, numbers and " +
-    "percentage signs (%), underscores (_), periods (.), or minuses (-).\n")
+# use pandasschema for checking if samples file is filed out correctly
+allowed_pattern = r'[A-Za-z0-9_.\-%]+'
+distinct_columns = ["sample"]
+if "descriptive_name" in samples.columns:
+    distinct_columns.append("descriptive_name")
+
+distinct_schema = Schema(
+    [Column(col, [MatchesPatternValidation(allowed_pattern),
+                  IsDistinctValidation(ignore_nan=True)] if col in distinct_columns else [MatchesPatternValidation(allowed_pattern)], allow_empty=True) for col in
+     samples.columns])
 
-# check that the file contains no irregular characters
-assert not any([any(samples[col].str.contains('[^A-Za-z0-9_.\-%]+', regex=True, na=False)) for col in samples if col != "control"]), \
-    (f"\n{config['samples']} may only contain letters, numbers and " +
-    "percentage signs (%), underscores (_), periods (.), or minuses (-).\n")
+errors = distinct_schema.validate(samples)
 
-# check that sample names are unique
-assert len(samples["sample"]) == len(set(samples["sample"])), \
-    (f"\nDuplicate samples found in {config['samples']}:\n" +
-     f"{samples[samples.duplicated(['sample'], keep=False)].to_string()}\n")
+if len(errors):
+    logger.error("\nThere are some issues with parsing the samples file:")
+    for error in errors:
+        logger.error(error)
+    logger.error("")  # empty line
+    raise TerminatedException
 
 # for each column, if found in samples.tsv:
 # 1) if it is incomplete, fill the blanks with replicate/sample names
@@ -365,12 +370,13 @@ with FileLock(layout_cachefile_lock):
 config['layout'] = {**{key: value for key, value in config['layout'].items() if key in samples.index},
                     **{key: value for key, value in config['layout'].items() if "control" in samples and key in samples["control"].values}}
 
-for sample in samples.index:
-    if sample not in config["layout"]:
-        raise ValueError(f"The command to lookup sample {sample} online failed!\n"
-                         f"Are you sure this sample exists..? Downloading samples with restricted "
-                         f"access is currently not supported. We advise you to download the sample "
-                         f"manually, and continue the pipeline from there on.")
+bad_samples = [sample for sample in samples.index if sample not in config["layout"]]
+if len(bad_samples) > 0:
+    logger.error(f"\nThe instructions to lookup sample(s) {' '.join(bad_samples)} online failed!\n"
+                 f"Are you sure these sample(s) exists..? Downloading samples with restricted "
+                 f"access is currently not supported. We advise you to download the sample "
+                 f"manually, and continue the pipeline from there on.\n")
+    raise TerminatedException
 
 sample_to_srr = {**{k: v.get() for k, v in trace_layout1.items() if v.get() is not None},
                  **{k: v.get() for k, v in trace_layout2.items() if v.get() is not None}}

@@ -2,8 +2,8 @@
 sample	assembly	replicate	condition	descriptive_name
 S1_1	assembly1	R1_1	A	sample1_1
 S1_2	assembly1	R1_1	A	sample1_2
-S2_1	assembly1	R2_1	A	sample2
-S2_2	assembly1	R2_1	A	sample2
+S2_1	assembly1	R2_1	A	sample2_1
+S2_2	assembly1	R2_1	A	sample2_2
 S3_1	assembly1		B	sample3
 S4_1	assembly1		B
 S5_1	assembly2		C

@@ -237,7 +237,7 @@ if [ $1 = "atac-seq" ]; then
   assert_rulecount $1 bwa_mem 7
 
   printf "\ninput control different across same condition\n"
-  seq2science run atac-seq -n --cores $CORES --configfile tests/$WF/genrich_macs2.yaml --snakemakeOptions dryrun=True quiet=True config={samples:tests/atac_seq/complex_samples2.tsv,create_qc_report:True} | tee tests/local_test_results/${1}_dag
+  seq2science run atac-seq -n --configfile tests/$WF/genrich_macs2.yaml --snakemakeOptions dryrun=True quiet=True config={samples:tests/atac_seq/complex_samples2.tsv,create_qc_report:True} | tee tests/local_test_results/${1}_dag
   assert_rulecount $1 genrich_pileup 4
   assert_rulecount $1 macs2_callpeak 4
 

@@ -2,8 +2,8 @@
 sample	assembly	replicate	stage	batch	descriptive_name
 S1_1	assembly1	R1_1	1	batch1	sample1_1
 S1_2	assembly1	R1_1	1	batch1	sample1_2
-S2_1	assembly1	R2_1	1	batch2	sample2
-S2_2	assembly1	R2_1	1	batch2	sample2
+S2_1	assembly1	R2_1	1	batch2	sample2_1
+S2_2	assembly1	R2_1	1	batch2	sample2_2
 S3_1	assembly1	R3_1	1	batch1	sample3
 S4_1	assembly1	R4_1	2	batch1
 S5_1	assembly1	R5_1	2	batch1

@@ -2,8 +2,8 @@
 sample	assembly	replicate	stage	batch	descriptive_name
 S1_1	assembly1	R1_1	1	batch1	sample1_1
 S1_2	assembly1	R1_1	1	batch1	sample1_2
-S2_1	assembly1	R2_1	1	batch2	sample2
-S2_2	assembly1	R2_1	1	batch2	sample2
+S2_1	assembly1	R2_1	1	batch2	sample2_1
+S2_2	assembly1	R2_1	1	batch2	sample2_2
 S3_1	assembly1	R3_1	1	batch1	sample3
 S4_1	assembly1	R4_1	2	batch1
 S5_1	assembly1	R5_1	2	batch1