Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pandasschema #467

Merged
merged 10 commits into from
Aug 17, 2020
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ All changed fall under either one of these types: `Added`, `Changed`, `Deprecate

### Changed

- Checking for validity of samples.tsv now happens with pandasschema
- Explicit priority arguments to all group jobs (aligner + samtools_presort)
- Snakemake version (5.22.1)
- Reduced threads on salmon indexing (matching aligners)
Expand Down
1 change: 1 addition & 0 deletions requirements.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ dependencies:
- pkgs/main::pyyaml=5.3.1
- pkgs/main::beautifulsoup4=4.9.0
- conda-forge:pretty_html_table=0.9.dev0
- conda-forge:pandas_schema=0.3.5
- bioconda::trackhub=0.1.2019.12.24
48 changes: 27 additions & 21 deletions seq2science/rules/configuration_generic.smk
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,13 @@ import urllib.request
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool
from filelock import FileLock
from pandas_schema import Column, Schema
from pandas_schema.validation import MatchesPatternValidation, IsDistinctValidation

from snakemake.logging import logger
from snakemake.utils import validate
from snakemake.utils import min_version

from snakemake.exceptions import TerminatedException

import seq2science

Expand Down Expand Up @@ -102,33 +104,36 @@ for key, value in config.items():
value = os.path.abspath(os.path.join(config['result_dir'], value))
config[key] = re.split("\/$", value)[0]


# samples.tsv


# read the samples.tsv file as all text, drop comment lines
samples = pd.read_csv(config["samples"], sep='\t', dtype='str', comment='#')
samples.columns = samples.columns.str.strip()
siebrenf marked this conversation as resolved.
Show resolved Hide resolved

# check that the columns are named
assert all([col[0:7] not in ["Unnamed", ''] for col in samples]), \
(f"\nEncountered unnamed column in {config['samples']}.\n" +
f"Column names: {str(', '.join(samples.columns))}.\n")

# check that the columns contains no irregular characters
assert not any(samples.columns.str.contains('[^A-Za-z0-9_.\-%]+', regex=True)), \
(f"\n{config['samples']} may only contain letters, numbers and " +
"percentage signs (%), underscores (_), periods (.), or minuses (-).\n")
# use pandasschema for checking if samples file is filed out correctly
allowed_pattern = r'[A-Za-z0-9_.\-%]+'
distinct_columns = ["sample"]
if "descriptive_name" in samples.columns:
distinct_columns.append("descriptive_name")

distinct_schema = Schema(
[Column(col, [MatchesPatternValidation(allowed_pattern),
IsDistinctValidation(ignore_nan=True)] if col in distinct_columns else [MatchesPatternValidation(allowed_pattern)], allow_empty=True) for col in
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ignore_nan doesnt exist now right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch! The cat 🐱 didnt let me sleep last night

samples.columns])

# check that the file contains no irregular characters
assert not any([any(samples[col].str.contains('[^A-Za-z0-9_.\-%]+', regex=True, na=False)) for col in samples if col != "control"]), \
(f"\n{config['samples']} may only contain letters, numbers and " +
"percentage signs (%), underscores (_), periods (.), or minuses (-).\n")
errors = distinct_schema.validate(samples)

# check that sample names are unique
assert len(samples["sample"]) == len(set(samples["sample"])), \
(f"\nDuplicate samples found in {config['samples']}:\n" +
f"{samples[samples.duplicated(['sample'], keep=False)].to_string()}\n")
if len(errors):
logger.error("\nThere are some issues with parsing the samples file:")
for error in errors:
logger.error(error)
logger.error("") # empty line
raise TerminatedException

# for each column, if found in samples.tsv:
# 1) if it is incomplete, fill the blanks with replicate/sample names
Expand Down Expand Up @@ -365,12 +370,13 @@ with FileLock(layout_cachefile_lock):
config['layout'] = {**{key: value for key, value in config['layout'].items() if key in samples.index},
**{key: value for key, value in config['layout'].items() if "control" in samples and key in samples["control"].values}}

for sample in samples.index:
if sample not in config["layout"]:
raise ValueError(f"The command to lookup sample {sample} online failed!\n"
f"Are you sure this sample exists..? Downloading samples with restricted "
f"access is currently not supported. We advise you to download the sample "
f"manually, and continue the pipeline from there on.")
bad_samples = [sample for sample in samples.index if sample not in config["layout"]]
if len(bad_samples) > 0:
logger.error(f"\nThe instructions to lookup sample(s) {' '.join(bad_samples)} online failed!\n"
f"Are you sure these sample(s) exists..? Downloading samples with restricted "
f"access is currently not supported. We advise you to download the sample "
f"manually, and continue the pipeline from there on.\n")
raise TerminatedException

sample_to_srr = {**{k: v.get() for k, v in trace_layout1.items() if v.get() is not None},
**{k: v.get() for k, v in trace_layout2.items() if v.get() is not None}}
Expand Down
4 changes: 2 additions & 2 deletions tests/atac_seq/complex_samples.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
sample assembly replicate condition descriptive_name
S1_1 assembly1 R1_1 A sample1_1
S1_2 assembly1 R1_1 A sample1_2
S2_1 assembly1 R2_1 A sample2
S2_2 assembly1 R2_1 A sample2
S2_1 assembly1 R2_1 A sample2_1
S2_2 assembly1 R2_1 A sample2_2
S3_1 assembly1 B sample3
S4_1 assembly1 B
S5_1 assembly2 C
Expand Down
2 changes: 1 addition & 1 deletion tests/dag_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ if [ $1 = "atac-seq" ]; then
assert_rulecount $1 bwa_mem 7

printf "\ninput control different across same condition\n"
seq2science run atac-seq -n --cores $CORES --configfile tests/$WF/genrich_macs2.yaml --snakemakeOptions dryrun=True quiet=True config={samples:tests/atac_seq/complex_samples2.tsv,create_qc_report:True} | tee tests/local_test_results/${1}_dag
seq2science run atac-seq -n --configfile tests/$WF/genrich_macs2.yaml --snakemakeOptions dryrun=True quiet=True config={samples:tests/atac_seq/complex_samples2.tsv,create_qc_report:True} | tee tests/local_test_results/${1}_dag
assert_rulecount $1 genrich_pileup 4
assert_rulecount $1 macs2_callpeak 4

Expand Down
4 changes: 2 additions & 2 deletions tests/rna_seq/complex_samples.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
sample assembly replicate stage batch descriptive_name
S1_1 assembly1 R1_1 1 batch1 sample1_1
S1_2 assembly1 R1_1 1 batch1 sample1_2
S2_1 assembly1 R2_1 1 batch2 sample2
S2_2 assembly1 R2_1 1 batch2 sample2
S2_1 assembly1 R2_1 1 batch2 sample2_1
S2_2 assembly1 R2_1 1 batch2 sample2_2
S3_1 assembly1 R3_1 1 batch1 sample3
S4_1 assembly1 R4_1 2 batch1
S5_1 assembly1 R5_1 2 batch1
Expand Down
4 changes: 2 additions & 2 deletions tests/rna_seq/dag_samples.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
sample assembly replicate stage batch descriptive_name
S1_1 assembly1 R1_1 1 batch1 sample1_1
S1_2 assembly1 R1_1 1 batch1 sample1_2
S2_1 assembly1 R2_1 1 batch2 sample2
S2_2 assembly1 R2_1 1 batch2 sample2
S2_1 assembly1 R2_1 1 batch2 sample2_1
S2_2 assembly1 R2_1 1 batch2 sample2_2
S3_1 assembly1 R3_1 1 batch1 sample3
S4_1 assembly1 R4_1 2 batch1
S5_1 assembly1 R5_1 2 batch1
Expand Down