Skip to content

Commit

Permalink
Separate curated data by L and S segments #12
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 authored Jul 29, 2024
2 parents 3646ca3 + a0a4cf8 commit 9f95104
Show file tree
Hide file tree
Showing 7 changed files with 261 additions and 10 deletions.
15 changes: 11 additions & 4 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ workdir: workflow.current_basedir
# Use default configuration values. Override with Snakemake's --configfile/--config options.
configfile: "defaults/config.yaml"

segments = ['L', 'S']

wildcard_constraints:
segment = "|".join(segments)

# This is the default rule that Snakemake will run when there are no specified targets.
# The default output of the ingest workflow is usually the curated metadata and sequences.
# Nextstrain-maintained ingest workflows will produce metadata files with the
Expand All @@ -17,8 +22,9 @@ configfile: "defaults/config.yaml"
# TODO: Add link to centralized docs on standard Nextstrain metadata fields
rule all:
input:
"results/sequences.fasta",
"results/metadata.tsv",
sequences=expand("results/{segment}/sequences.fasta", segment=segments),
metadata=expand("results/{segment}/metadata.tsv", segment=segments),
metadata_all="results/all/metadata.tsv",


# Note that only PATHOGEN-level customizations should be added to these
Expand All @@ -28,15 +34,16 @@ rule all:
# by build-specific rules.
include: "rules/fetch_from_ncbi.smk"
include: "rules/curate.smk"
include: "rules/nextclade.smk"

rule create_final_metadata:
input:
metadata="data/subset_metadata.tsv"
output:
metadata="results/metadata.tsv"
metadata="results/all/metadata.tsv"
shell:
"""
mv {input.metadata} {output.metadata}
cp {input.metadata} {output.metadata}
"""

# Allow users to import custom rules provided via the config.
Expand Down
8 changes: 6 additions & 2 deletions ingest/build-configs/nextstrain-automation/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,9 @@ s3_dst: "s3://nextstrain-data/files/workflows/lassa"
# Mapping of files to upload
files_to_upload:
ncbi.ndjson.zst: data/ncbi.ndjson
metadata.tsv.zst: results/metadata.tsv
sequences.fasta.zst: results/sequences.fasta
all/metadata.tsv.zst: results/all/metadata.tsv
all/sequences.fasta.zst: results/all/sequences.fasta
L/metadata.tsv.zst: results/L/metadata.tsv
L/sequences.fasta.zst: results/L/sequences.fasta
S/metadata.tsv.zst: results/S/metadata.tsv
S/sequences.fasta.zst: results/S/sequences.fasta
4 changes: 4 additions & 0 deletions ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,7 @@ curate:
"abbr_authors",
"institution",
]

nextclade:
segment_reference: "../shared/lassa_{segment}.fasta"
min_seed_cover: 0.01
8 changes: 4 additions & 4 deletions ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ REQUIRED INPUTS:
OUTPUTS:
metadata = data/subset_metadata.tsv
seuqences = results/sequences.fasta
sequences = results/all/sequences.fasta
"""

Expand Down Expand Up @@ -62,7 +62,7 @@ rule curate:
annotations=config["curate"]["annotations"],
output:
metadata="data/all_metadata.tsv",
sequences="results/sequences.fasta",
sequences="results/all/sequences.fasta",
log:
"logs/curate.txt",
benchmark:
Expand Down Expand Up @@ -121,11 +121,11 @@ rule subset_metadata:
input:
metadata="data/all_metadata.tsv",
output:
subset_metadata="data/subset_metadata.tsv",
metadata="data/subset_metadata.tsv",
params:
metadata_fields=",".join(config["curate"]["metadata_columns"]),
shell:
"""
tsv-select -H -f {params.metadata_fields} \
{input.metadata} > {output.subset_metadata}
{input.metadata} > {output.metadata}
"""
55 changes: 55 additions & 0 deletions ingest/rules/nextclade.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""
This part of the workflow handles running Nextclade on the curated metadata
and sequences to split the sequences into L and S segments.
REQUIRED INPUTS:
metadata = data/subset_metadata.tsv
all_metadata = results/all/metadata.tsv
sequences = results/all/sequences.fasta
OUTPUTS:
metadata = results/{segment}/metadata.tsv
sequences = results/{segment}/sequences.fasta
See Nextclade docs for more details on usage, inputs, and outputs if you would
like to customize the rules:
https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html
"""

rule run_nextclade_to_identify_segment:
input:
metadata = "data/subset_metadata.tsv",
sequences = "results/all/sequences.fasta",
segment_reference = config["nextclade"]["segment_reference"],
output:
sequences = "results/{segment}/sequences.fasta",
params:
min_seed_cover = config["nextclade"]["min_seed_cover"],
shell:
"""
nextclade run \
--input-ref {input.segment_reference} \
--output-fasta {output.sequences} \
--min-seed-cover {params.min_seed_cover} \
--silent \
{input.sequences}
"""

rule subset_metadata_by_segment:
input:
metadata = "results/all/metadata.tsv",
sequences = "results/{segment}/sequences.fasta",
output:
metadata = "results/{segment}/metadata.tsv",
params:
strain_id_field = config["curate"]["output_id_field"],
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id_field} \
--output-metadata {output.metadata}
"""
123 changes: 123 additions & 0 deletions shared/lassa_l.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
>KM822127_REF
caccaaccccagcagagagcccccacaacacagccaagcccacccagcccccagaaccgc
ccacatcagcaggcatgggcaacaagcagaccaggtccccacccaaaccagagcacccca
gaccaaccctgctacccgacgcatcccacctgggcccccaattctgcaagagctgctggt
ttgagaacaagggactggtggagtgtaataaccattacctctgtctaaactgtctcacac
tgcttctcagcgtgagcgacagatgtcctatttgtaagatgcccctccccaccaagctgg
cagtccgaacccaaccaagtgcacccccactcaaccagggcaacactcaatcctccccgc
ccccctacagcccctaactctcccaagacgggcccgcaccccacacccaggaacccaacc
aaacacacacacacagacaaacacatagaaagacacacacacacgcacacacacccacag
tccccacccgcacccccggggggaccccccgccgggggcccccccggggggctcagaggc
tggcatcactcaatgtcctctacctgcaccagtgtgtcgaacaatttaatgcacgtcctt
cccttcaatctcagctttccactgcttgactggatcatcacttcttttagtgacttacta
taacacagtgagtagcccttgaaatccacccagtcaccaactgcttcaagaagccgaaca
agaattggtctttgcaattctaacacagtcttcttcaacacagaaacaatgtcgacatcc
acccagtgcttcttgagcttcaatctgtcagaaaacattgagatcagtgcctctatcaaa
agatcttgggaaccatccctaacctcccttatgaacaactccaagtcttgatcgtgaatt
gctggggtaaaagtagcaagtctgctgtcacattcccaaagggctcctctactgatcact
aatggtacaggttcaatttcaggccccaccagtgtgaaatcatactgtgaaaactgcagc
ttcctcagatcaacaactatctcaatgggggttacaggccctaattctttagtgtcctct
aggaactcattaatgacctcagtatcgacaaaccatgcagatttccccttgaaaacagat
ccagacagcacaagcctccagcagtcaggcagcacacattggttgactgatccggaaaac
atggcttcaacattatcaactaaatacttaccctgatgatttattgagccaatgaaatta
agatgaatagaaaaattttctttaaaacttgcgtcttggcttctgatttggaatcttatt
gtctggctgaatctcactgaaccaactggtagttcactgattccatctaagaagtcttca
agacatttaaatttgacatcagttaacccacacatgatcaactgactgtaattgagtttt
gctggttctaaatcataatcaatccatgcataacccatcatcaaatcatatttgaacata
ggtctttctatacctttgtataccaccttctcaacaaaagaggaaaaggggccaagtctc
tgcaaaccttcaccctctgggatagaggttttatgagggaaccatgtaaatgatccaagc
gttctagttgttgcaacaaacgatctgatgtacgactcaaaataaatctgtctcataaaa
ttatcaacaacgtcactcgagctgacaaccctttcttctatcatgggttcatgtgtcctg
gtgtgtgacaatctcagctcagatgaaagcactatgtagtgttcctctctcttccacttc
actatgtgcgagacaagagatagtgcctcgcaattcacatctagtgcaacacaaagatct
agaaatttaattctaggtgaccacttcattttagtcgaggccaaatcactcatgaagggc
aacaaatgtttctcaaacagactggggtacagtctccttaatgagtgtattatatggttc
atgccaactctatcttctagcaatttagaagcagtaggttttaagggaaagtagtcacat
gggttatgtttgaaaaactccagctgtcctgctggcttagggttccctagaacccatgcc
cccaactctatcgctgttgataatgaaatgcacatataatcccataacaaagatctgaaa
taagataggacactcttcccttcagactcttctcttaccctcattgggaccctccaaatc
tggactccaaggtcgttagttaaacactcaacgccctgaattgccaggacctcagattgc
agtgatttgatgtataggttttccttgttagaccccctcacacacttgctgccaagtgtt
ctgcataatcctacaaagcctgaagcgattgaactttggaaagcagatttatttatggcc
tcagatagcagtttctgagctcctctagtgaatgtagaagacaatttattctgaatggtt
ttcactattgttggtattctctcatcatctaaactcactgccccagcataaatcaccttc
tgtctcagcaccattcttaatgggtgatgtgtagccagattcaaccaacaaatttctaga
tcagagtcatccaacatctcagtacccattaactcgcataaattccttaatgaagaaaca
tgttcacctgacagataattggtggtaaattcctcatgcaactcacctgtctttaatttg
ttaaacaactttcttagcattgatcttaccttactacaagcctctggggccaactcttca
atcagtctcatgatcctgtaactacggttgccgtcaacccaatccctaacatctgtgttg
caattgagtaagaatgggtcaatggggtacctcgcatacttcagcaagttcaaagttctc
ttctgaataagattacataggctaactggaacaccgttagcaatggactgatcaattatg
gtgtcaattgtctctgctaactggtgtggttctttacacttaacattatgtagcgcagca
gccacaaacttcgtcaagaggggaacctcatcaccccagacataaaatcttgatttaaat
tctgctacaaatcttccaatcacacttttcgggctgacaaacttatttaactgatcactc
atatagtaatgaaattctaacagggttttaaactcatcttggtctctagacattaactct
gtcaggttctggtcaaaaagagaaatttggtcatcactagaagtgtaagcatcaacttgc
cctccacaaacacaactcacagcatagtttataaatctctccgaaatgagtccataaaag
tctgaagtgttgtgcagaataccttgccccatgtcaaggatggaacttatatgagatggc
acaacacccaaatgaaagtttgagtaaaagaaatcttctgttagtgtctgagatgtactc
tttctcaacccaagttgtgcctttatataagatttcatcattgctgacactacattaaag
ggaatttctaccatcttgtgcatgtgccatgccaatagtgttgagagataatcctttcct
ttggcatcagcctgagtgtccccagtgagtagaattaagttctgtagggttgcaaggaat
aggaatgggcacatcataggaccccatttgctgtggtccatgctgtaggatacatgtgct
tgtgagacatttaatttcattgacaaaattgcattctcaaactctctttcatcatttaaa
caacttcccgagaattgcgtcgtcaatgcttcaaaataatcctctatcaatcttgtgaac
attttggtcctaaggtctccaatataaagttctctattacctccaacctgctctttgtat
gacagcgagaattttagtctccctgtatcaggaccaactgagttgaatgattgtggagat
tcctggctataaaaacacagatttttcaacatggcagttgtgcaatttgttagtgagagt
gctttactaagtgcttcggagttgctctctctttcacttattctagtgtcggttgacagt
ttgtctgtgtcaaatttgaaattgagacacttcgacttatagtgggtatacctccccatc
aacctgttgccattcatcaacaagagaattgatttgaaacacaggaaatattcctgatct
gaataagtcctggttacaactgcttttgttagttcaccaatagggcaagacaccatgggt
ccacaataaaagtacttctgcttaaactcttggtggtgatacactatgtcacagaactct
ttataaaaagtgtcagggagtgtgttataatcaaagtcctccaccatgtggtttgagagt
tcccctttaatcaatctgatgtacagtttttccttaacaatctcatttaaatcgtcaatt
gaacaaatctccccggttgttgagcgctggtccggtcctttcagatctctatactgatcc
acaattttttctatagtttcttttaactcatcaaaatatgacatcgcattcccttcaagc
aatatctcatccaggtcggccctgtctgtttctacatctttcccttgagaccccaaaacc
aaattgctcatagcctgctggactttgtactcatagtcttgcttatctaataaatactta
cctttccttgaaaaaacctcggtcaattgtgtaactgctaaggctgttagcttgttgaaa
tcataatttaagaccctacatccatctgtgtacttattaatcacaacactcttattgctt
gccagatctaaagcagtcgcacagccactggttgacagagggtcttttaattctttcctc
acttcttttcctttaaagagtgaaccattgttaaaagaagatgtaagcaaggacaagtac
ttcttggagacaccaggtttcttgtaccttatttcagctgattccacacagcattcctta
cctaagaactttttagcgttgtacaccatttcactcaactcatccgactctgcatgatcc
agaggattgacgctaacatgcccaaactctagttttggttctaagaatttctcaaagcac
ttgatctgatctgttagtctatcaggggtttccttagtgataaaatgacacatgtaagaa
atgttcaaaacaaatttgaacctgtttgttagcatactagtcacattaggagataaaaca
gtgcttaacagggctcttaccactctgaaaagcaagtactcaacatctgtaatcagcttc
tctttgatcttgttgattaagtctttgtgataataatcagagacaaaggccattatgaag
tatctaagattttgcaaaaacttctgacatcgcttactagggtgggtcaagattaaaacc
aaaatcattttggttagcactttaattgaggatatatcatccttaagctcagaacaatct
tcaatccagctgaccatgacttctaccacctttgtcagaacttcactggaaaagattgcg
gggaaaaacctcttgggatccgcataaaaggaacagacttcaccaactaagttgttgttt
accgcataacattttgaacactcgcccgttttttgataaagtaagctatattccactcca
tctatgaagaactgttgacagtatgcttctttgcatcttaccttctggtatcttgctgct
ccaagctcattttgcctcaacttaacagtggaggaagtcttcattgaattcaccaatgca
agactcaatgttgagagcctttcaagatcagctgactctgaaacgctcactgtgcccatt
gagtagggaaacagtccttcgtcagacttttgataagacattgttggcacaacacctgat
gcctcacagttcataactttactaaatacatgcccatccagaattgttagatcaacccca
taggtgtccacatttacatttaggtctttgagggcacttattgctttggtaacagttttc
tgcaacatgcaactcaagatttgatttctatctaatctaacggagtgagtgcttgatgct
gattttgaacattcagattgcaatcttctatcaacccaccttttcaggtccctcttagtg
tattctaaagacaccaagcggtcattcacactgataaaactggatcctaaccagccttcc
tgagcttgatcttcacatttaatcttgtccctctcataagcaaggagaatcattacatca
aaaatcaatagaagtttccttcttgtgttcaaaattctcaaagatttgactttgttgagg
aatgacctccaacaatgcatctgactgctgtttgtgggtgatgatctatcaaccccattg
gttaaatctgcatacaagaatttcaacacaggagaggctcttttaaaggaatacatcaat
tcttccactccatcttcattagaaaccatttcagaatcatacaagcagtgaaattctttt
aacagtctttctttgtcaactttaacaaattgatcttctatttggccctgccttaacttg
tttctgaatatctgatactcctcctcaatctgattcttaacctcatgggcagtcattttg
ttgttgatgccttgatggcagctggcaattatatcatcaaaatggttggttcgtttatct
aataacacattgatgctttctatacctgaaaaccggctagtgctgactgaaagagattcg
cataatctagagtactcagactcttcaaagagtgaattgctctcctgagcatatttcatc
aaggagaatagtgtatctctgagtttgtcattcacccagtccggaatttgttcattataa
aatgtagttcttccgtcaataagtggtattaggtttatatcaattgacttcaagtcattc
tttagttgctctaacttctttaagtcctctaggtacttctgctcaaagttcactggggat
gatctcacaaagcactcaaggagtatgaggacattcccattcaatttgaagccatcaggc
acaacaaagcataatgatggtgttaaaatccctaattcatggagaattacctcaacagag
cgtgcattgctgttgtgctcacaaccattggccttgcaggaatcaacctctatacataag
gataaaagttttagtccttccatcaacaacatcctcggctcagtctgcaccaagaatgcc
aatttttgtctagataatctctcatcatcagaaagatattttgaaacaagatcttttaca
taagctatctcctcctccattaaagaagttgtcaacaatagagcgtctaaatgcctagga
tcctcggtgcgc
58 changes: 58 additions & 0 deletions shared/lassa_s.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
>GU481068_REF
atttaggattgcgcttcaaagagacctttgtgtgactgagctttatcaaaccatgggaca
aatagtaacattctttcaggaagttcctcatgtcatagaagaggtaatgaatattgtcct
aattgctcttagtctcctggcgatactgaaggggatttacaatgttgccacttgtggcct
ttttgggctgatctcctttcttcttctgtgtggaagatcatgctcaacaacctacaaagg
tgtctatgagctacaaactctggagttggacatggcaaacctcaacatgactatgcctct
atcttgtacaaaaaacagcagccatcactatatcatggttgggaatgaaacaggcctaga
actgaccttgacaaacacaagtataatcaatcacaagttctgcaacctttctgatgcaca
caagaagaatctttatgatcatgccttgatgagcatcatttcaactttccacttatccat
tccaaatttcaatcagtatgaagcaatgagttgtgactttaatggagggaaaataagtgt
gcaatacaatctcagtcatgcttatgctgtagatgcagccaaccactgtgggaccattgc
caacggtgtcctacagactttcatgagaatggcttggggcggcagttacatagcacttga
ttctggaaaagggaactgggattgcataatgacttcctatcagtacttgattatccaaaa
caccacctgggaagatcactgtcagttttcccgcccgtcccctattggatacctaggact
gttatcacaaaggactagagatatttacattagtcggagacttcttggaactttcacctg
gaccctctctgactctgagggcaatgaggcacccggtggatactgccttactagatggat
gctaattgaagctgaattgaagtgctttgggaacacagccattgcgaagtgtaatgagaa
gcacgatgaggaattctgtgacatgttgagacttttcgacttcaacaaacaagccataaa
aaggctaaaagcagaagcacagatgagcatccaattgataaataaagcagtaaatgcact
aatcaatgaccaattgatcatgaagaaccacttgagggatatcatggggattccctattg
caattacagcaagtactggtatctgaaccatactgtgacagggaaaacatcattgccaag
gtgttggcttatttcaaatggttcttacctaaatgagacacatttctccgatgatattga
acagcaagcagataacatgataacagaactgctgcaaaaggagtatatggacaggcaggg
gaaaacacccttagggttagtggatctttttgttttcagcactagcttttatctcataag
tatcttcctccacttagtcaaaatcccgacccataggcacattgtagggagaccttgccc
caagccacacagactcaaccacatgggcatatgctcatgtggtttgtacaaacatcctgg
tgtaccagtcaagtggaagagatagaaatagacccattaacgggcccccgtgacccaccg
ccgaaaggcggtgggtcacgggggcgtccatttacaggacgactttgggacttgaagttc
tgaacaccatatctcttgggagaacagctctcaagattggtatattgagtcctcctgaca
cagctgcgtcaaacattatgcaatccattaaagcacaatgtggagtgatctcctctttgc
ctcctctcttctttttctcaacaaccactccagtgtgcatgtgacacaaatctttacact
gatcccagacagcattttcaaatctcctagaatcagccttatttaatgagatgtcaatga
gcttgatgtcccttctcccctgagaatccaagagttttttaatgtcgtctgaaccttggc
acgtcaacaccatgttgcgggggagagcctcaatgactgcactggttagaccaggctgag
cagaaaagagatctgtcacatcaatcccatgagaatatttggcatcttgtttgaactgtt
ttaaatccgttggttctctgaagaaatgtatataacagcctgacataggttggtaaagag
ctatctcaacagggtcttctggacgaccttcaatgtctatccaggttttggcgcttgggt
caagttgcatcattgaatctttgagtgtcatcagttgagaataggtcagccctgttggga
acccagcagattgcagagatttgttggatccagcagtacccactttctgtggtttcccat
ctgactcaaggtctacagtggtattctcccaagctctacccacaatggaggttcttgaag
ctatgtagggccagccgtccccagagagacaaattttgtaaagtatgttttcataagggt
ttctttcaccaggtgtgtctgaaacaaacattcccagggacctttttacctttagaatag
acttcagaatgccatccattgtctgaggcgtcaccttgatagtttccaacatgttacccc
catcgagcatgcaagctccggccttcactgcagctcccaaactaaaattataaccagaaa
tatttagggagctcttcttggtatctaccatattcagtataggatggctctgggaaagtc
tgtctaggtcggaactattggggtatttagctgtgtatattaatcccaagtctgttagcg
ccagaacggcgtcatttaagtcaacctgaccctgtttagtgagacatgctagcgttaaac
taggcatggtgccaaattggttgttgaggaggtccgggtttttgacatcccacactctga
caactccgtctcttccaggttgggttccctgagcaccaccgaccatgcctatcatactca
acaacgccttcctctgctcaagttgttgtgtgctcaagttccccatatacacacctgaac
ttaatggtctctccgtcctaataacctttgacttcaatttctctagatcagctgctaaag
tcagcaagtcgtctgaggtcaatgttccaaccctcagaacactcttttgttgtgttgact
tgagctcgacaaggttgttaactgcctgatttaggtctctaagacgttttaggtcaccgt
catctcttttctgtttgcgcatcaagcgctggacattgctgacctcagagaagtcgagac
catgcagaagggcctgagcatccttgaccacttggagttttatattagagcaatagcctg
aaagctctctcctcaatgactgtgtccaaaggaatgatttcacttccttggaagcactca
tcctggttgatgttgatcggaactcactggttgaaagtgttatccagtaaatcaacagta
gtaggcgcaatctaaaa

0 comments on commit 9f95104

Please sign in to comment.