-
Notifications
You must be signed in to change notification settings - Fork 4
/
type_pipe_2.3.sh
executable file
·348 lines (326 loc) · 15.2 KB
/
type_pipe_2.3.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
#!/bin/bash
#Authors: Logan Fink, Curtis Kapsak
#Usage: script to type bacteria and characterize AMR
#Permission to copy and modify is granted without warranty of any kind
#Last revised 09/08/18 - LDF
#Set all the variables that need to be set
while test $# -gt 0
do
case $1 in
-l)
SEQUENCE_LEN=$2
shift
;;
esac
shift
done
#This function will check if the file exists before trying to remove it
remove_file() {
if [ -e $1 ];then
rm -rf $1
fi
}
#This function will check to make sure the directory doesn't already exist before trying to create it
make_directory() {
if [ -e $1 ]; then
echo "Directory "$1" already exists"
else
mkdir $1
echo "Directory "$1" has been created"
fi
}
if [ -z "$SEQUENCE_LEN" ]; then
SEQUENCE_LEN=5000000
fi
echo "Sequence Length: $SEQUENCE_LEN"
##### Move all fastq files from fastq_files directory up one directory, remove fastq_files folder #####
#if [[ -n "$(find ./fastq_files)" ]]; then
if [[ -e ./fastq_files ]]; then
echo "Moving fastq files from ./fastq_files to ./ (top-level DIR)"
mv ./fastq_files/* .
rm -rf ./fastq_files
fi
declare -a srr=() #PASTE IN ANY SRR NUMBERS INTO FILE named: SRR
while IFS= read -r line; do
srr+=("$line")
done < ./SRR 2>/dev/null
#find . -maxdepth 1 -name '*fastq*' |cut -d '-' -f 1|cut -d '_' -f 1 |cut -d '/' -f 2 >tmp1 #output all fastq file identifiers in cwd to file tmp1 (the delimiters here are '-' and '_')
find . -maxdepth 1 -name '*fastq*' |cut -d '_' -f 1 |cut -d '/' -f 2 >tmp1 #output all fastq file identifiers in cwd to file tmp1 (the delimiters here are '_')
declare -a tmp=()
tmp1='tmp1'
tmpfile=`cat $tmp1`
for line in $tmpfile; do
tmp=("${tmp[@]}" "$line");
done
id=("${tmp[@]}" "${srr[@]}") #Combine the automatically generated list with the SRR list
id=($(printf "%s\n" "${id[@]}"|sort -u)) #Get all unique values and output them to an array
echo ${id[@]}
remove_file tmp
##### Fetch and fastq-dump all reads from NCBI identified by "SRR" #####
for i in ${srr[@]}; do
echo $i
if [[ -n "$(find *$i* 2>/dev/null)" ]]; then
echo "Files are here."
else
echo 'prefetching '$i'...'
prefetch $i
echo 'Creating read files for '$i'...'
fastq-dump --gzip --skip-technical --dumpbase --split-files --clip $i
fi
done
##### These are the QC trimming scripts as input to trimClean #####
make_directory clean
echo "cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning"
for i in *R1_001.fastq.gz; do
b=`basename ${i} _R1_001.fastq.gz`;
if [[ -n "$(find -path ./clean/${b}.cleaned.fastq.gz)" ]]; then
continue
else
run_assembly_shuffleReads.pl ${b}"_R1_001.fastq.gz" ${b}"_R2_001.fastq.gz" > clean/${b}.fastq;
echo ${b};
run_assembly_trimClean.pl -o clean/${b}.cleaned.fastq.gz -i clean/${b}.fastq --nosingletons;
remove_file clean/${b}.fastq;
fi
done
if [ -s "SRR" ]; then
for j in *_1.fastq.gz; do
c=`basename ${j} _1.fastq.gz`;
if [[ -n "$(find -path ./clean/${c}.cleaned.fastq.gz)" ]]; then
continue
else
run_assembly_shuffleReads.pl ${c}"_1.fastq.gz" ${c}"_2.fastq.gz" > clean/${c}.fastq;
echo ${c};
run_assembly_trimClean.pl -i clean/${c}.fastq -o clean/${c}.cleaned.fastq.gz --nosingletons;
remove_file clean/${c}.fastq;
fi
done
else
echo "There are no SRR numbers in this run"
fi
rm ./clean/\** 2>/dev/null
##### Kraken is called here as a contamination check #####
#Mini kraken is used here, since it takes up less space. If results are inconclusive, can run kraken with the larger database
#praise cthulhu.
make_directory kraken_output
echo 'SETTING KRAKEN DATABASE PATH'
KRAKEN_DB=$(find /home/$USER/ -mount -path "*/kraken/minikraken_CURRENT")
echo 'KRAKEN DATABASE PATH SET TO:' $KRAKEN_DB
echo $KRAKEN_DB
remove_file kraken_output/top_kraken_species_results
for i in ${id[@]}; do
if [[ -n "$(find . -path ./kraken_output/${i}/kraken_species.results 2>/dev/null)" ]]; then
echo "Isolate "${i}" has been krakked."
else
echo "RELEASE THE (mini)KRAKEN on isolate ${i}!!!"
make_directory ./kraken_output/${i}/;
kraken --preload --db $KRAKEN_DB --gzip-compressed --fastq-input ./clean/*${i}*.cleaned.fastq.gz > ./kraken_output/${i}/kraken.output;
kraken-report --db $KRAKEN_DB --show-zeros ./kraken_output/${i}/kraken.output > ./kraken_output/${i}/kraken.results;
awk '$4 == "S" {print $0}' ./kraken_output/${i}/kraken.results | sort -s -r -n -k 1,1 > ./kraken_output/${i}/kraken_species.results;
echo ${i} >> ./kraken_output/top_kraken_species_results;
head -10 ./kraken_output/$i/kraken_species.results >> ./kraken_output/top_kraken_species_results;
fi
done
##### Run Quality and Coverage Metrics #####
## check to see if the run quality and coverage metrics have already been completed or not
for i in ${id[@]}; do
if [[ -n "$(find -path ./clean/readMetrics.tsv 2>/dev/null)" ]]; then
echo 'Run quality and coverage metrics have been generated'
else
echo 'Running run_assembly_readMetrics.pl and generating readMetrics.tsv'
run_assembly_readMetrics.pl ./clean/*.fastq.gz --fast --numcpus 12 -e "$SEQUENCE_LEN"| sort -k3,3n > ./clean/readMetrics.tsv
fi
done
##### Run SPAdes de novo genome assembler on all cleaned, trimmed, fastq files #####
make_directory ./spades_assembly_trim
for i in ${id[@]}; do
if [[ -n "$(find -path ./spades_assembly_trim/$i/contigs.fasta 2>/dev/null)" ]]; then #This will print out the size of the spades assembly if it already exists
size=$(du -hs ./spades_assembly_trim/$i/contigs.fasta | awk '{print $1}');
echo 'File exists and is '$size' big.'
else
echo 'constructing assemblies for '$i', could take some time...'
spades.py --pe1-12 ./clean/*${i}*.cleaned.fastq.gz -o spades_assembly_trim/${i}/ --careful
rm -rf ./spades_assembly_trim/$i/corrected \
./spades_assembly_trim/$i/K21 \
./spades_assembly_trim/$i/K33 \
./spades_assembly_trim/$i/K55 \
./spades_assembly_trim/$i/K77 \
./spades_assembly_trim/$i/K99 \
./spades_assembly_trim/$i/K127 \
./spades_assembly_trim/$i/misc \
./spades_assembly_trim/$i/mismatch_corrector \
./spades_assembly_trim/$i/split_input \
./spades_assembly_trim/$i/tmp
fi
done
##### Run quast assembly statistics for verification that the assemblies worked #####
make_directory quast
for i in ${id[@]}; do
quast.py spades_assembly_trim/$i/contigs.fasta -o quast/$i
done
##### QUAST quality check ######
for i in ${id[@]}; do
remove_file quast/${i}_output_file
tail -n +3 ./quast/$i/report.txt | grep "contigs (>= 0 bp)" >> quast/${i}_output_file
tail -n +3 ./quast/$i/report.txt | grep "Total length (>= 0 bp)" >> quast/${i}_output_file
tail -n +10 ./quast/$i/report.txt | grep "contigs" >> quast/${i}_output_file
tail -n +10 ./quast/$i/report.txt | grep "N50" >> quast/${i}_output_file
#May need to add in some files which explain the limits for different organisms, eg acceptable lengths of genome for e coli, acceptable N50 values, etc.....
done
make_directory mash
echo "Mashing files now! MASH MASH MASH MASH MASH MASH MASH MASH MASH MASH MASH MASH MASH MASH MASH MASH MASH MASH MASH MASH MASH MASH MASH MASH MASH MASH MASH "
for i in ${id[@]}; do
if [[ -n "$(find -path ./mash/${i}_distance.tab 2>/dev/null)" ]]; then
echo "Skipping "$i". It has already been monster MASHed."
else
echo ${i}
mash sketch ./clean/*${i}*.cleaned.fastq.gz
mv ./clean/*${i}*.cleaned.fastq.gz.msh ./mash/
mash dist /home/staphb/databases/mash/refseq_CURRENT/*.msh ./mash/*${i}*.fastq.gz.msh > mash/${i}_distance.tab
sort -gk3 mash/${i}_distance.tab -o mash/${i}_distance.tab
echo $i >> ./mash/top_mash_results;
head -10 mash/${i}_distance.tab >> ./mash/top_mash_results;
fi
done
##### Create list of samples and how they were identified by MASH (E. coli or Salmonella) ##### #PROBLEM: The classifications aren't always the species name, they are sometimes just identifiers...
#Thanks to Kevin Libuit of DGS Virginia for the frame work of the following code for MASH, serotypeFinder and SeqSero.
make_directory ./mash/sample_id
echo "sep=;" > ./mash/sample_id/raw.csv && echo "Sample; MASH-ID; P-value" >> ./mash/sample_id/raw.csv
for i in ${id[@]}; do
echo "${i}; $(head -1 ./mash/${i}_distance.tab | sed 's/.*-\.-//' | grep -o '^\S*' | sed -e 's/\(.fna\)*$//g'); $(head -1 ./mash/${i}_distance.tab | awk '{ print $3 }')"
echo "${i}; $(head -1 ./mash/${i}_distance.tab | sed 's/.*-\.-//' | grep -o '^\S*' | sed -e 's/\(.fna\)*$//g'); $(head -1 ./mash/${i}_distance.tab | awk '{ print $3 }')" >> ./mash/sample_id/raw.csv
done
# database of H and O type genes
database="$(find /home/$USER/ -maxdepth 3 -mount -path "*/serotypefinder/database")"
# serotypeFinder requires legacy blast
blast="/opt/blast-2.2.26/"
##### SerotypeFinder, if mash pointed to E. coli, this will tell us the serotype details #####
ecoli_isolates="$(awk -F ';' '$2 ~ /Escherichia_coli/' ./mash/sample_id/raw.csv | awk -F ';' '{print$1}')"
make_directory serotypeFinder_output
echo "E. coli "$ecoli_isolates
remove_file ./serotypeFinder_output/serotype_results
for i in $ecoli_isolates; do
# Check if serotypeFinder output exists for each E.coli isolate; skip if so
if ls ./serotypeFinder_output/${i}/results_table.txt 1> /dev/null 2>&1; then
echo "Skipping ${i}. ${i} has already been serotyped with serotypeFinder."
else
# Run serotypeFinder for all Ecoli isolates and output to serotypeFinder_output/<sample_name>
serotypefinder.pl -d $database -i ./spades_assembly_trim/${i}/contigs.fasta -b $blast -o ./serotypeFinder_output/${i} -s ecoli -k 95.00 -l 0.60
fi
# Copy serotypeFinder's predicted serotype
o_type="$(awk -F $'\t' 'FNR == 7 {print $6}' ./serotypeFinder_output/${i}/results_table.txt)"
h_type="$(awk -F $'\t' 'FNR == 3 {print $6}' ./serotypeFinder_output/${i}/results_table.txt)"
ecoli_serotype="${o_type}:${h_type}"
# Edit ecoli.csv to include SeqSero results
echo ${i}';' $ecoli_serotype >> ./serotypeFinder_output/serotype_results
done
##### SeqSero, and SISTR, if mash pointed to a salmonella species, this will tell us the serotype details #####
sal_isolates="$(awk -F ';' '$2 ~ /Salmonella/' ./mash/sample_id/raw.csv | awk -F ';' '{print$1}')"
make_directory SeqSero_output
make_directory sistr
echo "Salmonella "$sal_isolates
oddity="See comments below*"
remove_file ./SeqSero_output/all_serotype_results
sistr_header="header"
remove_file ./sistr/sistr_summary_temp
remove_file ./sistr/sistr_summary
for i in $sal_isolates; do
#for i in ${id[@]}; do #In case of emergency (classification acting up), uncomment this line to serotype every isolate using SeqSero, comment out line above
# Check if SeqSero output exists for each Salmonenlla spp. isolate; skip if so
if [[ -n "$(find -path ./SeqSero_output/${i}/Seqsero_result.txt 2>/dev/null)" ]]; then
echo "Skipping ${i}. ${i} has already been serotyped with SeqSero."
else
# Run SeqSero for all Salmonella spp. isolates and output to SeqSero_output/<sample_name>"
if (find ./*$i*_[1,2]*fastq.gz); then
SeqSero.py -m2 -i ./*${i}*_[1,2]*fastq.gz
make_directory ./SeqSero_output/$i
mv ./SeqSero_result*/*.txt ./SeqSero_output/$i
remove_file ./SeqSero_result*
elif (find ./*$i*R[1,2]*fastq.gz); then
SeqSero.py -m2 -i ./*${i}*R[1,2]*fastq.gz
make_directory ./SeqSero_output/$i
mv ./SeqSero_result*/*.txt ./SeqSero_output/$i
remove_file ./SeqSero_result*
fi
fi
echo ${i} >> ./SeqSero_output/all_serotype_results
head -10 ./SeqSero_output/${i}/Seqsero_result.txt >> ./SeqSero_output/all_serotype_results
echo >> ./SeqSero_output/all_serotype_results
if [[ -n "$(find -path ./sistr/${i}_sistr-results.tab)" ]]; then
echo "${i} has a SISTR (file)."
else
sistr -i ./spades_assembly_trim/$i/contigs.fasta ${i} -f tab -o sistr/${i}_sistr-results
fi
sistr_header="$(head -1 ./sistr/${i}_sistr-results.tab)"
tail -n +2 ./sistr/${i}_sistr-results.tab >> ./sistr/sistr_summary_temp
done
echo $sistr_header
echo $sistr_header >> ./sistr/sistr_summary
grep -v "fasta_filepath" ./sistr/sistr_summary_temp >> ./sistr/sistr_summary
rm ./sistr/sistr_summary_temp
#####This section provides data on the virulence and antibiotic resistance profiles for each isolates, from the databases that make up abricate
echo 'Setting abricate db PATH'
abricate_db_path=$(find /home/$USER/ -mount -path "*/abricate/db")
echo 'ABRICATE DB PATH SET'
declare -a databases=()
for i in $abricate_db_path/*;
do b=`basename $i $abricate_db_path/`;
databases+=("$b");
done
echo ${databases[@]}
make_directory abricate
make_directory abricate/summary
for y in ${databases[@]}; do
if [[ -n "$(find -path ./abricate/summary/${y}_summary)" ]]; then
echo "Abricate kadabricate! ${y} has been run"
continue
else
for i in ${id[@]}; do
abricate -db ${y} spades_assembly_trim/${i}/contigs.fasta > abricate/${i}_${y}.tab
done
fi
abricate --summary ./abricate/*_${y}.tab > ./abricate/summary/${y}_summary
done
echo 'FINISHED RUNNING ABRICATE'
#####Create a file with all the relevant run info
remove_file isolate_info_file.tsv
qc_metric_head=$(head -1 ./clean/readMetrics.tsv)
echo -e "$qc_metric_head\tcontigs\tlargest_contig\ttotal_length\tN50\tL50" >> isolate_info_file.tsv
for i in ${id[@]}; do
qc_metric=$(grep "${i}" ./clean/readMetrics.tsv)
contigs=$(tail -9 ./quast/${i}/report.txt |grep "contigs" |tr -s ' '|cut -d' ' -f3)
largest_contig=$(tail -9 ./quast/${i}/report.txt |grep "Largest contig" |tr -s ' '|cut -d' ' -f3)
total_length=$(tail -9 ./quast/${i}/report.txt |grep "Total length" |tr -s ' '|cut -d' ' -f3)
N50=$(tail -9 ./quast/${i}/report.txt |grep "N50" |tr -s ' '|cut -d' ' -f2)
L50=$(tail -9 ./quast/${i}/report.txt |grep "L50" |tr -s ' '|cut -d' ' -f2)
if [ -z "$contigs" ]; then
contigs="N/A"
fi
if [ -z "$largest_contig" ]; then
largest_contig="N/A"
fi
if [ -z "$total_length" ]; then
total_length="N/A"
fi
if [ -z "$N50" ]; then
N50="N/A"
fi
if [ -z "$L50" ]; then
L50="N/A"
fi
echo -e "$qc_metric\t$contigs\t$largest_contig\t$total_length\t$N50\t$L50" >> isolate_info_file.tsv
echo -e "$qc_metric\t$contigs\t$largest_contig\t$total_length\t$N50\t$L50"
done
#rename the summary file
mv isolate_info_file.tsv isolate_info_${SEQ_NUM}_${species}.tsv
#### Remove the tmp1 file that lingers #####
remove_file tmp1
##### Move all of the fastq.gz files into a folder #####
make_directory fastq_files
for i in ${id[@]}; do
if [[ -n "$(find *$i*fastq.gz)" ]]; then
mv *$i*fastq.gz fastq_files
fi
done
echo "********************************"
echo "type_pipe pipeline has finished."
echo "********************************"