-
Notifications
You must be signed in to change notification settings - Fork 4
/
pipeline_non-ref_tree_build_1.3.sh
executable file
·150 lines (136 loc) · 5.58 KB
/
pipeline_non-ref_tree_build_1.3.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/bin/bash
#Authors: Logan Fink, Curtis Kapsak
#Usage: script to create a reference free phylogenetic tree from a set of fastq files
#Permission to copy and modify is granted without warranty of any kind
#This function will check if the file exists before trying to remove it
remove_file() {
if [ -e $1 ];then
rm -rf $1
fi
}
#This function will check to make sure the directory doesn't already exist before trying to create it
make_directory() {
if [ -e $1 ]; then
echo "Directory "$1" already exists"
else
mkdir $1
echo "Directory "$1" has been created"
fi
}
##### Move all fastq files from fastq_files directory up one directory, remove fastq_files folder #####
if [[ -n "$(find ./fastq_files)" ]]; then
mv ./fastq_files/* .
rm -rf ./fastq_files
fi
declare -a srr=() #PASTE IN ANY SRR NUMBERS INTO FILE named: SRR
while IFS= read -r line; do
srr+=("$line")
done < ./SRR
#find . -maxdepth 1 -name "*fastq*" |cut -d '-' -f 1|cut -d '_' -f 1 |cut -d '/' -f 2 >tmp1 #output all fastq file identifiers in cwd to file tmp1 (the delimiters here are '-' and '_')
find . -maxdepth 1 -name "*fastq*" |cut -d '_' -f 1 |cut -d '/' -f 2 >tmp1 #output all fastq file identifiers in cwd to file tmp1 (the delimiters here are '_')
declare -a tmp=()
tmp1='tmp1'
tmpfile=`cat $tmp1`
for line in $tmpfile; do
tmp=("${tmp[@]}" "$line");
done
id=($(printf "%s\n" "${tmp[@]}"|sort -u)) #Get all unique values and output them to array
id=("${id[@]}" "${srr[@]}") #Combine the automatically generated list with the SRR list
remove_file tmp
##### Fetch and fastq-dump all reads from NCBI identified by "SRR" #####
for i in ${srr[@]}; do
if (find ./*$i*); then
echo "Files are here."
else
echo 'prefetching '$i'...'
prefetch $i
echo 'dumping reads '$i'...'
fastq-dump --gzip --skip-technical --readids --dumpbase --split-files --clip $i
fi
done
##### These are the QC trimming scripts as input to trimClean #####
make_directory clean
echo "cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning"
for i in *R1_001.fastq.gz; do
b=`basename ${i} _R1_001.fastq.gz`;
if [[ -n "$(find . -maxdepth 2 -name ${b}.cleaned.fastq.gz)" ]]; then
continue
else
run_assembly_shuffleReads.pl ${b}"_R1_001.fastq.gz" ${b}"_R2_001.fastq.gz" > clean/${b}.fastq;
echo ${b};
run_assembly_trimClean.pl -i clean/${b}.fastq -o clean/${b}.cleaned.fastq.gz --nosingletons;
remove_file clean/${b}.fastq;
fi
done
for i in *_1.fastq.gz; do
b=`basename ${i} _1.fastq.gz`;
if find ./clean/${b}.cleaned.fastq.gz; then
continue
else
run_assembly_shuffleReads.pl ${b}"_1.fastq.gz" ${b}"_2.fastq.gz" > clean/${b}.fastq;
echo ${b};
run_assembly_trimClean.pl -i clean/${b}.fastq -o clean/${b}.cleaned.fastq.gz --nosingletons;
remove_file clean/${b}.fastq;
fi
done
remove_file ./clean/\**
##### Run SPAdes de novo genome assembler on all cleaned, trimmed, fastq files #####
make_directory ./spades_assembly_trim
for i in ${id[@]}; do
if [[ -n "$(find -path ./spades_assembly_trim/$i/contigs.fasta 2>/dev/null)" ]]; then #This will print out the size of the spades assembly if it already exists
size=$(du -hs ./spades_assembly_trim/$i/contigs.fasta | awk '{print $1}');
echo 'File exists and is '$size' big.'
else
echo 'constructing assemblies for '$i', could take some time...'
echo "spades.py --pe1-12 ./clean/*"${i}"*.cleaned.fastq.gz -o spades_assembly_trim/"${i}"/"
spades.py --pe1-12 ./clean/*${i}*.cleaned.fastq.gz -o spades_assembly_trim/${i}/
fi
done
make_directory new_temp
make_directory new_temp/spades_assemblies
for i in ${id[@]}; do
if [[ -n "$(find -path ./spades_assembly_trim/$i/contigs.fasta 2>/dev/null)" ]]; then
continue
else
cp ./clean/${i}*.cleaned.fastq.gz new_temp/
spades_file=./clean/${i}*.cleaned.fastq.gz
echo $spades_file
echo 'constructing assemblies for '$i', second try...'
spades.py -o new_temp/spades_assemblies/${i}/ --pe1-12 ./new_temp/${i}_*.cleaned.fastq.gz
fi
done
mv new_temp/spades_assemblies/* spades_assembly_trim/
remove_file new_temp
##### Run quast assembly statistics for verification that the assemblies worked #####
make_directory quast
for i in ${id[@]}; do
quast.py spades_assembly_trim/$i/contigs.fasta -o quast/$i
done
##### Run prokka on all the isolates to get the core genomes and generate .gff files #####
make_directory ./prokka
for i in ${id[@]}; do
if [[ -n "$(find -path ./prokka/$i)" ]]; then
echo "Prokka has been run on this isolate."
else
echo "Prokka will now be run on "$i
prokka spades_assembly_trim/$i/contigs.fasta --outdir prokka/$i --prefix $i
fi
done
remove_file -r ./gff_files
make_directory ./gff_files #make directory to hold the .gff files output by prokka
cp prokka/*/*.gff gff_files/ #copy over the .gff files from prokka
##### Run roary using the .gff file folder #####
rm -rf roary
roary -p 8 -e -n -v -f ./roary ./gff_files/*.gff
##### Run raxml on the roary alignment to generate a tree #####
raxmlHPC -m GTRGAMMA -p 12345 -x 12345 -s roary/core_gene_alignment.aln -# 100 -n phylo_output -f a
rm -rf raxml/
make_directory raxml
mv RAxML* raxml/
##### Move all of the fastq.gz files into a folder #####
make_directory fastq_files
for i in ${id[@]}; do
if [[ -n "$(find *$i*fastq.gz)" ]]; then
mv *$i*fastq.gz fastq_files
fi
done