forked from kircherlab/CADD-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
CADD.sh
executable file
·208 lines (177 loc) · 5.44 KB
/
CADD.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#!/bin/bash
usage="$(basename "$0") [-o <outfile>] [-g <genomebuild>] [-v <caddversion>] [-a] <infile> -- CADD version 1.5
where:
-h show this help text
-o out tsv.gz file (generated from input file name if not set)
-g genome build (supported are GRCh37 and GRCh38 [default: GRCh38])
-v CADD version (either v1.4 or v1.5 [default: v1.5])
-a include annotation in output
input vcf of vcf.gz file (required)"
unset OPTARG
unset OPTIND
GENOMEBUILD="GRCh38"
ANNOTATION=false
OUTFILE=""
VERSION="v1.5"
while getopts ':ho:g:v:a' option; do
case "$option" in
h) echo "$usage"
exit
;;
o) OUTFILE=$OPTARG
;;
g) GENOMEBUILD=$OPTARG
;;
v) VERSION=$OPTARG
;;
a) ANNOTATION=true
;;
\?) printf "illegal option: -%s\n" "$OPTARG" >&2
echo "$usage" >&2
exit 1
;;
esac
done
shift $((OPTIND-1))
INFILE=$1
echo "CADD-v1.5 (c) University of Washington, Hudson-Alpha Institute for Biotechnology and Berlin Institute of Health 2013-2019. All rights reserved."
set -ueo pipefail
### Configuring all the paths
FILENAME=$(basename $INFILE)
NAME=${FILENAME%\.vcf*}
FILEDIR=$(dirname $INFILE)
FILEFORMAT=${FILENAME#$NAME\.}
SCRIPT=$(readlink -f "$0")
export CADD=$(dirname "$SCRIPT")
if [ "$FILEFORMAT" != "vcf" ] && [ "$FILEFORMAT" != "vcf.gz" ]
then
echo "Unknown file format $FILEFORMAT. Make sure you provide a *.vcf or *.vcf.gz file."
exit 1
fi
if [ "$OUTFILE" == "" ]
then
OUTFILE=$FILEDIR/$NAME.tsv.gz
fi
if [ "$GENOMEBUILD" != "GRCh38" ] && [ "$GENOMEBUILD" != "GRCh37" ]
then
echo "Unknown/Unsupported genome build $GENOMEBUILD. CADD currently only supports GRCh37 and GRCh38."
exit 1
fi
if [ "$VERSION" != "v1.4" ] && [ "$VERSION" != "v1.5" ]
then
echo "Unknown/Unsupported CADD version $VERSION. This script currently only supports v1.4 and v1.5."
exit 1
fi
if [ "$VERSION" == "v1.5" ] && [ "$GENOMEBUILD" == "GRCh37" ]
then
echo "Please note that CADD scores for GRCh37 version v1.5 are the same as in v1.4."
VERSION="v1.4"
fi
if [ "$ANNOTATION" = 'true' ]
then
ANNO_FOLDER="incl_anno"
else
ANNO_FOLDER="no_anno"
fi
# Pipeline configuration
PRESCORED_FOLDER=$CADD/data/prescored/${GENOMEBUILD}_${VERSION}/$ANNO_FOLDER
REFERENCE_CONFIG=$CADD/config/references_${GENOMEBUILD}_${VERSION}.cfg
IMPUTE_CONFIG=$CADD/config/impute_${GENOMEBUILD}_${VERSION}.cfg
MODEL=$CADD/data/models/$GENOMEBUILD/CADD${VERSION}-$GENOMEBUILD.mod
CONVERSION_TABLE=$CADD/data/models/$GENOMEBUILD/conversionTable_CADD${VERSION}-$GENOMEBUILD.txt
# determine VEP database version
DBVERSION=92
if [ "$GENOMEBUILD" == "GRCh38" ] && [ "$VERSION" == "v1.5" ]
then
DBVERSION=95
fi
# Temp files
TMP_FOLDER=$CADD/data/tmp
TMP_PRE=$TMP_FOLDER/$NAME.pre.tsv.gz
TMP_VCF=$TMP_FOLDER/$NAME.vcf
TMP_ANNO=$TMP_FOLDER/$NAME.anno.tsv.gz
TMP_IMP=$TMP_FOLDER/$NAME.csv.gz
TMP_NOV=$TMP_FOLDER/$NAME.nov.tsv.gz
mkdir -p $TMP_FOLDER
### Pipeline
# Loading the environment
if [ "$VERSION" == "v1.4" ]
then
source activate cadd-env
else
source activate cadd-env-v1.5
fi
# File preparation
if [ "$FILEFORMAT" == "vcf" ]
then
cat $INFILE \
| python $CADD/src/scripts/VCF2vepVCF.py \
| sort -k1,1 -k2,2n -k3,3 -k4,4 \
| uniq > $TMP_VCF
else
zcat $INFILE \
| python $CADD/src/scripts/VCF2vepVCF.py \
| sort -k1,1 -k2,2n -k3,3 -k4,4 \
| uniq > $TMP_VCF
fi
# Prescoring
echo '## Prescored variant file' | gzip -c > $TMP_PRE;
if [ -d $PRESCORED_FOLDER ]
then
for PRESCORED in $(ls $PRESCORED_FOLDER/*.tsv.gz)
do
cat $TMP_VCF \
| python $CADD/src/scripts/extract_scored.py --header \
-p $PRESCORED --found_out=$TMP_PRE.tmp \
> $TMP_VCF.tmp;
gzip -c $TMP_PRE.tmp >> $TMP_PRE
mv $TMP_VCF.tmp $TMP_VCF;
done;
rm $TMP_PRE.tmp
fi
# Variant annotation
cat $TMP_VCF \
| vep --quiet --cache --buffer 1000 --no_stats --offline --vcf \
--dir $CADD/data/annotations/${GENOMEBUILD}_${VERSION}/vep \
--species homo_sapiens --db_version=$DBVERSION \
--assembly $GENOMEBUILD --regulatory --sift b \
--polyphen b --per_gene --ccds --domains --numbers --canonical \
--total_length --force_overwrite --format vcf --output_file STDOUT \
--warning_file STDERR \
| python $CADD/src/scripts/annotateVEPvcf.py -c $REFERENCE_CONFIG \
| gzip -c > $TMP_ANNO
rm $TMP_VCF
# Imputation
zcat $TMP_ANNO \
| python $CADD/src/scripts/trackTransformation.py -b \
-c $IMPUTE_CONFIG -o $TMP_IMP --noheader;
# Score prediction
python $CADD/src/scripts/predictSKmodel.py \
-i $TMP_IMP -m $MODEL -a $TMP_ANNO \
| python $CADD/src/scripts/max_line_hierarchy.py --all \
| python $CADD/src/scripts/appendPHREDscore.py -t $CONVERSION_TABLE \
| gzip -c > $TMP_NOV;
rm $TMP_ANNO
rm $TMP_IMP
if [ "$ANNOTATION" = 'false' ]
then
if [ "$GENOMEBUILD" == "GRCh38" ]
then
COLUMNS="1-4,124,125"
else
COLUMNS="1-4,106,107"
fi
zcat $TMP_NOV | cut -f $COLUMNS | uniq | gzip -c > $TMP_NOV.tmp
mv $TMP_NOV.tmp $TMP_NOV
fi
# Join pre and novel scored variants
{
echo "##CADD $GENOMEBUILD-$VERSION (c) University of Washington, Hudson-Alpha Institute for Biotechnology and Berlin Institute of Health 2013-2019. All rights reserved.";
head -n 1 < <(zcat $TMP_NOV);
zcat $TMP_PRE $TMP_NOV | grep -v "^#" | sort -k1,1 -k2,2n -k3,3 -k4,4 || true;
} | bgzip -c > $OUTFILE;
rm $TMP_NOV
rm $TMP_PRE
OUTFILE=$(echo $OUTFILE | sed 's/^\.\///')
echo -e "\nCADD scored variants written to file: $OUTFILE"
exit 0