-
Notifications
You must be signed in to change notification settings - Fork 0
/
smallseq.sh
64 lines (44 loc) · 2.99 KB
/
smallseq.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# -----Clean & Concatenate FASTQ files-------------
# Quality check
fastqc *.fq
multiqc .
# Identify adapters | bbmerge.sh is part of the BBtools package | dnapi.py is part of DNApi
bbmerge.sh in=Reads_1.fq in2=Reads_2.fq mininsert=17 outa=adapterst.fa
dnapi.py --show-all Reads.fq
# Trim the adapters | Paired-end | Single-end
cutadapt -a TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTACAGCATCTCGTATGCCGTCTTCTGCTTG -A GATCGTCGGACTGTAGAACTCTGAACGTGTAGATCTCGGTGGTCGCCGTATCATT -o Reads_1.trim.fq -p Reads_2.trim.fq Reads_1.fq Reads_2.fq
cutadapt -a TGGAATTCTCGGGTGCCAAGG -o Read_1.trim.fq Read_1.fq
umi_tools extract --stdin=Read_1.fq --log=output.log --stdout=Read_1.trim.fq --extract-method=regex --bc-pattern='.+(?P<discard_1>AACTGTAGGCACCATCAAT){s<=2}(?P<umi_1>.{12})(?P<discard_2>.+)'
# Paired-end | Generate reverse complement of Reads_2.trim.fq and concatenate
seqkit seq Reads_2.trim.fq -r -p -o Reads_2.trimrc.fq
cat Reads_1.trim.fq Reads_2.trimrc.fq > merged.fq
# Create config txt file as follows | list.txt
#path to file three letter code
Merged1.fastq S01
Merged2.fastq S02
Merged3.fastq S03
Merged4.fastq S04
# ----Extracting human mirBase mature and precursor sequences-----------
# Download
wget ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz
wget ftp://mirbase.org/pub/mirbase/CURRENT/hairpin.fa.gz
# Downloaded files have white spaces that need to be removed (for miRDeep2)
zcat hairpin.fa.gz | cut -f1 -d" ">hairpin_noSpace.fa
zcat mature.fa.gz | cut -f1 -d" " >mature_noSpace.fa
# Remove non-canonical nucleotides and convert to SL | fastaparse.pl is part of the miRDeep 2 package | fasta_formatter is part of FASTX-Toolkit
fastaparse.pl hairpin_noSpace.fa -b | fasta_formatter -w 0 >hairpin_cleaned.fa
fastaparse.pl mature_noSpace.fa -b | fasta_formatter -w 0 >mature_cleaned.fa
# grep out the human miRNAscd ..
grep -A1 "hsa" mature_cleaned.fa | grep -v -- "^--$" >hsa_matureMirnas.fa
grep -A1 "hsa" hairpin_cleaned.fa | grep -v -- "^--$" >hsa_hairpinMirnas.fa
# ----Indexing human genome-----------
# Remove whitespace and indexing | remove_white_space_in_id.pl is part of the miRDeep 2 package | bowtie-build is part of the bowtie package
remove_white_space_in_id.pl genome.fa > genome_rs.fa
bowtie-build genome_rs.fa /bowtieindex/removedspaces
# ----miRDeep2-------------------
#Begin miRDeep process with mapper.pl (with config file)
mapper.pl list.txt -d -e -m -l 18 -v -q -h -j -p /bowtieindex/removedspaces -s reads_collapsed.fa -t reads_collapsed_vs_genome.arf
# Output | FASTA file with processed reads | ARF file with with mapped reads
# Identify conserved miRNAs and predict novel miRNAs with miRDeep2
miRDeep2.pl /genome/genome_rs.fa reads_collapsed_vs_genome.arf hsa_matureMirnas.fa none hsa_hairpinMirnas.fa -t hsa -r results -P -v -c 2>report.log
# Output | A spreadsheet and an HTML file with an overview of all detected miRNAs in the deep sequencing input data.