forked from joybio/multiPrime
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmultiPrime.yaml
164 lines (143 loc) · 8.78 KB
/
multiPrime.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
---
################################################################################
# directories
################################################################################
# directory, where is/are the input virus fasta file(s) located
# absolute path
input_dir: /share/data3/yangjunbo/git_storage/multiPrime/test_data
# directory, in which the pipeline writes all output files (relative to current directory)
# (this also includes the summary files)
results_dir: /share/data3/yangjunbo/git_storage/multiPrime/test_data/results
# directory to write the log-files, created during processing, to
log_dir: /share/data3/yangjunbo/git_storage/multiPrime/test_data/logs
# directory of scripts.
scripts_dir: /share/data3/yangjunbo/git_storage/multiPrime/scripts
################################################################################
# sample(s) information
################################################################################
# name(s) of the input fastq file(s)
# please type here: sample1
# this name is used throughout the entire pipeline as name for the output samples
virus:
- CDS_20727
################################################################################
# preprocessing
################################################################################
#Model selection: [fast] or [normal].
#The degeneracy of the fast model would be higher than normal. However, the advantage of the
#fast model is that it has a shorter running time compared to the normal model.
#Model: normal
#-------------------------------------------------------------------------------
# cluster by cd-hit
#-------------------------------------------------------------------------------
# identity: This parameter is linked to the "-c" option in CD-HIT, with a range of 0.65 to 1.
# We recommend setting it as 0.7 or 0.8.
# If there is a considerable similarity among the sequences in your input file,
# you may consider increasing this parameter to 0.9.
# However, it's important to note that setting it to 1 will generate primers for each individual sequence.
identity: 0.7
# seq_number_ANI: The minimum sequence number in a cluster.
# multiPrime aims to merge clusters with a size smaller than {seq_number_ANI} into larger clusters.
# This parameter helps in reducing the total number of clusters.
# If seq_number_ANI is set to 0, all clusters will be processed.
# Conversely, if seq_number_ANI is set to 1, no clusters will undergo processing.
seq_number_ANI: 1
# ani: This parameter corresponds to the whole-genome Average Nucleotide Identity (ANI).
# It defines the threshold for merging clusters,
# where only clusters with an ANI value greater than {ani} will be merged.
# The minimum value for the ANI threshold is 0.7.
ani: 0.8
# drop: This parameter determines whether to merge or drop clusters with rare sequences
# that exhibit high ANI with other clusters. If set to "T," the clusters will be dropped.
# If set to "F," the clusters will be merged into others.
drop: "F"
# max_seq: This parameter specifies the maximum number of sequences to be used
# from each cluster for multi-alignment using tools such as MUSCLE or MAFFT.
# In this step, {maxseq} sequences are randomly selected from each cluster.
# It is important to note that this selection does not impact the
# calculation of primer coverage in the final primer set.
# The coverage calculation considers all sequences within the cluster to ensure accurate results.
max_seq: 500
#-------------------------------------------------------------------------------
## primer design by multiPrime
##-------------------------------------------------------------------------------
# dege_number: This parameter, represented as "-n {}" in multiPrime-core,
# specifies the maximum number of degenerate nucleotides allowed in a primer.
dege_number: 4
# degeneracy: This parameter, represented as "-d {}" in multiPrime-core,
# denotes the maximum degeneracy allowed in a primer.
degeneracy: 10
# primer_len: This parameter, represented as "-l {}" in multiPrime-core,
# determines the length of the degenerate primers.
primer_len: 18
# variation: This parameter, represented as "-v {}" in multiPrime-core,
# indicates the maximum number of mismatches allowed during the calculation of mis-coverage.
variation: 1
# nproc: This parameter, represented as "-p {}" in multiPrime-core,
# defines the number of processes to be launched.
nproc: 1
# entropy: This parameter, represented as "-e {}" in multiPrime-core,
# measures the level of disorder or variability.
# It is used to evaluate whether a window is conserved.
# Any primer-length window with an entropy less than
# the specified value of {entropy} will be processed. The default value is 3.6.
entropy: 3.6
# coordinate: This parameter, represented as "-c {}" in multiPrime-core,
# ensures that mismatch positions are not allowed at specific sites
# within a primer during the calculation of mis-coverage.
coordinate: 2,3,-1
#-------------------------------------------------------------------------------
## get candidate primers from multiPrime output
##-------------------------------------------------------------------------------
# coverage: This parameter filters primers based on their match rate,
# which is calculated as the number of sequences that match the selected primer
# divided by the number of sequences that span the selected primer.
# Only primers with a match rate greater than the specified fraction (default: 0.6) will be retained.
# However, if the sequence number in some clusters is less than 10,
# the default coverage threshold may not be suitable.
# In such cases, you can reset the threshold.
# Additionally, if the sequence number in some clusters is very large (greater than 100,000),
# only a random subset of 500 sequences will be used to generate the output.
coverage: 0.7
# PRODUCT_size: This parameter filters primers based on the desired PCR product size.
# Primers with a product size outside the specified range (default: [250, 700]) will be filtered out.
PRODUCT_size: 150,1200
# gc_content: This parameter filters primers based on their GC content.
# Primers with a GC content outside the specified range (default: [0.45, 0.65]) will be filtered out.
gc_content: 0.2,0.7
# distance: This parameter is a filter for hairpin structures and
# represents the minimum distance between the paired bases.
# The default value is 4, and it is used to detect hairpin structures in the primer sequence.
# For example, (number of X) AGCT[XXXX]AGCT.
distance: 4
# end: This parameter filters primers based on the presence of degenerate bases at the end.
# For example, setting it to "-t 4" means that degenerate bases should not appear at the last four positions
# during the primer pre-filtering step. The default value is 4.
end: 4
# adaptor: This parameter represents the adaptor sequence used for NGS (Next-Generation Sequencing) purposes.
# It is used for hairpin or dimer detection between the adaptor and primer.
# Multiple adaptor sequences can be specified, separated by commas.
# If the adaptor sequence is unknown but an adaptor is needed for subsequent sequencing,
# the provided example sequence can be used. If no adaptor is needed, the parameter should be set as ",".
adaptor: "TCTTTCCCTACACGACGCTCTTCCGATCT,TGGAGTTCAGACGTGTGCTCTTCCGATCT"
###-------------------------------------------------------------------------------
#### get core primer set
###-------------------------------------------------------------------------------
# core_number: This parameter specifies the minimum number of sequences required in each cluster.
core_number: 10
#-------------------------------------------------------------------------------
## get max primer set from get_multiPrime
##-------------------------------------------------------------------------------
#params of get_Maxprimerset_V4.
#This step will not consider the dimer formation of 5' (primers), cause it is designed for the NGS.
#It wont form dimers between adaptors of NGS and primers.
#distance between primers; column number of primer1_F to primer2_F. Do not change this param.
#GTGTGCTCGTGACCTTGA CCACAATTGCCACGTTAG 159 3 1.0 GTGTGCTCGTGACCTTGA GGTGTCTTGTTGGAAGGG 181 3
# primer1_F Primer1_R Product_len number_match coverage primer2_F primer2_R Product_len number_match
# 1 2 3 4 5 next(1) 2 3 4
# step: This parameter represents the distance between primers and should not be changed.
step: 5
# method: This parameter determines the method used for primer selection.
# If set to "T" (true), the greedy method will be used for maximal primer selection.
# If set to "F" (false), the maximum method will be used for primer selection.
method: T