-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmix2.py
executable file
·108 lines (95 loc) · 4.4 KB
/
mix2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re, regex
dico_comp = {'A':'T','C':'G',"G":"C","T":"A","M":"K","R":"Y","W":"W","S":"S","Y":"R","K":"M","V":"B","H":"D","D":"H","B":"V","X":"X","N":"X",".":".","|":"|"}
def inverComp (seq) : #return the inversed complementary sequence
seq = seq.upper() #acgt -> ACGT
seq_comp = ""
for nuc in seq : #nuc stand for nucleotide
seq_comp += dico_comp[nuc] #use of the dictionnary to have the complementary nucleotide
seq_comp = "".join(reversed(seq_comp)) #we reverse the sequence
return (seq_comp)
def positionsOfMatches (result,seq) : #get the matches positions in the fasta sequence
pos = []
for res in result :
pos.append([seq.find(res),res])
return (pos)
def search_matches(nbmismatch,primer, seq) : #return all match(es) in the fasta sequence
return (regex.findall("("+primer+"){e<="+str(nbmismatch)+"}",seq,overlapped=True))
def mix (nb,primer,seq) : #function to find match(s) with a mismatch
listFind =[]
if nb==1 :
invP=inverComp(primer)
#use of findtiter first for one mismatch (faster)
for i in range(len(primer)): #for each nucleotide of the primer
reg1=primer[:i]
reg2=primer[i+1:]
reg = str(reg1) + "["+"".join(dico_comp.keys()).replace("|","").replace(".","")+"]" + str(reg2)
searchseq = re.compile(reg) #search request for finditer (Primer with a mismatch)
tmpfinditer = searchseq.finditer(seq) #tmpfinder : objects list with start() end() and group(0) functions
tmp=[]
for m in tmpfinditer : #for each result
tmp.append([m.start(),"norm"]) #add the result in the tmp list
if tmp != [] : #if results, we add the first result in listFind
listFind.append(tmp[0])
else : #same search with the complementary reversed primer
reg1=invP[:i]
reg2=invP[i+1:]
reg = str(reg1) + "["+"".join(dico_comp.keys()).replace("|","").replace(".","")+"]" + str(reg2)
searchseq = re.compile(reg)
tmpfinditer = searchseq.finditer(seq)
for m in tmpfinditer :
tmp.append([m.start(),"inv"])
if tmp != [] :
listFind.append(tmp[0])
#if no match with one mismatch, use of regex.findall for two mismatch
if listFind == [] :
tmpfind_forward = search_matches(2,primer,seq) #get all matches (two mismatches)
positions_f = positionsOfMatches(tmpfind_forward,seq) #get positions of matches
if tmpfind_forward == [] :
tmpfind_reverse = search_matches(2,invP,seq)
positions_r = positionsOfMatches(tmpfind_reverse,seq)
if tmpfind_reverse != [] :
for res in positions_r :
listFind.append([res[0],"inv"])
else :
for res in positions_f :
listFind.append([res[0],"norm"])
elif nb==2 : #search only in the forward sense
for i in range(len(primer)):
reg1=primer[:i]
reg2=primer[i+1:]
reg = str(reg1) + "["+"".join(dico_comp.keys()).replace("|","").replace(".","")+"]" + str(reg2)
searchseq = re.compile(reg)
tmpfinditer = searchseq.finditer(seq)
if tmpfinditer !=[] :
tmp=[]
for m in tmpfinditer :
tmp.append([m.start(),"norm"])
if tmp != [] :
listFind.append(tmp[0])
#if no match with one mismatch
if listFind == [] :
tmpfind_forward = search_matches(2,primer,seq) #get all matches
positions_f = positionsOfMatches(tmpfind_forward,seq) #get positions of matches
if tmpfind_forward != [] :
for res in positions_f :
listFind.append([res[0],"norm"])
return(listFind)
#Brucella_suis_NZ_CP009096.1_NZ_CP009097.1.fa
#Brucella_vulpis_NZ_LN997863.1_NZ_LN997864.1.fa
#Brucella_ovis_ATCC_25840_NC_009505.1_NC_009504.1.fa
#Bruce19-324_3bp_163bp_36u GACGACCCGGACCATGTC ACTTCACCGTAACGTCGTGGA
#Bruce11-211_63bp_257bp_2u CTGTTGATCTGACCTTGCAACC CCAGACAACAACCTACGTCCTG
#Bruce42-424_125bp_539bp_4u CATCGCCTCAACTATACCGTCA ACCGCAAAATTTACGCATCG
#Bruce22-322_8bp_158bp_6u GATGAAGACGGCTATCGACTG TAGGGGAGTATGTTTTGGTTGC
with open("/home/david/Documents/complete_genomes/brucellaceae/Brucella_ovis_ATCC_25840_NC_009505.1_NC_009504.1.fa","r") as myfile : #load of the fasta file
raw_fasta=myfile.read()
myfile.seek(1)
tmp = myfile.read().split(">")
chr1 = tmp[0].replace("\n","")
chr2 = tmp[1].replace("\n","")
fasta = chr1+chr2
print mix(1,"ACTTCACCGTAACGTCGTGGA",chr2)
#print mix2(1,"CCAGACAACAACCTACGTCCTG",fasta)
#print search_matches(2,inverComp("ACCGCAAAATTTACGCATCG"),fasta)