-
Notifications
You must be signed in to change notification settings - Fork 0
/
week4.py
141 lines (116 loc) · 15.3 KB
/
week4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from collections import defaultdict
import json
import os
import sys
import pytest
import math
import random
from lib import *
from util import *
import constants
import cProfile
def test_week4():
print_sep("Randomized Motif Search")
input=(['GAAATGGACCTTTGTAAGCTCCCTTCAAAGTGGTTCAATTTTAAGCCCACGAAACCCCTCCCTTATGTCTAGATAACGTCAACGCCTGGAAACGCAACGCGTTCGCACACAGCCGTGCCCTGGCGCTTGGATCGGCCCCTTATGAGAGATGGAAGAAATGGACCTTTGT','AAGCTCCCTTCAAAGTGGTTCAATTTTAAGCCCACGAAGTCCCATATTTTTGCACCCCTCCCTTATGTCTAGATAACGTCAACGCCTGGAAACGCAACGCGTTCGCACACAGCCGTGCCCTGGCGCTTGGATCGGCCCCTTATGAGAGATGGAAGAAATGGACCTTTGT','ACTGTTGCATGTATTTTTGACAGGGTATTGAGTGGTTAGCAAGGATCCTCGAAAACGCCCACTCTCCAAGGGAATAATCAAGCAATCAGTTGCACATCAGTACAGTCGATAGGTTGTATTCCCTCGTAGAGATTGTGGTAAGTCGCGAGTGATAAGTTATGCTTTCAAC','CAAGGGCGTGGTCTTGTTCAGGTATCACGTATTGACGGATGGAATGTGTACTTACTGCTGCTCGGAAGCTTTCGGCGATAAAGAACGTTGATTCCAGGCGCTCTCGTCGAGTCATGTATCACTGCTGCAAATACGTGGTCCACTATGTCTTTGCAATTCGACGGACTAA','CGAGCTGGATGCCCATAATCGGTCATGTATTCCCGCTGCACGCCGGTGGAGCTTATCCGAACACGGACTCTGCGCCGGGCGTCCCATCAGCATAGAAACTCTCATCATGTAATGACGCGCGAACCTTATCGGTTCGCGTCTAACATGGTGCTGCCTTGCTGTTGTACTG','AAATGATGGTCAGTCACCATGCATGGACCTCATGTATTTTTATAGGCGGACTCAGACTATCCTACGAAAGTGAGGGCCCGGTACGCAGACAGAACGCGCGAGGCGACCGCGTCCCGCGATGGAAAGGGGATGCCGCCGGCCAACTTTCAGTCTGCTGAGTCTACGATTG','ACCCCTGCTCGCGTTTTCCTTTACAGCCGAGTGGTCAGTACGCCTACACCAGTCCCGTTCGGTATCCAGCAAGCCAACCATTTGGAGGGCTGTGGGCAAACGTCATGATATTTTGCCGGACAATTCACCTACGTAAGATAAACAATGGCCGGGTACTCTCCGCTTTCGG','TTCTACAGGAGGTTCACGAAGGGCCGCAGAATTACCTCCAAGCTCATCAAGACGGGGCGAAAACCGTGGACTTTATAATGGAGGAGACATACACCCGTGGCCAACAGAAGCTATACGGGTCCCACTTGATCGCGTTGACTCGGGGGTCATGTTGATTTGCCCCCATCTC','TAAAAATTAGGCTCAAGCATGTAGGACTGAAAGCATTTCCAGCATTCGTTCCTCTCACTCCCTCCACGAGGGGGGATCGCGATCTAGTACCTTCACCACCAAGGGGACTGGTCATGTATTTGCCCGGTCGCTATCCCCTAGGCCCATGCCGGACATTTACAAGCCAACG','TTATTCCACGTGATTTGAGTTCTAACCACCCGCGCCTTCGTATCGCCACGAGGGTTGTATTTTTGCCAGATGATCCCTTAACCGTGGGCAAAATAGAACCCTGAGGATAGACACGGGTTTGGGCGCCTCCCAGAGGTGCTACTCACGCGAGGTTTAACAAGCATCAGTA','TGAAACTTACCAAAGTTCGCCTTGCCTGTCCCGGCCCGCAGCCCTCTGGATCTGGACGGGAGAAAGGCAGTTGCGGGTGACACAAGCTGGAAAAGAACGGTAGTACAAGTTTTAGGGTAGCGGTGCTACGGATCACCCACTAAATATGTATTTTTGCAGGCGACTAGAC','CTAGGGATCGTCCAGTCCAAAAAATGATCGCCGATGGTCAACCATTTTTGCCATCTGATAGAGAGTTTCATCTTATGCAGCTAATCACTACCGCGCTGTTCTTGGGGCCTGAACCGCCTACGTGGGTGGACCTGATTATTCTGCCCGGCTCAACTATCGTTGCTCCAAT','TCCGATGAACGGCAAGGGAGGTGCCAAGACATACGGTTAGACGCTCCCTATGTCATGTATTTTCATATCAGGCGGAGGTCCGTGTCTACCAGCTCGGGTCACCTAGGGGGATAGCGTTGTATAGGATCGCTTGTGTGGTCGCACTGAAGCGGAAAAATTAGGGGCCGTC','ACCGATATACACCCCTGCCTCCTTGTCGTCTTGGTCAGTGAATTGTGCGCCGCGTGTGGCACCTAGTTAACACGAAGCAACTAACTATCGCCGATGTTTGCCCCCAATCGCTTCTAGCTACCCACGTCATGTAAGCTTGCCGTTATGTCCCCGTCAGGCCCGGGTAGTA','AAACATCAAACAGTAAAATATAGTTTCCGTTATAGTCTACCCGGTAGTAGCGTATTTTTGCGATACAGAAGTCCGAGAGCAAGGATTAGTTATGCGTGGGAAAGCACCTGGTGCTTTATCCCTTACACTGCGACGATTTAAGGCCTCTGTGTATATACTCGACTTACAA','AGCTAGTCCCTTATTTTTGCCGATCGAAATAGTGGCATTTGTTGGTTCGAGGGAACGTATGACCGATTGTTGAGTGACCCGATTTATGGGCCGTACCAACTTGCGCGCATACTCCATACTCTACACACACGTTTTACCGAGAGAACTTGAAGCCAATGCGACCTAAACA','CCAAGAGGCTATTAGCGGCACAGGGTGCGTGCCTCGTGATTTCGGAGTGCTTTAGTCCCAAAAAGGAAGCGGCTGAGGTGACTGCGCTTCCTAAGTTTCAACGCGGTGGGTGCGCGCGAAGTCACTAATTTTTGCCGTGGCGGTCTAATAGACAAGCTGGGGTGTGTCG','GGCCGTCGTACATTTGATCTCGGCGGGCCCTCACAGCAGGGTCCTCTGCGTGCACCCCGCATATGAGGACCCCCTAAATTTGTACATGAGTGAGGGTCATAGCTTTTTGCTTACCCCACAACGTAAGTATGATTGATGAGATTCTAGTGCGGGGCTAGTTAATTGTACA','CGCACCAGCCTTGTTCGCTCATCCCGTCATGACATTTTGCTTGTTTGAACCCCTTTTACAGATCAGATACGGCTTAATTTACGACACACAGGCAGTAGATACGCTGGTGGGTGTACAATAAAAAGCACCTTGTACTCAGCCATCGCCGGGCCGCACGATGAAGGTGGCG','ACGCCAAGTGGCTGTGTGGCCGCACGAAGTTTTCATGGACCGAAGCCCAACTACGTCATCAGTTTTTGCAGTGTGCAGCCTTTCGTCCCACGGATCGGTGGTCTAAAGTCGACCTATGCGAGGACGCTTGGTAACATTCGAGCTCTTACCCCTTATAGTCATATGTCCG'],
15)
print(f"k={input[1]}-motif search on {input[0]}")
#soln = randomized_motif_search(*input, iterations=50, debug=False) #takes about 1.5s/10 iters, run 1000 for best soln
#print(soln)
#print_iter(soln[0])
print_sep("")
print("Compute the probability that ten randomly selected 15-mers from the ten 600-nucleotide long strings in the Subtle Motif Problem capture at least one implanted 15-mer. (Allowable error: 0.000001)")
L,n,k = 600,10,15
num_kmers = (L-k+1)
prob_not_capturing = (num_kmers-1)/num_kmers #because we know that it has been implanted once at a particular index in the string
prob_not_capturing_all = math.pow(prob_not_capturing, 10)
prob_capturing_atleast_one = 1 - prob_not_capturing_all
print(f"Probability: {prob_capturing_atleast_one:.8f}")
print("Now, compute the probability of capturing atleast 2 implanted 15-mers")
prob_capturing_exactly_one = (1/num_kmers) * math.pow((num_kmers-1)/num_kmers,9) * 10
prob_capturing_atleast_two = prob_capturing_atleast_one - prob_capturing_exactly_one
print(f"Probability: {prob_capturing_atleast_two:.8f}")
print_sep("Motif search via Gibbs sampling")
input = (['GGAAAAATACTTTGATAGCAGGGCACCCGCGCTCCAAATCTAAGAGTAATTTACAGATAGCGCCGGAAGGGCCGTATTAGTCGCGGACGCATGCCACAATACTGATACCTTCCGGCGTATCCAATGCAGGGTCCACATAAGCTAAATTCCTGCCTACCTCACCGATTTTTTCTCAGATTTGGAAAACAGAAATGGTTGGCCCACAAACGCAGGGGCGTCGAAGAGTGCACGCAACCTCTGATGGAGGAGACTTAAGGCTATAAACCAGGAGACGTAGCCGGAGGTGGGCAAAATCCTCTTAATGTCGATAGTTAGGAAAAATACTTTGA','TAGCAGGGCACCCGCGCTCCAAATCTAAGAGTAATTTACAGATAGCGCCGGAAGGGCCGTATTAGTCGCGGACGCATGCCACAATACTGATACCTTCCGGCGTATCCAATGCAGGGTCCCAGAATGGCTTTTCCACATAAGCTAAATTCCTGCCTACCTCACCGATTTTTTCTCAGATTTGGAAAACAGAAATGGTTGGCCCACAAACGCAGGGGCGTCGAAGAGTGCACGCAACCTCTGATGGAGGAGACTTAAGGCTATAAACCAGGAGACGTAGCCGGAGGTGGGCAAAATCCTCTTAATGTCGATAGTTAGGAAAAATACTTTGA','CTTACCTCCGGCACGTAATCGGCGGGTAGCCATTGTAATAAGTGCCTAAAACTGCCTATCGGATAGGAGTGTCACCCATATGGCTTGCACCGCCGAAATGGGCAACCGATAACATCTGAGCTTAAGAGGCCCAAGGCGTGATGCCCTTTTCGCAGTTAATGATATAGGCCCCTAGGATCGCGGATCGGCGAGATGGAGGCGTTGACACACAAACGAAGCCCCGAATTCTAGGCTTAGAGGTCCTCCCAGCCTATTAGGTCTGTCGCGAAGTGTGGTACCCCAATTGAGTCAACGACAGTTAATGTCGTATTCTCCTTTGAGTGCTAGTG','ATTGCGAAGGTTTTCGCACCCCCAGCAGAGGGACATCATTAAACTCGCAAATATAGTCAGAATGGATGCAATCCCATTACGGAGCACGCACGTGGCATACTGGATATATTGACCGCTGCAGTTCATTGCCTGGTCTACCGTGATCGCTGCAAGCTAGCCAAACTCAGCACCGCGACAGTTGGTCTCGTGGTGTGCGCGCTTTGTGCCGATGTGGTTATAGCGATTCGTGCGTCATCTTGCTCACCCCCATTGTGCTTTTGGCGGCCTCCGATTTCTTGCAGATCGCTTCACTGGCCACGAAGTCCGTAGTAATGATCTTGGCCCATACG','CCGTCAAGGTACAGTCGAAATGAACTCATGGGCGAAAGGCCGATGCCATAAATCCTGCTCATACCAGAACTCGCTTATCCAGACCCCACGAGGCTTTTTGTTTGTGCGGGAGGCGGGTACTATGGACATAATTAGTTCGACACGTCCATGGTGTTGCTCCGAAATACGATACCCACACAGGCGTTGTGTCCTAGGAAGGATAGCCGTTATAAGATACGAAACGCCACGCATGGAAGCTGCCTAGCACCTGGGGTATAGCTACGATTGTGATCGTCATCAGCTAGCCACACCCTTCTTTTGCTCTTGTTCGAAGTAGCCTAAGGTCTAGT','GGCGAAAACAAATCTTCACTTTCGCACTGGAACCGCATCATAATACCGAGCTAGAGGAATAACTCGGTCTAGTAGCATTATGAGTAAGGTGATCTTCGTCCTCGTGCAAAGGCGGTCTCTCTATGCTGTCGGAAACTTGATGTCAGAGGTTCGTCCGAAGTACTGGTATTACTTGATACAATACATACACGCTACCTCTGTTCCCCCATACCACTTTTGCTCTGCATGGCCCTTCATATGTACACCTCCGTTCAAATAAGTGACTACGACCCTCCCGTTGGACGTGAACATACGAGGACGTTCTGTTCCACTAATCGGACAGCTGACAG','GTTCCCTTTGGATTAGGATGCGTTATTCTAAGATTGACTCCGTATCAAGTACTCCGGAATGAGTTCGAGCTTAGAACCGGAATTGACATCTGAATGTCGTTGCTTCCCTTCTGACTCCGGATGAGTTCGGTGCGTTGCTTTAGGGCCAAAGTAATCGCGACCTATGCGATATCCTACAACCTACTTGCGGTGAGGGGGGTAGAATTTCTCAGCTTGCGTTCCCCCATATGGAAGTTTGACGCTCACAATGAACGACATCGCCAGCAAACCAAACACTACGTGTTCGACAAGAGTCTGAAGAGGCGAGCAATATTCACAACCGGGCGGAG','GCTCCATTGTTTGATCGGTCAAGTCAGATCGCCCCTCGTCTGTAATCAACCTCCATGCCCAAGTCATCTGAGGCTCACCTCAATTCTATCGCAGCCACGTTGATTGCGGATGATCCCATTCAACCCATTAAATTTGCGATGACGCTTCGCTAACGTGGGGCCCATGTCTATGTCTCAAACTGTCCAATGATGTGGGATTTAAGCGAAGTCGATACAATCAGCTTGCAGCAGAAGCGGAGGGCGCACTCGTGCATGGTCCAATTGATCACACTTCTCGTCTGCCCCATATACGTTTTGAGAACCACCAACAAGATTTTATAACCCAGGGG','ATGATAACCTTACCCTGCATAACACATATGGTGTGCGCTCCACTCAATTCCACCATTCCGGCACTCACCGAAGGAGCTAGCTACGTCTACGCCCATTACCGAGTAGTTATGACCCCATTAAATTGTTGTTTTCACCTCAGGTCTATTCCTTGAACCTTCACGGGGTTTCTAGGGTATGGGCATCCTGATGACCGTGCACCTAGTATAGACCGTCGTGCGGCGCGGCGAGATAGACCCGTAATGGCTTTTGGACTGAGCAATATACGACCTAGTCTGAGCATTAGTGTAACTGTAGTTTCCGCGCAATTCGTTAGAGGTACCCTTTTAGA','CCCCATATGGCCGATTGTCTCCACGGCAGATGGAAACAAAACAGGCAGTACACTGAGTGGCTCAACGGCTTTGGCAAGACCGAAAAGACATTTGGTACACCTTGTTCTTATCCAAGCGTCGTAGGGCTGTTACAGTGACGATAATTACCATGTGTAGACTGTTAAAAATCTATGTGCCTAGAGCGGCACAGACGAGTTCGAAAGAGGTGTTAATTCGCCATAACTGTCATATACATATTCTCAGAGAGCGTGGCACGCAGGCCCTTCGTTTACCTCTACCGAGATCCGTAACAAGGAGCGAATGCCGAATTGACGGCCAGCAGGGTAAC','TCAATCCGATGCTGTGCCTCCATGCAGACCTAGCTGATGTAGCCGCGGTAATTGCTAATCGGGCTGCTGTTGTTGCAACGATGTGAAGTATTCTTTTGTATACGGAATAACACGCGGGAGCAGTAGACCCGAATACTATATAGACAGAGCTCATGCTCATCATCGTCCTGTCTGGGCCCGTGCACGAAAGTCAGGTCCATTGACTGTTAAACTGTTGCCATCAATGCAGAACATATGGCTTTTCCCGGTCACCAGGGCCCGACAAAGGGACCGCGATCATTCTGGCCCAGCCTGTTTTCAACATGGATATATATTATAGTTCGCCGCGG','TGAACTTATGCTCCTCTGGATCAAGCCTAACCGTAATATGACTATCATAAGAGGACGTACCGATTATGGACTATTCGCAAATACACCGTTTCGAGTATCAGCAATCACTGTTGTCACCTAGAAAAAGCCTCGGCCCTCTTACGAGGAGTGGCCCCAAAAACACCCCACAGCAGGGGCTGTCCACTTGCCTACGTTTGAGAAGTTCTACACGTCTACACTAGGCGAGTTGTTTTAGTTTGGCCCAGAGCATCCTAGCTGAATATGGCTTTTCAAGACTACCTAGGAGTTTGTGAAGGGCGGAAGGGATGGGGAGACAGCGGCCGCCGAGC','CGTCCCCAGTCGGCTTTTGGGCTGCCCGCGCGGAGGTCCATAAATGCTCGAGGTAAATTTATACACCTCCGCGTGACAAGTGAAGAAGTTGGGCGTGCTCATGTAGATCTCAAAGTCCCAAGGCGCGTCGGATTATGCGGCCTCGATCTATCCCAGATGGGGACCAGTGCTTCAAATGAGAAAGTGATGAGCTTGACGCGCCAGTCCTCACATCGACATAACCGAGGTAAAGGATGAGGTTAAGCCGCATTGTTCCATACAGTCGGCCGTGTGGCCGTTCGATCTTGACCATGAAGCCCGGAATATTGTATCCGCGGACGCTTAAAGTA','ACATGTTGAAGCCTCGAGGATTTGAACAGCACACCGTCAATTGGAGGGGAATGGCGTTGGGTGATATTAGTTGTCAAGTTAGAATGACACTATGTTTCTGGAGAGCGCGAGCTTACATTGGCATTTGTCCACAGAAACGCATGCTAATAACACCACCTCTAATAAATTCAACTCCGGTTTTTGAGACACCGTATAATGGCACCGTAGGTCTTCTGAAGGGCCTCTGGACATGTTGCCGGTGGGCCGTTTGTTCTAGCCACGATTACCGTCCGGCTTCCCCCCCATATGCACTTTTGAGCGTTTGGGAGGCCCCGCGGGTGACAAAGAGT','ACCATACAGTACAGCCTCCTATCTCATTCAGAATAGGGGGTACGCCGCCACTAAAGCATCTAATAGCGTATCGGGGATGTGTGGATGCAGGTCACCCTCTAACAGCGGTAGCTAACAATCCAGTTCTCTGGGCACTAATACTGGTGAAGATTCATCCCCATTCGTACGTTGTCCGTTCATCTCCGGTAGACTCTGCATGGTCAACTTATAGGTCTTGGCCCTAGAGCAGATTGACGCAAATGGCGTTCTGGCCGATGCGCTCGACTGTGTCTCACCGATGATATCGGATGTCGCAATCGCTCCAAGCCCCATGCAGCTTTTGAAGGTGT','CGCGCTAGAAGCTTTTAGTGTTGCCGCGCACCCAACAACCTGCAAGTCTTAACACTTATGGAATCACGTTCCGAATCCATGACCAATCCGCGCAATATCTTGAAGCCGTTGCGCAAAAAAGTCAGGATAATAAGGCCTGTCTCAACCTGCACACAAGATTTGAATATGGCCGGCAAATCTGCTTCTGCAACGATGAGTCGATCACACCACGAGACACCGGAGCCCCCATATGGCTGACTAGACATTAAAATCTCGCCCTGGTTCAGTTTGTCGCATGTCAAAATCATATTCGAATTATAATAAATCTGCGCTGCATTAGACCAAGTTCT','GCTGGCGTCGACCTCACCGGAGACCTGGTTGTAAACTATCTGAGTGCAAGTCTGATGGGCGCGCGACTCAGGAACGGAGCGATTGGAGCCCGAGCAGCTTGTCCCCGATTGGCTTTTGGAGTAGCGACGAATTACACTTAAGCTTTAGGACGCTGATACGCAAGCCTCGCTTTGGATGGAACGAGGCAGTACGGTTTTTCCAGGCGGACAGAGATACCTTCTTAGGCGCTAGCGCTCATAGCTCTTGCTCTAACGGTTACTCGAACGCAGCCGCGATTGCTTTTCTGAAGTAGTAAATATAGCTATGACGATCTTGTTGATGCTGCCGC','GTCGGAGCGCAAGTCGGCCGATGAACCGTTGCCCTACGGGCATCCATAGATTGCTTAGTGTAACACGCGAGGATGATAGTGCAGATGGCACGAGTCTCACGACTGAATTTCTAACTTACCGCTTCGTCACAGGACTTTTAAGCTATCGAGACTCACTATCAAGTAACACATTAGGGGTCGGTGAGAGCGGTTAAACTGACTGGGGTTAAAAGGGCACCCTCGCGAGACATATCAGTCGGCCTACTATGGCTTTTGAAGCAGAACGGGCTGATTAAGCTGCCGTCGATGCCTCCCGTCGAGAACCGCTTTGAATCGCAGCAATGAAAGCG','CTAGTTGATCGCGAGTGTATGTGACAACATGTAGAAAAGCGTCACGTTACGCGAGATCATCAATGGCAATTGCAGGGGTCGCCGGCTAATGCCCCGCTTGGCTTTTACTTCCTTCCTATGTGCTTGCGTAAGACCGTTTCAAAGCTCTTTTGAAATAGCCTCTTATAAGTACAATTGATCTTTATGCAATAAAAATAATCTCCGGCTACACCTACAGAAGGTCGGTCGTGCCGATAGATAGCCCTTCCTAGGCCTGTCTGGAAATAGCTTACATTCACATCCGGAATCCTGGCGAAACAAGCGCGTACCCAGTGCCCGGAGGTTATGTA','GAGGATTAGTAGAGGGAAACCGTTCCCGCGCAATAAGCGGTATTTTTTAGCTGTCTAAGGGATACCACTGTAACGTTATACTGAGTCATCCCCTTTGCGGACCCATCAGTAGGCGATTGCGCACAATAAGCGCCCCCTACGAATATTGTGCGGCAGGTGAGGATGATGTGGCGAATCTGTTCCGTACCGTAACGGGCCATATGGCTTTGGTGTGGGGTCCGTGTATCATCCTGAAGCATAGGTGATTGTGGTGTTCTGATGGTCGAACGGTCCCACGGCTGGGGAGTTATGCGCGTTCACGCAAAATCTGTTCATCCGGAGGTTATCGG'],
15, 2000)
#soln = gibbs_sampling_motif_search(*input,pseudo_counts=True)
#print(soln)
#print_iter(soln[0])
print("Greedy solution (gibbs score should be lower):")
#gsoln = greedy_motif_search(input[0],input[1])
#print(gsoln)
#print_iter(gsoln[0])
print_sep("Find the binding sites of the DosR (Dormancy Survival Regulator) transcription factor in TB")
dosr = constants.dataset('tb_dosr')
print(dosr)
dosr_results = {}
rewrite = False
if rewrite or not os.path.exists('dosr_results.json'):
cache_results(dosr_results)
elif os.path.exists('dosr_results.json'):
print("reading")
with open('dosr_results.json','r') as f:
dosr_results = json.loads(f.read())
def cache_results(results):
with open('dosr_results.json','w') as f:
return f.write(json.dumps(results))
print(dosr_results)
regen = False
for k in range(8,13):
print_sep(f"k={k}")
iters = 4000
if not regen and str(k) in dosr_results['gibbs']:
result = dosr_results['gibbs'][str(k)][0]
gibbs = (result['kmers'], result['score'])
else:
gibbs = gibbs_sampling_motif_search(texts=dosr,k=k,iters=iters)
dosr_results['gibbs'].setdefault(str(k), []).append({'kmers': gibbs[0], 'score': gibbs[1]})
cache_results(dosr_results)
print(f"\tGibbs soln: {gibbs}")
iters = 2000
if not regen and str(k) in dosr_results['random']:
result = dosr_results['random'][str(k)][0]
randomr = (result['kmers'], result['score'])
else:
randomr = randomized_motif_search(texts=dosr,k=k,iterations=iters)
dosr_results['random'].setdefault(str(k), []).append({'kmers': randomr[0], 'score': randomr[1]})
cache_results(dosr_results)
print(f"\tRandom soln: {randomr}")
if not regen and str(k) in dosr_results['median']:
result = dosr_results['median'][str(k)][0]
median = (result['kmers'], result['score'])
else:
median = median_strings(texts=dosr, k=k)
dosr_results['median'].setdefault(str(k), []).append({'kmers': median[0], 'score': None})
cache_results(dosr_results)
print(f"\tMedian soln: {median}")
print_sep("Quiz")
texts = ['AAGCCAAA',
'AATCCTGG',
'GCTACTTG',
'ATGTTTTG']
kmers = ['CCA',
'CCT',
'CTT',
'TTG']
kmers = np.array([list(s) for s in kmers])
print(kmers)
profile = profile_motif_matrix(kmers, True)
print(profile)
new_motifs = np.array([list(profile_most_probable_kmer(text, profile, 3)) for text in texts])
print(new_motifs)
print_sep("Application Challenge")
print("Mycobacterium tuberculosis (MTB) can persist in a latent state in humans for many years before causing disease. Latency has been found to be linked to hypoxia (lack of oxygen) in the host. You suspect that genes that are activated in hypoxia are regulated by a common transcription factor, so you collect the upstream sequences for all of the MTB genes that are upregulated in hypoxia, looking for the motif that corresponds to the binding site for the transcription factor regulating these genes. Your biologist colleague tells you that you should look at the 250 bp upstream region of each gene (which have been conveniently compiled for you in a FASTA file named upstream250.txt -- right click and download this file). Your colleague also tells you that the motif is probably about 20 bp long.")
starting_positions="57 139 107 172 114 136 159 143 155 186 178 200 118 137 173 201 160 62 216 165 45 204"
background_probs = [0.188,0.312,0.312,0.188] #A C G T
def estimate_prob(background_probs, seq, ntrials=10000):
l = len(seq)
bases = ['A','C','G','T']
occurrences = 0
for i in range(ntrials):
random_seq = ''.join([bases[i] for i in random.choices(range(len(background_probs)), weights=background_probs, k=l)])
if random_seq == seq:
occurrences += 1
return occurrences/ntrials
print(f"The probability of ACGT occurring by chance is: {estimate_prob(background_probs, 'ACGT')*100:.6f}%")
if __name__ == '__main__':
sys.exit(pytest.main(["-s", "week4.py::test_week4"])) #-s to not suppress prints