-
Notifications
You must be signed in to change notification settings - Fork 0
/
Needleman.hs
127 lines (106 loc) · 5.36 KB
/
Needleman.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
module Needleman where
import Test.HUnit
import Data.Array
import Text.Printf
type Matrix = Array (Int,Int) Double
type Sequences = (String, String)
type Alignments = [Sequences]
match_score = 1
mismatch_score = -1
gap = -1
{- scoringMatrix string1 string2
Creates matrix of partial alignment scores for two strings with Needleman-Wunsch.
RETURNS: Array of size (length string1 * length string2).
Disclaimer: The general structure of the function was obtained from
http://jelv.is/blog/Lazy-Dynamic-Programming/
-}
scoringMatrix :: [String] -> [String] -> Matrix
scoringMatrix seqsA seqsB = matrix where
m = length (seqsA!!0)
n = length (seqsB!!0)
score :: Int -> Int -> Double
score 0 0 = 0
score i 0 = (fromIntegral i)*(fromIntegral gap)
score 0 j = (fromIntegral j)*(fromIntegral gap)
score i j = maximum [(matrix ! (i-1, j-1)) + sumPairScore seqsA (i-1) seqsB (j-1),
(matrix ! (i, j-1)) + (fromIntegral gap), -- Cost of gaps in seqsB
(matrix ! (i-1, j)) + (fromIntegral gap)] -- Cost of gaps in seqsA
matrix = array ((0,0),(m,n)) [((x,y), score x y) | x<-[0..m], y<-[0..n]]
{- sumPairScore seqsA posA seqsB posB
Calculates the sum-of-pairs score of aligning sequences or alignments with each other.
PRE: posA is a valid index for (seqsA!!0) and posB for (seqsB!!0). seqsA and seqsB are non-empty.
RETURNS: sum-of-pair score of aligning symbols at posA in seqsA with symbols at posB in seqsB.
EXAMPLES: sumPairScore ["TEST"] 0 ["TEST"] 0 = 1 (given match_score = 1)
sumPairScore ["TEST","TEST"] 0 ["TEST"] 0 = 2
sumPairScore ["TEST","TEST"] 0 ["TEST","TEST"] 0 = 4
sumPairScore ["TEST"] 0 ["TEST"] 1 = -1 (given mismatch_score = -1)
sumPairScore ["-EST"] 0 ["-EST"] 0 = 0 (gap to gap not punished)
-}
sumPairScore :: [String] -> Int -> [String] -> Int -> Double
sumPairScore seqsA posA seqsB posB = pair_score/comparisons where
pair_score = fromIntegral (sum [spAux a posA b posB | a <- seqsA, b <- seqsB])
comparisons = fromIntegral ((length seqsA)*(length seqsB))
{- spAux seqA posA seqB posB
Find score of aligning two symbols.
PRE: seqA and seqB are non-empty.
RETURNS: score of aligning symbol (seqA!!posA) with (seqB!!posB)
EXAMPLES: spAux "test" 0 "test" 0 = 1
-}
spAux :: String -> Int -> String -> Int -> Int
spAux a posA b posB
| a!!posA == '-' && b!!posB == '-' = 0
| a!!posA == '-' || b!! posB == '-' = gap
| a!!posA == b!!posB = match_score
| otherwise = mismatch_score
{- traceBack sequencesA sequencesB
Finds the sequence of operations for the best path through the score matrix.
PRE: seqsA and seqsB are either single sequences or multiple alignments
(so all strings in seqA and seqB will be of the same length).
RETURNS: Alignment of sequencesA and sequencesB (can be alignments or single sequences).
-}
traceBack :: [String] -> [String] -> [String]
traceBack seqsA seqsB = traceAux seqsA i seqsB j m outA outB where
m = scoringMatrix seqsA seqsB
i = length (seqsA!!0)
j = length (seqsB!!0)
outA = ["" | x <- [1..(length seqsA)]]
outB = ["" | y <- [1..(length seqsB)]]
{- traceAux a i b j matrix x y
Generates alignment by checking how each position was reached, gap or match.
RETURNS: Alignment x and y of sequences a and b generated by recursively tracing the matrix m at indices i and j.
VARIANT: i + j
-}
traceAux :: [String] -> Int -> [String] -> Int -> Matrix-> [String] -> [String] -> [String]
traceAux _ 0 _ 0 _ outA outB = outA ++ outB
traceAux seqsA i seqsB j m outA outB
| i > 0 && j > 0 && m!(i,j) == m!(i-1,j-1) + (sumPairScore seqsA (i-1) seqsB (j-1)) = traceAux seqsA (i-1) seqsB (j-1) m (alignSeqs seqsA outA (i-1)) (alignSeqs seqsB outB (j-1))
| i > 0 && m!(i,j) == m!(i-1,j) + (fromIntegral gap) = traceAux seqsA (i-1) seqsB j m (alignSeqs seqsA outA (i-1)) (insertGaps outB)
| otherwise = traceAux seqsA i seqsB (j-1) m (insertGaps outA) (alignSeqs seqsB outB (j-1))
{- alignSeqs inseqs outseqs i
- Add aligned symbols to alignment being generated.
- PRE: inseqs and outseqs are non-empty and of equal length.
- RETURNS: symbols at position i in inseqs aligned in outseqs.
- EXAMPLES: alignSeqs ["aww","aww"] ["",""] 0 = ["a","a"]
- alignSeqs ["ses", "hos", "des"] ["nel","est","or-"] 0 = ["snel","hest","dor-"]
-}
alignSeqs :: [String] -> [String] -> Int -> [String]
alignSeqs inseqs outseqs i = zipWith (++) [[inseq!!i] | inseq <- inseqs] outseqs
{- insertGaps outseqs
- Add gaps to alignments being generated.
- RETURNS: outseqs with one gap (-) added to the beginning of each seq in outseqs.
- EXAMPLES: insertGaps ["","", ""] = ["-","-","-"]
- insertGaps ["te","he", "ha"] = ["-te","-he","-ha"]
-}
insertGaps :: [String] -> [String]
insertGaps outseqs = ["-" ++ outseq | outseq <- outseqs]
------------ Functions to inspect matrices ------------
printMatrix :: PrintfArg a => Array (Int,Int) a -> IO ()
printMatrix matrix = printMatrixAux matrix 0 0
printMatrixAux :: PrintfArg a => Array (Int,Int) a -> Int -> Int -> IO()
printMatrixAux matrix x y
| x /= m = do printf "%7.3f" (matrix ! (x,y))
printMatrixAux matrix (x+1) y
| y < n = do printf "%7.3f\n" (matrix ! (x,y))
printMatrixAux matrix 0 (y+1)
| otherwise = printf "%7.3f\n" (matrix ! (x,y))
where (m,n) = snd (bounds matrix)