-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmsp_mul32_fios.s43
215 lines (163 loc) · 4.54 KB
/
msp_mul32_fios.s43
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#include "msp430.h"
NAME msp_mul32_fios
EXTERN mp_freeze
EXTERN mp_cprop
PUBLIC mp_mulmod32_fios
RSEG CODE
; Performs modular multiplication of A and B which are both in Montgomery representation.
; This function utilizes fully the 32-bit multiplier.
; The reduction is done modulo 2^255-19
; A is pointed by R13, B is pointed by R14.
; The result T has to be pointed by R12
; BEWARE: the number T has to be 40 bytes long as the additional 8 bytes of memory is required
mp_mulmod32_fios
PUSHM.W #8,R11
; Finely Integrated Operand Scanning
; m = R4,R7
; --------------------------------------
; For loop i = 0 .. s-1
ADD R14,R15 ; Traverses B
MOV R12,R10
ADD #32,R10 ; R10 points to t[s] and t[s+1].
outer_for2
; (C,S) = t[0] + a[0]*b[i]
MOV 0x0(R12),&MPY32L ; t[0]
MOV 0x2(R12),&MPY32H
MOV #1,&OP2L ; 1
MOV #0,&OP2H
MOV 0x0(R13),&MAC32L ; a[0]
MOV 0x2(R13),&MAC32H
MOV 0x0(R14),&OP2L ; b[i]
MOV 0x2(R14),&OP2H
MOV RES0,R5 ; Store S in R5
MOV RES1,R6 ; and R6
; ADD(t[1],C)
PUSH.W R10 ; Save R10
MOV R12,R10 ; Put R12 to R10 which points to T
MOV R12,R11
ADD #6,R10 ; Start adding carry after t[1]
ADD #34,R11 ; until (s+1)th word inclusive (thus propagate further to t[s] and t[s+1])
ADD RES2,-2(R10)
ADDC RES3, 0(R10)
CALLA #mp_cprop ; Propagate the carry (start at t[2])
POP.W R10 ; Restore R10
; m = S*n'[0] mod 2^32
MOV R5,&MPY32L ; S
MOV R6,&MPY32H
MOV #0xCA1B,&OP2L ; n'[0]
MOV #0x286B,&OP2H
MOV RES0,R4 ; Store m to R4
MOV RES1,R7 ; and R7
; (C,S) = S + m*n[0]
MOV R5,&MPY32L ; S
MOV R6,&MPY32H
MOV #1,&OP2L ; 1
MOV #0,&OP2H
MOV R4, &MAC32L ; m
MOV R7, &MAC32H
MOV #0xFFED,&OP2L ; n[0]
MOV #0xFFFF,&OP2H
; For j = 1 .. s-2, last iteration is unrolled
MOV R13,R8
ADD #4,R8 ; a[j]
MOV R13,R9
ADD #28,R9 ; a[s-1]
MOV R12,R5
ADD #4,R5 ; t[j]
inner_for2
MOV RES2,R6
MOV RES3,R11
; (C,S) = t[j] + a[j]*b[i] + C
MOV 0x0(R5),&MPY32L ; t[j]
MOV 0x2(R5),&MPY32H
MOV #1,&OP2L ; 1
MOV #0,&OP2H
MOV R6,&MAC32L ; C
MOV R11,&MAC32H
MOV #1,&OP2L ; 1
MOV #0,&OP2H
MOV 0x0(R8),&MAC32L ; a[j]
MOV 0x2(R8),&MAC32H
MOV 0x0(R14),&OP2L ; b[i]
MOV 0x2(R14),&OP2H
; ADD(t[j+1],C)
PUSH.W R10 ; Save R10
MOV R5,R10
MOV R12,R11
ADD #34,R11 ; Propagate further to t[s], t[s+1]
ADD #6,R10
ADD RES2,-2(R10)
ADDC RES3, 0(R10)
CALLA #mp_cprop
POP.W R10 ; Restore R10
; (C,S) = S + m*n[j]
MOV RES0,&MPY32L ; S
MOV RES1,&MPY32H
MOV #1,&OP2L ; 1
MOV #0,&OP2H
MOV R4,&MAC32L ; m
MOV R7,&MAC32H
MOV #0xFFFF,&OP2L ; n[j] = 0xFFFF for j = 1 .. s-2
MOV #0xFFFF,&OP2H
; t[j-1] = S
MOV RES0,-4(R5) ; TODO: Maybe swap? Endianness!
MOV RES1,-2(R5)
ADD #4,R8 ; Advance to a[++j]
ADD #4,R5 ; Advance to t[++j]
CMP R8,R9 ; j = s-1 halts the loop
JNZ inner_for2
; The last iteration of the inner for is unrolled
MOV RES2,R6
MOV RES3,R11
; (C,S) = t[s-1] + a[s-1]*b[i] + C
MOV 0x0(R5),&MPY32L ; t[s-1]
MOV 0x2(R5),&MPY32H
MOV #1,&OP2L ; 1
MOV #0,&OP2H
MOV R6, &MAC32L ; C
MOV R11,&MAC32H
MOV #1,&OP2L ; 1
MOV #0,&OP2H
MOV 0x0(R8),&MAC32L ; a[s-1]
MOV 0x2(R8),&MAC32H
MOV 0x0(R14),&OP2L ; b[i]
MOV 0x2(R14),&OP2H
; ADD(t[s],C)
ADD RES2,0x0(R10) ; Propagate to t[s] and t[s+1]
ADDC RES3,0x2(R10)
ADC 0x4(R10)
ADC 0x6(R10)
; (C,S) = S + m*n[s-1]
MOV RES0,&MPY32L
MOV RES1,&MPY32H
MOV #1,&OP2L
MOV #0,&OP2H
MOV R4,&MAC32L
MOV R7,&MAC32H
MOV #0xFFFF,&OP2L
MOV #0x7FFF,&OP2H
; t[s-2] = S
MOV RES0,-4(R5)
MOV RES1,-2(R5)
; Now the final post-processing
; (C,S) = t[s] + C
MOV 0x0(R10),R4 ; m is not needed anymore
MOV 0x2(R10),R7
ADD RES2,R4 ; Store S in R4 and R7
ADDC RES3,R7
MOV R4 ,0x0(R5) ; t[s-1] = S
MOV R7, 0x2(R5)
MOV 0x4(R10),0x0(R10) ; Store t[s+1] to t[s]
MOV 0x6(R10),0x2(R10)
ADC 0x0(R10) ; t[s] = t[s+1] + C
ADC 0x2(R10)
CLR 0x4(R10) ; t[s+1] = 0
CLR 0x6(R10)
ADD #4,R14
CMP R15,R14 ; Check if B traversal is still less than s
JNZ outer_for2
POPM.W #8,R11
CLR R13
CALLA #mp_freeze ; Conditionally subtract 2^255-19 in constant time
RETA
END