forked from microsoft/SymCrypt
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ghash.c
910 lines (767 loc) · 33.7 KB
/
ghash.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
//
// GHASH.c
//
// Implementation of the NIST SP800-38D GHASH function which is the
// core authentication function for the GCM and GMAC modes.
//
// This implementation was done by Niels Ferguson for the RSA32.lib library in 2008,
// and adapted to the SymCrypt library in 2009.
//
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
//
#include "precomp.h"
#include "ghash_definitions.h"
//////////////////////////////////////////////////////////////////////////////
// Platform-independent code
//
//
// GHashExpandKeyC
// Generic GHash key expansion routine, works on all platforms.
// This function computes a table of H, Hx, Hx^2, Hx^3, ..., Hx^127
//
VOID
SYMCRYPT_CALL
SymCryptGHashExpandKeyC(
_Out_writes_( SYMCRYPT_GF128_FIELD_SIZE ) PSYMCRYPT_GF128_ELEMENT expandedKey,
_In_reads_( SYMCRYPT_GF128_BLOCK_SIZE ) PCBYTE pH )
{
UINT64 H0, H1, t;
UINT32 i;
//
// (H1, H0) form a 128-bit integer, H1 is the upper part, H0 the lower part.
// Convert pH[] to (H1, H0) using MSByte first convention.
//
H1 = SYMCRYPT_LOAD_MSBFIRST64( &pH[0] );
H0 = SYMCRYPT_LOAD_MSBFIRST64( &pH[8] );
for( i=0; i<SYMCRYPT_GF128_FIELD_SIZE; i++ )
{
expandedKey[i].ull[0] = H0;
expandedKey[i].ull[1] = H1;
//
// Multiply (H1,H0) by x in the GF(2^128) field using the field encoding from SP800-38D
//
t = UINT64_NEG(H0 & 1) & ((UINT64)GF128_FIELD_R_BYTE << (8 * ( sizeof( UINT64 ) - 1 )) ) ;
H0 = (H0 >> 1) | (H1 << 63);
H1 = (H1 >> 1) ^ t;
}
}
//
// GHashAppendDataC
// Generic GHash routine, works on all platforms.
//
VOID
SYMCRYPT_CALL
SymCryptGHashAppendDataC(
_In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable,
_Inout_ PSYMCRYPT_GF128_ELEMENT pState,
_In_reads_( cbData ) PCBYTE pbData,
SIZE_T cbData )
{
UINT64 R0, R1;
UINT64 mask;
SYMCRYPT_ALIGN UINT32 state32[4];
UINT32 t;
int i,j;
while( cbData >= SYMCRYPT_GF128_BLOCK_SIZE )
{
R0 = R1 = 0;
//
// We have two nested loops so that we can do most of our operations
// on 32-bit words. 64-bit rotates/shifts can be really slow on a 32-bit CPU.
// On AMD64 we use the XMM version which is much faster.
//
state32[0] = (UINT32)pState->ull[0];
state32[1] = (UINT32)(pState->ull[0] >> 32);
state32[2] = (UINT32)pState->ull[1];
state32[3] = (UINT32)(pState->ull[1] >> 32);
for( i=0; i<4; i++ )
{
t = SYMCRYPT_LOAD_MSBFIRST32( &pbData[4*i] ) ^ state32[3-i];
for( j=31; j>=0; j-- )
{
mask = (UINT64)( -(INT64)(t & 1 ));
R0 ^= expandedKeyTable[32*i+j].ull[0] & mask;
R1 ^= expandedKeyTable[32*i+j].ull[1] & mask;
t >>= 1;
}
}
pState->ull[0] = R0;
pState->ull[1] = R1;
pbData += SYMCRYPT_GF128_BLOCK_SIZE;
cbData -= SYMCRYPT_GF128_BLOCK_SIZE;
}
SymCryptWipeKnownSize( state32, sizeof( state32 ) );
}
VOID
SYMCRYPT_CALL
SymCryptGHashResult(
_In_ PCSYMCRYPT_GF128_ELEMENT pState,
_Out_writes_( SYMCRYPT_GF128_BLOCK_SIZE ) PBYTE pbResult )
{
SYMCRYPT_STORE_MSBFIRST64( pbResult , pState->ull[1] );
SYMCRYPT_STORE_MSBFIRST64( pbResult + 8, pState->ull[0] );
}
////////////////////////////////////////////////////////////////////////////////////////////
// XMM code
//
VOID
SYMCRYPT_CALL
SymCryptGHashExpandKeyXmm(
_Out_writes_( SYMCRYPT_GF128_FIELD_SIZE ) PSYMCRYPT_GF128_ELEMENT expandedKey,
_In_reads_( SYMCRYPT_GF128_BLOCK_SIZE ) PCBYTE pH )
{
//
// We use the same layout for XMM code as we did for C code, so we can use the same key
// expansion code.
// Improvement: we can add an expansion routine that uses the XMM registers for speed.
//
SymCryptGHashExpandKeyC( expandedKey, pH );
}
#if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64
//
// The XMM-based GHash append data function, only on AMD64 & X86
//
VOID
SYMCRYPT_CALL
SymCryptGHashAppendDataXmm(
_In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable,
_Inout_ PSYMCRYPT_GF128_ELEMENT pState,
_In_reads_( cbData ) PCBYTE pbData,
SIZE_T cbData )
{
__m128i R;
__m128i cmpValue;
__m128i mask;
__m128i T;
__m128i tmp;
PCSYMCRYPT_GF128_ELEMENT p;
PCSYMCRYPT_GF128_ELEMENT pLimit;
UINT32 t;
int i;
cmpValue = _mm_setzero_si128(); // cmpValue = 0
while( cbData >= SYMCRYPT_GF128_BLOCK_SIZE )
{
R = _mm_setzero_si128();
//
// The amd64 compiler can't optimize array indices in a loop where
// you use _mm intrinics,
// so we do all the pointer arithmetic for the compiler.
//
p = &expandedKeyTable[0];
pLimit = &expandedKeyTable[32];
for( i=0; i<4; i++ )
{
//
// Set up our XMM register with 4 identical 32-bit integers so that
// we can generate the mask from the individual bits of the 32-bit value.
// Note the use of tmp; if we assign directly to the fields of T the
// compiler no longer caches T in an XMM register, which is bad.
//
// There are XMM instructions where we can do the duplication in the XMM
// registers, but they require SSE3 support, and this code only requires
// SSE2. As the inner loop consumes most of the time, it isn't worth
// using the SSE3 instructions.
//
// Note that accessing the state as an array of UINT32s depends on the
// endianness of the CPU, but this is XMM code that only runs on
// little endian machines.
//
t = SYMCRYPT_LOAD_MSBFIRST32( &pbData[4*i] ) ^ pState->ul[3-i];
tmp = _mm_set_epi32(t, t, t, t);
T = tmp;
while( p < pLimit )
{
//
// p and plimit are always at indexes that are multiples of 4 from
// the start of the array.
// We need to explain to prefast that this means that p <= pLimit - 4
//
SYMCRYPT_ASSERT( p <= pLimit - 4 );
mask = _mm_cmpgt_epi32( cmpValue, T );
T = _mm_add_epi32( T, T );
mask = _mm_and_si128( mask, p[0].m128i );
R = _mm_xor_si128( R, mask );
mask = _mm_cmpgt_epi32( cmpValue, T );
T = _mm_add_epi32( T, T );
mask = _mm_and_si128( mask, p[1].m128i );
R = _mm_xor_si128( R, mask );
mask = _mm_cmpgt_epi32( cmpValue, T );
T = _mm_add_epi32( T, T );
mask = _mm_and_si128( mask, p[2].m128i );
R = _mm_xor_si128( R, mask );
mask = _mm_cmpgt_epi32( cmpValue, T );
T = _mm_add_epi32( T, T );
mask = _mm_and_si128( mask, p[3].m128i );
R = _mm_xor_si128( R, mask );
p += 4;
}
pLimit += 32;
}
pState->m128i = R;
pbData += SYMCRYPT_GF128_BLOCK_SIZE;
cbData -= SYMCRYPT_GF128_BLOCK_SIZE;
}
}
#endif
#if SYMCRYPT_CPU_ARM | SYMCRYPT_CPU_ARM64
//
// The NEON-based GHash append data function, only on ARM & ARM64
//
VOID
SYMCRYPT_CALL
SymCryptGHashAppendDataNeon(
_In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable,
_Inout_ PSYMCRYPT_GF128_ELEMENT pState,
_In_reads_( cbData ) PCBYTE pbData,
SIZE_T cbData )
{
// Room for improvement: replace non-crypto NEON code below, based on a bit by bit lookup with
// pmull on 8b elements - 8x(8bx8b) -> 8x(16b) pmull is NEON instruction since Armv7
//
// When properly unrolled:
// 1 (64bx64b -> 128b) pmull instruction and 1 eor instruction can be replaced by
// 8 (8x(8bx8b) -> 8x(16b)) pmull instructions and 8 eor instructions
// so each 128b of data could be processed by less than 64 instructions (using karatsuba)
// rather than ~512 instructions (bit by bit)
//
// Not a priority, expect that AES-GCM performance will be dominated by AES on these platforms
__n128 R;
__n128 cmpValue;
__n128 mask;
__n128 T;
PCSYMCRYPT_GF128_ELEMENT p;
PCSYMCRYPT_GF128_ELEMENT pLimit;
UINT32 t;
int i;
cmpValue = vdupq_n_u32(0); // cmpValue = 0
while( cbData >= SYMCRYPT_GF128_BLOCK_SIZE )
{
R = cmpValue;
//
// Do all the pointer arithmetic for the compiler.
//
p = &expandedKeyTable[0];
pLimit = &expandedKeyTable[32];
for( i=0; i<4; i++ )
{
//
// Set up our XMM register with 4 identical 32-bit integers so that
// we can generate the mask from the individual bits of the 32-bit value.
// Note the use of tmp; if we assign directly to the fields of T the
// compiler no longer caches T in an XMM register, which is bad.
//
// Note that accessing the state as an array of UINT32s depends on the
// endianness of the CPU, but Arm code is always expected to execute in
// little endian mode.
//
t = SYMCRYPT_LOAD_MSBFIRST32( &pbData[4*i] ) ^ pState->ul[3-i];
T = vdupq_n_u32( t );
while( p < pLimit )
{
//
// p and plimit are always at indexes that are multiples of 4 from
// the start of the array.
// We need to explain to prefast that this means that p <= pLimit - 4
//
SYMCRYPT_ASSERT( p <= pLimit - 4 );
mask = vcgtq_s32( cmpValue, T );
T = vaddq_u32( T, T );
mask = vandq_u32( mask, p[0].n128 );
R = veorq_u32( R, mask );
mask = vcgtq_s32( cmpValue, T );
T = vaddq_u32( T, T );
mask = vandq_u32( mask, p[1].n128 );
R = veorq_u32( R, mask );
mask = vcgtq_s32( cmpValue, T );
T = vaddq_u32( T, T );
mask = vandq_u32( mask, p[2].n128 );
R = veorq_u32( R, mask );
mask = vcgtq_s32( cmpValue, T );
T = vaddq_u32( T, T );
mask = vandq_u32( mask, p[3].n128 );
R = veorq_u32( R, mask );
p += 4;
}
pLimit += 32;
}
pState->n128 = R;
pbData += SYMCRYPT_GF128_BLOCK_SIZE;
cbData -= SYMCRYPT_GF128_BLOCK_SIZE;
}
}
#endif
//////////////////////////////////////////////////////////////////////////////////////
// Pclmulqdq implementation
//
/*
GHASH GF(2^128) multiplication using PCLMULQDQ
The GF(2^128) field used in GHASH is GF(2)[x]/p(x) where p(x) is the primitive polynomial
x^128 + x^7 + x^2 + x + 1
Notation: We use the standard mathematical notation '+' for the addition in the field,
which corresponds to a xor of the bits.
Multiplication:
Given two field elements A and B (represented as 128-bit values),
we first compute the polynomial product
(C,D) := A * B
where C and D are also 128-bit values.
The PCLMULQDQ instruction performs a 64 x 64 -> 128 bit carryless multiplication.
To multiply 128-bit values we write A = (A1, A0) and B = (B1, B0) in two 64-bit halves.
The schoolbook multiplication is computed by
(C, D) = (A1 * B1)x^128 + (A1 * B0 + A0 * B1)x^64 + (A0 * B0)
This require four PCLMULQDQ instructions. The middle 128-bit result has to be shifted
left and right, and each half added to the upper and lower 128-bit result to get (C,D).
Alternatively, the middle 128-bit intermediate result be computed using Karatsuba:
(A1*B0 + A0*B1) = (A1 + A0) * (B1 + B0) + (A1*B1) + (A0*B0)
This requires only one PCLMULQDQ instruction to multiply (A1 + A0) by (B1 + B0)
as the other two products are already computed.
Whether this is faster depends on the relative speed of shift/xor verses PCLMULQDQ.
Both multiplication algorithms produce three 128-bit intermediate results (R1, Rmid, R0),
with the full result defined by R1 x^128 + Rmid x^64 + R0.
If we do Multiply-Accumulate then we can accumulate the three 128-bit intermediate results
directly. As there are no carries, there is no overflow, and the combining of the three
intermediate results into a 256-bit result can be shared amongst all multiplications.
Modulo reduction:
We use << and >> to denote shifts on 128-bit values.
The modulo reduction can now be done as follows:
given a 256-bit value (C,D) representing C x^128 + D we compute
(T1,T0) := C + C*x + C * x^2 + C * x^7
R := D + T0 + T1 + (T1 << 1) + (T1 << 2) + (T1 << 7)
(T1,T0) is just the value C x^128 reduced one step modulo p(x).The value T1 is at most 7 bits,
so in the next step the reduciton, which computes the result R, is easy. The
expression T1 + (T1 << 1) + (T1 << 2) + (T1 << 7) is just T1 * x^128 reduced modulo p(x).
Let's first get rid of the polynomial arithmetic and write this completely using shifts on
128-bit values.
T0 := C + (C << 1) + (C << 2) + (C << 7)
T1 := (C >> 127) + (C >> 126) + (C >> 121)
R := D + T0 + T1 + (T1 << 1) + (T1 << 2) + (T1 << 7)
We can optimize this by rewriting the equations
T2 := T1 + C
= C + (C>>127) + (C>>126) + (C>>121)
R = D + T0 + T1 + (T1 << 1) + (T1 << 2) + (T1 << 7)
= D + C + (C << 1) + (C << 2) + (C << 7) + T1 + (T1 << 1) + (T1 << 2) + (T1 << 7)
= D + T2 + (T2 << 1) + (T2 << 2) + (T2 << 7)
Thus
T2 = C + (C>>127) + (C>>126) + (C>>121)
R = D + T2 + (T2 << 1) + (T2 << 2) + (T2 << 7)
Gets the right result and uses only 6 shifts.
The SSE instruction set does not implement bit-shifts of 128-bit values. Instead, we will
use bit-shifts of the 32-bit subvalues, and byte shifts (shifts by a multiple of 8 bits)
on the full 128-bit values.
We use the <<<< and >>>> operators to denote shifts on 32-bit subwords.
We can now do the modulo reduction by
t1 := (C >> 127) = (C >>>> 31) >> 96
t2 := (C >> 126) = (C >>>> 30) >> 96
t3 := (C >> 121) = (C >>>> 25) >> 96
T2 = C + t1 + t2 + t3
left-shifts in the computation of R are a bit more involved as we have to move bits from
one subword to the next
u1 := (T2 << 1) = (T2 <<<< 1) + ((T2 >>>> 31) << 32)
u2 := (T2 << 2) = (T2 <<<< 2) + ((T2 >>>> 30) << 32)
u3 := (T2 << 7) = (T2 <<<< 7) + ((T2 >>>> 25) << 32)
R = D + T2 + u1 + u2 + u3
We can eliminate some common subexpressions. For any k we have
(T2 >>>> k) = ((C + r) >>>> k)
where r is a 7-bit value. If k>7 then this is equal to (C >>>> k). This means that
the value (T2 >>>> 31) is equal to (C >>>> 31) so we don't have to compute it again.
So we can rewrite our formulas as
t4 := (C >>>> 31)
t5 := (C >>>> 30)
t6 := (C >>>> 25)
ts = t4 + t5 + t6
T2 = C + (ts >> 96)
Note that ts = (C >>>> 31) + (C >>>> 30) + (C >>>> 25)
which is equal to (T2 >>>> 31) + (T2 >>>> 30) + (T2 >>>> 25)
R = D + T2 + u1 + u2 + u3
= D + T2 + (T2 <<<< 1) + (T2 <<<< 2) + (T2 <<<< 7) + (ts << 32)
All together, we can do the modulo reduction using the following formulas
ts := (C >>>> 31) + (C >>>> 30) + (C >>>> 25)
T2 := C + (ts >> 96)
R = D + T2 + (T2 <<<< 1) + (T2 <<<< 2) + (T2 <<<< 7) + (ts << 32)
Using a total of 16 operations. (6 subword shifts, 2 byte shifts, and 8 additions)
Reversed bit order:
There is one more complication. GHASH uses the bits in the reverse order from normal representation.
The bits b_0, b_1, ..., b_127 represent the polynomial b_0 + b_1 * x + ... + b_127 * x^127.
This means that the most significant bit in each byte is actually the least significant bit in the
polynomial.
SSE CPUs use the LSBFirst convention. This means that the bits b_0, b_1, ..., b_127 of the polynimial
end up at positions 7, 6, 5, ..., 1, 0, 15, 14, ..., 9, 8, 23, 22, ... of our XMM register.
This is obviously not a useful representation to do arithmetic in.
The first step is to BSWAP the value so that the bits appear in pure reverse order.
That is at least algebraically useful.
To compute the multiplication we use the fact that GF(2)[x] multiplication has no carries and
thus no preference for bit order. After the BSWAP we don't have the values A and B, but rather
rev(A) and rev(B) where rev() is a function that reverses the bit order. We can now compute
rev(A) * rev(B) = rev( A*B ) >> 1
where the shift operator is on the 256-bit product.
The modulo reduction remains the same, except that we change all the shifts to be the other direction.
This gives us finally the outline of our multiplication:
- Apply BSWAP to all values loaded from memory.
A := BSWAP( Abytes )
B := BSWAP( Bbytes )
- Compute the 256-bit product, possibly using Karatsuba.
(P1, P0) := A * B // 128x128 carryless multiplication
- Shift the result left one bit.
(Q1, Q0) := (P1, P0) << 1
which is computed as
Q0 = (P0 <<<< 1) + (P0 >>>> 31) << 32
Q1 = (P1 <<<< 1) + (P1 >>>> 31) << 32 + (P0 >>>> 31) >> 96
- Perform the modulo reduction, with reversed bit order
ts := (Q0 <<<< 31) + (Q0 <<<< 30) + (Q0 <<<< 25)
T2 := Q0 + (ts << 96)
R = Q1 + T2 + (T2 >>>> 1) + (T2 >>>> 2) + (T2 >>>> 7) + (ts >> 32)
Future work:
It might be possible to construct a faster solution by merging the leftshift of (P1,P0)
with the modulo reduction.
*/
#if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64
VOID
SYMCRYPT_CALL
SymCryptGHashExpandKeyPclmulqdq(
_Out_writes_( SYMCRYPT_GF128_FIELD_SIZE ) PSYMCRYPT_GF128_ELEMENT expandedKey,
_In_reads_( SYMCRYPT_GF128_BLOCK_SIZE ) PCBYTE pH )
{
int i;
__m128i H, Hx, H2, H2x;
__m128i t0, t1, t2, t3, t4, t5;
__m128i Hi_even, Hix_even, Hi_odd, Hix_odd;
__m128i BYTE_REVERSE_ORDER = _mm_set_epi8(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );
__m128i vMultiplicationConstant = _mm_set_epi32( 0, 0, 0xc2000000, 0 );
//
// Our expanded key consists of a list of N=SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS
// powers of H. The first entry is H^N, the next H^(N-1), then H^(N-2), ...
//
// For each power we store two 128-bit values. The first is H^i (Hi) and the second
// contains the two halves of H^i xorred with each other in the lower 64 bits (Hix).
//
// We keep all of the Hi entries together in the first half of the expanded key
// table, and all of the Hix entries together in the second half of the table.
//
// This ordering allow for efficient vectorization with arbitrary vector width, as
// many multiplication constants can be loaded into wider vectors with the correct
// alignment. Not maintaining different layouts for different vector lengths does
// leave a small amount of performance on the table, but experimentally it seems to
// <1% difference, and using a single layout reduces complexity significantly.
//
C_ASSERT( 2*SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS <= SYMCRYPT_GF128_FIELD_SIZE );
H = _mm_loadu_si128((__m128i *) pH );
H = _mm_shuffle_epi8( H, BYTE_REVERSE_ORDER );
Hx = _mm_xor_si128( H, _mm_srli_si128( H, 8 ) );
_mm_store_si128( &GHASH_H_POWER(expandedKey, 1), H );
_mm_store_si128( &GHASH_Hx_POWER(expandedKey, 1), Hx );
CLMUL_X_3( H, Hx, H, Hx, t0, t1, t2 );
CLMUL_3_POST( t0, t1, t2 );
MODREDUCE( vMultiplicationConstant, t0, t1, t2, H2 );
H2x = _mm_xor_si128( H2, _mm_srli_si128( H2, 8 ) );
_mm_store_si128( &GHASH_H_POWER(expandedKey, 2), H2 );
_mm_store_si128( &GHASH_Hx_POWER(expandedKey, 2), H2x );
Hi_even = H2;
Hix_even = H2x;
for( i=2; i<SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS; i+=2 )
{
CLMUL_X_3( H, Hx, Hi_even, Hix_even, t0, t1, t2 );
CLMUL_3_POST( t0, t1, t2 );
CLMUL_X_3( H2, H2x, Hi_even, Hix_even, t3, t4, t5 );
CLMUL_3_POST( t3, t4, t5 );
MODREDUCE( vMultiplicationConstant, t0, t1, t2, Hi_odd );
MODREDUCE( vMultiplicationConstant, t3, t4, t5, Hi_even );
Hix_odd = _mm_xor_si128( Hi_odd, _mm_srli_si128( Hi_odd, 8 ) );
Hix_even = _mm_xor_si128( Hi_even, _mm_srli_si128( Hi_even, 8 ) );
_mm_store_si128( &GHASH_H_POWER(expandedKey, i + 1), Hi_odd );
_mm_store_si128( &GHASH_H_POWER(expandedKey, i + 2), Hi_even );
_mm_store_si128( &GHASH_Hx_POWER(expandedKey, i + 1), Hix_odd );
_mm_store_si128( &GHASH_Hx_POWER(expandedKey, i + 2), Hix_even );
}
}
VOID
SYMCRYPT_CALL
SymCryptGHashAppendDataPclmulqdq(
_In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable,
_Inout_ PSYMCRYPT_GF128_ELEMENT pState,
_In_reads_( cbData ) PCBYTE pbData,
SIZE_T cbData )
{
__m128i state;
__m128i data;
__m128i a0, a1, a2;
__m128i Hi, Hix;
SIZE_T i;
SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE;
SIZE_T todo;
//
// To do a BSWAP we need an __m128i value with the bytes
//
__m128i BYTE_REVERSE_ORDER = _mm_set_epi8(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );
__m128i vMultiplicationConstant = _mm_set_epi32( 0, 0, 0xc2000000, 0 );
state = _mm_loadu_si128( (__m128i *) pState );
while( nBlocks > 0 )
{
//
// We process the data in blocks of up to SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS blocks
//
todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS );
//
// The first block is xorred with the state before multiplying it with a power of H
//
data = _mm_loadu_si128( (__m128i *) pbData );
data = _mm_shuffle_epi8( data, BYTE_REVERSE_ORDER );
pbData += SYMCRYPT_GF128_BLOCK_SIZE;
state = _mm_xor_si128( state, data );
CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
//
// Then we just do an improduct
//
for( i=1; i<todo; i++ )
{
data = _mm_loadu_si128( (__m128i *) pbData );
data = _mm_shuffle_epi8( data, BYTE_REVERSE_ORDER );
pbData += SYMCRYPT_GF128_BLOCK_SIZE;
Hi = _mm_load_si128( &GHASH_H_POWER(expandedKeyTable, todo - i) );
Hix = _mm_load_si128( &GHASH_Hx_POWER(expandedKeyTable, todo - i) );
CLMUL_ACC_3( data, Hi, Hix, a0, a1, a2 );
}
CLMUL_3_POST( a0, a1, a2 );
MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
nBlocks -= todo;
}
_mm_storeu_si128((__m128i *)pState, state );
}
#endif // CPU_X86 || CPU_AMD64
#if SYMCRYPT_CPU_ARM64
VOID
SYMCRYPT_CALL
SymCryptGHashExpandKeyPmull(
_Out_writes_( SYMCRYPT_GF128_FIELD_SIZE ) PSYMCRYPT_GF128_ELEMENT expandedKey,
_In_reads_( SYMCRYPT_GF128_BLOCK_SIZE ) PCBYTE pH )
{
int i;
__n128 H, Hx, H2, H2x;
__n128 t0, t1, t2, t3, t4, t5;
__n128 Hi_even, Hix_even, Hi_odd, Hix_odd;
const __n64 vMultiplicationConstant = SYMCRYPT_SET_N64_U64(0xc200000000000000);
//
// Our expanded key consists of a list of N=SYMCRYPT_GHASH_PMULL_HPOWERS
// powers of H. The first entry is H^N, the next H^(N-1), then H^(N-2), ...
//
// For each power we store two 128-bit values. The first is H^i (Hi) and the second
// contains the two halves of H^i xorred with each other in the lower 64 bits (Hix).
//
// We keep all of the Hi entries together in the first half of the expanded key
// table, and all of the Hix entries together in the second half of the table.
//
// This ordering allow for efficient vectorization with arbitrary vector width, as
// many multiplication constants can be loaded into wider vectors with the correct
// alignment. Not maintaining different layouts for different vector lengths does
// leave a small amount of performance on the table, but experimentally it seems to
// <1% difference, and using a single layout reduces complexity significantly.
//
C_ASSERT( 2*SYMCRYPT_GHASH_PMULL_HPOWERS <= SYMCRYPT_GF128_FIELD_SIZE );
H = *(__n128 *) pH;
Hx = vrev64q_u8( H );
H = vextq_u8( Hx, Hx, 8 );
Hx = veorq_u8( H, Hx );
GHASH_H_POWER(expandedKey, 1) = H;
GHASH_Hx_POWER(expandedKey, 1) = Hx;
CLMUL_X_3( H, Hx, H, Hx, t0, t1, t2 );
CLMUL_3_POST( t0, t1, t2 );
MODREDUCE( vMultiplicationConstant, t0, t1, t2, H2 );
H2x = veorq_u8( H2, vextq_u8( H2, H2, 8 ) );
GHASH_H_POWER(expandedKey, 2) = H2;
GHASH_Hx_POWER(expandedKey, 2) = H2x;
Hi_even = H2;
Hix_even = H2x;
for( i=2; i<SYMCRYPT_GHASH_PMULL_HPOWERS; i+=2 )
{
CLMUL_X_3( H, Hx, Hi_even, Hix_even, t0, t1, t2 );
CLMUL_3_POST( t0, t1, t2 );
CLMUL_X_3( H2, H2x, Hi_even, Hix_even, t3, t4, t5 );
CLMUL_3_POST( t3, t4, t5 );
MODREDUCE( vMultiplicationConstant, t0, t1, t2, Hi_odd );
MODREDUCE( vMultiplicationConstant, t3, t4, t5, Hi_even );
Hix_odd = veorq_u8( Hi_odd, vextq_u8( Hi_odd, Hi_odd, 8 ) );
Hix_even = veorq_u8( Hi_even, vextq_u8( Hi_even, Hi_even, 8 ) );
GHASH_H_POWER(expandedKey, i + 1) = Hi_odd;
GHASH_H_POWER(expandedKey, i + 2) = Hi_even;
GHASH_Hx_POWER(expandedKey, i + 1) = Hix_odd;
GHASH_Hx_POWER(expandedKey, i + 2) = Hix_even;
}
}
VOID
SYMCRYPT_CALL
SymCryptGHashAppendDataPmull(
_In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable,
_Inout_ PSYMCRYPT_GF128_ELEMENT pState,
_In_reads_( cbData ) PCBYTE pbData,
SIZE_T cbData )
{
__n128 state;
__n128 data, datax;
__n128 a0, a1, a2;
__n128 Hi, Hix;
const __n64 vMultiplicationConstant = SYMCRYPT_SET_N64_U64(0xc200000000000000);
SIZE_T i;
SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE;
SIZE_T todo;
state = *(__n128 *) pState;
while( nBlocks > 0 )
{
//
// We process the data in blocks of up to SYMCRYPT_GHASH_PMULL_HPOWERS blocks
//
todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PMULL_HPOWERS );
//
// The first block is xorred with the state before multiplying it with a power of H
//
data = *(__n128 *)pbData;
REVERSE_BYTES( data, data );
pbData += SYMCRYPT_GF128_BLOCK_SIZE;
state = veorq_u8( state, data );
CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
//
// Then we just do an improduct
//
for( i=1; i<todo; i++ )
{
// we can avoid an EXT here by precomputing datax for CLMUL_ACCX_3
datax = vrev64q_u8( *(__n128 *)pbData );
data = vextq_u8( datax, datax, 8 );
datax = veorq_u8( data, datax );
pbData += SYMCRYPT_GF128_BLOCK_SIZE;
Hi = GHASH_H_POWER(expandedKeyTable, todo - i);
Hix = GHASH_Hx_POWER(expandedKeyTable, todo - i);
CLMUL_ACCX_3( data, datax, Hi, Hix, a0, a1, a2 );
}
CLMUL_3_POST( a0, a1, a2 );
MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
nBlocks -= todo;
}
*(__n128 *) pState = state;
}
#endif // CPU_ARM64
//////////////////////////////////////////////////////////////
// Stuff around the core algorithm implementation functions
//
VOID
SYMCRYPT_CALL
SymCryptGHashExpandKey(
_Out_ PSYMCRYPT_GHASH_EXPANDED_KEY expandedKey,
_In_reads_( SYMCRYPT_GF128_BLOCK_SIZE ) PCBYTE pH )
{
#if SYMCRYPT_CPU_X86
PSYMCRYPT_GF128_ELEMENT pExpandedKeyTable;
SYMCRYPT_EXTENDED_SAVE_DATA SaveData;
//
// Initialize offset into table space for 16-alignment.
//
expandedKey->tableOffset = (0 -((UINT_PTR) &expandedKey->tableSpace[0])) % sizeof(SYMCRYPT_GF128_ELEMENT);
pExpandedKeyTable = (PSYMCRYPT_GF128_ELEMENT)&expandedKey->tableSpace[expandedKey->tableOffset];
if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURES_FOR_PCLMULQDQ_CODE ) )
{
//
// We can only use the PCLMULQDQ data representation if the SaveXmm never fails.
// This is one of the CPU features required.
// We check anyway...
//
if( SymCryptSaveXmm( &SaveData ) != SYMCRYPT_NO_ERROR )
{
SymCryptFatal( 'pclm' );
}
SymCryptGHashExpandKeyPclmulqdq( pExpandedKeyTable, pH );
SymCryptRestoreXmm( &SaveData );
} else if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_SSE2 ) && SymCryptSaveXmm( &SaveData ) == SYMCRYPT_NO_ERROR )
{
SymCryptGHashExpandKeyXmm( pExpandedKeyTable, pH );
SymCryptRestoreXmm( &SaveData );
} else {
SymCryptGHashExpandKeyC( pExpandedKeyTable, pH );
}
#elif SYMCRYPT_CPU_AMD64
PSYMCRYPT_GF128_ELEMENT pExpandedKeyTable;
pExpandedKeyTable = &expandedKey->table[0];
if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURES_FOR_PCLMULQDQ_CODE ) )
{
SymCryptGHashExpandKeyPclmulqdq( pExpandedKeyTable, pH );
} else if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_SSE2 ) )
{
SymCryptGHashExpandKeyXmm( pExpandedKeyTable, pH );
} else {
SymCryptGHashExpandKeyC( pExpandedKeyTable, pH );
}
#elif SYMCRYPT_CPU_ARM64
PSYMCRYPT_GF128_ELEMENT pExpandedKeyTable;
pExpandedKeyTable = &expandedKey->table[0];
if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_NEON_PMULL ) )
{
SymCryptGHashExpandKeyPmull( pExpandedKeyTable, pH );
} else {
SymCryptGHashExpandKeyC( pExpandedKeyTable, pH );
}
#else
SymCryptGHashExpandKeyC( &expandedKey->table[0], pH ); // Default expansion (does not need alignment)
#endif
}
VOID
SYMCRYPT_CALL
SymCryptGHashAppendData(
_In_ PCSYMCRYPT_GHASH_EXPANDED_KEY expandedKey,
_Inout_ PSYMCRYPT_GF128_ELEMENT pState,
_In_reads_( cbData ) PCBYTE pbData,
SIZE_T cbData )
{
#if SYMCRYPT_CPU_X86
PCSYMCRYPT_GF128_ELEMENT pExpandedKeyTable;
SYMCRYPT_EXTENDED_SAVE_DATA SaveData;
pExpandedKeyTable = (PSYMCRYPT_GF128_ELEMENT)&expandedKey->tableSpace[expandedKey->tableOffset];
if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURES_FOR_PCLMULQDQ_CODE ) )
{
if( SymCryptSaveXmm( &SaveData ) != SYMCRYPT_NO_ERROR )
{
SymCryptFatal( 'pclm' );
}
SymCryptGHashAppendDataPclmulqdq( pExpandedKeyTable, pState, pbData, cbData );
SymCryptRestoreXmm( &SaveData );
} else if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_SSE2 ) && SymCryptSaveXmm( &SaveData ) == SYMCRYPT_NO_ERROR )
{
SymCryptGHashAppendDataXmm( pExpandedKeyTable, pState, pbData, cbData );
SymCryptRestoreXmm( &SaveData );
} else {
SymCryptGHashAppendDataC( pExpandedKeyTable, pState, pbData, cbData );
}
#elif SYMCRYPT_CPU_AMD64
PCSYMCRYPT_GF128_ELEMENT pExpandedKeyTable;
pExpandedKeyTable = &expandedKey->table[0];
if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURES_FOR_PCLMULQDQ_CODE ) )
{
SymCryptGHashAppendDataPclmulqdq( pExpandedKeyTable, pState, pbData, cbData );
} else if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_SSE2 ) )
{
SymCryptGHashAppendDataXmm( pExpandedKeyTable, pState, pbData, cbData );
} else {
SymCryptGHashAppendDataC( pExpandedKeyTable, pState, pbData, cbData );
}
#elif SYMCRYPT_CPU_ARM
PCSYMCRYPT_GF128_ELEMENT pExpandedKeyTable;
pExpandedKeyTable = &expandedKey->table[0];
if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_NEON ) )
{
SymCryptGHashAppendDataNeon( pExpandedKeyTable, pState, pbData, cbData );
} else {
SymCryptGHashAppendDataC( pExpandedKeyTable, pState, pbData, cbData );
}
#elif SYMCRYPT_CPU_ARM64
PCSYMCRYPT_GF128_ELEMENT pExpandedKeyTable;
pExpandedKeyTable = &expandedKey->table[0];
if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_NEON_PMULL ) )
{
SymCryptGHashAppendDataPmull( pExpandedKeyTable, pState, pbData, cbData );
} else if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_NEON ) )
{
SymCryptGHashAppendDataNeon( pExpandedKeyTable, pState, pbData, cbData );
} else {
SymCryptGHashAppendDataC( pExpandedKeyTable, pState, pbData, cbData );
}
#else
SymCryptGHashAppendDataC( &expandedKey->table[0], pState, pbData, cbData );
#endif
}