|
11 | 11 | #include <stdint.h> |
12 | 12 | #include <immintrin.h> |
13 | 13 |
|
14 | | - |
15 | | - |
16 | 14 | namespace { |
17 | 15 |
|
18 | | -const __m128i MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); |
19 | | -const __m128i INIT0 = _mm_set_epi64x(0x6a09e667bb67ae85ull, 0x510e527f9b05688cull); |
20 | | -const __m128i INIT1 = _mm_set_epi64x(0x3c6ef372a54ff53aull, 0x1f83d9ab5be0cd19ull); |
| 16 | +alignas(__m128i) const uint8_t MASK[16] = {0x03, 0x02, 0x01, 0x00, 0x07, 0x06, 0x05, 0x04, 0x0b, 0x0a, 0x09, 0x08, 0x0f, 0x0e, 0x0d, 0x0c}; |
| 17 | +alignas(__m128i) const uint8_t INIT0[16] = {0x8c, 0x68, 0x05, 0x9b, 0x7f, 0x52, 0x0e, 0x51, 0x85, 0xae, 0x67, 0xbb, 0x67, 0xe6, 0x09, 0x6a}; |
| 18 | +alignas(__m128i) const uint8_t INIT1[16] = {0x19, 0xcd, 0xe0, 0x5b, 0xab, 0xd9, 0x83, 0x1f, 0x3a, 0xf5, 0x4f, 0xa5, 0x72, 0xf3, 0x6e, 0x3c}; |
21 | 19 |
|
22 | 20 | void inline __attribute__((always_inline)) QuadRound(__m128i& state0, __m128i& state1, uint64_t k1, uint64_t k0) |
23 | 21 | { |
@@ -67,12 +65,12 @@ void inline __attribute__((always_inline)) Unshuffle(__m128i& s0, __m128i& s1) |
67 | 65 |
|
68 | 66 | __m128i inline __attribute__((always_inline)) Load(const unsigned char* in) |
69 | 67 | { |
70 | | - return _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)in), MASK); |
| 68 | + return _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)in), _mm_load_si128((const __m128i*)MASK)); |
71 | 69 | } |
72 | 70 |
|
73 | 71 | void inline __attribute__((always_inline)) Save(unsigned char* out, __m128i s) |
74 | 72 | { |
75 | | - _mm_storeu_si128((__m128i*)out, _mm_shuffle_epi8(s, MASK)); |
| 73 | + _mm_storeu_si128((__m128i*)out, _mm_shuffle_epi8(s, _mm_load_si128((const __m128i*)MASK))); |
76 | 74 | } |
77 | 75 | } |
78 | 76 |
|
@@ -149,8 +147,8 @@ void Transform_2way(unsigned char* out, const unsigned char* in) |
149 | 147 | __m128i bm0, bm1, bm2, bm3, bs0, bs1, bso0, bso1; |
150 | 148 |
|
151 | 149 | /* Transform 1 */ |
152 | | - bs0 = as0 = INIT0; |
153 | | - bs1 = as1 = INIT1; |
| 150 | + bs0 = as0 = _mm_load_si128((const __m128i*)INIT0); |
| 151 | + bs1 = as1 = _mm_load_si128((const __m128i*)INIT1); |
154 | 152 | am0 = Load(in); |
155 | 153 | bm0 = Load(in + 64); |
156 | 154 | QuadRound(as0, as1, am0, 0xe9b5dba5b5c0fbcfull, 0x71374491428a2f98ull); |
@@ -219,10 +217,10 @@ void Transform_2way(unsigned char* out, const unsigned char* in) |
219 | 217 | ShiftMessageC(bm1, bm2, bm3); |
220 | 218 | QuadRound(as0, as1, am3, 0xc67178f2bef9A3f7ull, 0xa4506ceb90befffaull); |
221 | 219 | QuadRound(bs0, bs1, bm3, 0xc67178f2bef9A3f7ull, 0xa4506ceb90befffaull); |
222 | | - as0 = _mm_add_epi32(as0, INIT0); |
223 | | - bs0 = _mm_add_epi32(bs0, INIT0); |
224 | | - as1 = _mm_add_epi32(as1, INIT1); |
225 | | - bs1 = _mm_add_epi32(bs1, INIT1); |
| 220 | + as0 = _mm_add_epi32(as0, _mm_load_si128((const __m128i*)INIT0)); |
| 221 | + bs0 = _mm_add_epi32(bs0, _mm_load_si128((const __m128i*)INIT0)); |
| 222 | + as1 = _mm_add_epi32(as1, _mm_load_si128((const __m128i*)INIT1)); |
| 223 | + bs1 = _mm_add_epi32(bs1, _mm_load_si128((const __m128i*)INIT1)); |
226 | 224 |
|
227 | 225 | /* Transform 2 */ |
228 | 226 | aso0 = as0; |
@@ -275,8 +273,8 @@ void Transform_2way(unsigned char* out, const unsigned char* in) |
275 | 273 | bm1 = bs1; |
276 | 274 |
|
277 | 275 | /* Transform 3 */ |
278 | | - bs0 = as0 = INIT0; |
279 | | - bs1 = as1 = INIT1; |
| 276 | + bs0 = as0 = _mm_load_si128((const __m128i*)INIT0); |
| 277 | + bs1 = as1 = _mm_load_si128((const __m128i*)INIT1); |
280 | 278 | QuadRound(as0, as1, am0, 0xe9b5dba5B5c0fbcfull, 0x71374491428a2f98ull); |
281 | 279 | QuadRound(bs0, bs1, bm0, 0xe9b5dba5B5c0fbcfull, 0x71374491428a2f98ull); |
282 | 280 | QuadRound(as0, as1, am1, 0xab1c5ed5923f82a4ull, 0x59f111f13956c25bull); |
@@ -339,10 +337,10 @@ void Transform_2way(unsigned char* out, const unsigned char* in) |
339 | 337 | ShiftMessageC(bm1, bm2, bm3); |
340 | 338 | QuadRound(as0, as1, am3, 0xc67178f2bef9a3f7ull, 0xa4506ceb90befffaull); |
341 | 339 | QuadRound(bs0, bs1, bm3, 0xc67178f2bef9a3f7ull, 0xa4506ceb90befffaull); |
342 | | - as0 = _mm_add_epi32(as0, INIT0); |
343 | | - bs0 = _mm_add_epi32(bs0, INIT0); |
344 | | - as1 = _mm_add_epi32(as1, INIT1); |
345 | | - bs1 = _mm_add_epi32(bs1, INIT1); |
| 340 | + as0 = _mm_add_epi32(as0, _mm_load_si128((const __m128i*)INIT0)); |
| 341 | + bs0 = _mm_add_epi32(bs0, _mm_load_si128((const __m128i*)INIT0)); |
| 342 | + as1 = _mm_add_epi32(as1, _mm_load_si128((const __m128i*)INIT1)); |
| 343 | + bs1 = _mm_add_epi32(bs1, _mm_load_si128((const __m128i*)INIT1)); |
346 | 344 |
|
347 | 345 | /* Extract hash into out */ |
348 | 346 | Unshuffle(as0, as1); |
|
0 commit comments