| 1 | static inline void |
| 2 | enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o) |
| 3 | { |
| 4 | // First load is done at s - 0 to not get a segfault: |
| 5 | __m256i src = _mm256_loadu_si256((__m256i *) *s); |
| 6 | |
| 7 | // Shift by 4 bytes, as required by enc_reshuffle: |
| 8 | src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6)); |
| 9 | |
| 10 | // Reshuffle, translate, store: |
| 11 | src = enc_reshuffle(src); |
| 12 | src = enc_translate(src); |
| 13 | _mm256_storeu_si256((__m256i *) *o, src); |
| 14 | |
| 15 | // Subsequent loads will be done at s - 4, set pointer for next round: |
| 16 | *s += 20; |
| 17 | *o += 32; |
| 18 | } |
| 19 | |
| 20 | static inline void |
| 21 | enc_loop_avx2_inner (const uint8_t **s, uint8_t **o) |
| 22 | { |
| 23 | // Load input: |
| 24 | __m256i src = _mm256_loadu_si256((__m256i *) *s); |
| 25 | |
| 26 | // Reshuffle, translate, store: |
| 27 | src = enc_reshuffle(src); |
| 28 | src = enc_translate(src); |
| 29 | _mm256_storeu_si256((__m256i *) *o, src); |
| 30 | |
| 31 | *s += 24; |
| 32 | *o += 32; |
| 33 | } |
| 34 | |
| 35 | static inline void |
| 36 | enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) |
| 37 | { |
| 38 | if (*slen < 32) { |
| 39 | return; |
| 40 | } |
| 41 | |
| 42 | // Process blocks of 24 bytes at a time. Because blocks are loaded 32 |
| 43 | // bytes at a time an offset of -4, ensure that there will be at least |
| 44 | // 4 remaining bytes after the last round, so that the final read will |
| 45 | // not pass beyond the bounds of the input buffer: |
| 46 | size_t rounds = (*slen - 4) / 24; |
| 47 | |
| 48 | *slen -= rounds * 24; // 24 bytes consumed per round |
| 49 | *olen += rounds * 32; // 32 bytes produced per round |
| 50 | |
| 51 | // The first loop iteration requires special handling to ensure that |
| 52 | // the read, which is done at an offset, does not underflow the buffer: |
| 53 | enc_loop_avx2_inner_first(s, o); |
| 54 | rounds--; |
| 55 | |
| 56 | while (rounds > 0) { |
| 57 | if (rounds >= 8) { |
| 58 | enc_loop_avx2_inner(s, o); |
| 59 | enc_loop_avx2_inner(s, o); |
| 60 | enc_loop_avx2_inner(s, o); |
| 61 | enc_loop_avx2_inner(s, o); |
| 62 | enc_loop_avx2_inner(s, o); |
| 63 | enc_loop_avx2_inner(s, o); |
| 64 | enc_loop_avx2_inner(s, o); |
| 65 | enc_loop_avx2_inner(s, o); |
| 66 | rounds -= 8; |
| 67 | continue; |
| 68 | } |
| 69 | if (rounds >= 4) { |
| 70 | enc_loop_avx2_inner(s, o); |
| 71 | enc_loop_avx2_inner(s, o); |
| 72 | enc_loop_avx2_inner(s, o); |
| 73 | enc_loop_avx2_inner(s, o); |
| 74 | rounds -= 4; |
| 75 | continue; |
| 76 | } |
| 77 | if (rounds >= 2) { |
| 78 | enc_loop_avx2_inner(s, o); |
| 79 | enc_loop_avx2_inner(s, o); |
| 80 | rounds -= 2; |
| 81 | continue; |
| 82 | } |
| 83 | enc_loop_avx2_inner(s, o); |
| 84 | break; |
| 85 | } |
| 86 | |
| 87 | // Add the offset back: |
| 88 | *s += 4; |
| 89 | } |
| 90 | |