| 1 | static inline int |
| 2 | dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds) |
| 3 | { |
| 4 | const __m256i lut_lo = _mm256_setr_epi8( |
| 5 | 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, |
| 6 | 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A, |
| 7 | 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, |
| 8 | 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A); |
| 9 | |
| 10 | const __m256i lut_hi = _mm256_setr_epi8( |
| 11 | 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, |
| 12 | 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, |
| 13 | 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, |
| 14 | 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10); |
| 15 | |
| 16 | const __m256i lut_roll = _mm256_setr_epi8( |
| 17 | 0, 16, 19, 4, -65, -65, -71, -71, |
| 18 | 0, 0, 0, 0, 0, 0, 0, 0, |
| 19 | 0, 16, 19, 4, -65, -65, -71, -71, |
| 20 | 0, 0, 0, 0, 0, 0, 0, 0); |
| 21 | |
| 22 | const __m256i mask_2F = _mm256_set1_epi8(0x2F); |
| 23 | |
| 24 | // Load input: |
| 25 | __m256i str = _mm256_loadu_si256((__m256i *) *s); |
| 26 | |
| 27 | // See the SSSE3 decoder for an explanation of the algorithm. |
| 28 | const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F); |
| 29 | const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F); |
| 30 | const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles); |
| 31 | const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles); |
| 32 | |
| 33 | if (!_mm256_testz_si256(lo, hi)) { |
| 34 | return 0; |
| 35 | } |
| 36 | |
| 37 | const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F); |
| 38 | const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles)); |
| 39 | |
| 40 | // Now simply add the delta values to the input: |
| 41 | str = _mm256_add_epi8(str, roll); |
| 42 | |
| 43 | // Reshuffle the input to packed 12-byte output format: |
| 44 | str = dec_reshuffle(str); |
| 45 | |
| 46 | // Store the output: |
| 47 | _mm256_storeu_si256((__m256i *) *o, str); |
| 48 | |
| 49 | *s += 32; |
| 50 | *o += 24; |
| 51 | *rounds -= 1; |
| 52 | |
| 53 | return 1; |
| 54 | } |
| 55 | |
| 56 | static inline void |
| 57 | dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) |
| 58 | { |
| 59 | if (*slen < 45) { |
| 60 | return; |
| 61 | } |
| 62 | |
| 63 | // Process blocks of 32 bytes per round. Because 8 extra zero bytes are |
| 64 | // written after the output, ensure that there will be at least 13 |
| 65 | // bytes of input data left to cover the gap. (11 data bytes and up to |
| 66 | // two end-of-string markers.) |
| 67 | size_t rounds = (*slen - 13) / 32; |
| 68 | |
| 69 | *slen -= rounds * 32; // 32 bytes consumed per round |
| 70 | *olen += rounds * 24; // 24 bytes produced per round |
| 71 | |
| 72 | do { |
| 73 | if (rounds >= 8) { |
| 74 | if (dec_loop_avx2_inner(s, o, &rounds) && |
| 75 | dec_loop_avx2_inner(s, o, &rounds) && |
| 76 | dec_loop_avx2_inner(s, o, &rounds) && |
| 77 | dec_loop_avx2_inner(s, o, &rounds) && |
| 78 | dec_loop_avx2_inner(s, o, &rounds) && |
| 79 | dec_loop_avx2_inner(s, o, &rounds) && |
| 80 | dec_loop_avx2_inner(s, o, &rounds) && |
| 81 | dec_loop_avx2_inner(s, o, &rounds)) { |
| 82 | continue; |
| 83 | } |
| 84 | break; |
| 85 | } |
| 86 | if (rounds >= 4) { |
| 87 | if (dec_loop_avx2_inner(s, o, &rounds) && |
| 88 | dec_loop_avx2_inner(s, o, &rounds) && |
| 89 | dec_loop_avx2_inner(s, o, &rounds) && |
| 90 | dec_loop_avx2_inner(s, o, &rounds)) { |
| 91 | continue; |
| 92 | } |
| 93 | break; |
| 94 | } |
| 95 | if (rounds >= 2) { |
| 96 | if (dec_loop_avx2_inner(s, o, &rounds) && |
| 97 | dec_loop_avx2_inner(s, o, &rounds)) { |
| 98 | continue; |
| 99 | } |
| 100 | break; |
| 101 | } |
| 102 | dec_loop_avx2_inner(s, o, &rounds); |
| 103 | break; |
| 104 | |
| 105 | } while (rounds > 0); |
| 106 | |
| 107 | // Adjust for any rounds that were skipped: |
| 108 | *slen += rounds * 32; |
| 109 | *olen -= rounds * 24; |
| 110 | } |
| 111 | |