11#if defined(__AVX512VPOPCNTDQ__) && defined(__AVX512F__) && \
13#define PIXIE_AVX512_SUPPORT
17#define PIXIE_AVX2_SUPPORT
21static inline const __m256i lookup_popcount_4 = _mm256_setr_epi8(
34static inline const __m256i mask_first_half = _mm256_setr_epi8(
35 0xFF, 0xFF, 0xFF, 0xFF,
36 0xFF, 0xFF, 0xFF, 0xFF,
37 0xFF, 0xFF, 0xFF, 0xFF,
38 0xFF, 0xFF, 0xFF, 0xFF,
62static inline uint32_t rmm_btree_match_mask_i16x16(
const int16_t* prefix_before,
63 const int16_t* min_excess,
64 const int16_t* max_excess,
66 bool include_zero_boundary) {
67#ifdef PIXIE_AVX2_SUPPORT
68 const __m256i vtarget = _mm256_set1_epi16(target);
69 const __m256i vprefix =
70 _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(prefix_before));
72 _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(min_excess));
74 _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(max_excess));
76 const __m256i lower = _mm256_adds_epi16(vprefix, vmin);
77 const __m256i upper = _mm256_adds_epi16(vprefix, vmax);
78 const __m256i ge_lower = _mm256_or_si256(_mm256_cmpgt_epi16(vtarget, lower),
79 _mm256_cmpeq_epi16(vtarget, lower));
80 const __m256i le_upper = _mm256_or_si256(_mm256_cmpgt_epi16(upper, vtarget),
81 _mm256_cmpeq_epi16(upper, vtarget));
82 __m256i matched = _mm256_and_si256(ge_lower, le_upper);
83 if (include_zero_boundary) {
84 matched = _mm256_or_si256(matched, _mm256_cmpeq_epi16(vtarget, vprefix));
87 const uint32_t byte_mask =
88 static_cast<uint32_t
>(_mm256_movemask_epi8(matched));
90 for (
size_t lane = 0; lane < 16; ++lane) {
91 const uint32_t lane_mask = 0x3u << (lane * 2);
92 if ((byte_mask & lane_mask) == lane_mask) {
93 result |= uint32_t{1} << lane;
99 for (
size_t lane = 0; lane < 16; ++lane) {
100 const int lower = prefix_before[lane] + min_excess[lane];
101 const int upper = prefix_before[lane] + max_excess[lane];
102 const bool found = (lower <= target && target <= upper) ||
103 (include_zero_boundary && target == prefix_before[lane]);
105 result |= uint32_t{1} << lane;
126static inline uint32_t rmm_btree_match_mask_i64x4(
const int64_t* prefix_before,
127 const int64_t* min_excess,
128 const int64_t* max_excess,
130 bool include_zero_boundary) {
131#ifdef PIXIE_AVX2_SUPPORT
132 const __m256i vtarget = _mm256_set1_epi64x(target);
133 const __m256i vprefix =
134 _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(prefix_before));
136 _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(min_excess));
138 _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(max_excess));
140 const __m256i relative = _mm256_sub_epi64(vtarget, vprefix);
141 const __m256i ge_min = _mm256_or_si256(_mm256_cmpgt_epi64(relative, vmin),
142 _mm256_cmpeq_epi64(relative, vmin));
143 const __m256i le_max = _mm256_or_si256(_mm256_cmpgt_epi64(vmax, relative),
144 _mm256_cmpeq_epi64(vmax, relative));
145 __m256i matched = _mm256_and_si256(ge_min, le_max);
146 if (include_zero_boundary) {
147 matched = _mm256_or_si256(matched, _mm256_cmpeq_epi64(vtarget, vprefix));
150 const uint32_t byte_mask =
151 static_cast<uint32_t
>(_mm256_movemask_epi8(matched));
153 for (
size_t lane = 0; lane < 4; ++lane) {
154 const uint32_t lane_mask = 0xffu << (lane * 8);
155 if ((byte_mask & lane_mask) == lane_mask) {
156 result |= uint32_t{1} << lane;
162 for (
size_t lane = 0; lane < 4; ++lane) {
163 const int64_t relative = target - prefix_before[lane];
165 (min_excess[lane] <= relative && relative <= max_excess[lane]) ||
166 (include_zero_boundary && relative == 0);
168 result |= uint32_t{1} << lane;
182static inline uint64_t first_bits_mask(
size_t num) {
183 return num >= 64 ? UINT64_MAX : ((1llu << num) - 1);
206static inline uint64_t rank_512(
const uint64_t* x, uint64_t count) {
207#ifdef PIXIE_AVX512_SUPPORT
209 __m512i a = _mm512_maskz_set1_epi64((1ull << ((count >> 6))) - 1,
210 std::numeric_limits<uint64_t>::max());
211 __m512i b = _mm512_maskz_set1_epi64((1ull << ((count >> 6) + 1)) - 1,
212 std::numeric_limits<uint64_t>::max());
213 __m512i mask = _mm512_shldv_epi64(a, b, _mm512_set1_epi64(count % 64));
215 __m512i res = _mm512_loadu_epi64(x);
216 res = _mm512_and_epi64(res, mask);
217 __m512i cnt = _mm512_popcnt_epi64(res);
218 return _mm512_reduce_add_epi64(cnt);
222 uint64_t last_uint = count < 512 ? count >> 6 : 8;
224 uint64_t pop_val = 0;
226 for (
int i = 0; i < last_uint; i++) {
227 pop_val += std::popcount(x[i]);
230 pop_val += count < 512
231 ? std::popcount(x[last_uint] & first_bits_mask(count & 63))
241static inline uint64_t select_64(uint64_t x, uint64_t rank) {
242 return _tzcnt_u64(_pdep_u64(1ull << rank, x));
262static inline uint64_t select_512(
const uint64_t* x, uint64_t rank) {
263#ifdef PIXIE_AVX512_SUPPORT
265 __m512i res = _mm512_loadu_epi64(x);
266 __m512i counts = _mm512_popcnt_epi64(res);
267 __m512i prefix = counts;
269 const __m512i idx_shift1 = _mm512_set_epi64(6, 5, 4, 3, 2, 1, 0, 0);
270 const __m512i idx_shift2 = _mm512_set_epi64(5, 4, 3, 2, 1, 0, 0, 0);
271 const __m512i idx_shift4 = _mm512_set_epi64(3, 2, 1, 0, 0, 0, 0, 0);
273 __m512i tmp = _mm512_maskz_permutexvar_epi64(0xFE, idx_shift1, prefix);
274 prefix = _mm512_add_epi64(prefix, tmp);
275 tmp = _mm512_maskz_permutexvar_epi64(0xFC, idx_shift2, prefix);
276 prefix = _mm512_add_epi64(prefix, tmp);
277 tmp = _mm512_maskz_permutexvar_epi64(0xF0, idx_shift4, prefix);
278 prefix = _mm512_add_epi64(prefix, tmp);
280 __mmask8 mask = _mm512_cmpgt_epu64_mask(prefix, _mm512_set1_epi64(rank));
281 uint32_t i = _tzcnt_u32(
static_cast<uint32_t
>(mask));
284 __m512i idx_prev = _mm512_set1_epi64(
static_cast<int64_t
>(i - 1));
285 __m512i prev_vec = _mm512_permutexvar_epi64(idx_prev, prefix);
286 prev =
static_cast<uint64_t
>(
287 _mm_cvtsi128_si64(_mm512_castsi512_si128(prev_vec)));
289 return i * 64 + select_64(x[i], rank - prev);
294 int popcount = std::popcount(x[0]);
295 while (i < 7 && popcount <= rank) {
297 popcount = std::popcount(x[++i]);
299 return i * 64 + select_64(x[i], rank);
308static inline uint64_t select0_512(
const uint64_t* x, uint64_t rank0) {
309#ifdef PIXIE_AVX512_SUPPORT
311 __m512i res = _mm512_loadu_epi64(x);
312 res = _mm512_xor_epi64(res, _mm512_set1_epi64(-1));
313 __m512i counts = _mm512_popcnt_epi64(res);
314 __m512i prefix = counts;
316 const __m512i idx_shift1 = _mm512_set_epi64(6, 5, 4, 3, 2, 1, 0, 0);
317 const __m512i idx_shift2 = _mm512_set_epi64(5, 4, 3, 2, 1, 0, 0, 0);
318 const __m512i idx_shift4 = _mm512_set_epi64(3, 2, 1, 0, 0, 0, 0, 0);
320 __m512i tmp = _mm512_maskz_permutexvar_epi64(0xFE, idx_shift1, prefix);
321 prefix = _mm512_add_epi64(prefix, tmp);
322 tmp = _mm512_maskz_permutexvar_epi64(0xFC, idx_shift2, prefix);
323 prefix = _mm512_add_epi64(prefix, tmp);
324 tmp = _mm512_maskz_permutexvar_epi64(0xF0, idx_shift4, prefix);
325 prefix = _mm512_add_epi64(prefix, tmp);
327 __mmask8 mask = _mm512_cmpgt_epu64_mask(prefix, _mm512_set1_epi64(rank0));
328 uint32_t i = _tzcnt_u32(
static_cast<uint32_t
>(mask));
331 __m512i idx_prev = _mm512_set1_epi64(
static_cast<int64_t
>(i - 1));
332 __m512i prev_vec = _mm512_permutexvar_epi64(idx_prev, prefix);
333 prev =
static_cast<uint64_t
>(
334 _mm_cvtsi128_si64(_mm512_castsi512_si128(prev_vec)));
336 return i * 64 + select_64(~x[i], rank0 - prev);
341 int popcount = std::popcount(~x[0]);
342 while (i < 7 && popcount <= rank0) {
344 popcount = std::popcount(~x[++i]);
346 return i * 64 + select_64(~x[i], rank0);
355static inline uint16_t lower_bound_4x64(
const uint64_t* x, uint64_t y) {
356#ifdef PIXIE_AVX512_SUPPORT
358 auto y_4 = _mm256_set1_epi64x(y);
359 auto reg_256 = _mm256_loadu_epi64(x);
360 auto cmp = _mm256_cmpge_epu64_mask(reg_256, y_4);
362 return _tzcnt_u16(cmp);
365#ifdef PIXIE_AVX2_SUPPORT
367 auto y_4 = _mm256_set1_epi64x(y);
368 __m256i reg_256 = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(x));
370 const __m256i offset = _mm256_set1_epi64x(0x8000000000000000ULL);
371 __m256i x_offset = _mm256_xor_si256(reg_256, offset);
372 __m256i y_offset = _mm256_xor_si256(y_4, offset);
373 auto mask = _mm256_movemask_epi8(_mm256_cmpgt_epi64(
374 x_offset, _mm256_sub_epi64(y_offset, _mm256_set1_epi64x(1))));
376 return _tzcnt_u32(mask) >> 3;
380 for (uint16_t i = 0; i < 4; ++i) {
404static inline uint16_t lower_bound_delta_4x64(
const uint64_t* x,
406 const uint64_t* delta_array,
407 uint64_t delta_scalar) {
408#ifdef PIXIE_AVX512_SUPPORT
410 const __m256i dlt_256 = _mm256_loadu_epi64(delta_array);
411 auto x_256 = _mm256_loadu_epi64(x);
412 auto dlt_4 = _mm256_set1_epi64x(delta_scalar);
413 auto y_4 = _mm256_set1_epi64x(y);
415 auto tmp = _mm256_add_epi64(dlt_4, dlt_256);
416 auto reg_256 = _mm256_sub_epi64(tmp, x_256);
417 auto cmp = _mm256_cmpge_epu64_mask(reg_256, y_4);
419 return _tzcnt_u16(cmp);
422#ifdef PIXIE_AVX2_SUPPORT
424 const __m256i dlt_256 =
425 _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(delta_array));
426 auto x_256 = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(x));
427 auto dlt_4 = _mm256_set1_epi64x(delta_scalar);
428 auto y_4 = _mm256_set1_epi64x(y);
430 auto tmp = _mm256_add_epi64(dlt_4, dlt_256);
431 auto reg_256 = _mm256_sub_epi64(tmp, x_256);
433 const __m256i offset = _mm256_set1_epi64x(0x8000000000000000ULL);
434 __m256i x_offset = _mm256_xor_si256(reg_256, offset);
435 __m256i y_offset = _mm256_xor_si256(y_4, offset);
436 auto mask = _mm256_movemask_epi8(_mm256_cmpgt_epi64(
437 x_offset, _mm256_sub_epi64(y_offset, _mm256_set1_epi64x(1))));
439 return _tzcnt_u32(mask) >> 3;
443 for (uint16_t i = 0; i < 4; ++i) {
444 if (delta_array[i] + delta_scalar - x[i] >= y) {
458static inline uint16_t lower_bound_8x64(
const uint64_t* x, uint64_t y) {
459#ifdef PIXIE_AVX512_SUPPORT
461 auto y_8 = _mm512_set1_epi64(y);
462 auto reg_512 = _mm512_loadu_epi64(x);
463 auto cmp = _mm512_cmpge_epu64_mask(reg_512, y_8);
465 return _tzcnt_u16(cmp);
468#ifdef PIXIE_AVX2_SUPPORT
470 uint16_t len = lower_bound_4x64(x, y);
476 return len + lower_bound_4x64(x + 4, y);
480 for (uint16_t i = 0; i < 8; ++i) {
504static inline uint16_t lower_bound_delta_8x64(
const uint64_t* x,
506 const uint64_t* delta_array,
507 uint64_t delta_scalar) {
508#ifdef PIXIE_AVX512_SUPPORT
510 const __m512i dlt_512 = _mm512_loadu_epi64(delta_array);
511 auto x_512 = _mm512_loadu_epi64(x);
512 auto dlt_8 = _mm512_set1_epi64(delta_scalar);
513 auto y_8 = _mm512_set1_epi64(y);
515 auto tmp = _mm512_add_epi64(dlt_8, dlt_512);
516 auto reg_512 = _mm512_sub_epi64(tmp, x_512);
517 auto cmp = _mm512_cmpge_epu64_mask(reg_512, y_8);
519 return _tzcnt_u16(cmp);
522#ifdef PIXIE_AVX2_SUPPORT
524 uint16_t len = lower_bound_delta_4x64(x, y, delta_array, delta_scalar);
530 return len + lower_bound_delta_4x64(x + 4, y, delta_array + 4, delta_scalar);
534 for (uint16_t i = 0; i < 8; ++i) {
535 if (delta_array[i] + delta_scalar - x[i] >= y) {
549static inline uint16_t lower_bound_32x16(
const uint16_t* x, uint16_t y) {
550#ifdef PIXIE_AVX512_SUPPORT
552 auto y_32 = _mm512_set1_epi16(y);
553 auto reg_512 = _mm512_loadu_epi16(x);
554 auto cmp = _mm512_cmplt_epu16_mask(reg_512, y_32);
555 return std::popcount(cmp);
558#ifdef PIXIE_AVX2_SUPPORT
560 auto y_16 = _mm256_set1_epi16(y);
561 __m256i reg_256 = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(x));
563 const __m256i offset = _mm256_set1_epi16(0x8000);
564 __m256i x_offset = _mm256_xor_si256(reg_256, offset);
565 __m256i y_offset = _mm256_xor_si256(y_16, offset);
566 uint32_t mask = _mm256_movemask_epi8(_mm256_cmpgt_epi16(y_offset, x_offset));
568 uint16_t count = std::popcount(mask) >> 1;
570 reg_256 = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(x + 16));
572 x_offset = _mm256_xor_si256(reg_256, offset);
573 mask = _mm256_movemask_epi8(_mm256_cmpgt_epi16(y_offset, x_offset));
575 return count + (std::popcount(mask) >> 1);
580 for (uint16_t i = 0; i < 32; ++i) {
604static inline uint16_t lower_bound_delta_32x16(
const uint16_t* x,
606 const uint16_t* delta_array,
607 uint16_t delta_scalar) {
608#ifdef PIXIE_AVX512_SUPPORT
610 const __m512i dlt_512 = _mm512_loadu_epi64(delta_array);
611 auto x_512 = _mm512_loadu_epi64(x);
612 auto dlt_32 = _mm512_set1_epi16(delta_scalar);
613 auto y_32 = _mm512_set1_epi16(y);
615 auto tmp = _mm512_add_epi16(dlt_32, dlt_512);
616 auto reg_512 = _mm512_sub_epi16(tmp, x_512);
617 auto cmp = _mm512_cmplt_epu16_mask(reg_512, y_32);
618 return std::popcount(cmp);
621#ifdef PIXIE_AVX2_SUPPORT
624 _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(delta_array));
625 auto x_256 = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(x));
626 auto dlt_16 = _mm256_set1_epi16(delta_scalar);
627 auto y_16 = _mm256_set1_epi16(y);
629 auto tmp = _mm256_add_epi16(dlt_16, dlt_256);
630 auto reg_256 = _mm256_sub_epi16(tmp, x_256);
632 const __m256i offset = _mm256_set1_epi16(0x8000);
633 __m256i x_offset = _mm256_xor_si256(reg_256, offset);
634 __m256i y_offset = _mm256_xor_si256(y_16, offset);
635 uint32_t mask = _mm256_movemask_epi8(_mm256_cmpgt_epi16(y_offset, x_offset));
637 uint16_t count = std::popcount(mask) >> 1;
640 _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(delta_array + 16));
641 x_256 = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(x + 16));
643 tmp = _mm256_add_epi16(dlt_16, dlt_256);
644 reg_256 = _mm256_sub_epi16(tmp, x_256);
646 x_offset = _mm256_xor_si256(reg_256, offset);
647 mask = _mm256_movemask_epi8(_mm256_cmpgt_epi16(y_offset, x_offset));
649 return count + (std::popcount(mask) >> 1);
654 for (uint16_t i = 0; i < 32; ++i) {
655 if (delta_array[i] + delta_scalar - x[i] < y) {
676static inline void popcount_64x4(
const uint8_t* x, uint8_t* result) {
677#ifdef PIXIE_AVX512_SUPPORT
678 __m256i data = _mm256_loadu_si256((__m256i
const*)x);
681 const __m256i low_bits_mask = _mm256_set1_epi8(0x0F);
684 __m256i low_bits = _mm256_and_si256(data, low_bits_mask);
685 __m256i low_count = _mm256_shuffle_epi8(lookup_popcount_4, low_bits);
688 __m256i high_bits = _mm256_srli_epi16(data, 4);
689 high_bits = _mm256_and_si256(high_bits, low_bits_mask);
690 __m256i high_count = _mm256_shuffle_epi8(lookup_popcount_4, high_bits);
694 _mm256_or_si256(low_count, _mm256_slli_epi16(high_count, 4));
695 _mm256_storeu_epi8(result, result_vec);
698 for (
size_t i = 0; i < 32; i++) {
700 uint8_t a = x[i] & 0x0F;
701 uint8_t low_count = std::popcount(a);
703 a = (x[i] >> 4) & 0x0F;
704 uint8_t high_count = std::popcount(a);
707 result[i] = low_count | (high_count << 4);
723static inline void popcount_32x8(
const uint8_t* x, uint8_t* result) {
724#ifdef PIXIE_AVX512_SUPPORT
726 __m256i data = _mm256_loadu_si256((__m256i
const*)x);
727 auto popcount_8 = _mm256_popcnt_epi8(data);
728 _mm256_storeu_si256((__m256i*)result, popcount_8);
730#ifdef PIXIE_AVX2_SUPPORT
732 __m256i data = _mm256_loadu_si256((__m256i
const*)x);
735 const __m256i low_bits_mask = _mm256_set1_epi8(0x0F);
738 __m256i low_bits = _mm256_and_si256(data, low_bits_mask);
739 __m256i low_count = _mm256_shuffle_epi8(lookup_popcount_4, low_bits);
742 __m256i high_bits = _mm256_srli_epi16(data, 4);
743 high_bits = _mm256_and_si256(high_bits, low_bits_mask);
744 __m256i high_count = _mm256_shuffle_epi8(lookup_popcount_4, high_bits);
746 __m256i result_vec = _mm256_add_epi8(low_count, high_count);
747 _mm256_storeu_si256((__m256i*)result, result_vec);
750 for (
size_t i = 0; i < 32; i++) {
751 result[i] = std::popcount(x[i]);
757#ifdef PIXIE_AVX2_SUPPORT
760static inline const __m256i excess_lut_delta = _mm256_setr_epi8(
771static inline const __m256i excess_lut_pos0 = _mm256_setr_epi8(
781static inline const __m256i excess_lut_pos1 = _mm256_setr_epi8(
791static inline const __m256i excess_lut_pos2 = _mm256_setr_epi8(
800static inline const __m256i excess_lut_pack_multiplier =
801 _mm256_set1_epi16(0x1001);
802static inline const __m256i excess_lut_bit0 = _mm256_set1_epi8(1);
803static inline const __m256i excess_lut_bit1 = _mm256_set1_epi8(2);
804static inline const __m256i excess_lut_bit2 = _mm256_set1_epi8(4);
805static inline const __m256i excess_lut_bit3 = _mm256_set1_epi8(8);
806static inline const __m128i excess_lut_nibble_mask = _mm_set1_epi8(0x0F);
823static inline int excess_positions_128(
const uint64_t* s,
825 uint64_t* out)
noexcept {
827 const int block_delta = 2 * (std::popcount(s[0]) + std::popcount(s[1])) - 128;
829 if (target_x < -128 || target_x > 128) {
833#ifdef PIXIE_AVX2_SUPPORT
834 const __m256i vdelta = excess_lut_delta;
835 const __m256i vpos0 = excess_lut_pos0;
836 const __m256i vpos1 = excess_lut_pos1;
837 const __m256i vpos2 = excess_lut_pos2;
838 const __m256i vmult = excess_lut_pack_multiplier;
839 const __m256i vbit0 = excess_lut_bit0;
840 const __m256i vbit1 = excess_lut_bit1;
841 const __m256i vbit2 = excess_lut_bit2;
842 const __m256i vbit3 = excess_lut_bit3;
843 const __m128i vnibble_mask = excess_lut_nibble_mask;
845 const int d = 2 * target_x - block_delta;
846 if (d < -128 || d > 128) {
850 __m128i word_vec = _mm_loadu_si128((
const __m128i*)s);
851 __m128i lo_nibbles = _mm_and_si128(word_vec, vnibble_mask);
852 __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi16(word_vec, 4), vnibble_mask);
854 __m128i unpack_lo = _mm_unpacklo_epi8(lo_nibbles, hi_nibbles);
855 __m128i unpack_hi = _mm_unpackhi_epi8(lo_nibbles, hi_nibbles);
858 _mm256_inserti128_si256(_mm256_castsi128_si256(unpack_lo), unpack_hi, 1);
860 __m256i ps = _mm256_shuffle_epi8(vdelta, nibbles);
861 ps = _mm256_add_epi8(ps, _mm256_slli_si256(ps, 1));
862 ps = _mm256_add_epi8(ps, _mm256_slli_si256(ps, 2));
863 ps = _mm256_add_epi8(ps, _mm256_slli_si256(ps, 4));
864 ps = _mm256_add_epi8(ps, _mm256_slli_si256(ps, 8));
866 __m128i ps_lo = _mm256_castsi256_si128(ps);
867 __m128i ps_hi = _mm256_extracti128_si256(ps, 1);
868 __m128i carry = _mm_set1_epi8((int8_t)_mm_extract_epi8(ps_lo, 15));
869 ps_hi = _mm_add_epi8(ps_hi, carry);
870 ps = _mm256_inserti128_si256(_mm256_castsi128_si256(ps_lo), ps_hi, 1);
872 __m256i b = _mm256_permute2x128_si256(ps, ps, 0x08);
873 __m256i excl_ps = _mm256_alignr_epi8(ps, b, 15);
875 __m256i vtgt = _mm256_set1_epi8((int8_t)target_x);
876 __m256i t = _mm256_sub_epi8(vtgt, excl_ps);
878 __m256i cmp0 = _mm256_cmpeq_epi8(_mm256_shuffle_epi8(vpos0, nibbles), t);
879 __m256i cmp1 = _mm256_cmpeq_epi8(_mm256_shuffle_epi8(vpos1, nibbles), t);
880 __m256i cmp2 = _mm256_cmpeq_epi8(_mm256_shuffle_epi8(vpos2, nibbles), t);
881 __m256i cmp3 = _mm256_cmpeq_epi8(ps, vtgt);
883 __m256i bit0 = _mm256_and_si256(cmp0, vbit0);
884 __m256i bit1 = _mm256_and_si256(cmp1, vbit1);
885 __m256i bit2 = _mm256_and_si256(cmp2, vbit2);
886 __m256i bit3 = _mm256_and_si256(cmp3, vbit3);
888 __m256i total_match =
889 _mm256_or_si256(_mm256_or_si256(bit0, bit1), _mm256_or_si256(bit2, bit3));
891 __m256i res = _mm256_maddubs_epi16(total_match, vmult);
892 __m128i res_lo = _mm256_castsi256_si128(res);
893 __m128i res_hi = _mm256_extracti128_si256(res, 1);
894 __m128i packed = _mm_packus_epi16(res_lo, res_hi);
896 _mm_storeu_si128((__m128i*)out, packed);
899 for (
size_t i = 0; i < 128; ++i) {
900 const uint64_t w = s[i >> 6];
901 const int bit = int((w >> (i & 63)) & 1ull);
902 cur += bit ? +1 : -1;
903 if (cur == target_x) {
904 out[i >> 6] |= (uint64_t{1} << (i & 63));
920static inline int prefix_excess_128(
const uint64_t* s,
921 size_t end_offset)
noexcept {
922 end_offset = end_offset > 128 ? 128 : end_offset;
923 if (end_offset == 0) {
926 if (end_offset <= 64) {
927 const int ones =
static_cast<int>(std::popcount(
928 s[0] & first_bits_mask(
static_cast<uint32_t
>(end_offset))));
929 return 2 * ones -
static_cast<int>(end_offset);
931 const int ones =
static_cast<int>(
932 std::popcount(s[0]) +
934 first_bits_mask(
static_cast<uint32_t
>(end_offset - 64))));
935 return 2 * ones -
static_cast<int>(end_offset);
953static inline size_t forward_search_128(
const uint64_t* s,
956 int* block_excess =
nullptr) noexcept {
958 const int delta = excess_positions_128(s, target_x, out);
959 if (block_excess !=
nullptr) {
960 *block_excess = delta;
962 if (start_offset >= 128) {
966 const size_t first_word = start_offset >> 6;
967 const size_t first_bit = start_offset & 63;
968 for (
size_t word = first_word; word < 2; ++word) {
969 uint64_t mask = out[word];
970 if (word == first_word && first_bit != 0) {
971 mask &= ~first_bits_mask(first_bit);
974 return word * 64 + std::countr_zero(mask);
997static inline size_t backward_search_128(
const uint64_t* s,
1000 int* block_excess =
nullptr) noexcept {
1002 const int delta = excess_positions_128(s, target_x, out);
1003 if (block_excess !=
nullptr) {
1004 *block_excess = delta;
1006 if (end_offset == 0) {
1010 const size_t max_prefix_length = end_offset - 1;
1011 if (max_prefix_length > 0) {
1012 const size_t last_bit_index = max_prefix_length - 1;
1013 size_t word = last_bit_index >> 6;
1014 const size_t bit_in_word = last_bit_index & 63;
1015 uint64_t mask = out[word] & first_bits_mask(bit_in_word + 1);
1018 return word * 64 + (63 - std::countl_zero(mask)) + 1;
1027 return target_x == 0 ? 0 : 128;
1042static inline void excess_positions_512(
const uint64_t* s,
1044 uint64_t* out)
noexcept {
1045 if (target_x < -512 || target_x > 512) {
1046 out[0] = out[1] = out[2] = out[3] = 0;
1047 out[4] = out[5] = out[6] = out[7] = 0;
1051 for (
int k = 0; k < 4; ++k) {
1052 target_x -= excess_positions_128(s + 2 * k, target_x, out + 2 * k);
1067static inline void rank_32x8(
const uint8_t* x, uint8_t* result) {
1068#ifdef PIXIE_AVX512_SUPPORT
1070 popcount_32x8(x, result);
1071 __m256i prefix_sums = _mm256_loadu_si256((__m256i
const*)result);
1072 const __m256i zero = _mm256_setzero_si256();
1074 prefix_sums = _mm256_add_epi8(prefix_sums,
1075 _mm256_alignr_epi8(prefix_sums, zero, 16 - 1));
1076 prefix_sums = _mm256_add_epi8(prefix_sums,
1077 _mm256_alignr_epi8(prefix_sums, zero, 16 - 2));
1078 prefix_sums = _mm256_add_epi8(prefix_sums,
1079 _mm256_alignr_epi8(prefix_sums, zero, 16 - 4));
1080 prefix_sums = _mm256_add_epi8(prefix_sums,
1081 _mm256_alignr_epi8(prefix_sums, zero, 16 - 8));
1085 __m128i low_lane = _mm256_extracti128_si256(prefix_sums, 0);
1086 __m128i high_lane = _mm256_extracti128_si256(prefix_sums, 1);
1087 auto last_val_low = _mm_extract_epi8(low_lane, 15);
1088 __m128i add_to_high = _mm_set1_epi8(last_val_low);
1089 high_lane = _mm_add_epi8(high_lane, add_to_high);
1090 prefix_sums = _mm256_set_m128i(high_lane, low_lane);
1091 _mm256_storeu_epi8(result, prefix_sums);
1094 result[0] = std::popcount(x[0]);
1095 for (
size_t i = 1; i < 32; ++i) {
1096 result[i] = std::popcount(x[i]) + result[i - 1];