Pixie
Loading...
Searching...
No Matches
bits.h
1#pragma once
2
3#include <immintrin.h>
4
5#include <bit>
6#include <cstddef>
7#include <cstdint>
8#include <limits>
9#include <numeric>
10
11#if defined(__AVX512VPOPCNTDQ__) && defined(__AVX512F__) && \
12 defined(__AVX512BW__)
13#define PIXIE_AVX512_SUPPORT
14#endif
15
16#ifdef __AVX2__
17#define PIXIE_AVX2_SUPPORT
18// Lookup table for 4-bit popcount
19// This table maps each 4-bit value (0-15) to its population count
20// clang-format off
21static inline const __m256i lookup_popcount_4 = _mm256_setr_epi8(
22 0, 1, 1, 2, // 0000, 0001, 0010, 0011
23 1, 2, 2, 3, // 0100, 0101, 0110, 0111
24 1, 2, 2, 3, // 1000, 1001, 1010, 1011
25 2, 3, 3, 4, // 1100, 1101, 1110, 1111
26
27 // Same table repeated for high 128 bits
28 0, 1, 1, 2, // 0000, 0001, 0010, 0011
29 1, 2, 2, 3, // 0100, 0101, 0110, 0111
30 1, 2, 2, 3, // 1000, 1001, 1010, 1011
31 2, 3, 3, 4 // 1100, 1101, 1110, 1111
32);
33
34static inline const __m256i mask_first_half = _mm256_setr_epi8(
35 0xFF, 0xFF, 0xFF, 0xFF,
36 0xFF, 0xFF, 0xFF, 0xFF,
37 0xFF, 0xFF, 0xFF, 0xFF,
38 0xFF, 0xFF, 0xFF, 0xFF,
39 0, 0, 0, 0,
40 0, 0, 0, 0,
41 0, 0, 0, 0,
42 0, 0, 0, 0
43);
44
45// clang-format on
46#endif
47
62static inline uint32_t rmm_btree_match_mask_i16x16(const int16_t* prefix_before,
63 const int16_t* min_excess,
64 const int16_t* max_excess,
65 int16_t target,
66 bool include_zero_boundary) {
67#ifdef PIXIE_AVX2_SUPPORT
68 const __m256i vtarget = _mm256_set1_epi16(target);
69 const __m256i vprefix =
70 _mm256_loadu_si256(reinterpret_cast<const __m256i*>(prefix_before));
71 const __m256i vmin =
72 _mm256_loadu_si256(reinterpret_cast<const __m256i*>(min_excess));
73 const __m256i vmax =
74 _mm256_loadu_si256(reinterpret_cast<const __m256i*>(max_excess));
75
76 const __m256i lower = _mm256_adds_epi16(vprefix, vmin);
77 const __m256i upper = _mm256_adds_epi16(vprefix, vmax);
78 const __m256i ge_lower = _mm256_or_si256(_mm256_cmpgt_epi16(vtarget, lower),
79 _mm256_cmpeq_epi16(vtarget, lower));
80 const __m256i le_upper = _mm256_or_si256(_mm256_cmpgt_epi16(upper, vtarget),
81 _mm256_cmpeq_epi16(upper, vtarget));
82 __m256i matched = _mm256_and_si256(ge_lower, le_upper);
83 if (include_zero_boundary) {
84 matched = _mm256_or_si256(matched, _mm256_cmpeq_epi16(vtarget, vprefix));
85 }
86
87 const uint32_t byte_mask =
88 static_cast<uint32_t>(_mm256_movemask_epi8(matched));
89 uint32_t result = 0;
90 for (size_t lane = 0; lane < 16; ++lane) {
91 const uint32_t lane_mask = 0x3u << (lane * 2);
92 if ((byte_mask & lane_mask) == lane_mask) {
93 result |= uint32_t{1} << lane;
94 }
95 }
96 return result;
97#else
98 uint32_t result = 0;
99 for (size_t lane = 0; lane < 16; ++lane) {
100 const int lower = prefix_before[lane] + min_excess[lane];
101 const int upper = prefix_before[lane] + max_excess[lane];
102 const bool found = (lower <= target && target <= upper) ||
103 (include_zero_boundary && target == prefix_before[lane]);
104 if (found) {
105 result |= uint32_t{1} << lane;
106 }
107 }
108 return result;
109#endif
110}
111
126static inline uint32_t rmm_btree_match_mask_i64x4(const int64_t* prefix_before,
127 const int64_t* min_excess,
128 const int64_t* max_excess,
129 int64_t target,
130 bool include_zero_boundary) {
131#ifdef PIXIE_AVX2_SUPPORT
132 const __m256i vtarget = _mm256_set1_epi64x(target);
133 const __m256i vprefix =
134 _mm256_loadu_si256(reinterpret_cast<const __m256i*>(prefix_before));
135 const __m256i vmin =
136 _mm256_loadu_si256(reinterpret_cast<const __m256i*>(min_excess));
137 const __m256i vmax =
138 _mm256_loadu_si256(reinterpret_cast<const __m256i*>(max_excess));
139
140 const __m256i relative = _mm256_sub_epi64(vtarget, vprefix);
141 const __m256i ge_min = _mm256_or_si256(_mm256_cmpgt_epi64(relative, vmin),
142 _mm256_cmpeq_epi64(relative, vmin));
143 const __m256i le_max = _mm256_or_si256(_mm256_cmpgt_epi64(vmax, relative),
144 _mm256_cmpeq_epi64(vmax, relative));
145 __m256i matched = _mm256_and_si256(ge_min, le_max);
146 if (include_zero_boundary) {
147 matched = _mm256_or_si256(matched, _mm256_cmpeq_epi64(vtarget, vprefix));
148 }
149
150 const uint32_t byte_mask =
151 static_cast<uint32_t>(_mm256_movemask_epi8(matched));
152 uint32_t result = 0;
153 for (size_t lane = 0; lane < 4; ++lane) {
154 const uint32_t lane_mask = 0xffu << (lane * 8);
155 if ((byte_mask & lane_mask) == lane_mask) {
156 result |= uint32_t{1} << lane;
157 }
158 }
159 return result;
160#else
161 uint32_t result = 0;
162 for (size_t lane = 0; lane < 4; ++lane) {
163 const int64_t relative = target - prefix_before[lane];
164 const bool found =
165 (min_excess[lane] <= relative && relative <= max_excess[lane]) ||
166 (include_zero_boundary && relative == 0);
167 if (found) {
168 result |= uint32_t{1} << lane;
169 }
170 }
171 return result;
172#endif
173}
174
182static inline uint64_t first_bits_mask(size_t num) {
183 return num >= 64 ? UINT64_MAX : ((1llu << num) - 1);
184}
185
206static inline uint64_t rank_512(const uint64_t* x, uint64_t count) {
207#ifdef PIXIE_AVX512_SUPPORT
208
209 __m512i a = _mm512_maskz_set1_epi64((1ull << ((count >> 6))) - 1,
210 std::numeric_limits<uint64_t>::max());
211 __m512i b = _mm512_maskz_set1_epi64((1ull << ((count >> 6) + 1)) - 1,
212 std::numeric_limits<uint64_t>::max());
213 __m512i mask = _mm512_shldv_epi64(a, b, _mm512_set1_epi64(count % 64));
214
215 __m512i res = _mm512_loadu_epi64(x);
216 res = _mm512_and_epi64(res, mask);
217 __m512i cnt = _mm512_popcnt_epi64(res);
218 return _mm512_reduce_add_epi64(cnt);
219
220#else
221
222 uint64_t last_uint = count < 512 ? count >> 6 : 8;
223
224 uint64_t pop_val = 0;
225
226 for (int i = 0; i < last_uint; i++) {
227 pop_val += std::popcount(x[i]);
228 }
229
230 pop_val += count < 512
231 ? std::popcount(x[last_uint] & first_bits_mask(count & 63))
232 : 0;
233 return pop_val;
234
235#endif
236}
237
241static inline uint64_t select_64(uint64_t x, uint64_t rank) {
242 return _tzcnt_u64(_pdep_u64(1ull << rank, x));
243}
244
262static inline uint64_t select_512(const uint64_t* x, uint64_t rank) {
263#ifdef PIXIE_AVX512_SUPPORT
264
265 __m512i res = _mm512_loadu_epi64(x);
266 __m512i counts = _mm512_popcnt_epi64(res);
267 __m512i prefix = counts;
268
269 const __m512i idx_shift1 = _mm512_set_epi64(6, 5, 4, 3, 2, 1, 0, 0);
270 const __m512i idx_shift2 = _mm512_set_epi64(5, 4, 3, 2, 1, 0, 0, 0);
271 const __m512i idx_shift4 = _mm512_set_epi64(3, 2, 1, 0, 0, 0, 0, 0);
272
273 __m512i tmp = _mm512_maskz_permutexvar_epi64(0xFE, idx_shift1, prefix);
274 prefix = _mm512_add_epi64(prefix, tmp);
275 tmp = _mm512_maskz_permutexvar_epi64(0xFC, idx_shift2, prefix);
276 prefix = _mm512_add_epi64(prefix, tmp);
277 tmp = _mm512_maskz_permutexvar_epi64(0xF0, idx_shift4, prefix);
278 prefix = _mm512_add_epi64(prefix, tmp);
279
280 __mmask8 mask = _mm512_cmpgt_epu64_mask(prefix, _mm512_set1_epi64(rank));
281 uint32_t i = _tzcnt_u32(static_cast<uint32_t>(mask));
282 uint64_t prev = 0;
283 if (i != 0) {
284 __m512i idx_prev = _mm512_set1_epi64(static_cast<int64_t>(i - 1));
285 __m512i prev_vec = _mm512_permutexvar_epi64(idx_prev, prefix);
286 prev = static_cast<uint64_t>(
287 _mm_cvtsi128_si64(_mm512_castsi512_si128(prev_vec)));
288 }
289 return i * 64 + select_64(x[i], rank - prev);
290
291#else
292
293 size_t i = 0;
294 int popcount = std::popcount(x[0]);
295 while (i < 7 && popcount <= rank) {
296 rank -= popcount;
297 popcount = std::popcount(x[++i]);
298 }
299 return i * 64 + select_64(x[i], rank);
300
301#endif
302}
303
308static inline uint64_t select0_512(const uint64_t* x, uint64_t rank0) {
309#ifdef PIXIE_AVX512_SUPPORT
310
311 __m512i res = _mm512_loadu_epi64(x);
312 res = _mm512_xor_epi64(res, _mm512_set1_epi64(-1));
313 __m512i counts = _mm512_popcnt_epi64(res);
314 __m512i prefix = counts;
315
316 const __m512i idx_shift1 = _mm512_set_epi64(6, 5, 4, 3, 2, 1, 0, 0);
317 const __m512i idx_shift2 = _mm512_set_epi64(5, 4, 3, 2, 1, 0, 0, 0);
318 const __m512i idx_shift4 = _mm512_set_epi64(3, 2, 1, 0, 0, 0, 0, 0);
319
320 __m512i tmp = _mm512_maskz_permutexvar_epi64(0xFE, idx_shift1, prefix);
321 prefix = _mm512_add_epi64(prefix, tmp);
322 tmp = _mm512_maskz_permutexvar_epi64(0xFC, idx_shift2, prefix);
323 prefix = _mm512_add_epi64(prefix, tmp);
324 tmp = _mm512_maskz_permutexvar_epi64(0xF0, idx_shift4, prefix);
325 prefix = _mm512_add_epi64(prefix, tmp);
326
327 __mmask8 mask = _mm512_cmpgt_epu64_mask(prefix, _mm512_set1_epi64(rank0));
328 uint32_t i = _tzcnt_u32(static_cast<uint32_t>(mask));
329 uint64_t prev = 0;
330 if (i != 0) {
331 __m512i idx_prev = _mm512_set1_epi64(static_cast<int64_t>(i - 1));
332 __m512i prev_vec = _mm512_permutexvar_epi64(idx_prev, prefix);
333 prev = static_cast<uint64_t>(
334 _mm_cvtsi128_si64(_mm512_castsi512_si128(prev_vec)));
335 }
336 return i * 64 + select_64(~x[i], rank0 - prev);
337
338#else
339
340 size_t i = 0;
341 int popcount = std::popcount(~x[0]);
342 while (i < 7 && popcount <= rank0) {
343 rank0 -= popcount;
344 popcount = std::popcount(~x[++i]);
345 }
346 return i * 64 + select_64(~x[i], rank0);
347
348#endif
349}
350
355static inline uint16_t lower_bound_4x64(const uint64_t* x, uint64_t y) {
356#ifdef PIXIE_AVX512_SUPPORT
357
358 auto y_4 = _mm256_set1_epi64x(y);
359 auto reg_256 = _mm256_loadu_epi64(x);
360 auto cmp = _mm256_cmpge_epu64_mask(reg_256, y_4);
361
362 return _tzcnt_u16(cmp);
363
364#else
365#ifdef PIXIE_AVX2_SUPPORT
366
367 auto y_4 = _mm256_set1_epi64x(y);
368 __m256i reg_256 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(x));
369
370 const __m256i offset = _mm256_set1_epi64x(0x8000000000000000ULL);
371 __m256i x_offset = _mm256_xor_si256(reg_256, offset);
372 __m256i y_offset = _mm256_xor_si256(y_4, offset);
373 auto mask = _mm256_movemask_epi8(_mm256_cmpgt_epi64(
374 x_offset, _mm256_sub_epi64(y_offset, _mm256_set1_epi64x(1))));
375
376 return _tzcnt_u32(mask) >> 3;
377
378#else
379
380 for (uint16_t i = 0; i < 4; ++i) {
381 if (x[i] >= y) {
382 return i;
383 }
384 }
385 return 4;
386
387#endif
388#endif
389}
390
404static inline uint16_t lower_bound_delta_4x64(const uint64_t* x,
405 uint64_t y,
406 const uint64_t* delta_array,
407 uint64_t delta_scalar) {
408#ifdef PIXIE_AVX512_SUPPORT
409
410 const __m256i dlt_256 = _mm256_loadu_epi64(delta_array);
411 auto x_256 = _mm256_loadu_epi64(x);
412 auto dlt_4 = _mm256_set1_epi64x(delta_scalar);
413 auto y_4 = _mm256_set1_epi64x(y);
414
415 auto tmp = _mm256_add_epi64(dlt_4, dlt_256);
416 auto reg_256 = _mm256_sub_epi64(tmp, x_256);
417 auto cmp = _mm256_cmpge_epu64_mask(reg_256, y_4);
418
419 return _tzcnt_u16(cmp);
420
421#else
422#ifdef PIXIE_AVX2_SUPPORT
423
424 const __m256i dlt_256 =
425 _mm256_loadu_si256(reinterpret_cast<const __m256i*>(delta_array));
426 auto x_256 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(x));
427 auto dlt_4 = _mm256_set1_epi64x(delta_scalar);
428 auto y_4 = _mm256_set1_epi64x(y);
429
430 auto tmp = _mm256_add_epi64(dlt_4, dlt_256);
431 auto reg_256 = _mm256_sub_epi64(tmp, x_256);
432
433 const __m256i offset = _mm256_set1_epi64x(0x8000000000000000ULL);
434 __m256i x_offset = _mm256_xor_si256(reg_256, offset);
435 __m256i y_offset = _mm256_xor_si256(y_4, offset);
436 auto mask = _mm256_movemask_epi8(_mm256_cmpgt_epi64(
437 x_offset, _mm256_sub_epi64(y_offset, _mm256_set1_epi64x(1))));
438
439 return _tzcnt_u32(mask) >> 3;
440
441#else
442
443 for (uint16_t i = 0; i < 4; ++i) {
444 if (delta_array[i] + delta_scalar - x[i] >= y) {
445 return i;
446 }
447 }
448 return 4;
449
450#endif
451#endif
452}
453
458static inline uint16_t lower_bound_8x64(const uint64_t* x, uint64_t y) {
459#ifdef PIXIE_AVX512_SUPPORT
460
461 auto y_8 = _mm512_set1_epi64(y);
462 auto reg_512 = _mm512_loadu_epi64(x);
463 auto cmp = _mm512_cmpge_epu64_mask(reg_512, y_8);
464
465 return _tzcnt_u16(cmp);
466
467#else
468#ifdef PIXIE_AVX2_SUPPORT
469
470 uint16_t len = lower_bound_4x64(x, y);
471
472 if (len < 4) {
473 return len;
474 }
475
476 return len + lower_bound_4x64(x + 4, y);
477
478#else
479
480 for (uint16_t i = 0; i < 8; ++i) {
481 if (x[i] >= y) {
482 return i;
483 }
484 }
485 return 8;
486
487#endif
488#endif
489}
490
504static inline uint16_t lower_bound_delta_8x64(const uint64_t* x,
505 uint64_t y,
506 const uint64_t* delta_array,
507 uint64_t delta_scalar) {
508#ifdef PIXIE_AVX512_SUPPORT
509
510 const __m512i dlt_512 = _mm512_loadu_epi64(delta_array);
511 auto x_512 = _mm512_loadu_epi64(x);
512 auto dlt_8 = _mm512_set1_epi64(delta_scalar);
513 auto y_8 = _mm512_set1_epi64(y);
514
515 auto tmp = _mm512_add_epi64(dlt_8, dlt_512);
516 auto reg_512 = _mm512_sub_epi64(tmp, x_512);
517 auto cmp = _mm512_cmpge_epu64_mask(reg_512, y_8);
518
519 return _tzcnt_u16(cmp);
520
521#else
522#ifdef PIXIE_AVX2_SUPPORT
523
524 uint16_t len = lower_bound_delta_4x64(x, y, delta_array, delta_scalar);
525
526 if (len < 4) {
527 return len;
528 }
529
530 return len + lower_bound_delta_4x64(x + 4, y, delta_array + 4, delta_scalar);
531
532#else
533
534 for (uint16_t i = 0; i < 8; ++i) {
535 if (delta_array[i] + delta_scalar - x[i] >= y) {
536 return i;
537 }
538 }
539 return 8;
540
541#endif
542#endif
543}
544
549static inline uint16_t lower_bound_32x16(const uint16_t* x, uint16_t y) {
550#ifdef PIXIE_AVX512_SUPPORT
551
552 auto y_32 = _mm512_set1_epi16(y);
553 auto reg_512 = _mm512_loadu_epi16(x);
554 auto cmp = _mm512_cmplt_epu16_mask(reg_512, y_32);
555 return std::popcount(cmp);
556
557#else
558#ifdef PIXIE_AVX2_SUPPORT
559
560 auto y_16 = _mm256_set1_epi16(y);
561 __m256i reg_256 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(x));
562
563 const __m256i offset = _mm256_set1_epi16(0x8000);
564 __m256i x_offset = _mm256_xor_si256(reg_256, offset);
565 __m256i y_offset = _mm256_xor_si256(y_16, offset);
566 uint32_t mask = _mm256_movemask_epi8(_mm256_cmpgt_epi16(y_offset, x_offset));
567
568 uint16_t count = std::popcount(mask) >> 1;
569
570 reg_256 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(x + 16));
571
572 x_offset = _mm256_xor_si256(reg_256, offset);
573 mask = _mm256_movemask_epi8(_mm256_cmpgt_epi16(y_offset, x_offset));
574
575 return count + (std::popcount(mask) >> 1);
576
577#else
578
579 uint16_t cnt = 0;
580 for (uint16_t i = 0; i < 32; ++i) {
581 if (x[i] < y) {
582 cnt++;
583 }
584 }
585 return cnt;
586
587#endif
588#endif
589}
590
604static inline uint16_t lower_bound_delta_32x16(const uint16_t* x,
605 uint16_t y,
606 const uint16_t* delta_array,
607 uint16_t delta_scalar) {
608#ifdef PIXIE_AVX512_SUPPORT
609
610 const __m512i dlt_512 = _mm512_loadu_epi64(delta_array);
611 auto x_512 = _mm512_loadu_epi64(x);
612 auto dlt_32 = _mm512_set1_epi16(delta_scalar);
613 auto y_32 = _mm512_set1_epi16(y);
614
615 auto tmp = _mm512_add_epi16(dlt_32, dlt_512);
616 auto reg_512 = _mm512_sub_epi16(tmp, x_512);
617 auto cmp = _mm512_cmplt_epu16_mask(reg_512, y_32);
618 return std::popcount(cmp);
619
620#else
621#ifdef PIXIE_AVX2_SUPPORT
622
623 auto dlt_256 =
624 _mm256_loadu_si256(reinterpret_cast<const __m256i*>(delta_array));
625 auto x_256 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(x));
626 auto dlt_16 = _mm256_set1_epi16(delta_scalar);
627 auto y_16 = _mm256_set1_epi16(y);
628
629 auto tmp = _mm256_add_epi16(dlt_16, dlt_256);
630 auto reg_256 = _mm256_sub_epi16(tmp, x_256);
631
632 const __m256i offset = _mm256_set1_epi16(0x8000);
633 __m256i x_offset = _mm256_xor_si256(reg_256, offset);
634 __m256i y_offset = _mm256_xor_si256(y_16, offset);
635 uint32_t mask = _mm256_movemask_epi8(_mm256_cmpgt_epi16(y_offset, x_offset));
636
637 uint16_t count = std::popcount(mask) >> 1;
638
639 dlt_256 =
640 _mm256_loadu_si256(reinterpret_cast<const __m256i*>(delta_array + 16));
641 x_256 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(x + 16));
642
643 tmp = _mm256_add_epi16(dlt_16, dlt_256);
644 reg_256 = _mm256_sub_epi16(tmp, x_256);
645
646 x_offset = _mm256_xor_si256(reg_256, offset);
647 mask = _mm256_movemask_epi8(_mm256_cmpgt_epi16(y_offset, x_offset));
648
649 return count + (std::popcount(mask) >> 1);
650
651#else
652
653 uint16_t cnt = 0;
654 for (uint16_t i = 0; i < 32; ++i) {
655 if (delta_array[i] + delta_scalar - x[i] < y) {
656 cnt++;
657 }
658 }
659 return cnt;
660
661#endif
662#endif
663}
664
676static inline void popcount_64x4(const uint8_t* x, uint8_t* result) {
677#ifdef PIXIE_AVX512_SUPPORT
678 __m256i data = _mm256_loadu_si256((__m256i const*)x);
679
680 // Masks for extracting the lower and upper nibbles
681 const __m256i low_bits_mask = _mm256_set1_epi8(0x0F);
682
683 // Count bits in the lower half
684 __m256i low_bits = _mm256_and_si256(data, low_bits_mask);
685 __m256i low_count = _mm256_shuffle_epi8(lookup_popcount_4, low_bits);
686
687 // Count bits in the upper half
688 __m256i high_bits = _mm256_srli_epi16(data, 4);
689 high_bits = _mm256_and_si256(high_bits, low_bits_mask);
690 __m256i high_count = _mm256_shuffle_epi8(lookup_popcount_4, high_bits);
691
692 // Pack the results into a single output vector
693 __m256i result_vec =
694 _mm256_or_si256(low_count, _mm256_slli_epi16(high_count, 4));
695 _mm256_storeu_epi8(result, result_vec);
696#else
697 // Fallback implementation for non-AVX2 platforms
698 for (size_t i = 0; i < 32; i++) {
699 // Count bits in the lower half
700 uint8_t a = x[i] & 0x0F;
701 uint8_t low_count = std::popcount(a);
702 // Count bits in the upper half
703 a = (x[i] >> 4) & 0x0F;
704 uint8_t high_count = std::popcount(a);
705
706 // Pack the counts into the output byte
707 result[i] = low_count | (high_count << 4);
708 }
709#endif
710}
711
723static inline void popcount_32x8(const uint8_t* x, uint8_t* result) {
724#ifdef PIXIE_AVX512_SUPPORT
725 // Load 64 4-bit integers (256 bits total)
726 __m256i data = _mm256_loadu_si256((__m256i const*)x);
727 auto popcount_8 = _mm256_popcnt_epi8(data);
728 _mm256_storeu_si256((__m256i*)result, popcount_8);
729#else
730#ifdef PIXIE_AVX2_SUPPORT
731 // Load 64 4-bit integers (256 bits total)
732 __m256i data = _mm256_loadu_si256((__m256i const*)x);
733
734 // Masks for extracting the lower and upper nibbles
735 const __m256i low_bits_mask = _mm256_set1_epi8(0x0F);
736
737 // Count bits in lower half
738 __m256i low_bits = _mm256_and_si256(data, low_bits_mask);
739 __m256i low_count = _mm256_shuffle_epi8(lookup_popcount_4, low_bits);
740
741 // Count bits upper half
742 __m256i high_bits = _mm256_srli_epi16(data, 4);
743 high_bits = _mm256_and_si256(high_bits, low_bits_mask);
744 __m256i high_count = _mm256_shuffle_epi8(lookup_popcount_4, high_bits);
745
746 __m256i result_vec = _mm256_add_epi8(low_count, high_count);
747 _mm256_storeu_si256((__m256i*)result, result_vec);
748#else
749 // Fallback implementation for non-AVX2 platforms
750 for (size_t i = 0; i < 32; i++) {
751 result[i] = std::popcount(x[i]);
752 }
753#endif
754#endif
755}
756
757#ifdef PIXIE_AVX2_SUPPORT
758// clang-format off
759// LUT for total excess change across a 4-bit nibble
760static inline const __m256i excess_lut_delta = _mm256_setr_epi8(
761 -4, -2, -2, 0,
762 -2, 0, 0, 2,
763 -2, 0, 0, 2,
764 0, 2, 2, 4,
765 -4, -2, -2, 0,
766 -2, 0, 0, 2,
767 -2, 0, 0, 2,
768 0, 2, 2, 4);
769
770// LUTs for target relative excess positions
771static inline const __m256i excess_lut_pos0 = _mm256_setr_epi8(
772 -1, 1, -1, 1,
773 -1, 1, -1, 1,
774 -1, 1, -1, 1,
775 -1, 1, -1, 1,
776 -1, 1, -1, 1,
777 -1, 1, -1, 1,
778 -1, 1, -1, 1,
779 -1, 1, -1, 1);
780
781static inline const __m256i excess_lut_pos1 = _mm256_setr_epi8(
782 -2, 0, 0, 2,
783 -2, 0, 0, 2,
784 -2, 0, 0, 2,
785 -2, 0, 0, 2,
786 -2, 0, 0, 2,
787 -2, 0, 0, 2,
788 -2, 0, 0, 2,
789 -2, 0, 0, 2);
790
791static inline const __m256i excess_lut_pos2 = _mm256_setr_epi8(
792 -3, -1, -1, 1,
793 -1, 1, 1, 3,
794 -3, -1, -1, 1,
795 -1, 1, 1, 3,
796 -3, -1, -1, 1,
797 -1, 1, 1, 3,
798 -3, -1, -1, 1,
799 -1, 1, 1, 3);
800static inline const __m256i excess_lut_pack_multiplier =
801 _mm256_set1_epi16(0x1001);
802static inline const __m256i excess_lut_bit0 = _mm256_set1_epi8(1);
803static inline const __m256i excess_lut_bit1 = _mm256_set1_epi8(2);
804static inline const __m256i excess_lut_bit2 = _mm256_set1_epi8(4);
805static inline const __m256i excess_lut_bit3 = _mm256_set1_epi8(8);
806static inline const __m128i excess_lut_nibble_mask = _mm_set1_epi8(0x0F);
807// clang-format on
808#endif
809
823static inline int excess_positions_128(const uint64_t* s,
824 int target_x,
825 uint64_t* out) noexcept {
826 out[0] = out[1] = 0;
827 const int block_delta = 2 * (std::popcount(s[0]) + std::popcount(s[1])) - 128;
828
829 if (target_x < -128 || target_x > 128) {
830 return block_delta;
831 }
832
833#ifdef PIXIE_AVX2_SUPPORT
834 const __m256i vdelta = excess_lut_delta;
835 const __m256i vpos0 = excess_lut_pos0;
836 const __m256i vpos1 = excess_lut_pos1;
837 const __m256i vpos2 = excess_lut_pos2;
838 const __m256i vmult = excess_lut_pack_multiplier;
839 const __m256i vbit0 = excess_lut_bit0;
840 const __m256i vbit1 = excess_lut_bit1;
841 const __m256i vbit2 = excess_lut_bit2;
842 const __m256i vbit3 = excess_lut_bit3;
843 const __m128i vnibble_mask = excess_lut_nibble_mask;
844
845 const int d = 2 * target_x - block_delta;
846 if (d < -128 || d > 128) {
847 return block_delta;
848 }
849
850 __m128i word_vec = _mm_loadu_si128((const __m128i*)s);
851 __m128i lo_nibbles = _mm_and_si128(word_vec, vnibble_mask);
852 __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi16(word_vec, 4), vnibble_mask);
853
854 __m128i unpack_lo = _mm_unpacklo_epi8(lo_nibbles, hi_nibbles);
855 __m128i unpack_hi = _mm_unpackhi_epi8(lo_nibbles, hi_nibbles);
856
857 __m256i nibbles =
858 _mm256_inserti128_si256(_mm256_castsi128_si256(unpack_lo), unpack_hi, 1);
859
860 __m256i ps = _mm256_shuffle_epi8(vdelta, nibbles);
861 ps = _mm256_add_epi8(ps, _mm256_slli_si256(ps, 1));
862 ps = _mm256_add_epi8(ps, _mm256_slli_si256(ps, 2));
863 ps = _mm256_add_epi8(ps, _mm256_slli_si256(ps, 4));
864 ps = _mm256_add_epi8(ps, _mm256_slli_si256(ps, 8));
865
866 __m128i ps_lo = _mm256_castsi256_si128(ps);
867 __m128i ps_hi = _mm256_extracti128_si256(ps, 1);
868 __m128i carry = _mm_set1_epi8((int8_t)_mm_extract_epi8(ps_lo, 15));
869 ps_hi = _mm_add_epi8(ps_hi, carry);
870 ps = _mm256_inserti128_si256(_mm256_castsi128_si256(ps_lo), ps_hi, 1);
871
872 __m256i b = _mm256_permute2x128_si256(ps, ps, 0x08);
873 __m256i excl_ps = _mm256_alignr_epi8(ps, b, 15);
874
875 __m256i vtgt = _mm256_set1_epi8((int8_t)target_x);
876 __m256i t = _mm256_sub_epi8(vtgt, excl_ps);
877
878 __m256i cmp0 = _mm256_cmpeq_epi8(_mm256_shuffle_epi8(vpos0, nibbles), t);
879 __m256i cmp1 = _mm256_cmpeq_epi8(_mm256_shuffle_epi8(vpos1, nibbles), t);
880 __m256i cmp2 = _mm256_cmpeq_epi8(_mm256_shuffle_epi8(vpos2, nibbles), t);
881 __m256i cmp3 = _mm256_cmpeq_epi8(ps, vtgt);
882
883 __m256i bit0 = _mm256_and_si256(cmp0, vbit0);
884 __m256i bit1 = _mm256_and_si256(cmp1, vbit1);
885 __m256i bit2 = _mm256_and_si256(cmp2, vbit2);
886 __m256i bit3 = _mm256_and_si256(cmp3, vbit3);
887
888 __m256i total_match =
889 _mm256_or_si256(_mm256_or_si256(bit0, bit1), _mm256_or_si256(bit2, bit3));
890
891 __m256i res = _mm256_maddubs_epi16(total_match, vmult);
892 __m128i res_lo = _mm256_castsi256_si128(res);
893 __m128i res_hi = _mm256_extracti128_si256(res, 1);
894 __m128i packed = _mm_packus_epi16(res_lo, res_hi);
895
896 _mm_storeu_si128((__m128i*)out, packed);
897#else
898 int cur = 0;
899 for (size_t i = 0; i < 128; ++i) {
900 const uint64_t w = s[i >> 6];
901 const int bit = int((w >> (i & 63)) & 1ull);
902 cur += bit ? +1 : -1;
903 if (cur == target_x) {
904 out[i >> 6] |= (uint64_t{1} << (i & 63));
905 }
906 }
907#endif
908 return block_delta;
909}
910
920static inline int prefix_excess_128(const uint64_t* s,
921 size_t end_offset) noexcept {
922 end_offset = end_offset > 128 ? 128 : end_offset;
923 if (end_offset == 0) {
924 return 0;
925 }
926 if (end_offset <= 64) {
927 const int ones = static_cast<int>(std::popcount(
928 s[0] & first_bits_mask(static_cast<uint32_t>(end_offset))));
929 return 2 * ones - static_cast<int>(end_offset);
930 }
931 const int ones = static_cast<int>(
932 std::popcount(s[0]) +
933 std::popcount(s[1] &
934 first_bits_mask(static_cast<uint32_t>(end_offset - 64))));
935 return 2 * ones - static_cast<int>(end_offset);
936}
937
953static inline size_t forward_search_128(const uint64_t* s,
954 int target_x,
955 size_t start_offset,
956 int* block_excess = nullptr) noexcept {
957 uint64_t out[2];
958 const int delta = excess_positions_128(s, target_x, out);
959 if (block_excess != nullptr) {
960 *block_excess = delta;
961 }
962 if (start_offset >= 128) {
963 return 128;
964 }
965
966 const size_t first_word = start_offset >> 6;
967 const size_t first_bit = start_offset & 63;
968 for (size_t word = first_word; word < 2; ++word) {
969 uint64_t mask = out[word];
970 if (word == first_word && first_bit != 0) {
971 mask &= ~first_bits_mask(first_bit);
972 }
973 if (mask != 0) {
974 return word * 64 + std::countr_zero(mask);
975 }
976 }
977 return 128;
978}
979
997static inline size_t backward_search_128(const uint64_t* s,
998 int target_x,
999 size_t end_offset,
1000 int* block_excess = nullptr) noexcept {
1001 uint64_t out[2];
1002 const int delta = excess_positions_128(s, target_x, out);
1003 if (block_excess != nullptr) {
1004 *block_excess = delta;
1005 }
1006 if (end_offset == 0) {
1007 return 128;
1008 }
1009
1010 const size_t max_prefix_length = end_offset - 1;
1011 if (max_prefix_length > 0) {
1012 const size_t last_bit_index = max_prefix_length - 1;
1013 size_t word = last_bit_index >> 6;
1014 const size_t bit_in_word = last_bit_index & 63;
1015 uint64_t mask = out[word] & first_bits_mask(bit_in_word + 1);
1016 while (true) {
1017 if (mask != 0) {
1018 return word * 64 + (63 - std::countl_zero(mask)) + 1;
1019 }
1020 if (word == 0) {
1021 break;
1022 }
1023 --word;
1024 mask = out[word];
1025 }
1026 }
1027 return target_x == 0 ? 0 : 128;
1028}
1029
1042static inline void excess_positions_512(const uint64_t* s,
1043 int target_x,
1044 uint64_t* out) noexcept {
1045 if (target_x < -512 || target_x > 512) {
1046 out[0] = out[1] = out[2] = out[3] = 0;
1047 out[4] = out[5] = out[6] = out[7] = 0;
1048 return;
1049 }
1050
1051 for (int k = 0; k < 4; ++k) {
1052 target_x -= excess_positions_128(s + 2 * k, target_x, out + 2 * k);
1053 }
1054}
1055
1067static inline void rank_32x8(const uint8_t* x, uint8_t* result) {
1068#ifdef PIXIE_AVX512_SUPPORT
1069 // Step 1: Calculate popcount of each byte
1070 popcount_32x8(x, result);
1071 __m256i prefix_sums = _mm256_loadu_si256((__m256i const*)result);
1072 const __m256i zero = _mm256_setzero_si256();
1073
1074 prefix_sums = _mm256_add_epi8(prefix_sums,
1075 _mm256_alignr_epi8(prefix_sums, zero, 16 - 1));
1076 prefix_sums = _mm256_add_epi8(prefix_sums,
1077 _mm256_alignr_epi8(prefix_sums, zero, 16 - 2));
1078 prefix_sums = _mm256_add_epi8(prefix_sums,
1079 _mm256_alignr_epi8(prefix_sums, zero, 16 - 4));
1080 prefix_sums = _mm256_add_epi8(prefix_sums,
1081 _mm256_alignr_epi8(prefix_sums, zero, 16 - 8));
1082
1083 // At this point we have prefix sums for two halfs, the last step is to
1084 // extract 16-th value and add it to the whole second half
1085 __m128i low_lane = _mm256_extracti128_si256(prefix_sums, 0);
1086 __m128i high_lane = _mm256_extracti128_si256(prefix_sums, 1);
1087 auto last_val_low = _mm_extract_epi8(low_lane, 15);
1088 __m128i add_to_high = _mm_set1_epi8(last_val_low);
1089 high_lane = _mm_add_epi8(high_lane, add_to_high);
1090 prefix_sums = _mm256_set_m128i(high_lane, low_lane);
1091 _mm256_storeu_epi8(result, prefix_sums);
1092#else
1093 // Scalar fallback implementation
1094 result[0] = std::popcount(x[0]);
1095 for (size_t i = 1; i < 32; ++i) {
1096 result[i] = std::popcount(x[i]) + result[i - 1];
1097 }
1098#endif
1099}