Pixie
Loading...
Searching...
No Matches
rmm_tree.h
1#pragma once
2#include <immintrin.h>
3#include <pixie/bits.h>
4
5#include <algorithm>
6#include <array>
7#include <bit>
8#include <climits>
9#include <cstddef>
10#include <cstdint>
11#include <limits>
12#include <string>
13#include <vector>
14
15namespace pixie {
34class RmMTree {
35 // ------------ bitvector ------------
36 std::vector<std::uint64_t> bits; // LSB-first
37 size_t num_bits = 0; // number of bits
38
39 // ------------ blocking ------------
40 size_t block_bits = 64; // block size (bits), leaf covers <= block_bits bits
41 size_t leaf_count = 0; // #leaves = ceil(num_bits/block_bits)
42
43 // ------------ tree arrays (heap order: 1 is root) ------------
44 // size of segment (in bits) covered by node
45 // needed for: rank1/rank0, select1/select0, select10,
46 // excess, fwdsearch/bwdsearch/close/open/enclose,
47 // range_min_query/range_max_query, minselect.
48 std::vector<uint32_t> segment_size_bits;
49
50 // node_total_excess = total excess (+1 for '1', -1 for '0') on the node
51 // needed for: rank1/rank0, select1/select0, excess,
52 // fwdsearch/bwdsearch/close/open/enclose,
53 // range_min_query/range_max_query, mincount/minselect.
54 std::vector<int32_t> node_total_excess;
55
56 // node_min_prefix_excess = minimum pref-excess on the node (from 0)
57 // needed for: fwdsearch/bwdsearch/close/open/enclose, range_min_query,
58 // mincount/minselect.
59 std::vector<int32_t> node_min_prefix_excess;
60
61 // node_max_prefix_excess = maximum pref-excess on the node (from 0)
62 // needed for: fwdsearch/bwdsearch/close/open/enclose, range_max_query.
63 std::vector<int32_t> node_max_prefix_excess;
64
65 // node_min_count = number of positions where the minimum is attained
66 // needed for: mincount/minselect.
67 std::vector<uint32_t> node_min_count;
68
69 // node_pattern10_count = # of "10" pattern occurrences inside the node
70 // needed for: rank10, select10.
71 std::vector<uint32_t> node_pattern10_count;
72
73 // node_first_bit = first bit (0/1), node_last_bit = last bit (0/1) of the
74 // segment (to handle "10" crossing)
75 // both needed for: rank10, select10.
76 std::vector<uint8_t> node_first_bit, node_last_bit;
77
78 public:
82 static constexpr size_t npos = std::numeric_limits<size_t>::max();
83
84#ifdef DEBUG
85 float built_overhead = 0.0;
86#endif
87
88 // --------- construction ----------
89
93 RmMTree() = default;
94
104 explicit RmMTree(const std::string& bit_string,
105 const size_t& leaf_block_bits /*0=auto*/ = 0,
106 const float& max_overhead /*<0=off*/ = -1.0) {
107 build_from_string(bit_string, leaf_block_bits, max_overhead);
108 }
109
120 explicit RmMTree(const std::vector<std::uint64_t>& words,
121 size_t bit_count,
122 const size_t& leaf_block_bits /*0=auto*/ = 0,
123 const float& max_overhead /*<0=off*/ = -1.0) {
124 build_from_words(words, bit_count, leaf_block_bits, max_overhead);
125 }
126
127 // --------- queries: rank/select/excess ----------
128
133 size_t rank1(const size_t& end_position) const {
134 if (end_position == 0) {
135 return 0;
136 }
137 const size_t block_index = block_of(end_position - 1);
138 size_t ones_count = 0;
139 if (block_index > 0) {
140 size_t nodes_buffer[64];
141 const size_t node_count =
142 cover_blocks_collect(0, block_index - 1, nodes_buffer);
143 for (size_t j = 0; j < node_count; ++j) {
144 ones_count += ones_in_node(nodes_buffer[j]);
145 }
146 }
147 const size_t block_begin = block_index * block_bits;
148 const size_t block_end = std::min(num_bits, block_begin + block_bits);
149 ones_count +=
150 rank1_in_block(block_begin, std::min(end_position, block_end));
151 return ones_count;
152 }
153
158 size_t rank0(const size_t& end_position) const {
159 return end_position - rank1(end_position);
160 }
161
166 size_t select1(size_t target_one_rank) const {
167 if (target_one_rank == 0 || num_bits == 0) {
168 return npos;
169 }
170 size_t node_index = 1;
171 if (ones_in_node(node_index) < target_one_rank) {
172 return npos;
173 }
174 size_t segment_base = 0;
175 while (node_index < first_leaf_index) {
176 const size_t left_child = node_index << 1;
177 const size_t right_child = left_child | 1;
178 const uint32_t ones_in_left_child = ones_in_node(left_child);
179 if (ones_in_left_child >= target_one_rank) {
180 node_index = left_child;
181 } else {
182 target_one_rank -= ones_in_left_child;
183 segment_base += segment_size_bits[left_child];
184 node_index = right_child;
185 }
186 }
187 return select1_in_block(
188 segment_base,
189 std::min(segment_base + segment_size_bits[node_index], num_bits),
190 target_one_rank);
191 }
192
197 size_t select0(size_t target_zero_rank) const {
198 if (target_zero_rank == 0 || num_bits == 0) {
199 return npos;
200 }
201 size_t node_index = 1;
202 const auto zeros_in_node = [&](const size_t& node) noexcept {
203 return segment_size_bits[node] - ones_in_node(node);
204 };
205 if (zeros_in_node(node_index) < target_zero_rank) {
206 return npos;
207 }
208 size_t segment_base = 0;
209 while (node_index < first_leaf_index) {
210 const size_t left_child = node_index << 1;
211 const size_t right_child = left_child | 1;
212 const size_t zeros_in_left_child = zeros_in_node(left_child);
213 if (zeros_in_left_child >= target_zero_rank) {
214 node_index = left_child;
215 } else {
216 target_zero_rank -= zeros_in_left_child;
217 segment_base += segment_size_bits[left_child];
218 node_index = right_child;
219 }
220 }
221 return select0_in_block(
222 segment_base,
223 std::min(segment_base + segment_size_bits[node_index], num_bits),
224 target_zero_rank);
225 }
226
232 size_t rank10(const size_t& end_position) const {
233 if (end_position <= 1) {
234 return 0;
235 }
236 const size_t block_index = block_of(end_position - 1);
237 size_t pattern_count = 0;
238 int previous_last_bit = -1;
239
240 if (block_index > 0) {
241 const auto covered_nodes = cover_blocks(0, block_index - 1);
242 for (const size_t& node_index : covered_nodes) {
243 pattern_count += node_pattern10_count[node_index];
244 if (previous_last_bit != -1 && previous_last_bit == 1 &&
245 node_first_bit[node_index] == 0) {
246 ++pattern_count;
247 }
248 previous_last_bit = node_last_bit[node_index];
249 }
250 }
251 const size_t block_begin = block_index * block_bits;
252 pattern_count += rr_in_block(block_begin, end_position);
253 // boundary between the last full node and the leaf tail
254 if (block_index > 0 && end_position > block_begin &&
255 previous_last_bit == 1 && bit(block_begin) == 0) {
256 ++pattern_count;
257 }
258 return pattern_count;
259 }
260
265 size_t select10(size_t target_pattern_rank) const {
266 if (target_pattern_rank == 0 || num_bits == 0) {
267 return npos;
268 }
269 size_t node_index = 1;
270 if (node_pattern10_count[node_index] < target_pattern_rank) {
271 return npos;
272 }
273 const size_t tree_size = segment_size_bits.size() - 1;
274 size_t segment_base = 0;
275 while (node_index < first_leaf_index) {
276 const size_t left_child = node_index << 1;
277 const size_t left_segment_size =
278 (left_child <= tree_size) ? segment_size_bits[left_child] : 0;
279 if (left_segment_size == 0) {
280 return npos;
281 }
282
283 const size_t left_count = node_pattern10_count[left_child];
284 if (left_count >= target_pattern_rank) {
285 node_index = left_child;
286 continue;
287 }
288
289 size_t remaining_rank = target_pattern_rank - left_count;
290 const size_t right_child = left_child | 1;
291 const bool has_right =
292 (right_child <= tree_size) && (segment_size_bits[right_child] != 0);
293 if (!has_right) {
294 return npos;
295 }
296
297 const size_t crossing_pattern =
298 (node_last_bit[left_child] == 1 && node_first_bit[right_child] == 0)
299 ? 1u
300 : 0u;
301 if (crossing_pattern) {
302 if (remaining_rank == 1) {
303 return segment_base + left_segment_size - 1;
304 }
305 --remaining_rank;
306 }
307 segment_base += left_segment_size;
308 node_index = right_child;
309 target_pattern_rank = remaining_rank;
310 }
311 return select10_in_block(
312 segment_base,
313 std::min(segment_base + segment_size_bits[node_index], num_bits),
314 target_pattern_rank);
315 }
316
320 inline int excess(const size_t& end_position) const {
321 return int64_t(rank1(end_position)) * 2 - int64_t(end_position);
322 }
323
330 size_t fwdsearch(const size_t& start_position, const int& delta) const {
331 if (start_position >= num_bits) {
332 return npos;
333 }
334
335 // 1) scan the remainder of the current leaf
336 const size_t leaf_block_index = block_of(start_position);
337 const size_t block_begin = leaf_block_index * block_bits;
338 const size_t block_end = std::min(num_bits, block_begin + block_bits);
339 int leaf_delta = 0;
340 const size_t leaf_result = leaf_fwd_bp_simd(
341 leaf_block_index, block_begin, start_position, delta, leaf_delta);
342 if (leaf_result != npos) {
343 return leaf_result;
344 }
345
346 int remaining_delta = delta - leaf_delta;
347 size_t segment_base = block_end;
348 if (remaining_delta == 0) {
349 return segment_base;
350 }
351
352 // Tree-walk to the right:
353 // go up; whenever we come from a left child, try the right sibling subtree.
354 // If target is inside sibling -> descend; else skip it and continue up.
355 size_t node_index = leaf_index_of(block_begin);
356 const size_t tree_size = segment_size_bits.size() - 1;
357
358 // If we are already at/after the last leaf boundary, there's nothing to
359 // scan.
360 if (segment_base >= num_bits || leaf_block_index + 1 >= leaf_count) {
361 return npos;
362 }
363
364 while (node_index > 1) {
365 const bool is_left_child = ((node_index & 1u) == 0u);
366 if (is_left_child) {
367 const size_t sibling = node_index | 1u; // right sibling
368 if (sibling <= tree_size && segment_size_bits[sibling]) {
369 // Boundary at sibling start already handled above via
370 // remaining_delta==0.
371 if (node_min_prefix_excess[sibling] <= remaining_delta &&
372 remaining_delta <= node_max_prefix_excess[sibling]) {
373 return descend_fwd(sibling, remaining_delta, segment_base);
374 }
375 // Skip whole sibling subtree.
376 remaining_delta -= node_total_excess[sibling];
377 segment_base += segment_size_bits[sibling];
378 if (remaining_delta == 0) {
379 return segment_base;
380 }
381 }
382 }
383 node_index >>= 1;
384 }
385 return npos;
386 }
387
394 size_t bwdsearch(const size_t& start_position, const int& delta) const {
395 if (start_position > num_bits || start_position == 0) {
396 return npos;
397 }
398
399 // 1) scan inside the block
400 const size_t leaf_block_index = block_of(start_position - 1);
401 const size_t block_begin = leaf_block_index * block_bits;
402 int leaf_delta = 0; // excess(start_position) - excess(block_begin)
403 const size_t leaf_result = leaf_bwd_bp_simd(
404 leaf_block_index, block_begin, start_position, delta, leaf_delta);
405 if (leaf_result != npos) {
406 return leaf_result;
407 }
408
409 // need = target - excess(block_begin) = excess(start_position) + delta -
410 // excess(block_begin) = leaf_delta + delta
411 int remaining_delta = leaf_delta + delta;
412 size_t node_index = leaf_index_of(block_begin);
413 size_t segment_base = block_begin;
414 while (node_index > 1) {
415 if (node_index & 1) { // node_index is the right child
416 const size_t sibling_index = node_index ^ 1; // left sibling
417 const size_t sibling_border =
418 segment_base; // right border of the sibling (== start(node_index))
419 const int needed_inside_sibling =
420 remaining_delta +
421 node_total_excess[sibling_index]; // target in coordinates relative
422 // to the start of sibling
423 const bool allow_right_border =
424 (sibling_border != start_position); // j must be < start_position
425
426 // try inside the sibling, but return only if a position is found
427 if (needed_inside_sibling == 0 ||
428 (node_min_prefix_excess[sibling_index] <= needed_inside_sibling &&
429 needed_inside_sibling <= node_max_prefix_excess[sibling_index])) {
430 const size_t result = descend_bwd(
431 sibling_index, sibling_border - segment_size_bits[sibling_index],
432 needed_inside_sibling, sibling_border, allow_right_border);
433 if (result != npos) {
434 return result;
435 }
436 }
437 // junction between children is a separate branch (allowed only if < i)
438 if (needed_inside_sibling == node_total_excess[sibling_index] &&
439 sibling_border < start_position) {
440 return sibling_border;
441 }
442
443 // stepped over the sibling, shifted the zero point of the coordinates
444 remaining_delta += node_total_excess[sibling_index];
445 segment_base -= segment_size_bits[sibling_index];
446 }
447 node_index >>= 1;
448 }
449 return npos;
450 }
451
458 size_t range_min_query_pos(const size_t& range_begin,
459 const size_t& range_end) const {
460 if (range_begin > range_end || range_end >= num_bits) {
461 return npos;
462 }
463
464 const size_t begin_block_index = block_of(range_begin);
465 const size_t begin_block_start = begin_block_index * block_bits;
466 const size_t begin_block_end =
467 std::min(num_bits, begin_block_start + block_bits);
468 const size_t end_block_index = block_of(range_end);
469 const size_t end_block_start = end_block_index * block_bits;
470
471 int best_value = INT_MAX;
472 size_t best_position = npos;
473 size_t chosen_node_index = 0;
474 int prefix_excess = 0, prefix_at_choice = 0;
475
476 // prefix
477 int min_prefix_first_chunk = INT_MAX;
478 size_t first_chunk_position = npos;
479 const size_t end_of_first_chunk = std::min(
480 range_end, (size_t)(begin_block_end ? begin_block_end - 1 : 0));
481 if (range_begin <= end_of_first_chunk) {
482 first_min_value_pos8(range_begin, end_of_first_chunk,
483 min_prefix_first_chunk, first_chunk_position);
484 prefix_excess =
485 (int64_t)rank1_in_block(range_begin, end_of_first_chunk + 1) * 2 -
486 int64_t(end_of_first_chunk + 1 - range_begin);
487 best_value = min_prefix_first_chunk;
488 best_position = first_chunk_position;
489 chosen_node_index = 0;
490 }
491
492 // middle
493 if (begin_block_index + 1 <= end_block_index - 1) {
494 size_t left_index = first_leaf_index + (begin_block_index + 1);
495 size_t right_index = first_leaf_index + (end_block_index - 1);
496 size_t right_nodes[64];
497 int right_nodes_count = 0;
498
499 while (left_index <= right_index) {
500 if (left_index & 1) {
501 const size_t node_index = left_index++;
502 const int candidate =
503 prefix_excess + node_min_prefix_excess[node_index];
504 if (candidate < best_value) {
505 best_value = candidate;
506 best_position = npos;
507 chosen_node_index = node_index;
508 prefix_at_choice = prefix_excess;
509 }
510 prefix_excess += node_total_excess[node_index];
511 }
512 if ((right_index & 1) == 0) {
513 right_nodes[right_nodes_count++] = right_index--;
514 }
515 left_index >>= 1;
516 right_index >>= 1;
517 }
518 while (right_nodes_count--) {
519 const size_t node_index = right_nodes[right_nodes_count];
520 const int candidate =
521 prefix_excess + node_min_prefix_excess[node_index];
522 if (candidate < best_value) {
523 best_value = candidate;
524 best_position = npos;
525 chosen_node_index = node_index;
526 prefix_at_choice = prefix_excess;
527 }
528 prefix_excess += node_total_excess[node_index];
529 }
530 }
531
532 // tail
533 if (end_block_index != begin_block_index) {
534 int min_prefix_last_chunk;
535 size_t last_chunk_position;
536 first_min_value_pos8(end_block_start, range_end, min_prefix_last_chunk,
537 last_chunk_position);
538 const int candidate = prefix_excess + min_prefix_last_chunk;
539 if (candidate < best_value) {
540 best_value = candidate;
541 best_position = last_chunk_position;
542 chosen_node_index = 0;
543 }
544 }
545
546 if (best_position != npos) {
547 return best_position;
548 }
549
550 return descend_first_min(chosen_node_index, best_value - prefix_at_choice,
551 node_base(chosen_node_index));
552 }
553
560 int range_min_query_val(const size_t& range_begin,
561 const size_t& range_end) const {
562 if (range_begin > range_end || range_end >= num_bits) {
563 return 0;
564 }
565 size_t min_position = range_min_query_pos(range_begin, range_end);
566 if (min_position == npos) {
567 return 0;
568 }
569 return excess(min_position + 1) - excess(range_begin);
570 }
571
578 size_t range_max_query_pos(const size_t& range_begin,
579 const size_t& range_end) const {
580 if (range_begin > range_end || range_end >= num_bits) {
581 return npos;
582 }
583
584 const size_t begin_block_index = block_of(range_begin);
585 const size_t begin_block_start = begin_block_index * block_bits;
586 const size_t begin_block_end =
587 std::min(num_bits, begin_block_start + block_bits);
588 const size_t end_block_index = block_of(range_end);
589 const size_t end_block_start = end_block_index * block_bits;
590
591 int best_value = INT_MIN;
592 size_t best_position = npos;
593 size_t chosen_node_index = 0;
594 int prefix_excess = 0, prefix_at_choice = 0;
595
596 // prefix
597 int max_prefix_first_chunk = INT_MIN;
598 size_t first_chunk_position = npos;
599 const size_t end_of_first_chunk = std::min(
600 range_end, (size_t)(begin_block_end ? begin_block_end - 1 : 0));
601 if (range_begin <= end_of_first_chunk) {
602 first_max_value_pos8(range_begin, end_of_first_chunk,
603 max_prefix_first_chunk, first_chunk_position);
604 prefix_excess =
605 (int64_t)rank1_in_block(range_begin, end_of_first_chunk + 1) * 2 -
606 int64_t(end_of_first_chunk + 1 - range_begin);
607 best_value = max_prefix_first_chunk;
608 best_position = first_chunk_position;
609 chosen_node_index = 0;
610 }
611
612 // middle
613 if (begin_block_index + 1 <= end_block_index - 1) {
614 size_t left_index = first_leaf_index + (begin_block_index + 1);
615 size_t right_index = first_leaf_index + (end_block_index - 1);
616 size_t right_nodes[64];
617 int right_nodes_count = 0;
618
619 while (left_index <= right_index) {
620 if (left_index & 1) {
621 const size_t node_index = left_index++;
622 const int candidate =
623 prefix_excess + node_max_prefix_excess[node_index];
624 if (candidate > best_value) {
625 best_value = candidate;
626 best_position = npos;
627 chosen_node_index = node_index;
628 prefix_at_choice = prefix_excess;
629 }
630 prefix_excess += node_total_excess[node_index];
631 }
632 if ((right_index & 1) == 0) {
633 right_nodes[right_nodes_count++] = right_index--;
634 }
635 left_index >>= 1;
636 right_index >>= 1;
637 }
638 while (right_nodes_count--) {
639 const size_t node_index = right_nodes[right_nodes_count];
640 const int candidate =
641 prefix_excess + node_max_prefix_excess[node_index];
642 if (candidate > best_value) {
643 best_value = candidate;
644 best_position = npos;
645 chosen_node_index = node_index;
646 prefix_at_choice = prefix_excess;
647 }
648 prefix_excess += node_total_excess[node_index];
649 }
650 }
651
652 // tail
653 if (end_block_index != begin_block_index) {
654 int max_prefix_last_chunk;
655 size_t last_chunk_position;
656 first_max_value_pos8(end_block_start, range_end, max_prefix_last_chunk,
657 last_chunk_position);
658 const int candidate = prefix_excess + max_prefix_last_chunk;
659 if (candidate > best_value) {
660 best_value = candidate;
661 best_position = last_chunk_position;
662 chosen_node_index = 0;
663 }
664 }
665
666 if (best_position != npos) {
667 return best_position;
668 }
669
670 return descend_first_max(chosen_node_index, best_value - prefix_at_choice,
671 node_base(chosen_node_index));
672 }
673
678 int range_max_query_val(const size_t& range_begin,
679 const size_t& range_end) const {
680 if (range_begin > range_end || range_end >= num_bits) {
681 return 0;
682 }
683 size_t max_position = range_max_query_pos(range_begin, range_end);
684 if (max_position == npos) {
685 return 0;
686 }
687 return excess(max_position + 1) - excess(range_begin);
688 }
689
694 size_t mincount(const size_t& range_begin, const size_t& range_end) const {
695 if (range_begin > range_end || range_end >= num_bits) {
696 return 0;
697 }
698
699 const size_t begin_block_index = block_of(range_begin);
700 const size_t begin_block_start = begin_block_index * block_bits;
701 const size_t begin_block_end =
702 std::min(num_bits, begin_block_start + block_bits);
703 const size_t end_block_index = block_of(range_end);
704 const size_t end_block_start = end_block_index * block_bits;
705
706 int best_value = INT_MAX;
707 size_t min_count = 0;
708 int prefix_excess = 0;
709
710 // first chunk
711 {
712 int current_excess = 0, min_value = INT_MAX, local_count = 0;
713 const size_t end_of_first_chunk =
714 std::min(range_end, begin_block_end - 1);
715 for (size_t position = range_begin; position <= end_of_first_chunk;
716 ++position) {
717 current_excess += bit(position) ? +1 : -1;
718 if (current_excess < min_value) {
719 min_value = current_excess;
720 local_count = 1;
721 } else if (current_excess == min_value) {
722 ++local_count;
723 }
724 }
725 best_value = min_value;
726 min_count = local_count;
727 prefix_excess = current_excess; // offset toward the middle
728 }
729
730 // middle
731 if (begin_block_index + 1 <= end_block_index - 1) {
732 const auto middle_nodes =
733 cover_blocks(begin_block_index + 1, end_block_index - 1);
734 for (const size_t& node_index : middle_nodes) {
735 const int candidate =
736 prefix_excess + node_min_prefix_excess[node_index];
737 if (candidate < best_value) {
738 best_value = candidate;
739 min_count = node_min_count[node_index];
740 } else if (candidate == best_value) {
741 min_count += node_min_count[node_index];
742 }
743 prefix_excess += node_total_excess[node_index];
744 }
745 }
746
747 // last chunk
748 if (end_block_index != begin_block_index) {
749 int current_excess = 0, min_value = INT_MAX, local_count = 0;
750 for (size_t position = end_block_start; position <= range_end;
751 ++position) {
752 current_excess += bit(position) ? +1 : -1;
753 if (current_excess < min_value) {
754 min_value = current_excess;
755 local_count = 1;
756 } else if (current_excess == min_value) {
757 ++local_count;
758 }
759 }
760 const int candidate = prefix_excess + min_value;
761 if (candidate < best_value) {
762 best_value = candidate;
763 min_count = local_count;
764 } else if (candidate == best_value) {
765 min_count += local_count;
766 }
767 }
768 return min_count;
769 }
770
777 size_t minselect(const size_t& range_begin,
778 const size_t& range_end,
779 size_t target_min_rank) const {
780 if (range_begin > range_end || range_end >= num_bits ||
781 target_min_rank == 0) {
782 return npos;
783 }
784
785 const size_t begin_block_index = block_of(range_begin);
786 const size_t begin_block_start = begin_block_index * block_bits;
787 const size_t begin_block_end =
788 std::min(num_bits, begin_block_start + block_bits);
789 const size_t end_block_index = block_of(range_end);
790 const size_t end_block_start = end_block_index * block_bits;
791
792 // prefix
793 const size_t end_of_first_chunk = std::min(range_end, begin_block_end - 1);
794 int current_first_chunk_excess = 0, min_first_chunk = 0;
795 uint32_t count_first_chunk = 0;
796
797 if (range_begin <= end_of_first_chunk) {
798 scan_range_min_count8(range_begin, end_of_first_chunk,
799 current_first_chunk_excess, min_first_chunk,
800 count_first_chunk);
801 } else {
802 current_first_chunk_excess = 0;
803 min_first_chunk = INT_MAX;
804 count_first_chunk = 0;
805 }
806
807 int best_value = (min_first_chunk == INT_MAX ? INT_MAX : min_first_chunk);
808 size_t total_count =
809 (min_first_chunk == INT_MAX ? 0u : (size_t)count_first_chunk);
810 int prefix_excess = current_first_chunk_excess; // offset for middle
811
812 size_t left_index = first_leaf_index + begin_block_index + 1;
813 size_t right_index = first_leaf_index + end_block_index - 1;
814 size_t right_nodes[64];
815 int right_nodes_count = 0;
816
817 // middle
818 if (begin_block_index + 1 <= end_block_index - 1) {
819 while (left_index <= right_index) {
820 if (left_index & 1) {
821 const int candidate =
822 prefix_excess + node_min_prefix_excess[left_index];
823 if (candidate < best_value) {
824 best_value = candidate;
825 total_count = node_min_count[left_index];
826 } else if (candidate == best_value) {
827 total_count += node_min_count[left_index];
828 }
829 prefix_excess += node_total_excess[left_index++];
830 }
831 if ((right_index & 1) == 0) {
832 right_nodes[right_nodes_count++] = right_index--;
833 }
834 left_index >>= 1;
835 right_index >>= 1;
836 }
837 while (right_nodes_count--) {
838 const size_t node_index = right_nodes[right_nodes_count];
839 const int candidate =
840 prefix_excess + node_min_prefix_excess[node_index];
841 if (candidate < best_value) {
842 best_value = candidate;
843 total_count = node_min_count[node_index];
844 } else if (candidate == best_value) {
845 total_count += node_min_count[node_index];
846 }
847 prefix_excess += node_total_excess[node_index];
848 }
849 }
850
851 // tail
852 int current_last_chunk_excess = 0, min_last_chunk = INT_MAX;
853 uint32_t count_last_chunk = 0;
854 if (end_block_index != begin_block_index) {
855 scan_range_min_count8(end_block_start, range_end,
856 current_last_chunk_excess, min_last_chunk,
857 count_last_chunk);
858 const int candidate = prefix_excess + min_last_chunk;
859 if (candidate < best_value) {
860 best_value = candidate;
861 total_count = count_last_chunk;
862 } else if (candidate == best_value) {
863 total_count += count_last_chunk;
864 }
865 }
866
867 if (target_min_rank > total_count) {
868 return npos;
869 }
870
871 // prefix
872 if (min_first_chunk == best_value && count_first_chunk) {
873 if (target_min_rank <= count_first_chunk) {
874 return qth_min_in_block(range_begin, end_of_first_chunk,
875 target_min_rank);
876 }
877 target_min_rank -= count_first_chunk;
878 }
879
880 // middle
881 prefix_excess = current_first_chunk_excess;
882 if (begin_block_index + 1 <= end_block_index - 1) {
883 left_index = first_leaf_index + (begin_block_index + 1);
884 right_index = first_leaf_index + (end_block_index - 1);
885 right_nodes_count = 0;
886 while (left_index <= right_index) {
887 if (left_index & 1) {
888 const size_t node_index = left_index++;
889 const int candidate =
890 prefix_excess + node_min_prefix_excess[node_index];
891 if (candidate == best_value) {
892 if (target_min_rank <= node_min_count[node_index]) {
893 return descend_qth_min(node_index, best_value - prefix_excess,
894 target_min_rank, node_base(node_index));
895 }
896 target_min_rank -= node_min_count[node_index];
897 }
898 prefix_excess += node_total_excess[node_index];
899 }
900 if (!(right_index & 1)) {
901 right_nodes[right_nodes_count++] = right_index--;
902 }
903 left_index >>= 1;
904 right_index >>= 1;
905 }
906 while (right_nodes_count--) {
907 const size_t node_index = right_nodes[right_nodes_count];
908 const int candidate =
909 prefix_excess + node_min_prefix_excess[node_index];
910 if (candidate == best_value) {
911 if (target_min_rank <= node_min_count[node_index]) {
912 return descend_qth_min(node_index, best_value - prefix_excess,
913 target_min_rank, node_base(node_index));
914 }
915 target_min_rank -= node_min_count[node_index];
916 }
917 prefix_excess += node_total_excess[node_index];
918 }
919 }
920
921 // tail
922 if (end_block_index != begin_block_index &&
923 (prefix_excess + min_last_chunk) == best_value) {
924 return qth_min_in_block(end_block_start, range_end, target_min_rank);
925 }
926
927 return npos;
928 }
929
930 // ----- parentheses navigation (BP) -----
931
936 inline size_t close(const size_t& open_position) const {
937 if (open_position >= num_bits) {
938 return npos;
939 }
940 return fwdsearch(open_position, -1);
941 }
942
947 inline size_t open(const size_t& close_position) const {
948 // bwdsearch allows i in [1..num_bits]
949 if (close_position == 0 || close_position > num_bits) {
950 return npos;
951 }
952 const size_t result = bwdsearch(close_position, 0);
953 return (result == npos ? npos : result + 1);
954 }
955
961 inline size_t enclose(const size_t& position) const {
962 if (position == 0 || position > num_bits) {
963 return npos;
964 }
965 const size_t result = bwdsearch(position, -2);
966 return (result == npos ? npos : result + 1);
967 }
968
969 private:
975 static inline size_t pop10_in_slice64(const std::uint64_t& slice,
976 const int& length) noexcept {
977 if (length <= 1) {
978 return 0;
979 }
980 std::uint64_t pattern_mask = slice & ~(slice >> 1); // candidates for "10"
981 if (length < 64) {
982 pattern_mask &= ((std::uint64_t(1) << (length - 1)) - 1);
983 } else {
984 pattern_mask &= 0x7FFFFFFFFFFFFFFFull;
985 }
986 return (size_t)std::popcount(pattern_mask);
987 }
988
993 size_t rank1_in_block(const size_t& block_begin,
994 const size_t& block_end) const noexcept {
995 if (block_end <= block_begin) {
996 return 0;
997 }
998 size_t left_word_index = block_begin >> 6;
999 const size_t right_word_index = block_end >> 6;
1000 size_t left_offset = block_begin & 63;
1001 const size_t right_offset = block_end & 63;
1002 size_t count = 0;
1003 if (left_word_index == right_word_index) {
1004 const std::uint64_t mask =
1005 ((right_offset == 0) ? 0 : ((std::uint64_t(1) << right_offset) - 1)) &
1006 (~std::uint64_t(0) << left_offset);
1007 return (size_t)std::popcount(bits[left_word_index] & mask);
1008 }
1009 if (left_offset) {
1010 count += (size_t)std::popcount(bits[left_word_index] &
1011 (~std::uint64_t(0) << left_offset));
1012 ++left_word_index;
1013 }
1014 while (left_word_index < right_word_index) {
1015 count += (size_t)std::popcount(bits[left_word_index]);
1016 ++left_word_index;
1017 }
1018 if (right_offset) {
1019 count += (size_t)std::popcount(bits[right_word_index] &
1020 ((std::uint64_t(1) << right_offset) - 1));
1021 }
1022 return count;
1023 }
1024
1029 size_t rr_in_block(const size_t& block_begin,
1030 const size_t& block_end) const noexcept {
1031 if (block_end <= block_begin + 1) {
1032 return 0;
1033 }
1034 size_t left_word_index = block_begin >> 6;
1035 const size_t right_word_index = (block_end - 1) >> 6;
1036 const int left_offset = block_begin & 63;
1037 const int right_offset = (block_end - 1) & 63;
1038 size_t count = 0;
1039
1040 if (left_word_index == right_word_index) {
1041 const int length = right_offset - left_offset + 1;
1042 const std::uint64_t slice = bits[left_word_index] >> left_offset;
1043 return pop10_in_slice64(slice, length);
1044 }
1045
1046 // prefix word
1047 {
1048 const int length = 64 - left_offset;
1049 const std::uint64_t slice = bits[left_word_index] >> left_offset;
1050 count += pop10_in_slice64(slice, length);
1051 }
1052 // full interior words
1053 for (size_t word_index = left_word_index + 1; word_index < right_word_index;
1054 ++word_index) {
1055 const std::uint64_t word = bits[word_index];
1056 count += pop10_in_slice64(word, 64);
1057 }
1058 // suffix word
1059 {
1060 const int length = right_offset + 1;
1061 const std::uint64_t mask = (length == 64)
1062 ? ~std::uint64_t(0)
1063 : ((std::uint64_t(1) << length) - 1);
1064 const std::uint64_t slice = bits[right_word_index] & mask;
1065 count += pop10_in_slice64(slice, length);
1066 }
1067 // cross-word boundaries (bit 63 of w and bit 0 of w+1)
1068 for (size_t word_index = left_word_index; word_index < right_word_index;
1069 ++word_index) {
1070 if (((bits[word_index] >> 63) & 1u) &&
1071 ((bits[word_index + 1] & 1u) == 0)) {
1072 ++count;
1073 }
1074 }
1075 return count;
1076 }
1077
1083 size_t select10_in_block(const size_t& block_begin,
1084 const size_t& block_end,
1085 size_t target_pattern_rank) const noexcept {
1086 if (block_end <= block_begin + 1) {
1087 return npos;
1088 }
1089 size_t left_word_index = block_begin >> 6;
1090 const size_t right_word_index = (block_end - 1) >> 6;
1091 const int left_offset = block_begin & 63;
1092 const int right_offset = (block_end - 1) & 63;
1093
1094 const auto select_in_masked_slice =
1095 [&](const std::uint64_t& slice, const int& length,
1096 const size_t& target_index) noexcept -> int {
1097 if (length <= 1) {
1098 return -1;
1099 }
1100 std::uint64_t pattern_mask = slice & ~(slice >> 1);
1101 if (length < 64) {
1102 pattern_mask &= ((std::uint64_t(1) << (length - 1)) - 1);
1103 } else {
1104 pattern_mask &= 0x7FFFFFFFFFFFFFFFull;
1105 }
1106 return select_in_word(pattern_mask, target_index);
1107 };
1108
1109 if (left_word_index == right_word_index) {
1110 const int length = right_offset - left_offset + 1;
1111 const std::uint64_t slice = bits[left_word_index] >> left_offset;
1112 const int offset =
1113 select_in_masked_slice(slice, length, target_pattern_rank);
1114 return offset >= 0 ? (block_begin + (size_t)offset) : npos;
1115 }
1116
1117 // prefix word
1118 {
1119 const int length = 64 - left_offset;
1120 const std::uint64_t slice = bits[left_word_index] >> left_offset;
1121 std::uint64_t pattern_mask = slice & ~(slice >> 1);
1122 pattern_mask &= ((std::uint64_t(1) << (length - 1)) - 1);
1123 const int count = std::popcount(pattern_mask);
1124 if (target_pattern_rank <= (size_t)count) {
1125 const int offset =
1126 select_in_masked_slice(slice, length, target_pattern_rank);
1127 return block_begin + (size_t)offset;
1128 }
1129 target_pattern_rank -= count;
1130 }
1131
1132 // walk interior boundaries and words
1133 for (size_t word_index = left_word_index; word_index + 1 < right_word_index;
1134 ++word_index) {
1135 // boundary between w and w+1
1136 if (((bits[word_index] >> 63) & 1u) &&
1137 ((bits[word_index + 1] & 1u) == 0)) {
1138 if (--target_pattern_rank == 0) {
1139 return (word_index << 6) + 63;
1140 }
1141 }
1142 // full word w+1 (positions 0..62)
1143 const std::uint64_t next_word = bits[word_index + 1];
1144 const std::uint64_t pattern_mask =
1145 (next_word & ~(next_word >> 1)) & 0x7FFFFFFFFFFFFFFFull;
1146 const int count = std::popcount(pattern_mask);
1147 if (target_pattern_rank <= (size_t)count) {
1148 const int offset = select_in_word(pattern_mask, target_pattern_rank);
1149 if (offset == -1) {
1150 return npos;
1151 }
1152 return ((word_index + 1) << 6) + (size_t)offset;
1153 }
1154 target_pattern_rank -= count;
1155 }
1156
1157 // boundary (w_r-1, w_r)
1158 if (((bits[right_word_index - 1] >> 63) & 1u) &&
1159 ((bits[right_word_index] & 1u) == 0)) {
1160 if (--target_pattern_rank == 0) {
1161 return ((right_word_index - 1) << 6) + 63;
1162 }
1163 }
1164
1165 // suffix word w_r: [0..off_r]
1166 {
1167 const int length = right_offset + 1;
1168 const std::uint64_t mask = (length == 64)
1169 ? ~std::uint64_t(0)
1170 : ((std::uint64_t(1) << length) - 1);
1171 const std::uint64_t slice = bits[right_word_index] & mask;
1172 const int offset =
1173 select_in_masked_slice(slice, length, target_pattern_rank);
1174 if (offset >= 0) {
1175 return (right_word_index << 6) + (size_t)offset;
1176 }
1177 }
1178 return npos;
1179 }
1180
1181 struct ByteAgg {
1182 int8_t excess_total; // total excess for the byte
1183 int8_t min_prefix; // minimum prefix within the byte (from 0)
1184 int8_t max_prefix; // maximum prefix within the byte (from 0)
1185 uint8_t min_count; // number of positions attaining the minimum in the byte
1186 uint8_t pattern10_count; // number of "10" patterns inside the byte
1187 uint8_t first_bit; // first bit (LSB)
1188 uint8_t last_bit; // last bit (MSB)
1189 uint8_t pos_first_min; // pos of first minimum in this byte
1190 uint8_t pos_first_max; // pos of first maximum in this byte
1191 };
1192
1193 struct LUT8Tables {
1194 std::array<ByteAgg, 256> agg;
1199 std::array<std::array<int8_t, 17>, 256> fwd_pos;
1204 std::array<std::array<int8_t, 17>, 256> bwd_pos;
1205 };
1206
1210 static inline const LUT8Tables& LUT8_ALL() noexcept {
1211 static const LUT8Tables tables = [] {
1212 LUT8Tables lookup_tables{};
1213 for (int byte_value = 0; byte_value < 256; ++byte_value) {
1214 int current_excess = 0, min_prefix = INT_MAX, max_prefix = INT_MIN,
1215 min_count = 0, pattern10_count = 0;
1216 int first_min_position = 0, first_max_position = 0;
1217 int prefixes[8];
1218 const auto bit_at = [&](const int& bit_index) {
1219 return (byte_value >> bit_index) & 1;
1220 }; // LSB-first
1221 for (int bit_index = 0; bit_index < 8; ++bit_index) {
1222 int bit_value = bit_at(bit_index);
1223 if (bit_index + 1 < 8 && bit_value && bit_at(bit_index + 1) == 0) {
1224 ++pattern10_count;
1225 }
1226 current_excess += bit_value ? +1 : -1;
1227 prefixes[bit_index] = current_excess;
1228 if (current_excess < min_prefix) {
1229 min_prefix = current_excess;
1230 min_count = 1;
1231 first_min_position = bit_index;
1232 } else if (current_excess == min_prefix) {
1233 ++min_count;
1234 }
1235 if (current_excess > max_prefix) {
1236 max_prefix = current_excess;
1237 first_max_position = bit_index;
1238 }
1239 }
1240 ByteAgg aggregates{};
1241 aggregates.excess_total = current_excess;
1242 aggregates.min_prefix = (min_prefix == INT_MAX ? 0 : min_prefix);
1243 aggregates.max_prefix = (max_prefix == INT_MIN ? 0 : max_prefix);
1244 aggregates.min_count = min_count;
1245 aggregates.pattern10_count = pattern10_count;
1246 aggregates.first_bit = bit_at(0);
1247 aggregates.last_bit = bit_at(7);
1248 aggregates.pos_first_min = first_min_position;
1249 aggregates.pos_first_max = first_max_position;
1250 lookup_tables.agg[byte_value] = aggregates;
1251 auto& forward_positions = lookup_tables.fwd_pos[byte_value];
1252 auto& backward_positions = lookup_tables.bwd_pos[byte_value];
1253 forward_positions.fill(-1);
1254 backward_positions.fill(-1);
1255 for (int delta = -8; delta <= 8; ++delta) {
1256 for (int bit_index = 0; bit_index < 8; ++bit_index) {
1257 if (prefixes[bit_index] == delta) {
1258 forward_positions[delta + 8] = bit_index;
1259 break;
1260 }
1261 }
1262 for (int bit_index = 7; bit_index >= 0; --bit_index) {
1263 if (prefixes[bit_index] == delta) {
1264 backward_positions[delta + 8] = bit_index;
1265 break;
1266 }
1267 }
1268 }
1269 }
1270 return lookup_tables;
1271 }();
1272 return tables;
1273 }
1274
1278 static inline const std::array<ByteAgg, 256>& LUT8() noexcept {
1279 return LUT8_ALL().agg;
1280 }
1281
1285 static inline const std::array<std::array<int8_t, 17>, 256>&
1286 LUT8_FWD_POS() noexcept {
1287 return LUT8_ALL().fwd_pos;
1288 }
1289
1293 static inline const std::array<std::array<int8_t, 17>, 256>&
1294 LUT8_BWD_POS() noexcept {
1295 return LUT8_ALL().bwd_pos;
1296 }
1297
1301 inline uint16_t get_u16(const size_t& position) const noexcept {
1302 const size_t word_index = position >> 6;
1303 const unsigned offset = unsigned(position & 63);
1304 const std::uint64_t w0 =
1305 (word_index < bits.size()) ? bits[word_index] : 0ULL;
1306 if (offset == 0) {
1307 return uint16_t(w0 & 0xFFFFu);
1308 }
1309 const std::uint64_t w1 =
1310 (word_index + 1 < bits.size()) ? bits[word_index + 1] : 0ULL;
1311 const std::uint64_t v = (w0 >> offset) | (w1 << (64u - offset));
1312 return uint16_t(v & 0xFFFFu);
1313 }
1314
1315#if defined(PIXIE_AVX2_SUPPORT)
1316 static inline __m256i bit_masks_16x() noexcept {
1317 // 16 lanes: (1<<0), (1<<1), ... (1<<15)
1318 return _mm256_setr_epi16(0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020,
1319 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800,
1320 0x1000, 0x2000, 0x4000, (int16_t)0x8000);
1321 }
1322
1323 static inline __m256i prefix_sum_16x_i16(__m256i v) noexcept {
1324 // Inclusive prefix sum within 128-bit lanes, then fix carry into the high
1325 // lane.
1326 __m256i x = v;
1327 __m256i t = _mm256_slli_si256(x, 2);
1328 x = _mm256_add_epi16(x, t);
1329 t = _mm256_slli_si256(x, 4);
1330 x = _mm256_add_epi16(x, t);
1331 t = _mm256_slli_si256(x, 8);
1332 x = _mm256_add_epi16(x, t);
1333
1334 __m128i lo = _mm256_extracti128_si256(x, 0);
1335 __m128i hi = _mm256_extracti128_si256(x, 1);
1336 const int16_t carry =
1337 (int16_t)_mm_extract_epi16(lo, 7); // sum of first 8 elems
1338 hi = _mm_add_epi16(hi, _mm_set1_epi16(carry));
1339
1340 __m256i out = _mm256_castsi128_si256(lo);
1341 out = _mm256_inserti128_si256(out, hi, 1);
1342 return out;
1343 }
1344
1345 static inline int16_t last_prefix_16x_i16(__m256i pref) noexcept {
1346 __m128i hi = _mm256_extracti128_si256(pref, 1);
1347 return (int16_t)_mm_extract_epi16(hi, 7); // lane 15
1348 }
1349
1356 inline size_t scan_leaf_fwd_simd(const size_t& start,
1357 const size_t& end,
1358 const int& required_delta,
1359 int* out_total) const noexcept {
1360 if (start >= end) {
1361 if (out_total) {
1362 *out_total = 0;
1363 }
1364 return npos;
1365 }
1366 if (required_delta < -32768 || required_delta > 32767) {
1367 if (out_total) {
1368 const int len = int(end - start);
1369 const int ones = int(rank1_in_block(start, end));
1370 *out_total = ones * 2 - len;
1371 }
1372 return npos;
1373 }
1374
1375 static const __m256i masks = bit_masks_16x();
1376 static const __m256i vzero = _mm256_setzero_si256();
1377 static const __m256i vallones = _mm256_cmpeq_epi16(vzero, vzero);
1378 static const __m256i vminus1 = _mm256_set1_epi16(-1);
1379 static const __m256i vtwo = _mm256_set1_epi16(2);
1380 const __m256i vtarget = _mm256_set1_epi16((int16_t)required_delta);
1381
1382 int cur = 0;
1383 size_t pos = start;
1384 while (pos + 16 <= end) {
1385 const uint16_t bits16 = get_u16(pos);
1386 const __m256i vb = _mm256_set1_epi16((int16_t)bits16);
1387 const __m256i m = _mm256_and_si256(vb, masks);
1388 const __m256i is_zero = _mm256_cmpeq_epi16(m, vzero);
1389 const __m256i is_set = _mm256_andnot_si256(is_zero, vallones);
1390 const __m256i steps =
1391 _mm256_add_epi16(vminus1, _mm256_and_si256(is_set, vtwo));
1392
1393 const __m256i pref_rel = prefix_sum_16x_i16(steps);
1394 const __m256i base = _mm256_set1_epi16((int16_t)cur);
1395 const __m256i pref = _mm256_add_epi16(pref_rel, base);
1396 const __m256i cmp = _mm256_cmpeq_epi16(pref, vtarget);
1397 const uint32_t mask = (uint32_t)_mm256_movemask_epi8(cmp);
1398 if (mask) {
1399 const int lane = int(std::countr_zero(mask)) >> 1;
1400 return pos + (size_t)lane;
1401 }
1402 cur += (int)last_prefix_16x_i16(pref_rel);
1403 pos += 16;
1404 }
1405 while (pos < end) {
1406 cur += bit(pos) ? +1 : -1;
1407 if (cur == required_delta) {
1408 return pos;
1409 }
1410 ++pos;
1411 }
1412 if (out_total) {
1413 *out_total = cur;
1414 }
1415 return npos;
1416 }
1417#endif // PIXIE_AVX2_SUPPORT
1418
1425 inline size_t scan_leaf_fwd_lut8_fast(const size_t& start,
1426 const size_t& end,
1427 const int& required_delta,
1428 int* out_total) const noexcept {
1429 if (start >= end) {
1430 if (out_total) {
1431 *out_total = 0;
1432 }
1433 return npos;
1434 }
1435 int cur = 0;
1436
1437 size_t pos = start;
1438 while (pos < end && (pos & 7)) {
1439 cur += bit(pos) ? +1 : -1;
1440 if (cur == required_delta) {
1441 if (out_total) {
1442 *out_total = cur;
1443 }
1444 return pos;
1445 }
1446 ++pos;
1447 }
1448
1449 // Byte-aligned fast path: read bytes directly.
1450 // bits are LSB-first; on little-endian x86 the in-memory byte order matches
1451 // bit groups [pos..pos+7].
1452 const uint8_t* bytep =
1453 reinterpret_cast<const uint8_t*>(bits.data()) + (pos >> 3);
1454 const auto& agg = LUT8();
1455 const auto& fwd = LUT8_FWD_POS();
1456
1457 while (pos + 8 <= end) {
1458 const uint8_t bv = *bytep++;
1459 const auto& a = agg[bv];
1460 const int need = required_delta - cur;
1461 // need must be in [-8..8] for a match inside one byte.
1462 if ((unsigned)(need + 8) <= 16u) {
1463 // min/max pruning first, then position lookup.
1464 if (need >= a.min_prefix && need <= a.max_prefix) {
1465 const int8_t off = fwd[bv][need + 8];
1466 if (off >= 0) {
1467 if (out_total) {
1468 *out_total = cur + a.excess_total; // not exact end, but caller
1469 // only uses when not found
1470 }
1471 return pos + (size_t)off;
1472 }
1473 }
1474 }
1475 cur += a.excess_total;
1476 pos += 8;
1477 }
1478
1479 while (pos < end) {
1480 cur += bit(pos) ? +1 : -1;
1481 if (cur == required_delta) {
1482 if (out_total) {
1483 *out_total = cur;
1484 }
1485 return pos;
1486 }
1487 ++pos;
1488 }
1489 if (out_total) {
1490 *out_total = cur;
1491 }
1492 return npos;
1493 }
1494
1502 inline size_t scan_leaf_fwd(const size_t& search_start,
1503 const size_t& search_end,
1504 const int& required_delta) const noexcept {
1505 if (search_start >= search_end) {
1506 return npos;
1507 }
1508 const auto& aggregates_table = LUT8();
1509 const auto& forward_lookup = LUT8_FWD_POS();
1510 int current_excess = 0;
1511 size_t position = search_start;
1512 while (position + 8 <= search_end) {
1513 const uint8_t byte_value = get_byte(position);
1514 const auto& byte_aggregate = aggregates_table[byte_value];
1515 const int local_need = required_delta - current_excess;
1516 if (local_need >= byte_aggregate.min_prefix &&
1517 local_need <= byte_aggregate.max_prefix && local_need >= -8 &&
1518 local_need <= 8) {
1519 const int8_t offset = forward_lookup[byte_value][local_need + 8];
1520 if (offset >= 0) {
1521 return position + size_t(offset);
1522 }
1523 }
1524 current_excess += byte_aggregate.excess_total;
1525 position += 8;
1526 }
1527
1528 while (position < search_end) {
1529 current_excess += bit(position) ? 1 : -1;
1530 if (current_excess == required_delta) {
1531 return position;
1532 }
1533 ++position;
1534 }
1535
1536 return npos;
1537 }
1538
1547 inline size_t scan_leaf_bwd(
1548 const size_t& block_begin,
1549 const size_t& block_end,
1550 const int& required_delta,
1551 const bool& allow_right_boundary,
1552 const size_t& global_right_border,
1553 const int& prefix_at_boundary_max /*=kNoPrefixOverride*/) const noexcept {
1554 // We scan bits in [block_begin, block_end) and look for the LAST boundary
1555 // where prefix == required_delta, with optional exclusion of the right
1556 // boundary.
1557 if (block_begin > block_end) {
1558 return npos;
1559 }
1560
1561 // Maximum allowed boundary (inclusive) inside this scan.
1562 // If right boundary is forbidden, we forbid boundary == block_end.
1563 size_t boundary_max = block_end;
1564 if (!allow_right_boundary && boundary_max > block_begin) {
1565 --boundary_max;
1566 }
1567
1568 // No bits to scan -> only possible answer is the left boundary.
1569 if (block_begin >= boundary_max) {
1570 if ((block_begin < global_right_border || allow_right_boundary) &&
1571 required_delta == 0) {
1572 return block_begin;
1573 }
1574 return npos;
1575 }
1576
1577 if (required_delta < -32768 || required_delta > 32767) {
1578 // Just in case. Should be impossible.
1579 if ((block_begin < global_right_border || allow_right_boundary) &&
1580 required_delta == 0) {
1581 return block_begin;
1582 }
1583 return npos;
1584 }
1585
1586#if defined(PIXIE_AVX2_SUPPORT)
1587 // Fast reverse scan with early exit:
1588 // find the RIGHTMOST boundary j in (block_begin..boundary_max] such that
1589 // prefix(j) == required_delta.
1590
1591 // prefix_end = prefix(boundary_max) relative to block_begin
1592 int prefix_end = prefix_at_boundary_max;
1593 if (prefix_end == kNoPrefixOverride) {
1594 const int len = int(boundary_max - block_begin);
1595 const int ones = int(rank1_in_block(block_begin, boundary_max));
1596 prefix_end = ones * 2 - len;
1597 }
1598
1599 const __m256i masks = bit_masks_16x();
1600 const __m256i vzero = _mm256_setzero_si256();
1601 const __m256i vallones = _mm256_cmpeq_epi16(vzero, vzero);
1602 const __m256i vminus1 = _mm256_set1_epi16(-1);
1603 const __m256i vtwo = _mm256_set1_epi16(2);
1604 const __m256i vtarget = _mm256_set1_epi16((int16_t)required_delta);
1605
1606 size_t pos_end = boundary_max; // boundary (not bit index)
1607 int cur_end = prefix_end; // prefix(pos_end)
1608
1609 // Vector chunks: process 16 bits ending at pos_end.
1610 while (pos_end >= block_begin + 16) {
1611 const size_t pos = pos_end - 16; // bit index of the chunk start
1612 const uint16_t bits16 = get_u16(pos);
1613 const __m256i vb = _mm256_set1_epi16((int16_t)bits16);
1614 const __m256i m = _mm256_and_si256(vb, masks);
1615 const __m256i is_zero = _mm256_cmpeq_epi16(m, vzero);
1616 const __m256i is_set = _mm256_andnot_si256(is_zero, vallones);
1617 const __m256i steps =
1618 _mm256_add_epi16(vminus1, _mm256_and_si256(is_set, vtwo));
1619
1620 const __m256i pref_rel =
1621 prefix_sum_16x_i16(steps); // prefix after each bit (relative)
1622 const int16_t sum16 =
1623 last_prefix_16x_i16(pref_rel); // total sum on this 16-bit chunk
1624 const int cur_start =
1625 cur_end - (int)sum16; // prefix at boundary pos (chunk start)
1626
1627 const __m256i base = _mm256_set1_epi16((int16_t)cur_start);
1628 const __m256i pref = _mm256_add_epi16(
1629 pref_rel, base); // prefix at boundaries (pos+1..pos+16)
1630 const __m256i cmp = _mm256_cmpeq_epi16(pref, vtarget);
1631 const uint32_t mask = (uint32_t)_mm256_movemask_epi8(cmp);
1632 if (mask) {
1633 const int bit_i = 31 - int(std::countl_zero(mask));
1634 const int lane = bit_i >> 1;
1635 const size_t boundary = pos + (size_t)lane + 1;
1636 if (boundary < global_right_border || allow_right_boundary) {
1637 return boundary;
1638 }
1639 return npos;
1640 }
1641
1642 cur_end = cur_start;
1643 pos_end = pos;
1644 }
1645
1646 while (pos_end > block_begin) {
1647 // boundary pos_end corresponds to prefix cur_end
1648 if (cur_end == required_delta) {
1649 if (pos_end < global_right_border || allow_right_boundary) {
1650 return pos_end;
1651 }
1652 return npos;
1653 }
1654 const size_t bit_pos = pos_end - 1;
1655 cur_end -= bit(bit_pos) ? +1 : -1; // move one bit to the left
1656 pos_end = bit_pos;
1657 }
1658#else
1659 size_t last_boundary = npos;
1660 int cur = 0;
1661 for (size_t pos = block_begin; pos < boundary_max; ++pos) {
1662 cur += bit(pos) ? +1 : -1;
1663 if (cur == required_delta) {
1664 last_boundary = pos + 1;
1665 }
1666 }
1667 if (last_boundary != npos) {
1668 if (last_boundary < global_right_border || allow_right_boundary) {
1669 return last_boundary;
1670 }
1671 return npos;
1672 }
1673#endif
1674
1675 // Left boundary (prefix == 0) is always the final candidate.
1676 if ((block_begin < global_right_border || allow_right_boundary) &&
1677 required_delta == 0) {
1678 return block_begin;
1679 }
1680 return npos;
1681 }
1682
1686 inline uint8_t get_byte(const size_t& position) const noexcept {
1687 const size_t word_index = position >> 6;
1688 const size_t offset = position & 63;
1689 const std::uint64_t lower_word = bits[word_index] >> offset;
1690 if (offset <= 56) {
1691 return uint8_t(lower_word & 0xFFu);
1692 }
1693 const std::uint64_t higher_word =
1694 (word_index + 1 < bits.size()) ? bits[word_index + 1] : 0;
1695 const std::uint64_t byte_value =
1696 (lower_word | (higher_word << (64 - offset))) & 0xFFu;
1697 return uint8_t(byte_value);
1698 }
1699
1708 size_t descend_first_max(size_t node_index,
1709 int target_prefix,
1710 size_t segment_base) const noexcept {
1711 while (node_index < first_leaf_index) {
1712 const size_t left_child = node_index << 1, right_child = left_child | 1;
1713 const int left_max = node_max_prefix_excess[left_child];
1714 const int right_max =
1715 node_total_excess[left_child] + node_max_prefix_excess[right_child];
1716 if (left_max >= right_max && left_max == target_prefix) {
1717 node_index = left_child;
1718 } else if (right_max == target_prefix) {
1719 segment_base += segment_size_bits[left_child];
1720 target_prefix -= node_total_excess[left_child];
1721 node_index = right_child;
1722 } else {
1723 return npos;
1724 }
1725 }
1726
1727 const size_t segment_begin = segment_base;
1728 const size_t segment_end =
1729 std::min(segment_base + segment_size_bits[node_index], num_bits);
1730 int max_value;
1731 size_t position;
1732
1733 first_max_value_pos8(segment_begin,
1734 segment_end ? (segment_end - 1) : segment_begin,
1735 max_value, position);
1736 return (max_value == target_prefix ? position : npos);
1737 }
1738
1742 size_t first_leaf_index = 1;
1743
1748 static constexpr int kNoPrefixOverride = std::numeric_limits<int>::min();
1749
1753 size_t block_of(const size_t& position) const noexcept {
1754 return position / block_bits;
1755 }
1756
1760 size_t leaf_index_of(const size_t& block_start) const noexcept {
1761 return first_leaf_index + block_of(block_start);
1762 }
1763
1768 size_t node_base(size_t node_index) const noexcept {
1769 if (node_index >= first_leaf_index) {
1770 return (node_index - first_leaf_index) * block_bits;
1771 }
1772
1773 size_t base = 0;
1774 for (; node_index > 1; node_index >>= 1) {
1775 if (node_index & 1) {
1776 base += segment_size_bits[node_index - 1];
1777 }
1778 }
1779 return base;
1780 }
1781
1787 std::vector<size_t> cover_blocks(const size_t& block_begin_index,
1788 const size_t& block_end_index) const {
1789 size_t left_index = first_leaf_index + block_begin_index;
1790 size_t right_index = first_leaf_index + block_end_index;
1791 std::vector<size_t> left_nodes, right_nodes;
1792 while (left_index <= right_index) {
1793 if ((left_index & 1) == 1) {
1794 left_nodes.push_back(left_index++);
1795 }
1796 if ((right_index & 1) == 0) {
1797 right_nodes.push_back(right_index--);
1798 }
1799 left_index >>= 1;
1800 right_index >>= 1;
1801 }
1802 std::reverse(right_nodes.begin(), right_nodes.end());
1803 left_nodes.insert(left_nodes.end(), right_nodes.begin(), right_nodes.end());
1804 return left_nodes;
1805 }
1806
1811 size_t descend_fwd(size_t node_index,
1812 int required_delta,
1813 size_t segment_base) const noexcept {
1814 while (node_index < first_leaf_index) {
1815 const size_t left_child = node_index << 1;
1816 const size_t right_child = left_child | 1;
1817 if (node_min_prefix_excess[left_child] <= required_delta &&
1818 required_delta <= node_max_prefix_excess[left_child]) {
1819 node_index = left_child;
1820 } else {
1821 required_delta -= node_total_excess[left_child];
1822 segment_base += segment_size_bits[left_child];
1823 node_index = right_child;
1824 }
1825 }
1826 const size_t seg_end =
1827 std::min(segment_base + segment_size_bits[node_index], num_bits);
1828#if defined(PIXIE_AVX2_SUPPORT)
1829 return scan_leaf_fwd_simd(segment_base, seg_end, required_delta, nullptr);
1830#else
1831 return scan_leaf_fwd(segment_base, seg_end, required_delta);
1832#endif
1833 }
1834
1844 size_t descend_bwd(size_t node_index,
1845 const size_t& segment_base,
1846 const int& required_delta,
1847 const size_t& global_right_border,
1848 const bool& allow_right_boundary) const noexcept {
1849 while (node_index < first_leaf_index) {
1850 const size_t left_child = node_index << 1;
1851 const size_t right_child = left_child | 1;
1852 const int required_in_right =
1853 required_delta - node_total_excess[left_child];
1854
1855 // 1) try the right child first (to capture the rightmost j)
1856 if (node_min_prefix_excess[right_child] <= required_in_right &&
1857 required_in_right <= node_max_prefix_excess[right_child]) {
1858 const size_t result = descend_bwd(
1859 right_child, segment_base + segment_size_bits[left_child],
1860 required_in_right, global_right_border, allow_right_boundary);
1861 if (result != npos) {
1862 return result;
1863 }
1864 }
1865
1866 // 2) junction between children (end of the left child)
1867 const size_t junction = segment_base + segment_size_bits[left_child];
1868 if (required_delta == node_total_excess[left_child] &&
1869 (junction < global_right_border || allow_right_boundary)) {
1870 return junction;
1871 }
1872
1873 // 3) can we move left within the range?
1874 if (node_min_prefix_excess[left_child] <= required_delta &&
1875 required_delta <= node_max_prefix_excess[left_child]) {
1876 node_index = left_child;
1877 continue;
1878 }
1879
1880 // None of (1)-(3) worked. The only possible point is the left border of
1881 // the node.
1882 if (required_delta == 0 &&
1883 (segment_base < global_right_border || allow_right_boundary)) {
1884 return segment_base;
1885 }
1886
1887 return npos;
1888 }
1889
1890 const size_t seg_end =
1891 std::min(segment_base + segment_size_bits[node_index], num_bits);
1892 const size_t block_end = std::min(global_right_border, seg_end);
1893
1894 int prefix_override = kNoPrefixOverride;
1895 // If we scan the full leaf up to its end boundary, we know
1896 // prefix(block_end) from node_total_excess[leaf]. If the right boundary is
1897 // forbidden, we can still derive prefix(block_end-1) cheaply.
1898 if (block_end == seg_end) {
1899 const int total = node_total_excess[node_index];
1900 if (!allow_right_boundary && block_end > segment_base) {
1901 prefix_override = total - (bit(block_end - 1) ? +1 : -1);
1902 } else {
1903 prefix_override = total;
1904 }
1905 }
1906
1907 return scan_leaf_bwd(segment_base, block_end, required_delta,
1908 allow_right_boundary, global_right_border,
1909 prefix_override);
1910 }
1911
1916 size_t descend_first_min(size_t node_index,
1917 int target_prefix,
1918 size_t segment_base) const noexcept {
1919 while (node_index < first_leaf_index) {
1920 const size_t left_child = node_index << 1, right_child = left_child | 1;
1921 const int left_min = node_min_prefix_excess[left_child];
1922 const int right_min =
1923 node_total_excess[left_child] + node_min_prefix_excess[right_child];
1924 if (left_min <= right_min && left_min == target_prefix) {
1925 node_index = left_child;
1926 } else if (right_min == target_prefix) {
1927 segment_base += segment_size_bits[left_child];
1928 target_prefix -= node_total_excess[left_child];
1929 node_index = right_child;
1930 } else {
1931 return npos;
1932 }
1933 }
1934
1935 const size_t segment_begin = segment_base;
1936 const size_t segment_end =
1937 std::min(segment_base + segment_size_bits[node_index], num_bits);
1938 int min_value;
1939 size_t position;
1940
1941 first_min_value_pos8(segment_begin,
1942 segment_end ? (segment_end - 1) : segment_begin,
1943 min_value, position);
1944 return (min_value == target_prefix ? position : npos);
1945 }
1946
1951 size_t descend_qth_min(size_t node_index,
1952 int target_prefix,
1953 size_t target_min_rank,
1954 size_t segment_base) const noexcept {
1955 while (node_index < first_leaf_index) {
1956 const size_t left_child = node_index << 1;
1957 const size_t right_child = left_child | 1;
1958 const int left_min = node_min_prefix_excess[left_child];
1959 const int right_min =
1960 node_total_excess[left_child] + node_min_prefix_excess[right_child];
1961 if (left_min == target_prefix) {
1962 if (node_min_count[left_child] >= target_min_rank) {
1963 node_index = left_child;
1964 continue;
1965 }
1966 target_min_rank -= node_min_count[left_child];
1967 }
1968 if (right_min == target_prefix) {
1969 segment_base += segment_size_bits[left_child];
1970 target_prefix -= node_total_excess[left_child];
1971 node_index = right_child;
1972 continue;
1973 }
1974 return npos;
1975 }
1976 return qth_min_in_block(
1977 segment_base,
1978 std::min(segment_base + segment_size_bits[node_index], num_bits) - 1,
1979 target_min_rank);
1980 }
1981
1986 size_t select1_in_block(const size_t& block_begin,
1987 const size_t& block_end,
1988 size_t target_one_rank) const noexcept {
1989 size_t left_word_index = block_begin >> 6;
1990 const size_t right_word_index = (block_end >> 6);
1991 const size_t left_offset = block_begin & 63;
1992 const std::uint64_t left_mask =
1993 (left_offset ? (~std::uint64_t(0) << left_offset) : ~std::uint64_t(0));
1994 if (left_word_index == right_word_index) {
1995 const std::uint64_t word =
1996 bits[left_word_index] & left_mask &
1997 ((block_end & 63) ? ((std::uint64_t(1) << (block_end & 63)) - 1)
1998 : ~std::uint64_t(0));
1999 return block_begin + select_in_word(word, target_one_rank);
2000 }
2001 // prefix
2002 if (left_offset) {
2003 const std::uint64_t word = bits[left_word_index] & left_mask;
2004 const int count = std::popcount(word);
2005 if (target_one_rank <= (size_t)count) {
2006 return block_begin + select_in_word(word, target_one_rank);
2007 }
2008 target_one_rank -= count;
2009 left_word_index++;
2010 }
2011 // full words
2012 while (left_word_index < right_word_index) {
2013 const std::uint64_t word = bits[left_word_index];
2014 const int count = std::popcount(word);
2015 if (target_one_rank <= (size_t)count) {
2016 return (left_word_index << 6) + select_in_word(word, target_one_rank);
2017 }
2018 target_one_rank -= count;
2019 ++left_word_index;
2020 }
2021 // tail
2022 const size_t right_offset = block_end & 63;
2023 if (right_offset) {
2024 const std::uint64_t word =
2025 bits[left_word_index] & ((std::uint64_t(1) << right_offset) - 1);
2026 const int count = std::popcount(word);
2027 if (target_one_rank <= (size_t)count) {
2028 return (left_word_index << 6) + select_in_word(word, target_one_rank);
2029 }
2030 }
2031 return npos;
2032 }
2033
2038 size_t select0_in_block(const size_t& block_begin,
2039 const size_t& block_end,
2040 size_t target_zero_rank) const noexcept {
2041 if (block_end <= block_begin) {
2042 return npos;
2043 }
2044
2045 size_t left_word_index = block_begin >> 6;
2046 const size_t right_word_index = block_end >> 6;
2047 const size_t left_offset = block_begin & 63;
2048
2049 if (left_word_index == right_word_index) {
2050 const std::uint64_t left_mask =
2051 (left_offset ? (~std::uint64_t(0) << left_offset)
2052 : ~std::uint64_t(0));
2053 const std::uint64_t right_mask =
2054 ((block_end & 63) ? ((std::uint64_t(1) << (block_end & 63)) - 1)
2055 : ~std::uint64_t(0));
2056 const std::uint64_t word =
2057 (~bits[left_word_index]) & left_mask & right_mask;
2058 const int offset = select_in_word(word, target_zero_rank);
2059 return (offset >= 0) ? (block_begin + (size_t)offset) : npos;
2060 }
2061
2062 // prefix
2063 if (left_offset) {
2064 const std::uint64_t word =
2065 (~bits[left_word_index]) & (~std::uint64_t(0) << left_offset);
2066 const int count = std::popcount(word);
2067 if (target_zero_rank <= (size_t)count) {
2068 const int offset = select_in_word(word, target_zero_rank);
2069 return (offset >= 0) ? (block_begin + (size_t)offset) : npos;
2070 }
2071 target_zero_rank -= count;
2072 ++left_word_index;
2073 }
2074
2075 // full words
2076 while (left_word_index < right_word_index) {
2077 const std::uint64_t word = ~bits[left_word_index];
2078 const int count = std::popcount(word);
2079 if (target_zero_rank <= (size_t)count) {
2080 const int offset = select_in_word(word, target_zero_rank);
2081 return (offset >= 0) ? ((left_word_index << 6) + (size_t)offset) : npos;
2082 }
2083 target_zero_rank -= count;
2084 ++left_word_index;
2085 }
2086
2087 // tail
2088 const size_t right_offset = block_end & 63;
2089 if (right_offset) {
2090 const std::uint64_t word =
2091 (~bits[left_word_index]) & ((std::uint64_t(1) << right_offset) - 1);
2092 const int count = std::popcount(word);
2093 if (target_zero_rank <= (size_t)count) {
2094 const int offset = select_in_word(word, target_zero_rank);
2095 return (offset >= 0) ? ((left_word_index << 6) + (size_t)offset) : npos;
2096 }
2097 }
2098 return npos;
2099 }
2100
2105 static inline int select_in_word(std::uint64_t word,
2106 size_t target_rank) noexcept {
2107 while (word) {
2108 if (--target_rank == 0) {
2109 return std::countr_zero(word);
2110 }
2111 word &= (word - 1);
2112 }
2113 return -1;
2114 }
2115
2119 static inline size_t ceil_div(const size_t& numerator,
2120 const size_t& denominator) noexcept {
2121 return (numerator + denominator - 1) / denominator;
2122 }
2123
2129 static inline size_t nodeslots_for(const size_t& bit_count,
2130 const size_t& block_size_pow2) noexcept {
2131 if (bit_count == 0) {
2132 return 0;
2133 }
2134 size_t leaf_node_count = ceil_div(bit_count, block_size_pow2);
2135 return std::bit_ceil(std::max<size_t>(1, leaf_node_count)) +
2136 leaf_node_count;
2137 }
2138
2142 static inline float overhead_for(const size_t& bit_count,
2143 const size_t& block_size_pow2) noexcept {
2144 static constexpr size_t AUX_SLOT_BYTES =
2145 sizeof(uint32_t) + sizeof(int32_t) + sizeof(int32_t) + sizeof(int32_t) +
2146 sizeof(uint32_t) + sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint8_t);
2147
2148 size_t bitvector_bytes = ceil_div(bit_count, 64) * 8;
2149 if (bitvector_bytes == 0) {
2150 return 0;
2151 }
2152 size_t slot_count = nodeslots_for(bit_count, block_size_pow2);
2153 size_t aux_bytes = slot_count * AUX_SLOT_BYTES;
2154 return ((float)aux_bytes) / ((float)bitvector_bytes);
2155 }
2156
2163 static inline size_t choose_block_bits_for_overhead(
2164 const size_t& bit_count,
2165 const float& overhead_cap) noexcept {
2166 if (overhead_cap < 0.f) {
2167 return 64;
2168 }
2169
2170 const size_t max_block_bits = std::min<size_t>(bit_count, 16384);
2171 size_t candidate_block_bits = 64;
2172 while (candidate_block_bits < max_block_bits) {
2173 if (overhead_for(bit_count, candidate_block_bits) <= overhead_cap) {
2174 break;
2175 }
2176 candidate_block_bits <<= 1;
2177 }
2178 return candidate_block_bits;
2179 }
2180
2186 void build_from_string(const std::string& bit_string_input,
2187 const size_t& leaf_block_bits = 0,
2188 const float& max_overhead = -1.0) {
2189 num_bits = bit_string_input.size();
2190 bits.assign((num_bits + 63) / 64, 0);
2191 for (size_t i = 0; i < num_bits; ++i) {
2192 if (bit_string_input[i] == '1') {
2193 set1(i);
2194 }
2195 }
2196 build(leaf_block_bits, max_overhead);
2197 }
2198
2206 void build_from_words(const std::vector<std::uint64_t>& words,
2207 const size_t& bit_count,
2208 const size_t& leaf_block_bits = 0,
2209 const float& max_overhead = -1.0) {
2210 bits = words;
2211 num_bits = bit_count;
2212 if (bits.size() * 64 < num_bits) {
2213 bits.resize((num_bits + 63) / 64);
2214 }
2215 build(leaf_block_bits, max_overhead);
2216 }
2217
2221 inline int bit(const size_t& position) const noexcept {
2222 return (bits[position >> 6] >> (position & 63)) & 1u;
2223 }
2224
2228 inline void set1(const size_t& position) noexcept {
2229 bits[position >> 6] |= (std::uint64_t(1) << (position & 63));
2230 }
2231
2236 inline uint32_t ones_in_node(const size_t& node_index) const noexcept {
2237 return ((int64_t)segment_size_bits[node_index] +
2238 (int64_t)node_total_excess[node_index]) >>
2239 1;
2240 }
2241
2249 inline void scan_range_min_count8(size_t range_begin,
2250 const size_t& range_end,
2251 int& current_excess,
2252 int& min_value,
2253 uint32_t& count) const noexcept {
2254 current_excess = 0;
2255 min_value = INT_MAX;
2256 count = 0;
2257 if (range_end < range_begin) {
2258 min_value = 0;
2259 return;
2260 }
2261 // to byte alignment
2262 while (range_begin <= range_end && (range_begin & 7)) {
2263 current_excess += bit(range_begin) ? +1 : -1;
2264 if (current_excess < min_value) {
2265 min_value = current_excess;
2266 count = 1;
2267 } else if (current_excess == min_value) {
2268 ++count;
2269 }
2270 ++range_begin;
2271 }
2272 // full bytes
2273 const auto& aggregates_table = LUT8();
2274 while (range_begin + 7 <= range_end) {
2275 const auto& byte_aggregate = aggregates_table[get_byte(range_begin)];
2276 const int candidate = current_excess + byte_aggregate.min_prefix;
2277 if (candidate < min_value) {
2278 min_value = candidate;
2279 count = byte_aggregate.min_count;
2280 } else if (candidate == min_value) {
2281 count += byte_aggregate.min_count;
2282 }
2283 current_excess += byte_aggregate.excess_total;
2284 range_begin += 8;
2285 }
2286 // tail
2287 while (range_begin <= range_end) {
2288 current_excess += bit(range_begin) ? +1 : -1;
2289 if (current_excess < min_value) {
2290 min_value = current_excess;
2291 count = 1;
2292 } else if (current_excess == min_value) {
2293 ++count;
2294 }
2295 ++range_begin;
2296 }
2297 if (min_value == INT_MAX) {
2298 min_value = count = 0;
2299 }
2300 }
2301
2308 inline size_t cover_blocks_collect(const size_t& block_begin_index,
2309 const size_t& block_end_index,
2310 size_t (&out_nodes)[64]) const noexcept {
2311 if (leaf_count == 0 || block_begin_index > block_end_index) {
2312 return 0;
2313 }
2314 size_t left_index = first_leaf_index + block_begin_index;
2315 size_t right_index = first_leaf_index + block_end_index;
2316 size_t left_nodes[32];
2317 size_t right_nodes[32];
2318 size_t left_count = 0, right_count = 0;
2319 while (left_index <= right_index) {
2320 if (left_index & 1) {
2321 left_nodes[left_count++] = left_index++;
2322 }
2323 if ((right_index & 1) == 0) {
2324 right_nodes[right_count++] = right_index--;
2325 }
2326 left_index >>= 1;
2327 right_index >>= 1;
2328 }
2329 size_t out_count = 0;
2330 for (size_t i = 0; i < left_count; ++i) {
2331 out_nodes[out_count++] = left_nodes[i];
2332 }
2333 while (right_count > 0) {
2334 out_nodes[out_count++] = right_nodes[--right_count];
2335 }
2336 return out_count;
2337 }
2338
2345 inline size_t qth_min_in_block(const size_t& range_begin,
2346 const size_t& range_end,
2347 size_t target_min_rank) const noexcept {
2348 if (range_end < range_begin || target_min_rank == 0) {
2349 return npos;
2350 }
2351
2352 const auto& aggregates_table = LUT8();
2353
2354 int current_excess = 0, min_value = INT_MAX;
2355 size_t position = range_begin;
2356
2357 while (position <= range_end && (position & 7)) {
2358 current_excess += bit(position) ? +1 : -1;
2359 if (current_excess < min_value) {
2360 min_value = current_excess;
2361 }
2362 ++position;
2363 }
2364 while (position + 7 <= range_end) {
2365 const auto& byte_aggregate = aggregates_table[get_byte(position)];
2366 min_value =
2367 std::min(min_value, current_excess + byte_aggregate.min_prefix);
2368 current_excess += byte_aggregate.excess_total;
2369 position += 8;
2370 }
2371 while (position <= range_end) {
2372 current_excess += bit(position) ? +1 : -1;
2373 if (current_excess < min_value) {
2374 min_value = current_excess;
2375 }
2376 ++position;
2377 }
2378
2379 current_excess = 0;
2380 position = range_begin;
2381
2382 // to byte alignment
2383 while (position <= range_end && (position & 7)) {
2384 current_excess += bit(position) ? +1 : -1;
2385 if (current_excess == min_value) {
2386 if (--target_min_rank == 0) {
2387 return position;
2388 }
2389 }
2390 ++position;
2391 }
2392
2393 // full bytes
2394 while (position + 7 <= range_end) {
2395 const uint8_t byte_value = get_byte(position);
2396 const auto& byte_aggregate = aggregates_table[byte_value];
2397 const int candidate = current_excess + byte_aggregate.min_prefix;
2398 if (candidate == min_value) {
2399 int prefix_sum = 0;
2400 for (int k = 0; k < 8; ++k) {
2401 prefix_sum += ((byte_value >> k) & 1u) ? +1 : -1;
2402 if (prefix_sum == byte_aggregate.min_prefix) {
2403 if (--target_min_rank == 0) {
2404 return position + k;
2405 }
2406 }
2407 }
2408 }
2409 current_excess += byte_aggregate.excess_total;
2410 position += 8;
2411 }
2412
2413 // tail
2414 while (position <= range_end) {
2415 current_excess += bit(position) ? +1 : -1;
2416 if (current_excess == min_value) {
2417 if (--target_min_rank == 0) {
2418 return position;
2419 }
2420 }
2421 ++position;
2422 }
2423
2424 return npos;
2425 }
2426
2437 inline size_t leaf_fwd_bp_simd(const size_t& leaf_index,
2438 const size_t& leaf_block_begin,
2439 const size_t& start_position,
2440 const int& delta,
2441 int& leaf_delta) const noexcept {
2442 const size_t leaf_length = segment_size_bits[first_leaf_index + leaf_index];
2443 const size_t leaf_end = std::min(num_bits, leaf_block_begin + leaf_length);
2444 if (start_position >= leaf_end) {
2445 leaf_delta = 0;
2446 return npos;
2447 }
2448#if defined(PIXIE_AVX2_SUPPORT)
2449 int total = 0;
2450 // Heuristic: for tiny tails and tiny deltas, LUT8 wins because AVX2 setup
2451 // is heavy...
2452 // At least I think so...
2453 const size_t tail_len = leaf_end - start_position;
2454 size_t res = npos;
2455 if (delta >= -8 && delta <= 8 && tail_len <= 256) {
2456 res = scan_leaf_fwd_lut8_fast(start_position, leaf_end, delta, &total);
2457 } else {
2458 res = scan_leaf_fwd_simd(start_position, leaf_end, delta, &total);
2459 }
2460 if (res != npos) {
2461 return res;
2462 }
2463 leaf_delta = total;
2464 return npos;
2465#else
2466 const size_t res = scan_leaf_fwd(start_position, leaf_end, delta);
2467 if (res != npos) {
2468 return res;
2469 }
2470 const int len = int(leaf_end - start_position);
2471 const int ones = int(rank1_in_block(start_position, leaf_end));
2472 leaf_delta = ones * 2 - len;
2473 return npos;
2474#endif
2475 }
2476
2485 inline size_t leaf_bwd_bp_simd(const size_t& leaf_index,
2486 const size_t& leaf_block_begin,
2487 const size_t& start_position,
2488 const int& delta,
2489 int& leaf_delta) const noexcept {
2490 const size_t leaf_length = segment_size_bits[first_leaf_index + leaf_index];
2491 const size_t leaf_end = std::min(num_bits, leaf_block_begin + leaf_length);
2492 if (start_position < leaf_block_begin || start_position > leaf_end) {
2493 leaf_delta = 0;
2494 return npos;
2495 }
2496
2497 // leaf_delta = excess(start_position) - excess(leaf_block_begin)
2498 // = 2*rank1([leaf_begin, start)) - (start - leaf_begin)
2499 const int len = int(start_position - leaf_block_begin);
2500 const int ones = int(rank1_in_block(leaf_block_begin, start_position));
2501 leaf_delta = ones * 2 - len;
2502
2503 // Must find a boundary strictly < start_position (so
2504 // boundary==start_position forbidden).
2505 if (start_position == leaf_block_begin) {
2506 return npos;
2507 }
2508 const int target_prefix = leaf_delta + delta;
2509 return scan_leaf_bwd(
2510 leaf_block_begin,
2511 start_position, // do not look to the right of start_position
2512 target_prefix,
2513 false, // right boundary (=start_position) forbidden
2514 start_position,
2515 // prefix at boundary_max = start_position-1:
2516 // leaf_delta is prefix at start_position, subtract last step
2517 (start_position > leaf_block_begin
2518 ? (leaf_delta - (bit(start_position - 1) ? +1 : -1))
2519 : 0));
2520 }
2521
2528 inline void first_min_value_pos8(size_t range_begin,
2529 const size_t& range_end,
2530 int& min_value_out,
2531 size_t& first_position) const noexcept {
2532 const auto& aggregates_table = LUT8();
2533 int current_excess = 0;
2534 int min_value = INT_MAX;
2535 first_position = npos;
2536
2537 // to byte allignment
2538 while (range_begin <= range_end && (range_begin & 7)) {
2539 current_excess += bit(range_begin) ? +1 : -1;
2540 if (current_excess < min_value) {
2541 min_value = current_excess;
2542 first_position = range_begin;
2543 }
2544 ++range_begin;
2545 }
2546
2547 // full bytes
2548 while (range_begin + 7 <= range_end) {
2549 const auto& byte_aggregate = aggregates_table[get_byte(range_begin)];
2550 const int candidate = current_excess + byte_aggregate.min_prefix;
2551 if (candidate < min_value) {
2552 min_value = candidate;
2553 first_position = range_begin + byte_aggregate.pos_first_min;
2554 }
2555 current_excess += byte_aggregate.excess_total;
2556 range_begin += 8;
2557 }
2558
2559 // tail
2560 while (range_begin <= range_end) {
2561 current_excess += bit(range_begin) ? +1 : -1;
2562 if (current_excess < min_value) {
2563 min_value = current_excess;
2564 first_position = range_begin;
2565 }
2566 ++range_begin;
2567 }
2568
2569 min_value_out = (min_value == INT_MAX ? 0 : min_value);
2570 }
2571
2578 inline void first_max_value_pos8(size_t range_begin,
2579 const size_t& range_end,
2580 int& max_value_out,
2581 size_t& first_position) const noexcept {
2582 const auto& aggregates_table = LUT8();
2583 int current_excess = 0;
2584 int max_value = INT_MIN;
2585 first_position = npos;
2586
2587 while (range_begin <= range_end && (range_begin & 7)) {
2588 current_excess += bit(range_begin) ? +1 : -1;
2589 if (current_excess > max_value) {
2590 max_value = current_excess;
2591 first_position = range_begin;
2592 }
2593 ++range_begin;
2594 }
2595
2596 while (range_begin + 7 <= range_end) {
2597 const auto& byte_aggregate = aggregates_table[get_byte(range_begin)];
2598 const int candidate = current_excess + byte_aggregate.max_prefix;
2599 if (candidate > max_value) {
2600 max_value = candidate;
2601 first_position = range_begin + byte_aggregate.pos_first_max;
2602 }
2603 current_excess += byte_aggregate.excess_total;
2604 range_begin += 8;
2605 }
2606
2607 while (range_begin <= range_end) {
2608 current_excess += bit(range_begin) ? +1 : -1;
2609 if (current_excess > max_value) {
2610 max_value = current_excess;
2611 first_position = range_begin;
2612 }
2613 ++range_begin;
2614 }
2615
2616 max_value_out = (max_value == INT_MIN ? 0 : max_value);
2617 }
2618
2625 void build(const size_t& leaf_block_bits, const float& max_overhead) {
2626 // the lower clamp depends on the desired overhead fraction; otherwise use
2627 // 64
2628 const size_t clamp_by_overhead =
2629 (max_overhead >= 0.0
2630 ? choose_block_bits_for_overhead(num_bits, max_overhead)
2631 : size_t(64));
2632
2633 // chosen block_bits: honor an explicit request, but not below
2634 // clamp_by_overhead
2635 if (leaf_block_bits == 0) {
2636 block_bits =
2637 std::max(clamp_by_overhead,
2638 std::bit_ceil<size_t>(
2639 (num_bits <= 1) ? 1 : std::bit_width(num_bits - 1)));
2640 } else {
2641 block_bits =
2642 std::max(clamp_by_overhead,
2643 std::bit_ceil(std::max<size_t>(1, leaf_block_bits)));
2644 }
2645
2646#ifdef DEBUG
2647 // finalizes the achieved overhead percentage
2648 built_overhead = overhead_for(num_bits, block_bits);
2649#endif
2650
2651 leaf_count = ceil_div(num_bits, block_bits);
2652 first_leaf_index = std::bit_ceil(std::max<size_t>(1, leaf_count));
2653 const size_t tree_size = first_leaf_index + leaf_count - 1;
2654 segment_size_bits.assign(tree_size + 1, 0);
2655 node_total_excess.assign(tree_size + 1, 0);
2656 node_min_prefix_excess.assign(tree_size + 1, 0);
2657 node_max_prefix_excess.assign(tree_size + 1, 0);
2658 node_min_count.assign(tree_size + 1, 0);
2659 node_pattern10_count.assign(tree_size + 1, 0);
2660 node_first_bit.assign(tree_size + 1, 0);
2661 node_last_bit.assign(tree_size + 1, 0);
2662
2663 // leaves
2664 for (size_t leaf_block_index = 0; leaf_block_index < leaf_count;
2665 ++leaf_block_index) {
2666 const size_t leaf_node_index = first_leaf_index + leaf_block_index;
2667 const size_t segment_begin = leaf_block_index * block_bits;
2668 const size_t segment_end = std::min(num_bits, segment_begin + block_bits);
2669 segment_size_bits[leaf_node_index] = segment_end - segment_begin;
2670
2671 if (segment_begin < segment_end) {
2672 node_first_bit[leaf_node_index] = bit(segment_begin);
2673 }
2674
2675 const auto& aggregates_table = LUT8();
2676
2677 int current_excess = 0, min_value = INT_MAX, max_value = INT_MIN;
2678 uint32_t min_count = 0;
2679 uint32_t pattern10_count = 0;
2680
2681 uint8_t previous_bit = 0;
2682
2683 size_t position = segment_begin;
2684
2685 // Full bytes
2686 while (position + 8 <= segment_end) {
2687 const uint8_t byte_value = get_byte(position);
2688 const auto& byte_aggregate = aggregates_table[byte_value];
2689
2690 // internal "10" inside the byte
2691 pattern10_count += byte_aggregate.pattern10_count;
2692 // stitching across the boundary between the previous and current byte
2693 // (within the segment)
2694 if (previous_bit == 1 && byte_aggregate.first_bit == 0) {
2695 pattern10_count++;
2696 }
2697
2698 // prefix min/max accounting for the current offset
2699 const int candidate_min = current_excess + byte_aggregate.min_prefix;
2700 if (candidate_min < min_value) {
2701 min_value = candidate_min;
2702 min_count = byte_aggregate.min_count;
2703 } else if (candidate_min == min_value) {
2704 min_count += byte_aggregate.min_count;
2705 }
2706
2707 max_value =
2708 std::max(max_value, current_excess + byte_aggregate.max_prefix);
2709 current_excess += byte_aggregate.excess_total;
2710 previous_bit = byte_aggregate.last_bit;
2711 position += 8;
2712 }
2713
2714 // Tail < 8 bits
2715 while (position < segment_end) {
2716 const uint8_t bit_value = bit(position);
2717 if (previous_bit == 1 && bit_value == 0) {
2718 pattern10_count++;
2719 }
2720 const int step = bit_value ? +1 : -1;
2721 current_excess += step;
2722 if (current_excess < min_value) {
2723 min_value = current_excess;
2724 min_count = 1;
2725 } else if (current_excess == min_value) {
2726 ++min_count;
2727 }
2728 if (current_excess > max_value) {
2729 max_value = current_excess;
2730 }
2731
2732 previous_bit = bit_value;
2733 ++position;
2734 }
2735
2736 if (segment_begin < segment_end) {
2737 node_last_bit[leaf_node_index] = previous_bit;
2738 }
2739
2740 node_total_excess[leaf_node_index] = current_excess;
2741 node_min_prefix_excess[leaf_node_index] =
2742 (segment_size_bits[leaf_node_index] == 0 ? 0 : min_value);
2743 node_max_prefix_excess[leaf_node_index] =
2744 (segment_size_bits[leaf_node_index] == 0 ? 0 : max_value);
2745 node_min_count[leaf_node_index] = min_count;
2746 node_pattern10_count[leaf_node_index] = (uint32_t)pattern10_count;
2747 }
2748 // internal nodes
2749 for (size_t node_index = first_leaf_index - 1; node_index >= 1;
2750 --node_index) {
2751 const size_t left_child = node_index << 1;
2752 const size_t right_child = left_child | 1;
2753 const bool has_left =
2754 (left_child <= tree_size) && segment_size_bits[left_child];
2755 const bool has_right =
2756 (right_child <= tree_size) && segment_size_bits[right_child];
2757 if (!has_left && !has_right) {
2758 segment_size_bits[node_index] = 0;
2759 continue;
2760 }
2761 if (has_left && !has_right) {
2762 segment_size_bits[node_index] = segment_size_bits[left_child];
2763 node_total_excess[node_index] = node_total_excess[left_child];
2764 node_min_prefix_excess[node_index] = node_min_prefix_excess[left_child];
2765 node_max_prefix_excess[node_index] = node_max_prefix_excess[left_child];
2766 node_min_count[node_index] = node_min_count[left_child];
2767 node_pattern10_count[node_index] = node_pattern10_count[left_child];
2768 node_first_bit[node_index] = node_first_bit[left_child];
2769 node_last_bit[node_index] = node_last_bit[left_child];
2770 } else if (!has_left && has_right) {
2771 segment_size_bits[node_index] = segment_size_bits[right_child];
2772 node_total_excess[node_index] = node_total_excess[right_child];
2773 node_min_prefix_excess[node_index] =
2774 node_min_prefix_excess[right_child];
2775 node_max_prefix_excess[node_index] =
2776 node_max_prefix_excess[right_child];
2777 node_min_count[node_index] = node_min_count[right_child];
2778 node_pattern10_count[node_index] = node_pattern10_count[right_child];
2779 node_first_bit[node_index] = node_first_bit[right_child];
2780 node_last_bit[node_index] = node_last_bit[right_child];
2781 } else {
2782 segment_size_bits[node_index] =
2783 segment_size_bits[left_child] + segment_size_bits[right_child];
2784 node_total_excess[node_index] =
2785 node_total_excess[left_child] + node_total_excess[right_child];
2786 const int right_min_candidate =
2787 node_total_excess[left_child] + node_min_prefix_excess[right_child];
2788 const int right_max_candidate =
2789 node_total_excess[left_child] + node_max_prefix_excess[right_child];
2790 node_min_prefix_excess[node_index] =
2791 std::min(node_min_prefix_excess[left_child], right_min_candidate);
2792 node_max_prefix_excess[node_index] =
2793 std::max(node_max_prefix_excess[left_child], right_max_candidate);
2794 node_min_count[node_index] =
2795 (node_min_prefix_excess[left_child] ==
2796 node_min_prefix_excess[node_index]
2797 ? node_min_count[left_child]
2798 : 0) +
2799 (right_min_candidate == node_min_prefix_excess[node_index]
2800 ? node_min_count[right_child]
2801 : 0);
2802 node_pattern10_count[node_index] = node_pattern10_count[left_child] +
2803 node_pattern10_count[right_child] +
2804 ((node_last_bit[left_child] == 1 &&
2805 node_first_bit[right_child] == 0)
2806 ? 1u
2807 : 0u);
2808 node_first_bit[node_index] = node_first_bit[left_child];
2809 node_last_bit[node_index] = node_last_bit[right_child];
2810 }
2811 if (node_index == 1) {
2812 break;
2813 }
2814 }
2815 }
2816};
2817
2818} // namespace pixie
int range_max_query_val(const size_t &range_begin, const size_t &range_end) const
Value of the maximum prefix excess on [range_begin, range_end] relative to range_begin.
Definition rmm_tree.h:678
int range_min_query_val(const size_t &range_begin, const size_t &range_end) const
Value of the minimum prefix excess on [range_begin, range_end] relative to range_begin.
Definition rmm_tree.h:560
RmMTree(const std::string &bit_string, const size_t &leaf_block_bits=0, const float &max_overhead=-1.0)
Build from a '0'/'1' string.
Definition rmm_tree.h:104
size_t rank0(const size_t &end_position) const
Number of zeros in prefix [0, end_position).
Definition rmm_tree.h:158
size_t select0(size_t target_zero_rank) const
1-based select of the target_zero_rank-th zero.
Definition rmm_tree.h:197
size_t fwdsearch(const size_t &start_position, const int &delta) const
Forward search: first position p ≥ start_position where excess(p) = excess(start_position) + delta.
Definition rmm_tree.h:330
int excess(const size_t &end_position) const
Prefix excess on [0, end_position): +1 for '1', −1 for '0'.
Definition rmm_tree.h:320
size_t bwdsearch(const size_t &start_position, const int &delta) const
Backward search: last position p ≤ start_position where excess(p) = excess(start_position) + delta.
Definition rmm_tree.h:394
size_t minselect(const size_t &range_begin, const size_t &range_end, size_t target_min_rank) const
Position of the target_min_rank-th (1-based) occurrence of the minimum on [range_begin,...
Definition rmm_tree.h:777
RmMTree()=default
Construct empty structure.
RmMTree(const std::vector< std::uint64_t > &words, size_t bit_count, const size_t &leaf_block_bits=0, const float &max_overhead=-1.0)
Build from 64-bit words (LSB-first).
Definition rmm_tree.h:120
size_t close(const size_t &open_position) const
close(open_position): matching ')' for '(' at open_position.
Definition rmm_tree.h:936
size_t mincount(const size_t &range_begin, const size_t &range_end) const
How many times the minimum prefix excess occurs on [range_begin, range_end].
Definition rmm_tree.h:694
size_t select1(size_t target_one_rank) const
1-based select of the target_one_rank-th one.
Definition rmm_tree.h:166
size_t rank10(const size_t &end_position) const
Rank of the pattern "10" (starts) within [0, end_position).
Definition rmm_tree.h:232
size_t select10(size_t target_pattern_rank) const
1-based select of the target_pattern_rank-th "10" start.
Definition rmm_tree.h:265
size_t rank1(const size_t &end_position) const
Number of ones in prefix [0, end_position).
Definition rmm_tree.h:133
size_t range_max_query_pos(const size_t &range_begin, const size_t &range_end) const
Position of the first maximum of excess on [range_begin, range_end] (inclusive).
Definition rmm_tree.h:578
size_t enclose(const size_t &position) const
enclose(position): opening '(' that strictly encloses position.
Definition rmm_tree.h:961
static constexpr size_t npos
Sentinel for "not found".
Definition rmm_tree.h:82
size_t range_min_query_pos(const size_t &range_begin, const size_t &range_end) const
Position of the first minimum of excess on [range_begin, range_end] (inclusive).
Definition rmm_tree.h:458
size_t open(const size_t &close_position) const
open(close_position): matching '(' for ')' at close_position.
Definition rmm_tree.h:947