40 std::span<const std::uint64_t> bits;
44 size_t block_bits = 64;
45 size_t leaf_count = 0;
52 std::vector<uint32_t> segment_size_bits;
58 std::vector<int32_t> node_total_excess;
63 std::vector<int32_t> node_min_prefix_excess;
67 std::vector<int32_t> node_max_prefix_excess;
71 std::vector<uint32_t> node_min_count;
75 std::vector<uint32_t> node_pattern10_count;
80 std::vector<uint8_t> node_first_bit, node_last_bit;
86 static constexpr size_t npos = std::numeric_limits<size_t>::max();
89 float built_overhead = 0.0;
112 explicit RmMTree(std::span<const std::uint64_t> words,
114 const size_t& leaf_block_bits = 0,
115 const float& max_overhead = -1.0) {
116 build_from_words(words, bit_count, leaf_block_bits, max_overhead);
119 size_t size_impl()
const {
return num_bits; }
128 if (end_position == 0) {
131 const size_t block_index = block_of(end_position - 1);
132 size_t ones_count = 0;
133 if (block_index > 0) {
134 size_t nodes_buffer[64];
135 const size_t node_count =
136 cover_blocks_collect(0, block_index - 1, nodes_buffer);
137 for (
size_t j = 0; j < node_count; ++j) {
138 ones_count += ones_in_node(nodes_buffer[j]);
141 const size_t block_begin = block_index * block_bits;
142 const size_t block_end = std::min(num_bits, block_begin + block_bits);
144 rank1_in_block(block_begin, std::min(end_position, block_end));
153 return end_position -
rank1_impl(end_position);
161 if (target_one_rank == 0 || num_bits == 0) {
164 size_t node_index = 1;
165 if (ones_in_node(node_index) < target_one_rank) {
168 size_t segment_base = 0;
169 while (node_index < first_leaf_index) {
170 const size_t left_child = node_index << 1;
171 const size_t right_child = left_child | 1;
172 const uint32_t ones_in_left_child = ones_in_node(left_child);
173 if (ones_in_left_child >= target_one_rank) {
174 node_index = left_child;
176 target_one_rank -= ones_in_left_child;
177 segment_base += segment_size_bits[left_child];
178 node_index = right_child;
181 return select1_in_block(
183 std::min(segment_base + segment_size_bits[node_index], num_bits),
192 if (target_zero_rank == 0 || num_bits == 0) {
195 size_t node_index = 1;
196 const auto zeros_in_node = [&](
const size_t& node)
noexcept {
197 return segment_size_bits[node] - ones_in_node(node);
199 if (zeros_in_node(node_index) < target_zero_rank) {
202 size_t segment_base = 0;
203 while (node_index < first_leaf_index) {
204 const size_t left_child = node_index << 1;
205 const size_t right_child = left_child | 1;
206 const size_t zeros_in_left_child = zeros_in_node(left_child);
207 if (zeros_in_left_child >= target_zero_rank) {
208 node_index = left_child;
210 target_zero_rank -= zeros_in_left_child;
211 segment_base += segment_size_bits[left_child];
212 node_index = right_child;
215 return select0_in_block(
217 std::min(segment_base + segment_size_bits[node_index], num_bits),
227 if (end_position <= 1) {
230 const size_t block_index = block_of(end_position - 1);
231 size_t pattern_count = 0;
232 int previous_last_bit = -1;
234 if (block_index > 0) {
235 const auto covered_nodes = cover_blocks(0, block_index - 1);
236 for (
const size_t& node_index : covered_nodes) {
237 pattern_count += node_pattern10_count[node_index];
238 if (previous_last_bit != -1 && previous_last_bit == 1 &&
239 node_first_bit[node_index] == 0) {
242 previous_last_bit = node_last_bit[node_index];
245 const size_t block_begin = block_index * block_bits;
246 pattern_count += rr_in_block(block_begin, end_position);
248 if (block_index > 0 && end_position > block_begin &&
249 previous_last_bit == 1 &&
bit(block_begin) == 0) {
252 return pattern_count;
260 if (target_pattern_rank == 0 || num_bits == 0) {
263 size_t node_index = 1;
264 if (node_pattern10_count[node_index] < target_pattern_rank) {
267 const size_t tree_size = segment_size_bits.size() - 1;
268 size_t segment_base = 0;
269 while (node_index < first_leaf_index) {
270 const size_t left_child = node_index << 1;
271 const size_t left_segment_size =
272 (left_child <= tree_size) ? segment_size_bits[left_child] : 0;
273 if (left_segment_size == 0) {
277 const size_t left_count = node_pattern10_count[left_child];
278 if (left_count >= target_pattern_rank) {
279 node_index = left_child;
283 size_t remaining_rank = target_pattern_rank - left_count;
284 const size_t right_child = left_child | 1;
285 const bool has_right =
286 (right_child <= tree_size) && (segment_size_bits[right_child] != 0);
291 const size_t crossing_pattern =
292 (node_last_bit[left_child] == 1 && node_first_bit[right_child] == 0)
295 if (crossing_pattern) {
296 if (remaining_rank == 1) {
297 return segment_base + left_segment_size - 1;
301 segment_base += left_segment_size;
302 node_index = right_child;
303 target_pattern_rank = remaining_rank;
305 return select10_in_block(
307 std::min(segment_base + segment_size_bits[node_index], num_bits),
308 target_pattern_rank);
315 return int64_t(
rank1_impl(end_position)) * 2 - int64_t(end_position);
325 if (start_position >= num_bits) {
330 const size_t leaf_block_index = block_of(start_position);
331 const size_t block_begin = leaf_block_index * block_bits;
332 const size_t block_end = std::min(num_bits, block_begin + block_bits);
334 const size_t leaf_result = leaf_fwd_bp_simd(
335 leaf_block_index, block_begin, start_position, delta, leaf_delta);
336 if (leaf_result !=
npos) {
340 int remaining_delta = delta - leaf_delta;
341 size_t segment_base = block_end;
342 if (remaining_delta == 0) {
349 size_t node_index = leaf_index_of(block_begin);
350 const size_t tree_size = segment_size_bits.size() - 1;
354 if (segment_base >= num_bits || leaf_block_index + 1 >= leaf_count) {
358 while (node_index > 1) {
359 const bool is_left_child = ((node_index & 1u) == 0u);
361 const size_t sibling = node_index | 1u;
362 if (sibling <= tree_size && segment_size_bits[sibling]) {
365 if (node_min_prefix_excess[sibling] <= remaining_delta &&
366 remaining_delta <= node_max_prefix_excess[sibling]) {
367 return descend_fwd(sibling, remaining_delta, segment_base);
370 remaining_delta -= node_total_excess[sibling];
371 segment_base += segment_size_bits[sibling];
372 if (remaining_delta == 0) {
389 if (start_position > num_bits || start_position == 0) {
394 const size_t leaf_block_index = block_of(start_position - 1);
395 const size_t block_begin = leaf_block_index * block_bits;
398 const size_t leaf_result = leaf_bwd_bp_simd(
399 leaf_block_index, block_begin, start_position, delta, leaf_delta);
400 if (leaf_result !=
npos) {
406 int remaining_delta = leaf_delta + delta;
407 size_t node_index = leaf_index_of(block_begin);
408 size_t segment_base = block_begin;
409 while (node_index > 1) {
410 if (node_index & 1) {
411 const size_t sibling_index = node_index ^ 1;
412 const size_t sibling_border =
414 const int needed_inside_sibling =
416 node_total_excess[sibling_index];
418 const bool allow_right_border =
419 (sibling_border != start_position);
422 if (needed_inside_sibling == 0 ||
423 (node_min_prefix_excess[sibling_index] <= needed_inside_sibling &&
424 needed_inside_sibling <= node_max_prefix_excess[sibling_index])) {
425 const size_t result = descend_bwd(
426 sibling_index, sibling_border - segment_size_bits[sibling_index],
427 needed_inside_sibling, sibling_border, allow_right_border);
428 if (result !=
npos) {
433 if (needed_inside_sibling == node_total_excess[sibling_index] &&
434 sibling_border < start_position) {
435 return sibling_border;
439 remaining_delta += node_total_excess[sibling_index];
440 segment_base -= segment_size_bits[sibling_index];
454 const size_t& range_end)
const {
455 if (range_begin > range_end || range_end >= num_bits) {
459 const size_t begin_block_index = block_of(range_begin);
460 const size_t begin_block_start = begin_block_index * block_bits;
461 const size_t begin_block_end =
462 std::min(num_bits, begin_block_start + block_bits);
463 const size_t end_block_index = block_of(range_end);
464 const size_t end_block_start = end_block_index * block_bits;
466 int best_value = INT_MAX;
467 size_t best_position =
npos;
468 size_t chosen_node_index = 0;
469 int prefix_excess = 0, prefix_at_choice = 0;
472 int min_prefix_first_chunk = INT_MAX;
473 size_t first_chunk_position =
npos;
474 const size_t end_of_first_chunk = std::min(
475 range_end, (
size_t)(begin_block_end ? begin_block_end - 1 : 0));
476 if (range_begin <= end_of_first_chunk) {
477 first_min_value_pos8(range_begin, end_of_first_chunk,
478 min_prefix_first_chunk, first_chunk_position);
480 (int64_t)rank1_in_block(range_begin, end_of_first_chunk + 1) * 2 -
481 int64_t(end_of_first_chunk + 1 - range_begin);
482 best_value = min_prefix_first_chunk;
483 best_position = first_chunk_position;
484 chosen_node_index = 0;
488 if (begin_block_index + 1 <= end_block_index - 1) {
489 size_t left_index = first_leaf_index + (begin_block_index + 1);
490 size_t right_index = first_leaf_index + (end_block_index - 1);
491 size_t right_nodes[64];
492 int right_nodes_count = 0;
494 while (left_index <= right_index) {
495 if (left_index & 1) {
496 const size_t node_index = left_index++;
497 const int candidate =
498 prefix_excess + node_min_prefix_excess[node_index];
499 if (candidate < best_value) {
500 best_value = candidate;
501 best_position =
npos;
502 chosen_node_index = node_index;
503 prefix_at_choice = prefix_excess;
505 prefix_excess += node_total_excess[node_index];
507 if ((right_index & 1) == 0) {
508 right_nodes[right_nodes_count++] = right_index--;
513 while (right_nodes_count--) {
514 const size_t node_index = right_nodes[right_nodes_count];
515 const int candidate =
516 prefix_excess + node_min_prefix_excess[node_index];
517 if (candidate < best_value) {
518 best_value = candidate;
519 best_position =
npos;
520 chosen_node_index = node_index;
521 prefix_at_choice = prefix_excess;
523 prefix_excess += node_total_excess[node_index];
528 if (end_block_index != begin_block_index) {
529 int min_prefix_last_chunk;
530 size_t last_chunk_position;
531 first_min_value_pos8(end_block_start, range_end, min_prefix_last_chunk,
532 last_chunk_position);
533 const int candidate = prefix_excess + min_prefix_last_chunk;
534 if (candidate < best_value) {
535 best_value = candidate;
536 best_position = last_chunk_position;
537 chosen_node_index = 0;
541 if (best_position !=
npos) {
542 return best_position;
545 return descend_first_min(chosen_node_index, best_value - prefix_at_choice,
546 node_base(chosen_node_index));
556 const size_t& range_end)
const {
557 if (range_begin > range_end || range_end >= num_bits) {
561 if (min_position ==
npos) {
574 const size_t& range_end)
const {
575 if (range_begin > range_end || range_end >= num_bits) {
579 const size_t begin_block_index = block_of(range_begin);
580 const size_t begin_block_start = begin_block_index * block_bits;
581 const size_t begin_block_end =
582 std::min(num_bits, begin_block_start + block_bits);
583 const size_t end_block_index = block_of(range_end);
584 const size_t end_block_start = end_block_index * block_bits;
586 int best_value = INT_MIN;
587 size_t best_position =
npos;
588 size_t chosen_node_index = 0;
589 int prefix_excess = 0, prefix_at_choice = 0;
592 int max_prefix_first_chunk = INT_MIN;
593 size_t first_chunk_position =
npos;
594 const size_t end_of_first_chunk = std::min(
595 range_end, (
size_t)(begin_block_end ? begin_block_end - 1 : 0));
596 if (range_begin <= end_of_first_chunk) {
597 first_max_value_pos8(range_begin, end_of_first_chunk,
598 max_prefix_first_chunk, first_chunk_position);
600 (int64_t)rank1_in_block(range_begin, end_of_first_chunk + 1) * 2 -
601 int64_t(end_of_first_chunk + 1 - range_begin);
602 best_value = max_prefix_first_chunk;
603 best_position = first_chunk_position;
604 chosen_node_index = 0;
608 if (begin_block_index + 1 <= end_block_index - 1) {
609 size_t left_index = first_leaf_index + (begin_block_index + 1);
610 size_t right_index = first_leaf_index + (end_block_index - 1);
611 size_t right_nodes[64];
612 int right_nodes_count = 0;
614 while (left_index <= right_index) {
615 if (left_index & 1) {
616 const size_t node_index = left_index++;
617 const int candidate =
618 prefix_excess + node_max_prefix_excess[node_index];
619 if (candidate > best_value) {
620 best_value = candidate;
621 best_position =
npos;
622 chosen_node_index = node_index;
623 prefix_at_choice = prefix_excess;
625 prefix_excess += node_total_excess[node_index];
627 if ((right_index & 1) == 0) {
628 right_nodes[right_nodes_count++] = right_index--;
633 while (right_nodes_count--) {
634 const size_t node_index = right_nodes[right_nodes_count];
635 const int candidate =
636 prefix_excess + node_max_prefix_excess[node_index];
637 if (candidate > best_value) {
638 best_value = candidate;
639 best_position =
npos;
640 chosen_node_index = node_index;
641 prefix_at_choice = prefix_excess;
643 prefix_excess += node_total_excess[node_index];
648 if (end_block_index != begin_block_index) {
649 int max_prefix_last_chunk;
650 size_t last_chunk_position;
651 first_max_value_pos8(end_block_start, range_end, max_prefix_last_chunk,
652 last_chunk_position);
653 const int candidate = prefix_excess + max_prefix_last_chunk;
654 if (candidate > best_value) {
655 best_value = candidate;
656 best_position = last_chunk_position;
657 chosen_node_index = 0;
661 if (best_position !=
npos) {
662 return best_position;
665 return descend_first_max(chosen_node_index, best_value - prefix_at_choice,
666 node_base(chosen_node_index));
674 const size_t& range_end)
const {
675 if (range_begin > range_end || range_end >= num_bits) {
679 if (max_position ==
npos) {
690 const size_t& range_end)
const {
691 if (range_begin > range_end || range_end >= num_bits) {
695 const size_t begin_block_index = block_of(range_begin);
696 const size_t begin_block_start = begin_block_index * block_bits;
697 const size_t begin_block_end =
698 std::min(num_bits, begin_block_start + block_bits);
699 const size_t end_block_index = block_of(range_end);
700 const size_t end_block_start = end_block_index * block_bits;
702 int best_value = INT_MAX;
703 size_t min_count = 0;
704 int prefix_excess = 0;
708 int current_excess = 0, min_value = INT_MAX, local_count = 0;
709 const size_t end_of_first_chunk =
710 std::min(range_end, begin_block_end - 1);
711 for (
size_t position = range_begin; position <= end_of_first_chunk;
713 current_excess +=
bit(position) ? +1 : -1;
714 if (current_excess < min_value) {
715 min_value = current_excess;
717 }
else if (current_excess == min_value) {
721 best_value = min_value;
722 min_count = local_count;
723 prefix_excess = current_excess;
727 if (begin_block_index + 1 <= end_block_index - 1) {
728 const auto middle_nodes =
729 cover_blocks(begin_block_index + 1, end_block_index - 1);
730 for (
const size_t& node_index : middle_nodes) {
731 const int candidate =
732 prefix_excess + node_min_prefix_excess[node_index];
733 if (candidate < best_value) {
734 best_value = candidate;
735 min_count = node_min_count[node_index];
736 }
else if (candidate == best_value) {
737 min_count += node_min_count[node_index];
739 prefix_excess += node_total_excess[node_index];
744 if (end_block_index != begin_block_index) {
745 int current_excess = 0, min_value = INT_MAX, local_count = 0;
746 for (
size_t position = end_block_start; position <= range_end;
748 current_excess +=
bit(position) ? +1 : -1;
749 if (current_excess < min_value) {
750 min_value = current_excess;
752 }
else if (current_excess == min_value) {
756 const int candidate = prefix_excess + min_value;
757 if (candidate < best_value) {
758 best_value = candidate;
759 min_count = local_count;
760 }
else if (candidate == best_value) {
761 min_count += local_count;
774 const size_t& range_end,
775 size_t target_min_rank)
const {
776 if (range_begin > range_end || range_end >= num_bits ||
777 target_min_rank == 0) {
781 const size_t begin_block_index = block_of(range_begin);
782 const size_t begin_block_start = begin_block_index * block_bits;
783 const size_t begin_block_end =
784 std::min(num_bits, begin_block_start + block_bits);
785 const size_t end_block_index = block_of(range_end);
786 const size_t end_block_start = end_block_index * block_bits;
789 const size_t end_of_first_chunk = std::min(range_end, begin_block_end - 1);
790 int current_first_chunk_excess = 0, min_first_chunk = 0;
791 uint32_t count_first_chunk = 0;
793 if (range_begin <= end_of_first_chunk) {
794 scan_range_min_count8(range_begin, end_of_first_chunk,
795 current_first_chunk_excess, min_first_chunk,
798 current_first_chunk_excess = 0;
799 min_first_chunk = INT_MAX;
800 count_first_chunk = 0;
803 int best_value = (min_first_chunk == INT_MAX ? INT_MAX : min_first_chunk);
805 (min_first_chunk == INT_MAX ? 0u : (size_t)count_first_chunk);
806 int prefix_excess = current_first_chunk_excess;
808 size_t left_index = first_leaf_index + begin_block_index + 1;
809 size_t right_index = first_leaf_index + end_block_index - 1;
810 size_t right_nodes[64];
811 int right_nodes_count = 0;
814 if (begin_block_index + 1 <= end_block_index - 1) {
815 while (left_index <= right_index) {
816 if (left_index & 1) {
817 const int candidate =
818 prefix_excess + node_min_prefix_excess[left_index];
819 if (candidate < best_value) {
820 best_value = candidate;
821 total_count = node_min_count[left_index];
822 }
else if (candidate == best_value) {
823 total_count += node_min_count[left_index];
825 prefix_excess += node_total_excess[left_index++];
827 if ((right_index & 1) == 0) {
828 right_nodes[right_nodes_count++] = right_index--;
833 while (right_nodes_count--) {
834 const size_t node_index = right_nodes[right_nodes_count];
835 const int candidate =
836 prefix_excess + node_min_prefix_excess[node_index];
837 if (candidate < best_value) {
838 best_value = candidate;
839 total_count = node_min_count[node_index];
840 }
else if (candidate == best_value) {
841 total_count += node_min_count[node_index];
843 prefix_excess += node_total_excess[node_index];
848 int current_last_chunk_excess = 0, min_last_chunk = INT_MAX;
849 uint32_t count_last_chunk = 0;
850 if (end_block_index != begin_block_index) {
851 scan_range_min_count8(end_block_start, range_end,
852 current_last_chunk_excess, min_last_chunk,
854 const int candidate = prefix_excess + min_last_chunk;
855 if (candidate < best_value) {
856 best_value = candidate;
857 total_count = count_last_chunk;
858 }
else if (candidate == best_value) {
859 total_count += count_last_chunk;
863 if (target_min_rank > total_count) {
868 if (min_first_chunk == best_value && count_first_chunk) {
869 if (target_min_rank <= count_first_chunk) {
870 return qth_min_in_block(range_begin, end_of_first_chunk,
873 target_min_rank -= count_first_chunk;
877 prefix_excess = current_first_chunk_excess;
878 if (begin_block_index + 1 <= end_block_index - 1) {
879 left_index = first_leaf_index + (begin_block_index + 1);
880 right_index = first_leaf_index + (end_block_index - 1);
881 right_nodes_count = 0;
882 while (left_index <= right_index) {
883 if (left_index & 1) {
884 const size_t node_index = left_index++;
885 const int candidate =
886 prefix_excess + node_min_prefix_excess[node_index];
887 if (candidate == best_value) {
888 if (target_min_rank <= node_min_count[node_index]) {
889 return descend_qth_min(node_index, best_value - prefix_excess,
890 target_min_rank, node_base(node_index));
892 target_min_rank -= node_min_count[node_index];
894 prefix_excess += node_total_excess[node_index];
896 if (!(right_index & 1)) {
897 right_nodes[right_nodes_count++] = right_index--;
902 while (right_nodes_count--) {
903 const size_t node_index = right_nodes[right_nodes_count];
904 const int candidate =
905 prefix_excess + node_min_prefix_excess[node_index];
906 if (candidate == best_value) {
907 if (target_min_rank <= node_min_count[node_index]) {
908 return descend_qth_min(node_index, best_value - prefix_excess,
909 target_min_rank, node_base(node_index));
911 target_min_rank -= node_min_count[node_index];
913 prefix_excess += node_total_excess[node_index];
918 if (end_block_index != begin_block_index &&
919 (prefix_excess + min_last_chunk) == best_value) {
920 return qth_min_in_block(end_block_start, range_end, target_min_rank);
933 inline size_t close_impl(
const size_t& open_position)
const {
934 if (open_position >= num_bits) {
937 if (!
bit(open_position)) {
938 return open_position;
948 inline size_t open_impl(
const size_t& close_position)
const {
949 if (close_position >= num_bits) {
952 if (
bit(close_position)) {
953 return close_position;
964 if (position >= num_bits) {
967 if (!
bit(position)) {
976 inline int bit(
const size_t& position)
const noexcept {
977 return (bits[position >> 6] >> (position & 63)) & 1u;
986 static inline size_t pop10_in_slice64(
const std::uint64_t& slice,
987 const int& length)
noexcept {
991 std::uint64_t pattern_mask = slice & ~(slice >> 1);
993 pattern_mask &= ((std::uint64_t(1) << (length - 1)) - 1);
995 pattern_mask &= 0x7FFFFFFFFFFFFFFFull;
997 return (
size_t)std::popcount(pattern_mask);
1004 size_t rank1_in_block(
const size_t& block_begin,
1005 const size_t& block_end)
const noexcept {
1006 if (block_end <= block_begin) {
1009 size_t left_word_index = block_begin >> 6;
1010 const size_t right_word_index = block_end >> 6;
1011 size_t left_offset = block_begin & 63;
1012 const size_t right_offset = block_end & 63;
1014 if (left_word_index == right_word_index) {
1015 const std::uint64_t mask =
1016 ((right_offset == 0) ? 0 : ((std::uint64_t(1) << right_offset) - 1)) &
1017 (~std::uint64_t(0) << left_offset);
1018 return (
size_t)std::popcount(bits[left_word_index] & mask);
1021 count += (size_t)std::popcount(bits[left_word_index] &
1022 (~std::uint64_t(0) << left_offset));
1025 while (left_word_index < right_word_index) {
1026 count += (size_t)std::popcount(bits[left_word_index]);
1030 count += (size_t)std::popcount(bits[right_word_index] &
1031 ((std::uint64_t(1) << right_offset) - 1));
1040 size_t rr_in_block(
const size_t& block_begin,
1041 const size_t& block_end)
const noexcept {
1042 if (block_end <= block_begin + 1) {
1045 size_t left_word_index = block_begin >> 6;
1046 const size_t right_word_index = (block_end - 1) >> 6;
1047 const int left_offset = block_begin & 63;
1048 const int right_offset = (block_end - 1) & 63;
1051 if (left_word_index == right_word_index) {
1052 const int length = right_offset - left_offset + 1;
1053 const std::uint64_t slice = bits[left_word_index] >> left_offset;
1054 return pop10_in_slice64(slice, length);
1059 const int length = 64 - left_offset;
1060 const std::uint64_t slice = bits[left_word_index] >> left_offset;
1061 count += pop10_in_slice64(slice, length);
1064 for (
size_t word_index = left_word_index + 1; word_index < right_word_index;
1066 const std::uint64_t word = bits[word_index];
1067 count += pop10_in_slice64(word, 64);
1071 const int length = right_offset + 1;
1072 const std::uint64_t mask = (length == 64)
1074 : ((std::uint64_t(1) << length) - 1);
1075 const std::uint64_t slice = bits[right_word_index] & mask;
1076 count += pop10_in_slice64(slice, length);
1079 for (
size_t word_index = left_word_index; word_index < right_word_index;
1081 if (((bits[word_index] >> 63) & 1u) &&
1082 ((bits[word_index + 1] & 1u) == 0)) {
1094 size_t select10_in_block(
const size_t& block_begin,
1095 const size_t& block_end,
1096 size_t target_pattern_rank)
const noexcept {
1097 if (block_end <= block_begin + 1) {
1100 size_t left_word_index = block_begin >> 6;
1101 const size_t right_word_index = (block_end - 1) >> 6;
1102 const int left_offset = block_begin & 63;
1103 const int right_offset = (block_end - 1) & 63;
1105 const auto select_in_masked_slice =
1106 [&](
const std::uint64_t& slice,
const int& length,
1107 const size_t& target_index)
noexcept ->
int {
1111 std::uint64_t pattern_mask = slice & ~(slice >> 1);
1113 pattern_mask &= ((std::uint64_t(1) << (length - 1)) - 1);
1115 pattern_mask &= 0x7FFFFFFFFFFFFFFFull;
1117 return select_in_word(pattern_mask, target_index);
1120 if (left_word_index == right_word_index) {
1121 const int length = right_offset - left_offset + 1;
1122 const std::uint64_t slice = bits[left_word_index] >> left_offset;
1124 select_in_masked_slice(slice, length, target_pattern_rank);
1125 return offset >= 0 ? (block_begin + (size_t)offset) :
npos;
1130 const int length = 64 - left_offset;
1131 const std::uint64_t slice = bits[left_word_index] >> left_offset;
1132 std::uint64_t pattern_mask = slice & ~(slice >> 1);
1133 pattern_mask &= ((std::uint64_t(1) << (length - 1)) - 1);
1134 const int count = std::popcount(pattern_mask);
1135 if (target_pattern_rank <= (
size_t)count) {
1137 select_in_masked_slice(slice, length, target_pattern_rank);
1138 return block_begin + (size_t)offset;
1140 target_pattern_rank -= count;
1144 for (
size_t word_index = left_word_index; word_index + 1 < right_word_index;
1147 if (((bits[word_index] >> 63) & 1u) &&
1148 ((bits[word_index + 1] & 1u) == 0)) {
1149 if (--target_pattern_rank == 0) {
1150 return (word_index << 6) + 63;
1154 const std::uint64_t next_word = bits[word_index + 1];
1155 const std::uint64_t pattern_mask =
1156 (next_word & ~(next_word >> 1)) & 0x7FFFFFFFFFFFFFFFull;
1157 const int count = std::popcount(pattern_mask);
1158 if (target_pattern_rank <= (
size_t)count) {
1159 const int offset = select_in_word(pattern_mask, target_pattern_rank);
1163 return ((word_index + 1) << 6) + (size_t)offset;
1165 target_pattern_rank -= count;
1169 if (((bits[right_word_index - 1] >> 63) & 1u) &&
1170 ((bits[right_word_index] & 1u) == 0)) {
1171 if (--target_pattern_rank == 0) {
1172 return ((right_word_index - 1) << 6) + 63;
1178 const int length = right_offset + 1;
1179 const std::uint64_t mask = (length == 64)
1181 : ((std::uint64_t(1) << length) - 1);
1182 const std::uint64_t slice = bits[right_word_index] & mask;
1184 select_in_masked_slice(slice, length, target_pattern_rank);
1186 return (right_word_index << 6) + (size_t)offset;
1193 int8_t excess_total;
1197 uint8_t pattern10_count;
1200 uint8_t pos_first_min;
1201 uint8_t pos_first_max;
1205 std::array<ByteAgg, 256> agg;
1210 std::array<std::array<int8_t, 17>, 256> fwd_pos;
1215 std::array<std::array<int8_t, 17>, 256> bwd_pos;
1221 static inline const LUT8Tables& LUT8_ALL() noexcept {
1222 static const LUT8Tables tables = [] {
1223 LUT8Tables lookup_tables{};
1224 for (
int byte_value = 0; byte_value < 256; ++byte_value) {
1225 int current_excess = 0, min_prefix = INT_MAX, max_prefix = INT_MIN,
1226 min_count = 0, pattern10_count = 0;
1227 int first_min_position = 0, first_max_position = 0;
1229 const auto bit_at = [&](
const int& bit_index) {
1230 return (byte_value >> bit_index) & 1;
1232 for (
int bit_index = 0; bit_index < 8; ++bit_index) {
1233 int bit_value = bit_at(bit_index);
1234 if (bit_index + 1 < 8 && bit_value && bit_at(bit_index + 1) == 0) {
1237 current_excess += bit_value ? +1 : -1;
1238 prefixes[bit_index] = current_excess;
1239 if (current_excess < min_prefix) {
1240 min_prefix = current_excess;
1242 first_min_position = bit_index;
1243 }
else if (current_excess == min_prefix) {
1246 if (current_excess > max_prefix) {
1247 max_prefix = current_excess;
1248 first_max_position = bit_index;
1251 ByteAgg aggregates{};
1252 aggregates.excess_total = current_excess;
1253 aggregates.min_prefix = (min_prefix == INT_MAX ? 0 : min_prefix);
1254 aggregates.max_prefix = (max_prefix == INT_MIN ? 0 : max_prefix);
1255 aggregates.min_count = min_count;
1256 aggregates.pattern10_count = pattern10_count;
1257 aggregates.first_bit = bit_at(0);
1258 aggregates.last_bit = bit_at(7);
1259 aggregates.pos_first_min = first_min_position;
1260 aggregates.pos_first_max = first_max_position;
1261 lookup_tables.agg[byte_value] = aggregates;
1262 auto& forward_positions = lookup_tables.fwd_pos[byte_value];
1263 auto& backward_positions = lookup_tables.bwd_pos[byte_value];
1264 forward_positions.fill(-1);
1265 backward_positions.fill(-1);
1266 for (
int delta = -8; delta <= 8; ++delta) {
1267 for (
int bit_index = 0; bit_index < 8; ++bit_index) {
1268 if (prefixes[bit_index] == delta) {
1269 forward_positions[delta + 8] = bit_index;
1273 for (
int bit_index = 7; bit_index >= 0; --bit_index) {
1274 if (prefixes[bit_index] == delta) {
1275 backward_positions[delta + 8] = bit_index;
1281 return lookup_tables;
1289 static inline const std::array<ByteAgg, 256>& LUT8() noexcept {
1290 return LUT8_ALL().agg;
1296 static inline const std::array<std::array<int8_t, 17>, 256>&
1297 LUT8_FWD_POS() noexcept {
1298 return LUT8_ALL().fwd_pos;
1304 static inline const std::array<std::array<int8_t, 17>, 256>&
1305 LUT8_BWD_POS() noexcept {
1306 return LUT8_ALL().bwd_pos;
1312 inline uint16_t get_u16(
const size_t& position)
const noexcept {
1313 const size_t word_index = position >> 6;
1314 const unsigned offset = unsigned(position & 63);
1315 const std::uint64_t w0 =
1316 (word_index < bits.size()) ? bits[word_index] : 0ULL;
1318 return uint16_t(w0 & 0xFFFFu);
1320 const std::uint64_t w1 =
1321 (word_index + 1 < bits.size()) ? bits[word_index + 1] : 0ULL;
1322 const std::uint64_t v = (w0 >> offset) | (w1 << (64u - offset));
1323 return uint16_t(v & 0xFFFFu);
1326#if defined(PIXIE_AVX2_SUPPORT)
1327 static inline __m256i bit_masks_16x() noexcept {
1329 return _mm256_setr_epi16(0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020,
1330 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800,
1331 0x1000, 0x2000, 0x4000, (int16_t)0x8000);
1334 static inline __m256i prefix_sum_16x_i16(__m256i v)
noexcept {
1338 __m256i t = _mm256_slli_si256(x, 2);
1339 x = _mm256_add_epi16(x, t);
1340 t = _mm256_slli_si256(x, 4);
1341 x = _mm256_add_epi16(x, t);
1342 t = _mm256_slli_si256(x, 8);
1343 x = _mm256_add_epi16(x, t);
1345 __m128i lo = _mm256_extracti128_si256(x, 0);
1346 __m128i hi = _mm256_extracti128_si256(x, 1);
1347 const int16_t carry =
1348 (int16_t)_mm_extract_epi16(lo, 7);
1349 hi = _mm_add_epi16(hi, _mm_set1_epi16(carry));
1351 __m256i out = _mm256_castsi128_si256(lo);
1352 out = _mm256_inserti128_si256(out, hi, 1);
1356 static inline int16_t last_prefix_16x_i16(__m256i pref)
noexcept {
1357 __m128i hi = _mm256_extracti128_si256(pref, 1);
1358 return (int16_t)_mm_extract_epi16(hi, 7);
1367 inline size_t scan_leaf_fwd_simd(
const size_t& start,
1369 const int& required_delta,
1370 int* out_total)
const noexcept {
1377 if (required_delta < -32768 || required_delta > 32767) {
1379 const int len = int(end - start);
1380 const int ones = int(rank1_in_block(start, end));
1381 *out_total = ones * 2 - len;
1386 static const __m256i masks = bit_masks_16x();
1387 static const __m256i vzero = _mm256_setzero_si256();
1388 static const __m256i vallones = _mm256_cmpeq_epi16(vzero, vzero);
1389 static const __m256i vminus1 = _mm256_set1_epi16(-1);
1390 static const __m256i vtwo = _mm256_set1_epi16(2);
1391 const __m256i vtarget = _mm256_set1_epi16((int16_t)required_delta);
1395 while (pos + 16 <= end) {
1396 const uint16_t bits16 = get_u16(pos);
1397 const __m256i vb = _mm256_set1_epi16((int16_t)bits16);
1398 const __m256i m = _mm256_and_si256(vb, masks);
1399 const __m256i is_zero = _mm256_cmpeq_epi16(m, vzero);
1400 const __m256i is_set = _mm256_andnot_si256(is_zero, vallones);
1401 const __m256i steps =
1402 _mm256_add_epi16(vminus1, _mm256_and_si256(is_set, vtwo));
1404 const __m256i pref_rel = prefix_sum_16x_i16(steps);
1405 const __m256i base = _mm256_set1_epi16((int16_t)cur);
1406 const __m256i pref = _mm256_add_epi16(pref_rel, base);
1407 const __m256i cmp = _mm256_cmpeq_epi16(pref, vtarget);
1408 const uint32_t mask = (uint32_t)_mm256_movemask_epi8(cmp);
1410 const int lane = int(std::countr_zero(mask)) >> 1;
1411 return pos + (size_t)lane;
1413 cur += (int)last_prefix_16x_i16(pref_rel);
1417 cur +=
bit(pos) ? +1 : -1;
1418 if (cur == required_delta) {
1436 inline size_t scan_leaf_fwd_lut8_fast(
const size_t& start,
1438 const int& required_delta,
1439 int* out_total)
const noexcept {
1449 while (pos < end && (pos & 7)) {
1450 cur +=
bit(pos) ? +1 : -1;
1451 if (cur == required_delta) {
1463 const uint8_t* bytep =
1464 reinterpret_cast<const uint8_t*
>(bits.data()) + (pos >> 3);
1465 const auto& agg = LUT8();
1466 const auto& fwd = LUT8_FWD_POS();
1468 while (pos + 8 <= end) {
1469 const uint8_t bv = *bytep++;
1470 const auto& a = agg[bv];
1471 const int need = required_delta - cur;
1473 if ((
unsigned)(need + 8) <= 16u) {
1475 if (need >= a.min_prefix && need <= a.max_prefix) {
1476 const int8_t off = fwd[bv][need + 8];
1479 *out_total = cur + a.excess_total;
1482 return pos + (size_t)off;
1486 cur += a.excess_total;
1491 cur +=
bit(pos) ? +1 : -1;
1492 if (cur == required_delta) {
1513 inline size_t scan_leaf_fwd(
const size_t& search_start,
1514 const size_t& search_end,
1515 const int& required_delta)
const noexcept {
1516 if (search_start >= search_end) {
1519 const auto& aggregates_table = LUT8();
1520 const auto& forward_lookup = LUT8_FWD_POS();
1521 int current_excess = 0;
1522 size_t position = search_start;
1523 while (position + 8 <= search_end) {
1524 const uint8_t byte_value = get_byte(position);
1525 const auto& byte_aggregate = aggregates_table[byte_value];
1526 const int local_need = required_delta - current_excess;
1527 if (local_need >= byte_aggregate.min_prefix &&
1528 local_need <= byte_aggregate.max_prefix && local_need >= -8 &&
1530 const int8_t offset = forward_lookup[byte_value][local_need + 8];
1532 return position + size_t(offset);
1535 current_excess += byte_aggregate.excess_total;
1539 while (position < search_end) {
1540 current_excess +=
bit(position) ? 1 : -1;
1541 if (current_excess == required_delta) {
1558 inline size_t scan_leaf_bwd(
1559 const size_t& block_begin,
1560 const size_t& block_end,
1561 const int& required_delta,
1562 const bool& allow_right_boundary,
1563 const size_t& global_right_border,
1564 const int& prefix_at_boundary_max )
const noexcept {
1568 if (block_begin > block_end) {
1574 size_t boundary_max = block_end;
1575 if (!allow_right_boundary && boundary_max > block_begin) {
1580 if (block_begin >= boundary_max) {
1581 if ((block_begin < global_right_border || allow_right_boundary) &&
1582 required_delta == 0) {
1588 if (required_delta < -32768 || required_delta > 32767) {
1590 if ((block_begin < global_right_border || allow_right_boundary) &&
1591 required_delta == 0) {
1597#if defined(PIXIE_AVX2_SUPPORT)
1603 int prefix_end = prefix_at_boundary_max;
1604 if (prefix_end == kNoPrefixOverride) {
1605 const int len = int(boundary_max - block_begin);
1606 const int ones = int(rank1_in_block(block_begin, boundary_max));
1607 prefix_end = ones * 2 - len;
1610 const __m256i masks = bit_masks_16x();
1611 const __m256i vzero = _mm256_setzero_si256();
1612 const __m256i vallones = _mm256_cmpeq_epi16(vzero, vzero);
1613 const __m256i vminus1 = _mm256_set1_epi16(-1);
1614 const __m256i vtwo = _mm256_set1_epi16(2);
1615 const __m256i vtarget = _mm256_set1_epi16((int16_t)required_delta);
1617 size_t pos_end = boundary_max;
1618 int cur_end = prefix_end;
1621 while (pos_end >= block_begin + 16) {
1622 const size_t pos = pos_end - 16;
1623 const uint16_t bits16 = get_u16(pos);
1624 const __m256i vb = _mm256_set1_epi16((int16_t)bits16);
1625 const __m256i m = _mm256_and_si256(vb, masks);
1626 const __m256i is_zero = _mm256_cmpeq_epi16(m, vzero);
1627 const __m256i is_set = _mm256_andnot_si256(is_zero, vallones);
1628 const __m256i steps =
1629 _mm256_add_epi16(vminus1, _mm256_and_si256(is_set, vtwo));
1631 const __m256i pref_rel =
1632 prefix_sum_16x_i16(steps);
1633 const int16_t sum16 =
1634 last_prefix_16x_i16(pref_rel);
1635 const int cur_start =
1636 cur_end - (int)sum16;
1638 const __m256i base = _mm256_set1_epi16((int16_t)cur_start);
1639 const __m256i pref = _mm256_add_epi16(
1641 const __m256i cmp = _mm256_cmpeq_epi16(pref, vtarget);
1642 const uint32_t mask = (uint32_t)_mm256_movemask_epi8(cmp);
1644 const int bit_i = 31 - int(std::countl_zero(mask));
1645 const int lane = bit_i >> 1;
1646 const size_t boundary = pos + (size_t)lane + 1;
1647 if (boundary < global_right_border || allow_right_boundary) {
1653 cur_end = cur_start;
1657 while (pos_end > block_begin) {
1659 if (cur_end == required_delta) {
1660 if (pos_end < global_right_border || allow_right_boundary) {
1665 const size_t bit_pos = pos_end - 1;
1666 cur_end -=
bit(bit_pos) ? +1 : -1;
1670 size_t last_boundary =
npos;
1672 for (
size_t pos = block_begin; pos < boundary_max; ++pos) {
1673 cur +=
bit(pos) ? +1 : -1;
1674 if (cur == required_delta) {
1675 last_boundary = pos + 1;
1678 if (last_boundary !=
npos) {
1679 if (last_boundary < global_right_border || allow_right_boundary) {
1680 return last_boundary;
1687 if ((block_begin < global_right_border || allow_right_boundary) &&
1688 required_delta == 0) {
1697 inline uint8_t get_byte(
const size_t& position)
const noexcept {
1698 const size_t word_index = position >> 6;
1699 const size_t offset = position & 63;
1700 const std::uint64_t lower_word = bits[word_index] >> offset;
1702 return uint8_t(lower_word & 0xFFu);
1704 const std::uint64_t higher_word =
1705 (word_index + 1 < bits.size()) ? bits[word_index + 1] : 0;
1706 const std::uint64_t byte_value =
1707 (lower_word | (higher_word << (64 - offset))) & 0xFFu;
1708 return uint8_t(byte_value);
1719 size_t descend_first_max(
size_t node_index,
1721 size_t segment_base)
const noexcept {
1722 while (node_index < first_leaf_index) {
1723 const size_t left_child = node_index << 1, right_child = left_child | 1;
1724 const int left_max = node_max_prefix_excess[left_child];
1725 const int right_max =
1726 node_total_excess[left_child] + node_max_prefix_excess[right_child];
1727 if (left_max >= right_max && left_max == target_prefix) {
1728 node_index = left_child;
1729 }
else if (right_max == target_prefix) {
1730 segment_base += segment_size_bits[left_child];
1731 target_prefix -= node_total_excess[left_child];
1732 node_index = right_child;
1738 const size_t segment_begin = segment_base;
1739 const size_t segment_end =
1740 std::min(segment_base + segment_size_bits[node_index], num_bits);
1744 first_max_value_pos8(segment_begin,
1745 segment_end ? (segment_end - 1) : segment_begin,
1746 max_value, position);
1747 return (max_value == target_prefix ? position :
npos);
1753 size_t first_leaf_index = 1;
1759 static constexpr int kNoPrefixOverride = std::numeric_limits<int>::min();
1764 size_t block_of(
const size_t& position)
const noexcept {
1765 return position / block_bits;
1771 size_t leaf_index_of(
const size_t& block_start)
const noexcept {
1772 return first_leaf_index + block_of(block_start);
1779 size_t node_base(
size_t node_index)
const noexcept {
1780 if (node_index >= first_leaf_index) {
1781 return (node_index - first_leaf_index) * block_bits;
1785 for (; node_index > 1; node_index >>= 1) {
1786 if (node_index & 1) {
1787 base += segment_size_bits[node_index - 1];
1798 std::vector<size_t> cover_blocks(
const size_t& block_begin_index,
1799 const size_t& block_end_index)
const {
1800 size_t left_index = first_leaf_index + block_begin_index;
1801 size_t right_index = first_leaf_index + block_end_index;
1802 std::vector<size_t> left_nodes, right_nodes;
1803 while (left_index <= right_index) {
1804 if ((left_index & 1) == 1) {
1805 left_nodes.push_back(left_index++);
1807 if ((right_index & 1) == 0) {
1808 right_nodes.push_back(right_index--);
1813 std::reverse(right_nodes.begin(), right_nodes.end());
1814 left_nodes.insert(left_nodes.end(), right_nodes.begin(), right_nodes.end());
1822 size_t descend_fwd(
size_t node_index,
1824 size_t segment_base)
const noexcept {
1825 while (node_index < first_leaf_index) {
1826 const size_t left_child = node_index << 1;
1827 const size_t right_child = left_child | 1;
1828 if (node_min_prefix_excess[left_child] <= required_delta &&
1829 required_delta <= node_max_prefix_excess[left_child]) {
1830 node_index = left_child;
1832 required_delta -= node_total_excess[left_child];
1833 segment_base += segment_size_bits[left_child];
1834 node_index = right_child;
1837 const size_t seg_end =
1838 std::min(segment_base + segment_size_bits[node_index], num_bits);
1839#if defined(PIXIE_AVX2_SUPPORT)
1840 return scan_leaf_fwd_simd(segment_base, seg_end, required_delta,
nullptr);
1842 return scan_leaf_fwd(segment_base, seg_end, required_delta);
1855 size_t descend_bwd(
size_t node_index,
1856 const size_t& segment_base,
1857 const int& required_delta,
1858 const size_t& global_right_border,
1859 const bool& allow_right_boundary)
const noexcept {
1860 while (node_index < first_leaf_index) {
1861 const size_t left_child = node_index << 1;
1862 const size_t right_child = left_child | 1;
1863 const int required_in_right =
1864 required_delta - node_total_excess[left_child];
1867 if (node_min_prefix_excess[right_child] <= required_in_right &&
1868 required_in_right <= node_max_prefix_excess[right_child]) {
1869 const size_t result = descend_bwd(
1870 right_child, segment_base + segment_size_bits[left_child],
1871 required_in_right, global_right_border, allow_right_boundary);
1872 if (result !=
npos) {
1878 const size_t junction = segment_base + segment_size_bits[left_child];
1879 if (required_delta == node_total_excess[left_child] &&
1880 (junction < global_right_border || allow_right_boundary)) {
1885 if (node_min_prefix_excess[left_child] <= required_delta &&
1886 required_delta <= node_max_prefix_excess[left_child]) {
1887 node_index = left_child;
1893 if (required_delta == 0 &&
1894 (segment_base < global_right_border || allow_right_boundary)) {
1895 return segment_base;
1901 const size_t seg_end =
1902 std::min(segment_base + segment_size_bits[node_index], num_bits);
1903 const size_t block_end = std::min(global_right_border, seg_end);
1905 int prefix_override = kNoPrefixOverride;
1909 if (block_end == seg_end) {
1910 const int total = node_total_excess[node_index];
1911 if (!allow_right_boundary && block_end > segment_base) {
1912 prefix_override = total - (
bit(block_end - 1) ? +1 : -1);
1914 prefix_override = total;
1918 return scan_leaf_bwd(segment_base, block_end, required_delta,
1919 allow_right_boundary, global_right_border,
1927 size_t descend_first_min(
size_t node_index,
1929 size_t segment_base)
const noexcept {
1930 while (node_index < first_leaf_index) {
1931 const size_t left_child = node_index << 1, right_child = left_child | 1;
1932 const int left_min = node_min_prefix_excess[left_child];
1933 const int right_min =
1934 node_total_excess[left_child] + node_min_prefix_excess[right_child];
1935 if (left_min <= right_min && left_min == target_prefix) {
1936 node_index = left_child;
1937 }
else if (right_min == target_prefix) {
1938 segment_base += segment_size_bits[left_child];
1939 target_prefix -= node_total_excess[left_child];
1940 node_index = right_child;
1946 const size_t segment_begin = segment_base;
1947 const size_t segment_end =
1948 std::min(segment_base + segment_size_bits[node_index], num_bits);
1952 first_min_value_pos8(segment_begin,
1953 segment_end ? (segment_end - 1) : segment_begin,
1954 min_value, position);
1955 return (min_value == target_prefix ? position :
npos);
1962 size_t descend_qth_min(
size_t node_index,
1964 size_t target_min_rank,
1965 size_t segment_base)
const noexcept {
1966 while (node_index < first_leaf_index) {
1967 const size_t left_child = node_index << 1;
1968 const size_t right_child = left_child | 1;
1969 const int left_min = node_min_prefix_excess[left_child];
1970 const int right_min =
1971 node_total_excess[left_child] + node_min_prefix_excess[right_child];
1972 if (left_min == target_prefix) {
1973 if (node_min_count[left_child] >= target_min_rank) {
1974 node_index = left_child;
1977 target_min_rank -= node_min_count[left_child];
1979 if (right_min == target_prefix) {
1980 segment_base += segment_size_bits[left_child];
1981 target_prefix -= node_total_excess[left_child];
1982 node_index = right_child;
1987 return qth_min_in_block(
1989 std::min(segment_base + segment_size_bits[node_index], num_bits) - 1,
1997 size_t select1_in_block(
const size_t& block_begin,
1998 const size_t& block_end,
1999 size_t target_one_rank)
const noexcept {
2000 size_t left_word_index = block_begin >> 6;
2001 const size_t right_word_index = (block_end >> 6);
2002 const size_t left_offset = block_begin & 63;
2003 const std::uint64_t left_mask =
2004 (left_offset ? (~std::uint64_t(0) << left_offset) : ~std::uint64_t(0));
2005 if (left_word_index == right_word_index) {
2006 const std::uint64_t word =
2007 bits[left_word_index] & left_mask &
2008 ((block_end & 63) ? ((std::uint64_t(1) << (block_end & 63)) - 1)
2009 : ~std::uint64_t(0));
2010 return block_begin + select_in_word(word, target_one_rank);
2014 const std::uint64_t word = bits[left_word_index] & left_mask;
2015 const int count = std::popcount(word);
2016 if (target_one_rank <= (
size_t)count) {
2017 return block_begin + select_in_word(word, target_one_rank);
2019 target_one_rank -= count;
2023 while (left_word_index < right_word_index) {
2024 const std::uint64_t word = bits[left_word_index];
2025 const int count = std::popcount(word);
2026 if (target_one_rank <= (
size_t)count) {
2027 return (left_word_index << 6) + select_in_word(word, target_one_rank);
2029 target_one_rank -= count;
2033 const size_t right_offset = block_end & 63;
2035 const std::uint64_t word =
2036 bits[left_word_index] & ((std::uint64_t(1) << right_offset) - 1);
2037 const int count = std::popcount(word);
2038 if (target_one_rank <= (
size_t)count) {
2039 return (left_word_index << 6) + select_in_word(word, target_one_rank);
2049 size_t select0_in_block(
const size_t& block_begin,
2050 const size_t& block_end,
2051 size_t target_zero_rank)
const noexcept {
2052 if (block_end <= block_begin) {
2056 size_t left_word_index = block_begin >> 6;
2057 const size_t right_word_index = block_end >> 6;
2058 const size_t left_offset = block_begin & 63;
2060 if (left_word_index == right_word_index) {
2061 const std::uint64_t left_mask =
2062 (left_offset ? (~std::uint64_t(0) << left_offset)
2063 : ~std::uint64_t(0));
2064 const std::uint64_t right_mask =
2065 ((block_end & 63) ? ((std::uint64_t(1) << (block_end & 63)) - 1)
2066 : ~std::uint64_t(0));
2067 const std::uint64_t word =
2068 (~bits[left_word_index]) & left_mask & right_mask;
2069 const int offset = select_in_word(word, target_zero_rank);
2070 return (offset >= 0) ? (block_begin + (size_t)offset) :
npos;
2075 const std::uint64_t word =
2076 (~bits[left_word_index]) & (~std::uint64_t(0) << left_offset);
2077 const int count = std::popcount(word);
2078 if (target_zero_rank <= (
size_t)count) {
2079 const int offset = select_in_word(word, target_zero_rank);
2080 return (offset >= 0) ? (block_begin + (size_t)offset) :
npos;
2082 target_zero_rank -= count;
2087 while (left_word_index < right_word_index) {
2088 const std::uint64_t word = ~bits[left_word_index];
2089 const int count = std::popcount(word);
2090 if (target_zero_rank <= (
size_t)count) {
2091 const int offset = select_in_word(word, target_zero_rank);
2092 return (offset >= 0) ? ((left_word_index << 6) + (size_t)offset) :
npos;
2094 target_zero_rank -= count;
2099 const size_t right_offset = block_end & 63;
2101 const std::uint64_t word =
2102 (~bits[left_word_index]) & ((std::uint64_t(1) << right_offset) - 1);
2103 const int count = std::popcount(word);
2104 if (target_zero_rank <= (
size_t)count) {
2105 const int offset = select_in_word(word, target_zero_rank);
2106 return (offset >= 0) ? ((left_word_index << 6) + (size_t)offset) :
npos;
2116 static inline int select_in_word(std::uint64_t word,
2117 size_t target_rank)
noexcept {
2119 if (--target_rank == 0) {
2120 return std::countr_zero(word);
2130 static inline size_t ceil_div(
const size_t& numerator,
2131 const size_t& denominator)
noexcept {
2132 return (numerator + denominator - 1) / denominator;
2140 static inline size_t nodeslots_for(
const size_t& bit_count,
2141 const size_t& block_size_pow2)
noexcept {
2142 if (bit_count == 0) {
2145 size_t leaf_node_count = ceil_div(bit_count, block_size_pow2);
2146 return std::bit_ceil(std::max<size_t>(1, leaf_node_count)) +
2153 static inline float overhead_for(
const size_t& bit_count,
2154 const size_t& block_size_pow2)
noexcept {
2155 static constexpr size_t AUX_SLOT_BYTES =
2156 sizeof(uint32_t) +
sizeof(int32_t) +
sizeof(int32_t) +
sizeof(int32_t) +
2157 sizeof(uint32_t) +
sizeof(uint32_t) +
sizeof(uint8_t) +
sizeof(uint8_t);
2159 size_t bitvector_bytes = ceil_div(bit_count, 64) * 8;
2160 if (bitvector_bytes == 0) {
2163 size_t slot_count = nodeslots_for(bit_count, block_size_pow2);
2164 size_t aux_bytes = slot_count * AUX_SLOT_BYTES;
2165 return ((
float)aux_bytes) / ((float)bitvector_bytes);
2174 static inline size_t choose_block_bits_for_overhead(
2175 const size_t& bit_count,
2176 const float& overhead_cap)
noexcept {
2177 if (overhead_cap < 0.f) {
2181 const size_t max_block_bits = std::min<size_t>(bit_count, 16384);
2182 size_t candidate_block_bits = 64;
2183 while (candidate_block_bits < max_block_bits) {
2184 if (overhead_for(bit_count, candidate_block_bits) <= overhead_cap) {
2187 candidate_block_bits <<= 1;
2189 return candidate_block_bits;
2199 void build_from_words(std::span<const std::uint64_t> words,
2200 const size_t& bit_count,
2201 const size_t& leaf_block_bits = 0,
2202 const float& max_overhead = -1.0) {
2204 num_bits = bit_count;
2205 if (bits.size() * 64 < num_bits) {
2206 throw std::invalid_argument(
2207 "RmMTree bit_count exceeds the provided word span");
2209 build(leaf_block_bits, max_overhead);
2216 inline uint32_t ones_in_node(
const size_t& node_index)
const noexcept {
2217 return ((int64_t)segment_size_bits[node_index] +
2218 (int64_t)node_total_excess[node_index]) >>
2229 inline void scan_range_min_count8(
size_t range_begin,
2230 const size_t& range_end,
2231 int& current_excess,
2233 uint32_t& count)
const noexcept {
2235 min_value = INT_MAX;
2237 if (range_end < range_begin) {
2242 while (range_begin <= range_end && (range_begin & 7)) {
2243 current_excess +=
bit(range_begin) ? +1 : -1;
2244 if (current_excess < min_value) {
2245 min_value = current_excess;
2247 }
else if (current_excess == min_value) {
2253 const auto& aggregates_table = LUT8();
2254 while (range_begin + 7 <= range_end) {
2255 const auto& byte_aggregate = aggregates_table[get_byte(range_begin)];
2256 const int candidate = current_excess + byte_aggregate.min_prefix;
2257 if (candidate < min_value) {
2258 min_value = candidate;
2259 count = byte_aggregate.min_count;
2260 }
else if (candidate == min_value) {
2261 count += byte_aggregate.min_count;
2263 current_excess += byte_aggregate.excess_total;
2267 while (range_begin <= range_end) {
2268 current_excess +=
bit(range_begin) ? +1 : -1;
2269 if (current_excess < min_value) {
2270 min_value = current_excess;
2272 }
else if (current_excess == min_value) {
2277 if (min_value == INT_MAX) {
2278 min_value = count = 0;
2288 inline size_t cover_blocks_collect(
const size_t& block_begin_index,
2289 const size_t& block_end_index,
2290 size_t (&out_nodes)[64])
const noexcept {
2291 if (leaf_count == 0 || block_begin_index > block_end_index) {
2294 size_t left_index = first_leaf_index + block_begin_index;
2295 size_t right_index = first_leaf_index + block_end_index;
2296 size_t left_nodes[32];
2297 size_t right_nodes[32];
2298 size_t left_count = 0, right_count = 0;
2299 while (left_index <= right_index) {
2300 if (left_index & 1) {
2301 left_nodes[left_count++] = left_index++;
2303 if ((right_index & 1) == 0) {
2304 right_nodes[right_count++] = right_index--;
2309 size_t out_count = 0;
2310 for (
size_t i = 0; i < left_count; ++i) {
2311 out_nodes[out_count++] = left_nodes[i];
2313 while (right_count > 0) {
2314 out_nodes[out_count++] = right_nodes[--right_count];
2325 inline size_t qth_min_in_block(
const size_t& range_begin,
2326 const size_t& range_end,
2327 size_t target_min_rank)
const noexcept {
2328 if (range_end < range_begin || target_min_rank == 0) {
2332 const auto& aggregates_table = LUT8();
2334 int current_excess = 0, min_value = INT_MAX;
2335 size_t position = range_begin;
2337 while (position <= range_end && (position & 7)) {
2338 current_excess +=
bit(position) ? +1 : -1;
2339 if (current_excess < min_value) {
2340 min_value = current_excess;
2344 while (position + 7 <= range_end) {
2345 const auto& byte_aggregate = aggregates_table[get_byte(position)];
2347 std::min(min_value, current_excess + byte_aggregate.min_prefix);
2348 current_excess += byte_aggregate.excess_total;
2351 while (position <= range_end) {
2352 current_excess +=
bit(position) ? +1 : -1;
2353 if (current_excess < min_value) {
2354 min_value = current_excess;
2360 position = range_begin;
2363 while (position <= range_end && (position & 7)) {
2364 current_excess +=
bit(position) ? +1 : -1;
2365 if (current_excess == min_value) {
2366 if (--target_min_rank == 0) {
2374 while (position + 7 <= range_end) {
2375 const uint8_t byte_value = get_byte(position);
2376 const auto& byte_aggregate = aggregates_table[byte_value];
2377 const int candidate = current_excess + byte_aggregate.min_prefix;
2378 if (candidate == min_value) {
2380 for (
int k = 0; k < 8; ++k) {
2381 prefix_sum += ((byte_value >> k) & 1u) ? +1 : -1;
2382 if (prefix_sum == byte_aggregate.min_prefix) {
2383 if (--target_min_rank == 0) {
2384 return position + k;
2389 current_excess += byte_aggregate.excess_total;
2394 while (position <= range_end) {
2395 current_excess +=
bit(position) ? +1 : -1;
2396 if (current_excess == min_value) {
2397 if (--target_min_rank == 0) {
2417 inline size_t leaf_fwd_bp_simd(
const size_t& leaf_index,
2418 const size_t& leaf_block_begin,
2419 const size_t& start_position,
2421 int& leaf_delta)
const noexcept {
2422 const size_t leaf_length = segment_size_bits[first_leaf_index + leaf_index];
2423 const size_t leaf_end = std::min(num_bits, leaf_block_begin + leaf_length);
2424 if (start_position >= leaf_end) {
2428#if defined(PIXIE_AVX2_SUPPORT)
2433 const size_t tail_len = leaf_end - start_position;
2435 if (delta >= -8 && delta <= 8 && tail_len <= 256) {
2436 res = scan_leaf_fwd_lut8_fast(start_position, leaf_end, delta, &total);
2438 res = scan_leaf_fwd_simd(start_position, leaf_end, delta, &total);
2446 const size_t res = scan_leaf_fwd(start_position, leaf_end, delta);
2450 const int len = int(leaf_end - start_position);
2451 const int ones = int(rank1_in_block(start_position, leaf_end));
2452 leaf_delta = ones * 2 - len;
2465 inline size_t leaf_bwd_bp_simd(
const size_t& leaf_index,
2466 const size_t& leaf_block_begin,
2467 const size_t& start_position,
2469 int& leaf_delta)
const noexcept {
2470 const size_t leaf_length = segment_size_bits[first_leaf_index + leaf_index];
2471 const size_t leaf_end = std::min(num_bits, leaf_block_begin + leaf_length);
2472 if (start_position < leaf_block_begin || start_position > leaf_end) {
2479 const int len = int(start_position - leaf_block_begin);
2480 const int ones = int(rank1_in_block(leaf_block_begin, start_position));
2481 leaf_delta = ones * 2 - len;
2485 if (start_position == leaf_block_begin) {
2488 const int target_prefix = leaf_delta + delta;
2489 return scan_leaf_bwd(
2497 (start_position > leaf_block_begin
2498 ? (leaf_delta - (
bit(start_position - 1) ? +1 : -1))
2508 inline void first_min_value_pos8(
size_t range_begin,
2509 const size_t& range_end,
2511 size_t& first_position)
const noexcept {
2512 const auto& aggregates_table = LUT8();
2513 int current_excess = 0;
2514 int min_value = INT_MAX;
2515 first_position =
npos;
2518 while (range_begin <= range_end && (range_begin & 7)) {
2519 current_excess +=
bit(range_begin) ? +1 : -1;
2520 if (current_excess < min_value) {
2521 min_value = current_excess;
2522 first_position = range_begin;
2528 while (range_begin + 7 <= range_end) {
2529 const auto& byte_aggregate = aggregates_table[get_byte(range_begin)];
2530 const int candidate = current_excess + byte_aggregate.min_prefix;
2531 if (candidate < min_value) {
2532 min_value = candidate;
2533 first_position = range_begin + byte_aggregate.pos_first_min;
2535 current_excess += byte_aggregate.excess_total;
2540 while (range_begin <= range_end) {
2541 current_excess +=
bit(range_begin) ? +1 : -1;
2542 if (current_excess < min_value) {
2543 min_value = current_excess;
2544 first_position = range_begin;
2549 min_value_out = (min_value == INT_MAX ? 0 : min_value);
2558 inline void first_max_value_pos8(
size_t range_begin,
2559 const size_t& range_end,
2561 size_t& first_position)
const noexcept {
2562 const auto& aggregates_table = LUT8();
2563 int current_excess = 0;
2564 int max_value = INT_MIN;
2565 first_position =
npos;
2567 while (range_begin <= range_end && (range_begin & 7)) {
2568 current_excess +=
bit(range_begin) ? +1 : -1;
2569 if (current_excess > max_value) {
2570 max_value = current_excess;
2571 first_position = range_begin;
2576 while (range_begin + 7 <= range_end) {
2577 const auto& byte_aggregate = aggregates_table[get_byte(range_begin)];
2578 const int candidate = current_excess + byte_aggregate.max_prefix;
2579 if (candidate > max_value) {
2580 max_value = candidate;
2581 first_position = range_begin + byte_aggregate.pos_first_max;
2583 current_excess += byte_aggregate.excess_total;
2587 while (range_begin <= range_end) {
2588 current_excess +=
bit(range_begin) ? +1 : -1;
2589 if (current_excess > max_value) {
2590 max_value = current_excess;
2591 first_position = range_begin;
2596 max_value_out = (max_value == INT_MIN ? 0 : max_value);
2605 void build(
const size_t& leaf_block_bits,
const float& max_overhead) {
2608 const size_t clamp_by_overhead =
2609 (max_overhead >= 0.0
2610 ? choose_block_bits_for_overhead(num_bits, max_overhead)
2615 if (leaf_block_bits == 0) {
2617 std::max(clamp_by_overhead,
2618 std::bit_ceil<size_t>(
2619 (num_bits <= 1) ? 1 : std::bit_width(num_bits - 1)));
2622 std::max(clamp_by_overhead,
2623 std::bit_ceil(std::max<size_t>(1, leaf_block_bits)));
2628 built_overhead = overhead_for(num_bits, block_bits);
2631 leaf_count = ceil_div(num_bits, block_bits);
2632 first_leaf_index = std::bit_ceil(std::max<size_t>(1, leaf_count));
2633 const size_t tree_size = first_leaf_index + leaf_count - 1;
2634 segment_size_bits.assign(tree_size + 1, 0);
2635 node_total_excess.assign(tree_size + 1, 0);
2636 node_min_prefix_excess.assign(tree_size + 1, 0);
2637 node_max_prefix_excess.assign(tree_size + 1, 0);
2638 node_min_count.assign(tree_size + 1, 0);
2639 node_pattern10_count.assign(tree_size + 1, 0);
2640 node_first_bit.assign(tree_size + 1, 0);
2641 node_last_bit.assign(tree_size + 1, 0);
2644 for (
size_t leaf_block_index = 0; leaf_block_index < leaf_count;
2645 ++leaf_block_index) {
2646 const size_t leaf_node_index = first_leaf_index + leaf_block_index;
2647 const size_t segment_begin = leaf_block_index * block_bits;
2648 const size_t segment_end = std::min(num_bits, segment_begin + block_bits);
2649 segment_size_bits[leaf_node_index] = segment_end - segment_begin;
2651 if (segment_begin < segment_end) {
2652 node_first_bit[leaf_node_index] =
bit(segment_begin);
2655 const auto& aggregates_table = LUT8();
2657 int current_excess = 0, min_value = INT_MAX, max_value = INT_MIN;
2658 uint32_t min_count = 0;
2659 uint32_t pattern10_count = 0;
2661 uint8_t previous_bit = 0;
2663 size_t position = segment_begin;
2666 while (position + 8 <= segment_end) {
2667 const uint8_t byte_value = get_byte(position);
2668 const auto& byte_aggregate = aggregates_table[byte_value];
2671 pattern10_count += byte_aggregate.pattern10_count;
2674 if (previous_bit == 1 && byte_aggregate.first_bit == 0) {
2679 const int candidate_min = current_excess + byte_aggregate.min_prefix;
2680 if (candidate_min < min_value) {
2681 min_value = candidate_min;
2682 min_count = byte_aggregate.min_count;
2683 }
else if (candidate_min == min_value) {
2684 min_count += byte_aggregate.min_count;
2688 std::max(max_value, current_excess + byte_aggregate.max_prefix);
2689 current_excess += byte_aggregate.excess_total;
2690 previous_bit = byte_aggregate.last_bit;
2695 while (position < segment_end) {
2696 const uint8_t bit_value =
bit(position);
2697 if (previous_bit == 1 && bit_value == 0) {
2700 const int step = bit_value ? +1 : -1;
2701 current_excess += step;
2702 if (current_excess < min_value) {
2703 min_value = current_excess;
2705 }
else if (current_excess == min_value) {
2708 if (current_excess > max_value) {
2709 max_value = current_excess;
2712 previous_bit = bit_value;
2716 if (segment_begin < segment_end) {
2717 node_last_bit[leaf_node_index] = previous_bit;
2720 node_total_excess[leaf_node_index] = current_excess;
2721 node_min_prefix_excess[leaf_node_index] =
2722 (segment_size_bits[leaf_node_index] == 0 ? 0 : min_value);
2723 node_max_prefix_excess[leaf_node_index] =
2724 (segment_size_bits[leaf_node_index] == 0 ? 0 : max_value);
2725 node_min_count[leaf_node_index] = min_count;
2726 node_pattern10_count[leaf_node_index] = (uint32_t)pattern10_count;
2729 for (
size_t node_index = first_leaf_index - 1; node_index >= 1;
2731 const size_t left_child = node_index << 1;
2732 const size_t right_child = left_child | 1;
2733 const bool has_left =
2734 (left_child <= tree_size) && segment_size_bits[left_child];
2735 const bool has_right =
2736 (right_child <= tree_size) && segment_size_bits[right_child];
2737 if (!has_left && !has_right) {
2738 segment_size_bits[node_index] = 0;
2741 if (has_left && !has_right) {
2742 segment_size_bits[node_index] = segment_size_bits[left_child];
2743 node_total_excess[node_index] = node_total_excess[left_child];
2744 node_min_prefix_excess[node_index] = node_min_prefix_excess[left_child];
2745 node_max_prefix_excess[node_index] = node_max_prefix_excess[left_child];
2746 node_min_count[node_index] = node_min_count[left_child];
2747 node_pattern10_count[node_index] = node_pattern10_count[left_child];
2748 node_first_bit[node_index] = node_first_bit[left_child];
2749 node_last_bit[node_index] = node_last_bit[left_child];
2750 }
else if (!has_left && has_right) {
2751 segment_size_bits[node_index] = segment_size_bits[right_child];
2752 node_total_excess[node_index] = node_total_excess[right_child];
2753 node_min_prefix_excess[node_index] =
2754 node_min_prefix_excess[right_child];
2755 node_max_prefix_excess[node_index] =
2756 node_max_prefix_excess[right_child];
2757 node_min_count[node_index] = node_min_count[right_child];
2758 node_pattern10_count[node_index] = node_pattern10_count[right_child];
2759 node_first_bit[node_index] = node_first_bit[right_child];
2760 node_last_bit[node_index] = node_last_bit[right_child];
2762 segment_size_bits[node_index] =
2763 segment_size_bits[left_child] + segment_size_bits[right_child];
2764 node_total_excess[node_index] =
2765 node_total_excess[left_child] + node_total_excess[right_child];
2766 const int right_min_candidate =
2767 node_total_excess[left_child] + node_min_prefix_excess[right_child];
2768 const int right_max_candidate =
2769 node_total_excess[left_child] + node_max_prefix_excess[right_child];
2770 node_min_prefix_excess[node_index] =
2771 std::min(node_min_prefix_excess[left_child], right_min_candidate);
2772 node_max_prefix_excess[node_index] =
2773 std::max(node_max_prefix_excess[left_child], right_max_candidate);
2774 node_min_count[node_index] =
2775 (node_min_prefix_excess[left_child] ==
2776 node_min_prefix_excess[node_index]
2777 ? node_min_count[left_child]
2779 (right_min_candidate == node_min_prefix_excess[node_index]
2780 ? node_min_count[right_child]
2782 node_pattern10_count[node_index] = node_pattern10_count[left_child] +
2783 node_pattern10_count[right_child] +
2784 ((node_last_bit[left_child] == 1 &&
2785 node_first_bit[right_child] == 0)
2788 node_first_bit[node_index] = node_first_bit[left_child];
2789 node_last_bit[node_index] = node_last_bit[right_child];
2791 if (node_index == 1) {