36 std::vector<std::uint64_t> bits;
40 size_t block_bits = 64;
41 size_t leaf_count = 0;
48 std::vector<uint32_t> segment_size_bits;
54 std::vector<int32_t> node_total_excess;
59 std::vector<int32_t> node_min_prefix_excess;
63 std::vector<int32_t> node_max_prefix_excess;
67 std::vector<uint32_t> node_min_count;
71 std::vector<uint32_t> node_pattern10_count;
76 std::vector<uint8_t> node_first_bit, node_last_bit;
82 static constexpr size_t npos = std::numeric_limits<size_t>::max();
85 float built_overhead = 0.0;
104 explicit RmMTree(
const std::string& bit_string,
105 const size_t& leaf_block_bits = 0,
106 const float& max_overhead = -1.0) {
107 build_from_string(bit_string, leaf_block_bits, max_overhead);
120 explicit RmMTree(
const std::vector<std::uint64_t>& words,
122 const size_t& leaf_block_bits = 0,
123 const float& max_overhead = -1.0) {
124 build_from_words(words, bit_count, leaf_block_bits, max_overhead);
133 size_t rank1(
const size_t& end_position)
const {
134 if (end_position == 0) {
137 const size_t block_index = block_of(end_position - 1);
138 size_t ones_count = 0;
139 if (block_index > 0) {
140 size_t nodes_buffer[64];
141 const size_t node_count =
142 cover_blocks_collect(0, block_index - 1, nodes_buffer);
143 for (
size_t j = 0; j < node_count; ++j) {
144 ones_count += ones_in_node(nodes_buffer[j]);
147 const size_t block_begin = block_index * block_bits;
148 const size_t block_end = std::min(num_bits, block_begin + block_bits);
150 rank1_in_block(block_begin, std::min(end_position, block_end));
158 size_t rank0(
const size_t& end_position)
const {
159 return end_position -
rank1(end_position);
166 size_t select1(
size_t target_one_rank)
const {
167 if (target_one_rank == 0 || num_bits == 0) {
170 size_t node_index = 1;
171 if (ones_in_node(node_index) < target_one_rank) {
174 size_t segment_base = 0;
175 while (node_index < first_leaf_index) {
176 const size_t left_child = node_index << 1;
177 const size_t right_child = left_child | 1;
178 const uint32_t ones_in_left_child = ones_in_node(left_child);
179 if (ones_in_left_child >= target_one_rank) {
180 node_index = left_child;
182 target_one_rank -= ones_in_left_child;
183 segment_base += segment_size_bits[left_child];
184 node_index = right_child;
187 return select1_in_block(
189 std::min(segment_base + segment_size_bits[node_index], num_bits),
197 size_t select0(
size_t target_zero_rank)
const {
198 if (target_zero_rank == 0 || num_bits == 0) {
201 size_t node_index = 1;
202 const auto zeros_in_node = [&](
const size_t& node)
noexcept {
203 return segment_size_bits[node] - ones_in_node(node);
205 if (zeros_in_node(node_index) < target_zero_rank) {
208 size_t segment_base = 0;
209 while (node_index < first_leaf_index) {
210 const size_t left_child = node_index << 1;
211 const size_t right_child = left_child | 1;
212 const size_t zeros_in_left_child = zeros_in_node(left_child);
213 if (zeros_in_left_child >= target_zero_rank) {
214 node_index = left_child;
216 target_zero_rank -= zeros_in_left_child;
217 segment_base += segment_size_bits[left_child];
218 node_index = right_child;
221 return select0_in_block(
223 std::min(segment_base + segment_size_bits[node_index], num_bits),
232 size_t rank10(
const size_t& end_position)
const {
233 if (end_position <= 1) {
236 const size_t block_index = block_of(end_position - 1);
237 size_t pattern_count = 0;
238 int previous_last_bit = -1;
240 if (block_index > 0) {
241 const auto covered_nodes = cover_blocks(0, block_index - 1);
242 for (
const size_t& node_index : covered_nodes) {
243 pattern_count += node_pattern10_count[node_index];
244 if (previous_last_bit != -1 && previous_last_bit == 1 &&
245 node_first_bit[node_index] == 0) {
248 previous_last_bit = node_last_bit[node_index];
251 const size_t block_begin = block_index * block_bits;
252 pattern_count += rr_in_block(block_begin, end_position);
254 if (block_index > 0 && end_position > block_begin &&
255 previous_last_bit == 1 && bit(block_begin) == 0) {
258 return pattern_count;
265 size_t select10(
size_t target_pattern_rank)
const {
266 if (target_pattern_rank == 0 || num_bits == 0) {
269 size_t node_index = 1;
270 if (node_pattern10_count[node_index] < target_pattern_rank) {
273 const size_t tree_size = segment_size_bits.size() - 1;
274 size_t segment_base = 0;
275 while (node_index < first_leaf_index) {
276 const size_t left_child = node_index << 1;
277 const size_t left_segment_size =
278 (left_child <= tree_size) ? segment_size_bits[left_child] : 0;
279 if (left_segment_size == 0) {
283 const size_t left_count = node_pattern10_count[left_child];
284 if (left_count >= target_pattern_rank) {
285 node_index = left_child;
289 size_t remaining_rank = target_pattern_rank - left_count;
290 const size_t right_child = left_child | 1;
291 const bool has_right =
292 (right_child <= tree_size) && (segment_size_bits[right_child] != 0);
297 const size_t crossing_pattern =
298 (node_last_bit[left_child] == 1 && node_first_bit[right_child] == 0)
301 if (crossing_pattern) {
302 if (remaining_rank == 1) {
303 return segment_base + left_segment_size - 1;
307 segment_base += left_segment_size;
308 node_index = right_child;
309 target_pattern_rank = remaining_rank;
311 return select10_in_block(
313 std::min(segment_base + segment_size_bits[node_index], num_bits),
314 target_pattern_rank);
320 inline int excess(
const size_t& end_position)
const {
321 return int64_t(
rank1(end_position)) * 2 - int64_t(end_position);
330 size_t fwdsearch(
const size_t& start_position,
const int& delta)
const {
331 if (start_position >= num_bits) {
336 const size_t leaf_block_index = block_of(start_position);
337 const size_t block_begin = leaf_block_index * block_bits;
338 const size_t block_end = std::min(num_bits, block_begin + block_bits);
340 const size_t leaf_result = leaf_fwd_bp_simd(
341 leaf_block_index, block_begin, start_position, delta, leaf_delta);
342 if (leaf_result !=
npos) {
346 int remaining_delta = delta - leaf_delta;
347 size_t segment_base = block_end;
348 if (remaining_delta == 0) {
355 size_t node_index = leaf_index_of(block_begin);
356 const size_t tree_size = segment_size_bits.size() - 1;
360 if (segment_base >= num_bits || leaf_block_index + 1 >= leaf_count) {
364 while (node_index > 1) {
365 const bool is_left_child = ((node_index & 1u) == 0u);
367 const size_t sibling = node_index | 1u;
368 if (sibling <= tree_size && segment_size_bits[sibling]) {
371 if (node_min_prefix_excess[sibling] <= remaining_delta &&
372 remaining_delta <= node_max_prefix_excess[sibling]) {
373 return descend_fwd(sibling, remaining_delta, segment_base);
376 remaining_delta -= node_total_excess[sibling];
377 segment_base += segment_size_bits[sibling];
378 if (remaining_delta == 0) {
394 size_t bwdsearch(
const size_t& start_position,
const int& delta)
const {
395 if (start_position > num_bits || start_position == 0) {
400 const size_t leaf_block_index = block_of(start_position - 1);
401 const size_t block_begin = leaf_block_index * block_bits;
403 const size_t leaf_result = leaf_bwd_bp_simd(
404 leaf_block_index, block_begin, start_position, delta, leaf_delta);
405 if (leaf_result !=
npos) {
411 int remaining_delta = leaf_delta + delta;
412 size_t node_index = leaf_index_of(block_begin);
413 size_t segment_base = block_begin;
414 while (node_index > 1) {
415 if (node_index & 1) {
416 const size_t sibling_index = node_index ^ 1;
417 const size_t sibling_border =
419 const int needed_inside_sibling =
421 node_total_excess[sibling_index];
423 const bool allow_right_border =
424 (sibling_border != start_position);
427 if (needed_inside_sibling == 0 ||
428 (node_min_prefix_excess[sibling_index] <= needed_inside_sibling &&
429 needed_inside_sibling <= node_max_prefix_excess[sibling_index])) {
430 const size_t result = descend_bwd(
431 sibling_index, sibling_border - segment_size_bits[sibling_index],
432 needed_inside_sibling, sibling_border, allow_right_border);
433 if (result !=
npos) {
438 if (needed_inside_sibling == node_total_excess[sibling_index] &&
439 sibling_border < start_position) {
440 return sibling_border;
444 remaining_delta += node_total_excess[sibling_index];
445 segment_base -= segment_size_bits[sibling_index];
459 const size_t& range_end)
const {
460 if (range_begin > range_end || range_end >= num_bits) {
464 const size_t begin_block_index = block_of(range_begin);
465 const size_t begin_block_start = begin_block_index * block_bits;
466 const size_t begin_block_end =
467 std::min(num_bits, begin_block_start + block_bits);
468 const size_t end_block_index = block_of(range_end);
469 const size_t end_block_start = end_block_index * block_bits;
471 int best_value = INT_MAX;
472 size_t best_position =
npos;
473 size_t chosen_node_index = 0;
474 int prefix_excess = 0, prefix_at_choice = 0;
477 int min_prefix_first_chunk = INT_MAX;
478 size_t first_chunk_position =
npos;
479 const size_t end_of_first_chunk = std::min(
480 range_end, (
size_t)(begin_block_end ? begin_block_end - 1 : 0));
481 if (range_begin <= end_of_first_chunk) {
482 first_min_value_pos8(range_begin, end_of_first_chunk,
483 min_prefix_first_chunk, first_chunk_position);
485 (int64_t)rank1_in_block(range_begin, end_of_first_chunk + 1) * 2 -
486 int64_t(end_of_first_chunk + 1 - range_begin);
487 best_value = min_prefix_first_chunk;
488 best_position = first_chunk_position;
489 chosen_node_index = 0;
493 if (begin_block_index + 1 <= end_block_index - 1) {
494 size_t left_index = first_leaf_index + (begin_block_index + 1);
495 size_t right_index = first_leaf_index + (end_block_index - 1);
496 size_t right_nodes[64];
497 int right_nodes_count = 0;
499 while (left_index <= right_index) {
500 if (left_index & 1) {
501 const size_t node_index = left_index++;
502 const int candidate =
503 prefix_excess + node_min_prefix_excess[node_index];
504 if (candidate < best_value) {
505 best_value = candidate;
506 best_position =
npos;
507 chosen_node_index = node_index;
508 prefix_at_choice = prefix_excess;
510 prefix_excess += node_total_excess[node_index];
512 if ((right_index & 1) == 0) {
513 right_nodes[right_nodes_count++] = right_index--;
518 while (right_nodes_count--) {
519 const size_t node_index = right_nodes[right_nodes_count];
520 const int candidate =
521 prefix_excess + node_min_prefix_excess[node_index];
522 if (candidate < best_value) {
523 best_value = candidate;
524 best_position =
npos;
525 chosen_node_index = node_index;
526 prefix_at_choice = prefix_excess;
528 prefix_excess += node_total_excess[node_index];
533 if (end_block_index != begin_block_index) {
534 int min_prefix_last_chunk;
535 size_t last_chunk_position;
536 first_min_value_pos8(end_block_start, range_end, min_prefix_last_chunk,
537 last_chunk_position);
538 const int candidate = prefix_excess + min_prefix_last_chunk;
539 if (candidate < best_value) {
540 best_value = candidate;
541 best_position = last_chunk_position;
542 chosen_node_index = 0;
546 if (best_position !=
npos) {
547 return best_position;
550 return descend_first_min(chosen_node_index, best_value - prefix_at_choice,
551 node_base(chosen_node_index));
561 const size_t& range_end)
const {
562 if (range_begin > range_end || range_end >= num_bits) {
566 if (min_position ==
npos) {
579 const size_t& range_end)
const {
580 if (range_begin > range_end || range_end >= num_bits) {
584 const size_t begin_block_index = block_of(range_begin);
585 const size_t begin_block_start = begin_block_index * block_bits;
586 const size_t begin_block_end =
587 std::min(num_bits, begin_block_start + block_bits);
588 const size_t end_block_index = block_of(range_end);
589 const size_t end_block_start = end_block_index * block_bits;
591 int best_value = INT_MIN;
592 size_t best_position =
npos;
593 size_t chosen_node_index = 0;
594 int prefix_excess = 0, prefix_at_choice = 0;
597 int max_prefix_first_chunk = INT_MIN;
598 size_t first_chunk_position =
npos;
599 const size_t end_of_first_chunk = std::min(
600 range_end, (
size_t)(begin_block_end ? begin_block_end - 1 : 0));
601 if (range_begin <= end_of_first_chunk) {
602 first_max_value_pos8(range_begin, end_of_first_chunk,
603 max_prefix_first_chunk, first_chunk_position);
605 (int64_t)rank1_in_block(range_begin, end_of_first_chunk + 1) * 2 -
606 int64_t(end_of_first_chunk + 1 - range_begin);
607 best_value = max_prefix_first_chunk;
608 best_position = first_chunk_position;
609 chosen_node_index = 0;
613 if (begin_block_index + 1 <= end_block_index - 1) {
614 size_t left_index = first_leaf_index + (begin_block_index + 1);
615 size_t right_index = first_leaf_index + (end_block_index - 1);
616 size_t right_nodes[64];
617 int right_nodes_count = 0;
619 while (left_index <= right_index) {
620 if (left_index & 1) {
621 const size_t node_index = left_index++;
622 const int candidate =
623 prefix_excess + node_max_prefix_excess[node_index];
624 if (candidate > best_value) {
625 best_value = candidate;
626 best_position =
npos;
627 chosen_node_index = node_index;
628 prefix_at_choice = prefix_excess;
630 prefix_excess += node_total_excess[node_index];
632 if ((right_index & 1) == 0) {
633 right_nodes[right_nodes_count++] = right_index--;
638 while (right_nodes_count--) {
639 const size_t node_index = right_nodes[right_nodes_count];
640 const int candidate =
641 prefix_excess + node_max_prefix_excess[node_index];
642 if (candidate > best_value) {
643 best_value = candidate;
644 best_position =
npos;
645 chosen_node_index = node_index;
646 prefix_at_choice = prefix_excess;
648 prefix_excess += node_total_excess[node_index];
653 if (end_block_index != begin_block_index) {
654 int max_prefix_last_chunk;
655 size_t last_chunk_position;
656 first_max_value_pos8(end_block_start, range_end, max_prefix_last_chunk,
657 last_chunk_position);
658 const int candidate = prefix_excess + max_prefix_last_chunk;
659 if (candidate > best_value) {
660 best_value = candidate;
661 best_position = last_chunk_position;
662 chosen_node_index = 0;
666 if (best_position !=
npos) {
667 return best_position;
670 return descend_first_max(chosen_node_index, best_value - prefix_at_choice,
671 node_base(chosen_node_index));
679 const size_t& range_end)
const {
680 if (range_begin > range_end || range_end >= num_bits) {
684 if (max_position ==
npos) {
694 size_t mincount(
const size_t& range_begin,
const size_t& range_end)
const {
695 if (range_begin > range_end || range_end >= num_bits) {
699 const size_t begin_block_index = block_of(range_begin);
700 const size_t begin_block_start = begin_block_index * block_bits;
701 const size_t begin_block_end =
702 std::min(num_bits, begin_block_start + block_bits);
703 const size_t end_block_index = block_of(range_end);
704 const size_t end_block_start = end_block_index * block_bits;
706 int best_value = INT_MAX;
707 size_t min_count = 0;
708 int prefix_excess = 0;
712 int current_excess = 0, min_value = INT_MAX, local_count = 0;
713 const size_t end_of_first_chunk =
714 std::min(range_end, begin_block_end - 1);
715 for (
size_t position = range_begin; position <= end_of_first_chunk;
717 current_excess += bit(position) ? +1 : -1;
718 if (current_excess < min_value) {
719 min_value = current_excess;
721 }
else if (current_excess == min_value) {
725 best_value = min_value;
726 min_count = local_count;
727 prefix_excess = current_excess;
731 if (begin_block_index + 1 <= end_block_index - 1) {
732 const auto middle_nodes =
733 cover_blocks(begin_block_index + 1, end_block_index - 1);
734 for (
const size_t& node_index : middle_nodes) {
735 const int candidate =
736 prefix_excess + node_min_prefix_excess[node_index];
737 if (candidate < best_value) {
738 best_value = candidate;
739 min_count = node_min_count[node_index];
740 }
else if (candidate == best_value) {
741 min_count += node_min_count[node_index];
743 prefix_excess += node_total_excess[node_index];
748 if (end_block_index != begin_block_index) {
749 int current_excess = 0, min_value = INT_MAX, local_count = 0;
750 for (
size_t position = end_block_start; position <= range_end;
752 current_excess += bit(position) ? +1 : -1;
753 if (current_excess < min_value) {
754 min_value = current_excess;
756 }
else if (current_excess == min_value) {
760 const int candidate = prefix_excess + min_value;
761 if (candidate < best_value) {
762 best_value = candidate;
763 min_count = local_count;
764 }
else if (candidate == best_value) {
765 min_count += local_count;
778 const size_t& range_end,
779 size_t target_min_rank)
const {
780 if (range_begin > range_end || range_end >= num_bits ||
781 target_min_rank == 0) {
785 const size_t begin_block_index = block_of(range_begin);
786 const size_t begin_block_start = begin_block_index * block_bits;
787 const size_t begin_block_end =
788 std::min(num_bits, begin_block_start + block_bits);
789 const size_t end_block_index = block_of(range_end);
790 const size_t end_block_start = end_block_index * block_bits;
793 const size_t end_of_first_chunk = std::min(range_end, begin_block_end - 1);
794 int current_first_chunk_excess = 0, min_first_chunk = 0;
795 uint32_t count_first_chunk = 0;
797 if (range_begin <= end_of_first_chunk) {
798 scan_range_min_count8(range_begin, end_of_first_chunk,
799 current_first_chunk_excess, min_first_chunk,
802 current_first_chunk_excess = 0;
803 min_first_chunk = INT_MAX;
804 count_first_chunk = 0;
807 int best_value = (min_first_chunk == INT_MAX ? INT_MAX : min_first_chunk);
809 (min_first_chunk == INT_MAX ? 0u : (size_t)count_first_chunk);
810 int prefix_excess = current_first_chunk_excess;
812 size_t left_index = first_leaf_index + begin_block_index + 1;
813 size_t right_index = first_leaf_index + end_block_index - 1;
814 size_t right_nodes[64];
815 int right_nodes_count = 0;
818 if (begin_block_index + 1 <= end_block_index - 1) {
819 while (left_index <= right_index) {
820 if (left_index & 1) {
821 const int candidate =
822 prefix_excess + node_min_prefix_excess[left_index];
823 if (candidate < best_value) {
824 best_value = candidate;
825 total_count = node_min_count[left_index];
826 }
else if (candidate == best_value) {
827 total_count += node_min_count[left_index];
829 prefix_excess += node_total_excess[left_index++];
831 if ((right_index & 1) == 0) {
832 right_nodes[right_nodes_count++] = right_index--;
837 while (right_nodes_count--) {
838 const size_t node_index = right_nodes[right_nodes_count];
839 const int candidate =
840 prefix_excess + node_min_prefix_excess[node_index];
841 if (candidate < best_value) {
842 best_value = candidate;
843 total_count = node_min_count[node_index];
844 }
else if (candidate == best_value) {
845 total_count += node_min_count[node_index];
847 prefix_excess += node_total_excess[node_index];
852 int current_last_chunk_excess = 0, min_last_chunk = INT_MAX;
853 uint32_t count_last_chunk = 0;
854 if (end_block_index != begin_block_index) {
855 scan_range_min_count8(end_block_start, range_end,
856 current_last_chunk_excess, min_last_chunk,
858 const int candidate = prefix_excess + min_last_chunk;
859 if (candidate < best_value) {
860 best_value = candidate;
861 total_count = count_last_chunk;
862 }
else if (candidate == best_value) {
863 total_count += count_last_chunk;
867 if (target_min_rank > total_count) {
872 if (min_first_chunk == best_value && count_first_chunk) {
873 if (target_min_rank <= count_first_chunk) {
874 return qth_min_in_block(range_begin, end_of_first_chunk,
877 target_min_rank -= count_first_chunk;
881 prefix_excess = current_first_chunk_excess;
882 if (begin_block_index + 1 <= end_block_index - 1) {
883 left_index = first_leaf_index + (begin_block_index + 1);
884 right_index = first_leaf_index + (end_block_index - 1);
885 right_nodes_count = 0;
886 while (left_index <= right_index) {
887 if (left_index & 1) {
888 const size_t node_index = left_index++;
889 const int candidate =
890 prefix_excess + node_min_prefix_excess[node_index];
891 if (candidate == best_value) {
892 if (target_min_rank <= node_min_count[node_index]) {
893 return descend_qth_min(node_index, best_value - prefix_excess,
894 target_min_rank, node_base(node_index));
896 target_min_rank -= node_min_count[node_index];
898 prefix_excess += node_total_excess[node_index];
900 if (!(right_index & 1)) {
901 right_nodes[right_nodes_count++] = right_index--;
906 while (right_nodes_count--) {
907 const size_t node_index = right_nodes[right_nodes_count];
908 const int candidate =
909 prefix_excess + node_min_prefix_excess[node_index];
910 if (candidate == best_value) {
911 if (target_min_rank <= node_min_count[node_index]) {
912 return descend_qth_min(node_index, best_value - prefix_excess,
913 target_min_rank, node_base(node_index));
915 target_min_rank -= node_min_count[node_index];
917 prefix_excess += node_total_excess[node_index];
922 if (end_block_index != begin_block_index &&
923 (prefix_excess + min_last_chunk) == best_value) {
924 return qth_min_in_block(end_block_start, range_end, target_min_rank);
936 inline size_t close(
const size_t& open_position)
const {
937 if (open_position >= num_bits) {
947 inline size_t open(
const size_t& close_position)
const {
949 if (close_position == 0 || close_position > num_bits) {
952 const size_t result =
bwdsearch(close_position, 0);
953 return (result ==
npos ?
npos : result + 1);
961 inline size_t enclose(
const size_t& position)
const {
962 if (position == 0 || position > num_bits) {
965 const size_t result =
bwdsearch(position, -2);
966 return (result ==
npos ?
npos : result + 1);
975 static inline size_t pop10_in_slice64(
const std::uint64_t& slice,
976 const int& length)
noexcept {
980 std::uint64_t pattern_mask = slice & ~(slice >> 1);
982 pattern_mask &= ((std::uint64_t(1) << (length - 1)) - 1);
984 pattern_mask &= 0x7FFFFFFFFFFFFFFFull;
986 return (
size_t)std::popcount(pattern_mask);
993 size_t rank1_in_block(
const size_t& block_begin,
994 const size_t& block_end)
const noexcept {
995 if (block_end <= block_begin) {
998 size_t left_word_index = block_begin >> 6;
999 const size_t right_word_index = block_end >> 6;
1000 size_t left_offset = block_begin & 63;
1001 const size_t right_offset = block_end & 63;
1003 if (left_word_index == right_word_index) {
1004 const std::uint64_t mask =
1005 ((right_offset == 0) ? 0 : ((std::uint64_t(1) << right_offset) - 1)) &
1006 (~std::uint64_t(0) << left_offset);
1007 return (
size_t)std::popcount(bits[left_word_index] & mask);
1010 count += (size_t)std::popcount(bits[left_word_index] &
1011 (~std::uint64_t(0) << left_offset));
1014 while (left_word_index < right_word_index) {
1015 count += (size_t)std::popcount(bits[left_word_index]);
1019 count += (size_t)std::popcount(bits[right_word_index] &
1020 ((std::uint64_t(1) << right_offset) - 1));
1029 size_t rr_in_block(
const size_t& block_begin,
1030 const size_t& block_end)
const noexcept {
1031 if (block_end <= block_begin + 1) {
1034 size_t left_word_index = block_begin >> 6;
1035 const size_t right_word_index = (block_end - 1) >> 6;
1036 const int left_offset = block_begin & 63;
1037 const int right_offset = (block_end - 1) & 63;
1040 if (left_word_index == right_word_index) {
1041 const int length = right_offset - left_offset + 1;
1042 const std::uint64_t slice = bits[left_word_index] >> left_offset;
1043 return pop10_in_slice64(slice, length);
1048 const int length = 64 - left_offset;
1049 const std::uint64_t slice = bits[left_word_index] >> left_offset;
1050 count += pop10_in_slice64(slice, length);
1053 for (
size_t word_index = left_word_index + 1; word_index < right_word_index;
1055 const std::uint64_t word = bits[word_index];
1056 count += pop10_in_slice64(word, 64);
1060 const int length = right_offset + 1;
1061 const std::uint64_t mask = (length == 64)
1063 : ((std::uint64_t(1) << length) - 1);
1064 const std::uint64_t slice = bits[right_word_index] & mask;
1065 count += pop10_in_slice64(slice, length);
1068 for (
size_t word_index = left_word_index; word_index < right_word_index;
1070 if (((bits[word_index] >> 63) & 1u) &&
1071 ((bits[word_index + 1] & 1u) == 0)) {
1083 size_t select10_in_block(
const size_t& block_begin,
1084 const size_t& block_end,
1085 size_t target_pattern_rank)
const noexcept {
1086 if (block_end <= block_begin + 1) {
1089 size_t left_word_index = block_begin >> 6;
1090 const size_t right_word_index = (block_end - 1) >> 6;
1091 const int left_offset = block_begin & 63;
1092 const int right_offset = (block_end - 1) & 63;
1094 const auto select_in_masked_slice =
1095 [&](
const std::uint64_t& slice,
const int& length,
1096 const size_t& target_index)
noexcept ->
int {
1100 std::uint64_t pattern_mask = slice & ~(slice >> 1);
1102 pattern_mask &= ((std::uint64_t(1) << (length - 1)) - 1);
1104 pattern_mask &= 0x7FFFFFFFFFFFFFFFull;
1106 return select_in_word(pattern_mask, target_index);
1109 if (left_word_index == right_word_index) {
1110 const int length = right_offset - left_offset + 1;
1111 const std::uint64_t slice = bits[left_word_index] >> left_offset;
1113 select_in_masked_slice(slice, length, target_pattern_rank);
1114 return offset >= 0 ? (block_begin + (size_t)offset) :
npos;
1119 const int length = 64 - left_offset;
1120 const std::uint64_t slice = bits[left_word_index] >> left_offset;
1121 std::uint64_t pattern_mask = slice & ~(slice >> 1);
1122 pattern_mask &= ((std::uint64_t(1) << (length - 1)) - 1);
1123 const int count = std::popcount(pattern_mask);
1124 if (target_pattern_rank <= (
size_t)count) {
1126 select_in_masked_slice(slice, length, target_pattern_rank);
1127 return block_begin + (size_t)offset;
1129 target_pattern_rank -= count;
1133 for (
size_t word_index = left_word_index; word_index + 1 < right_word_index;
1136 if (((bits[word_index] >> 63) & 1u) &&
1137 ((bits[word_index + 1] & 1u) == 0)) {
1138 if (--target_pattern_rank == 0) {
1139 return (word_index << 6) + 63;
1143 const std::uint64_t next_word = bits[word_index + 1];
1144 const std::uint64_t pattern_mask =
1145 (next_word & ~(next_word >> 1)) & 0x7FFFFFFFFFFFFFFFull;
1146 const int count = std::popcount(pattern_mask);
1147 if (target_pattern_rank <= (
size_t)count) {
1148 const int offset = select_in_word(pattern_mask, target_pattern_rank);
1152 return ((word_index + 1) << 6) + (size_t)offset;
1154 target_pattern_rank -= count;
1158 if (((bits[right_word_index - 1] >> 63) & 1u) &&
1159 ((bits[right_word_index] & 1u) == 0)) {
1160 if (--target_pattern_rank == 0) {
1161 return ((right_word_index - 1) << 6) + 63;
1167 const int length = right_offset + 1;
1168 const std::uint64_t mask = (length == 64)
1170 : ((std::uint64_t(1) << length) - 1);
1171 const std::uint64_t slice = bits[right_word_index] & mask;
1173 select_in_masked_slice(slice, length, target_pattern_rank);
1175 return (right_word_index << 6) + (size_t)offset;
1182 int8_t excess_total;
1186 uint8_t pattern10_count;
1189 uint8_t pos_first_min;
1190 uint8_t pos_first_max;
1194 std::array<ByteAgg, 256> agg;
1199 std::array<std::array<int8_t, 17>, 256> fwd_pos;
1204 std::array<std::array<int8_t, 17>, 256> bwd_pos;
1210 static inline const LUT8Tables& LUT8_ALL() noexcept {
1211 static const LUT8Tables tables = [] {
1212 LUT8Tables lookup_tables{};
1213 for (
int byte_value = 0; byte_value < 256; ++byte_value) {
1214 int current_excess = 0, min_prefix = INT_MAX, max_prefix = INT_MIN,
1215 min_count = 0, pattern10_count = 0;
1216 int first_min_position = 0, first_max_position = 0;
1218 const auto bit_at = [&](
const int& bit_index) {
1219 return (byte_value >> bit_index) & 1;
1221 for (
int bit_index = 0; bit_index < 8; ++bit_index) {
1222 int bit_value = bit_at(bit_index);
1223 if (bit_index + 1 < 8 && bit_value && bit_at(bit_index + 1) == 0) {
1226 current_excess += bit_value ? +1 : -1;
1227 prefixes[bit_index] = current_excess;
1228 if (current_excess < min_prefix) {
1229 min_prefix = current_excess;
1231 first_min_position = bit_index;
1232 }
else if (current_excess == min_prefix) {
1235 if (current_excess > max_prefix) {
1236 max_prefix = current_excess;
1237 first_max_position = bit_index;
1240 ByteAgg aggregates{};
1241 aggregates.excess_total = current_excess;
1242 aggregates.min_prefix = (min_prefix == INT_MAX ? 0 : min_prefix);
1243 aggregates.max_prefix = (max_prefix == INT_MIN ? 0 : max_prefix);
1244 aggregates.min_count = min_count;
1245 aggregates.pattern10_count = pattern10_count;
1246 aggregates.first_bit = bit_at(0);
1247 aggregates.last_bit = bit_at(7);
1248 aggregates.pos_first_min = first_min_position;
1249 aggregates.pos_first_max = first_max_position;
1250 lookup_tables.agg[byte_value] = aggregates;
1251 auto& forward_positions = lookup_tables.fwd_pos[byte_value];
1252 auto& backward_positions = lookup_tables.bwd_pos[byte_value];
1253 forward_positions.fill(-1);
1254 backward_positions.fill(-1);
1255 for (
int delta = -8; delta <= 8; ++delta) {
1256 for (
int bit_index = 0; bit_index < 8; ++bit_index) {
1257 if (prefixes[bit_index] == delta) {
1258 forward_positions[delta + 8] = bit_index;
1262 for (
int bit_index = 7; bit_index >= 0; --bit_index) {
1263 if (prefixes[bit_index] == delta) {
1264 backward_positions[delta + 8] = bit_index;
1270 return lookup_tables;
1278 static inline const std::array<ByteAgg, 256>& LUT8() noexcept {
1279 return LUT8_ALL().agg;
1285 static inline const std::array<std::array<int8_t, 17>, 256>&
1286 LUT8_FWD_POS() noexcept {
1287 return LUT8_ALL().fwd_pos;
1293 static inline const std::array<std::array<int8_t, 17>, 256>&
1294 LUT8_BWD_POS() noexcept {
1295 return LUT8_ALL().bwd_pos;
1301 inline uint16_t get_u16(
const size_t& position)
const noexcept {
1302 const size_t word_index = position >> 6;
1303 const unsigned offset = unsigned(position & 63);
1304 const std::uint64_t w0 =
1305 (word_index < bits.size()) ? bits[word_index] : 0ULL;
1307 return uint16_t(w0 & 0xFFFFu);
1309 const std::uint64_t w1 =
1310 (word_index + 1 < bits.size()) ? bits[word_index + 1] : 0ULL;
1311 const std::uint64_t v = (w0 >> offset) | (w1 << (64u - offset));
1312 return uint16_t(v & 0xFFFFu);
1315#if defined(PIXIE_AVX2_SUPPORT)
1316 static inline __m256i bit_masks_16x() noexcept {
1318 return _mm256_setr_epi16(0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020,
1319 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800,
1320 0x1000, 0x2000, 0x4000, (int16_t)0x8000);
1323 static inline __m256i prefix_sum_16x_i16(__m256i v)
noexcept {
1327 __m256i t = _mm256_slli_si256(x, 2);
1328 x = _mm256_add_epi16(x, t);
1329 t = _mm256_slli_si256(x, 4);
1330 x = _mm256_add_epi16(x, t);
1331 t = _mm256_slli_si256(x, 8);
1332 x = _mm256_add_epi16(x, t);
1334 __m128i lo = _mm256_extracti128_si256(x, 0);
1335 __m128i hi = _mm256_extracti128_si256(x, 1);
1336 const int16_t carry =
1337 (int16_t)_mm_extract_epi16(lo, 7);
1338 hi = _mm_add_epi16(hi, _mm_set1_epi16(carry));
1340 __m256i out = _mm256_castsi128_si256(lo);
1341 out = _mm256_inserti128_si256(out, hi, 1);
1345 static inline int16_t last_prefix_16x_i16(__m256i pref)
noexcept {
1346 __m128i hi = _mm256_extracti128_si256(pref, 1);
1347 return (int16_t)_mm_extract_epi16(hi, 7);
1356 inline size_t scan_leaf_fwd_simd(
const size_t& start,
1358 const int& required_delta,
1359 int* out_total)
const noexcept {
1366 if (required_delta < -32768 || required_delta > 32767) {
1368 const int len = int(end - start);
1369 const int ones = int(rank1_in_block(start, end));
1370 *out_total = ones * 2 - len;
1375 static const __m256i masks = bit_masks_16x();
1376 static const __m256i vzero = _mm256_setzero_si256();
1377 static const __m256i vallones = _mm256_cmpeq_epi16(vzero, vzero);
1378 static const __m256i vminus1 = _mm256_set1_epi16(-1);
1379 static const __m256i vtwo = _mm256_set1_epi16(2);
1380 const __m256i vtarget = _mm256_set1_epi16((int16_t)required_delta);
1384 while (pos + 16 <= end) {
1385 const uint16_t bits16 = get_u16(pos);
1386 const __m256i vb = _mm256_set1_epi16((int16_t)bits16);
1387 const __m256i m = _mm256_and_si256(vb, masks);
1388 const __m256i is_zero = _mm256_cmpeq_epi16(m, vzero);
1389 const __m256i is_set = _mm256_andnot_si256(is_zero, vallones);
1390 const __m256i steps =
1391 _mm256_add_epi16(vminus1, _mm256_and_si256(is_set, vtwo));
1393 const __m256i pref_rel = prefix_sum_16x_i16(steps);
1394 const __m256i base = _mm256_set1_epi16((int16_t)cur);
1395 const __m256i pref = _mm256_add_epi16(pref_rel, base);
1396 const __m256i cmp = _mm256_cmpeq_epi16(pref, vtarget);
1397 const uint32_t mask = (uint32_t)_mm256_movemask_epi8(cmp);
1399 const int lane = int(std::countr_zero(mask)) >> 1;
1400 return pos + (size_t)lane;
1402 cur += (int)last_prefix_16x_i16(pref_rel);
1406 cur += bit(pos) ? +1 : -1;
1407 if (cur == required_delta) {
1425 inline size_t scan_leaf_fwd_lut8_fast(
const size_t& start,
1427 const int& required_delta,
1428 int* out_total)
const noexcept {
1438 while (pos < end && (pos & 7)) {
1439 cur += bit(pos) ? +1 : -1;
1440 if (cur == required_delta) {
1452 const uint8_t* bytep =
1453 reinterpret_cast<const uint8_t*
>(bits.data()) + (pos >> 3);
1454 const auto& agg = LUT8();
1455 const auto& fwd = LUT8_FWD_POS();
1457 while (pos + 8 <= end) {
1458 const uint8_t bv = *bytep++;
1459 const auto& a = agg[bv];
1460 const int need = required_delta - cur;
1462 if ((
unsigned)(need + 8) <= 16u) {
1464 if (need >= a.min_prefix && need <= a.max_prefix) {
1465 const int8_t off = fwd[bv][need + 8];
1468 *out_total = cur + a.excess_total;
1471 return pos + (size_t)off;
1475 cur += a.excess_total;
1480 cur += bit(pos) ? +1 : -1;
1481 if (cur == required_delta) {
1502 inline size_t scan_leaf_fwd(
const size_t& search_start,
1503 const size_t& search_end,
1504 const int& required_delta)
const noexcept {
1505 if (search_start >= search_end) {
1508 const auto& aggregates_table = LUT8();
1509 const auto& forward_lookup = LUT8_FWD_POS();
1510 int current_excess = 0;
1511 size_t position = search_start;
1512 while (position + 8 <= search_end) {
1513 const uint8_t byte_value = get_byte(position);
1514 const auto& byte_aggregate = aggregates_table[byte_value];
1515 const int local_need = required_delta - current_excess;
1516 if (local_need >= byte_aggregate.min_prefix &&
1517 local_need <= byte_aggregate.max_prefix && local_need >= -8 &&
1519 const int8_t offset = forward_lookup[byte_value][local_need + 8];
1521 return position + size_t(offset);
1524 current_excess += byte_aggregate.excess_total;
1528 while (position < search_end) {
1529 current_excess += bit(position) ? 1 : -1;
1530 if (current_excess == required_delta) {
1547 inline size_t scan_leaf_bwd(
1548 const size_t& block_begin,
1549 const size_t& block_end,
1550 const int& required_delta,
1551 const bool& allow_right_boundary,
1552 const size_t& global_right_border,
1553 const int& prefix_at_boundary_max )
const noexcept {
1557 if (block_begin > block_end) {
1563 size_t boundary_max = block_end;
1564 if (!allow_right_boundary && boundary_max > block_begin) {
1569 if (block_begin >= boundary_max) {
1570 if ((block_begin < global_right_border || allow_right_boundary) &&
1571 required_delta == 0) {
1577 if (required_delta < -32768 || required_delta > 32767) {
1579 if ((block_begin < global_right_border || allow_right_boundary) &&
1580 required_delta == 0) {
1586#if defined(PIXIE_AVX2_SUPPORT)
1592 int prefix_end = prefix_at_boundary_max;
1593 if (prefix_end == kNoPrefixOverride) {
1594 const int len = int(boundary_max - block_begin);
1595 const int ones = int(rank1_in_block(block_begin, boundary_max));
1596 prefix_end = ones * 2 - len;
1599 const __m256i masks = bit_masks_16x();
1600 const __m256i vzero = _mm256_setzero_si256();
1601 const __m256i vallones = _mm256_cmpeq_epi16(vzero, vzero);
1602 const __m256i vminus1 = _mm256_set1_epi16(-1);
1603 const __m256i vtwo = _mm256_set1_epi16(2);
1604 const __m256i vtarget = _mm256_set1_epi16((int16_t)required_delta);
1606 size_t pos_end = boundary_max;
1607 int cur_end = prefix_end;
1610 while (pos_end >= block_begin + 16) {
1611 const size_t pos = pos_end - 16;
1612 const uint16_t bits16 = get_u16(pos);
1613 const __m256i vb = _mm256_set1_epi16((int16_t)bits16);
1614 const __m256i m = _mm256_and_si256(vb, masks);
1615 const __m256i is_zero = _mm256_cmpeq_epi16(m, vzero);
1616 const __m256i is_set = _mm256_andnot_si256(is_zero, vallones);
1617 const __m256i steps =
1618 _mm256_add_epi16(vminus1, _mm256_and_si256(is_set, vtwo));
1620 const __m256i pref_rel =
1621 prefix_sum_16x_i16(steps);
1622 const int16_t sum16 =
1623 last_prefix_16x_i16(pref_rel);
1624 const int cur_start =
1625 cur_end - (int)sum16;
1627 const __m256i base = _mm256_set1_epi16((int16_t)cur_start);
1628 const __m256i pref = _mm256_add_epi16(
1630 const __m256i cmp = _mm256_cmpeq_epi16(pref, vtarget);
1631 const uint32_t mask = (uint32_t)_mm256_movemask_epi8(cmp);
1633 const int bit_i = 31 - int(std::countl_zero(mask));
1634 const int lane = bit_i >> 1;
1635 const size_t boundary = pos + (size_t)lane + 1;
1636 if (boundary < global_right_border || allow_right_boundary) {
1642 cur_end = cur_start;
1646 while (pos_end > block_begin) {
1648 if (cur_end == required_delta) {
1649 if (pos_end < global_right_border || allow_right_boundary) {
1654 const size_t bit_pos = pos_end - 1;
1655 cur_end -= bit(bit_pos) ? +1 : -1;
1659 size_t last_boundary =
npos;
1661 for (
size_t pos = block_begin; pos < boundary_max; ++pos) {
1662 cur += bit(pos) ? +1 : -1;
1663 if (cur == required_delta) {
1664 last_boundary = pos + 1;
1667 if (last_boundary !=
npos) {
1668 if (last_boundary < global_right_border || allow_right_boundary) {
1669 return last_boundary;
1676 if ((block_begin < global_right_border || allow_right_boundary) &&
1677 required_delta == 0) {
1686 inline uint8_t get_byte(
const size_t& position)
const noexcept {
1687 const size_t word_index = position >> 6;
1688 const size_t offset = position & 63;
1689 const std::uint64_t lower_word = bits[word_index] >> offset;
1691 return uint8_t(lower_word & 0xFFu);
1693 const std::uint64_t higher_word =
1694 (word_index + 1 < bits.size()) ? bits[word_index + 1] : 0;
1695 const std::uint64_t byte_value =
1696 (lower_word | (higher_word << (64 - offset))) & 0xFFu;
1697 return uint8_t(byte_value);
1708 size_t descend_first_max(
size_t node_index,
1710 size_t segment_base)
const noexcept {
1711 while (node_index < first_leaf_index) {
1712 const size_t left_child = node_index << 1, right_child = left_child | 1;
1713 const int left_max = node_max_prefix_excess[left_child];
1714 const int right_max =
1715 node_total_excess[left_child] + node_max_prefix_excess[right_child];
1716 if (left_max >= right_max && left_max == target_prefix) {
1717 node_index = left_child;
1718 }
else if (right_max == target_prefix) {
1719 segment_base += segment_size_bits[left_child];
1720 target_prefix -= node_total_excess[left_child];
1721 node_index = right_child;
1727 const size_t segment_begin = segment_base;
1728 const size_t segment_end =
1729 std::min(segment_base + segment_size_bits[node_index], num_bits);
1733 first_max_value_pos8(segment_begin,
1734 segment_end ? (segment_end - 1) : segment_begin,
1735 max_value, position);
1736 return (max_value == target_prefix ? position :
npos);
1742 size_t first_leaf_index = 1;
1748 static constexpr int kNoPrefixOverride = std::numeric_limits<int>::min();
1753 size_t block_of(
const size_t& position)
const noexcept {
1754 return position / block_bits;
1760 size_t leaf_index_of(
const size_t& block_start)
const noexcept {
1761 return first_leaf_index + block_of(block_start);
1768 size_t node_base(
size_t node_index)
const noexcept {
1769 if (node_index >= first_leaf_index) {
1770 return (node_index - first_leaf_index) * block_bits;
1774 for (; node_index > 1; node_index >>= 1) {
1775 if (node_index & 1) {
1776 base += segment_size_bits[node_index - 1];
1787 std::vector<size_t> cover_blocks(
const size_t& block_begin_index,
1788 const size_t& block_end_index)
const {
1789 size_t left_index = first_leaf_index + block_begin_index;
1790 size_t right_index = first_leaf_index + block_end_index;
1791 std::vector<size_t> left_nodes, right_nodes;
1792 while (left_index <= right_index) {
1793 if ((left_index & 1) == 1) {
1794 left_nodes.push_back(left_index++);
1796 if ((right_index & 1) == 0) {
1797 right_nodes.push_back(right_index--);
1802 std::reverse(right_nodes.begin(), right_nodes.end());
1803 left_nodes.insert(left_nodes.end(), right_nodes.begin(), right_nodes.end());
1811 size_t descend_fwd(
size_t node_index,
1813 size_t segment_base)
const noexcept {
1814 while (node_index < first_leaf_index) {
1815 const size_t left_child = node_index << 1;
1816 const size_t right_child = left_child | 1;
1817 if (node_min_prefix_excess[left_child] <= required_delta &&
1818 required_delta <= node_max_prefix_excess[left_child]) {
1819 node_index = left_child;
1821 required_delta -= node_total_excess[left_child];
1822 segment_base += segment_size_bits[left_child];
1823 node_index = right_child;
1826 const size_t seg_end =
1827 std::min(segment_base + segment_size_bits[node_index], num_bits);
1828#if defined(PIXIE_AVX2_SUPPORT)
1829 return scan_leaf_fwd_simd(segment_base, seg_end, required_delta,
nullptr);
1831 return scan_leaf_fwd(segment_base, seg_end, required_delta);
1844 size_t descend_bwd(
size_t node_index,
1845 const size_t& segment_base,
1846 const int& required_delta,
1847 const size_t& global_right_border,
1848 const bool& allow_right_boundary)
const noexcept {
1849 while (node_index < first_leaf_index) {
1850 const size_t left_child = node_index << 1;
1851 const size_t right_child = left_child | 1;
1852 const int required_in_right =
1853 required_delta - node_total_excess[left_child];
1856 if (node_min_prefix_excess[right_child] <= required_in_right &&
1857 required_in_right <= node_max_prefix_excess[right_child]) {
1858 const size_t result = descend_bwd(
1859 right_child, segment_base + segment_size_bits[left_child],
1860 required_in_right, global_right_border, allow_right_boundary);
1861 if (result !=
npos) {
1867 const size_t junction = segment_base + segment_size_bits[left_child];
1868 if (required_delta == node_total_excess[left_child] &&
1869 (junction < global_right_border || allow_right_boundary)) {
1874 if (node_min_prefix_excess[left_child] <= required_delta &&
1875 required_delta <= node_max_prefix_excess[left_child]) {
1876 node_index = left_child;
1882 if (required_delta == 0 &&
1883 (segment_base < global_right_border || allow_right_boundary)) {
1884 return segment_base;
1890 const size_t seg_end =
1891 std::min(segment_base + segment_size_bits[node_index], num_bits);
1892 const size_t block_end = std::min(global_right_border, seg_end);
1894 int prefix_override = kNoPrefixOverride;
1898 if (block_end == seg_end) {
1899 const int total = node_total_excess[node_index];
1900 if (!allow_right_boundary && block_end > segment_base) {
1901 prefix_override = total - (bit(block_end - 1) ? +1 : -1);
1903 prefix_override = total;
1907 return scan_leaf_bwd(segment_base, block_end, required_delta,
1908 allow_right_boundary, global_right_border,
1916 size_t descend_first_min(
size_t node_index,
1918 size_t segment_base)
const noexcept {
1919 while (node_index < first_leaf_index) {
1920 const size_t left_child = node_index << 1, right_child = left_child | 1;
1921 const int left_min = node_min_prefix_excess[left_child];
1922 const int right_min =
1923 node_total_excess[left_child] + node_min_prefix_excess[right_child];
1924 if (left_min <= right_min && left_min == target_prefix) {
1925 node_index = left_child;
1926 }
else if (right_min == target_prefix) {
1927 segment_base += segment_size_bits[left_child];
1928 target_prefix -= node_total_excess[left_child];
1929 node_index = right_child;
1935 const size_t segment_begin = segment_base;
1936 const size_t segment_end =
1937 std::min(segment_base + segment_size_bits[node_index], num_bits);
1941 first_min_value_pos8(segment_begin,
1942 segment_end ? (segment_end - 1) : segment_begin,
1943 min_value, position);
1944 return (min_value == target_prefix ? position :
npos);
1951 size_t descend_qth_min(
size_t node_index,
1953 size_t target_min_rank,
1954 size_t segment_base)
const noexcept {
1955 while (node_index < first_leaf_index) {
1956 const size_t left_child = node_index << 1;
1957 const size_t right_child = left_child | 1;
1958 const int left_min = node_min_prefix_excess[left_child];
1959 const int right_min =
1960 node_total_excess[left_child] + node_min_prefix_excess[right_child];
1961 if (left_min == target_prefix) {
1962 if (node_min_count[left_child] >= target_min_rank) {
1963 node_index = left_child;
1966 target_min_rank -= node_min_count[left_child];
1968 if (right_min == target_prefix) {
1969 segment_base += segment_size_bits[left_child];
1970 target_prefix -= node_total_excess[left_child];
1971 node_index = right_child;
1976 return qth_min_in_block(
1978 std::min(segment_base + segment_size_bits[node_index], num_bits) - 1,
1986 size_t select1_in_block(
const size_t& block_begin,
1987 const size_t& block_end,
1988 size_t target_one_rank)
const noexcept {
1989 size_t left_word_index = block_begin >> 6;
1990 const size_t right_word_index = (block_end >> 6);
1991 const size_t left_offset = block_begin & 63;
1992 const std::uint64_t left_mask =
1993 (left_offset ? (~std::uint64_t(0) << left_offset) : ~std::uint64_t(0));
1994 if (left_word_index == right_word_index) {
1995 const std::uint64_t word =
1996 bits[left_word_index] & left_mask &
1997 ((block_end & 63) ? ((std::uint64_t(1) << (block_end & 63)) - 1)
1998 : ~std::uint64_t(0));
1999 return block_begin + select_in_word(word, target_one_rank);
2003 const std::uint64_t word = bits[left_word_index] & left_mask;
2004 const int count = std::popcount(word);
2005 if (target_one_rank <= (
size_t)count) {
2006 return block_begin + select_in_word(word, target_one_rank);
2008 target_one_rank -= count;
2012 while (left_word_index < right_word_index) {
2013 const std::uint64_t word = bits[left_word_index];
2014 const int count = std::popcount(word);
2015 if (target_one_rank <= (
size_t)count) {
2016 return (left_word_index << 6) + select_in_word(word, target_one_rank);
2018 target_one_rank -= count;
2022 const size_t right_offset = block_end & 63;
2024 const std::uint64_t word =
2025 bits[left_word_index] & ((std::uint64_t(1) << right_offset) - 1);
2026 const int count = std::popcount(word);
2027 if (target_one_rank <= (
size_t)count) {
2028 return (left_word_index << 6) + select_in_word(word, target_one_rank);
2038 size_t select0_in_block(
const size_t& block_begin,
2039 const size_t& block_end,
2040 size_t target_zero_rank)
const noexcept {
2041 if (block_end <= block_begin) {
2045 size_t left_word_index = block_begin >> 6;
2046 const size_t right_word_index = block_end >> 6;
2047 const size_t left_offset = block_begin & 63;
2049 if (left_word_index == right_word_index) {
2050 const std::uint64_t left_mask =
2051 (left_offset ? (~std::uint64_t(0) << left_offset)
2052 : ~std::uint64_t(0));
2053 const std::uint64_t right_mask =
2054 ((block_end & 63) ? ((std::uint64_t(1) << (block_end & 63)) - 1)
2055 : ~std::uint64_t(0));
2056 const std::uint64_t word =
2057 (~bits[left_word_index]) & left_mask & right_mask;
2058 const int offset = select_in_word(word, target_zero_rank);
2059 return (offset >= 0) ? (block_begin + (size_t)offset) :
npos;
2064 const std::uint64_t word =
2065 (~bits[left_word_index]) & (~std::uint64_t(0) << left_offset);
2066 const int count = std::popcount(word);
2067 if (target_zero_rank <= (
size_t)count) {
2068 const int offset = select_in_word(word, target_zero_rank);
2069 return (offset >= 0) ? (block_begin + (size_t)offset) :
npos;
2071 target_zero_rank -= count;
2076 while (left_word_index < right_word_index) {
2077 const std::uint64_t word = ~bits[left_word_index];
2078 const int count = std::popcount(word);
2079 if (target_zero_rank <= (
size_t)count) {
2080 const int offset = select_in_word(word, target_zero_rank);
2081 return (offset >= 0) ? ((left_word_index << 6) + (size_t)offset) :
npos;
2083 target_zero_rank -= count;
2088 const size_t right_offset = block_end & 63;
2090 const std::uint64_t word =
2091 (~bits[left_word_index]) & ((std::uint64_t(1) << right_offset) - 1);
2092 const int count = std::popcount(word);
2093 if (target_zero_rank <= (
size_t)count) {
2094 const int offset = select_in_word(word, target_zero_rank);
2095 return (offset >= 0) ? ((left_word_index << 6) + (size_t)offset) :
npos;
2105 static inline int select_in_word(std::uint64_t word,
2106 size_t target_rank)
noexcept {
2108 if (--target_rank == 0) {
2109 return std::countr_zero(word);
2119 static inline size_t ceil_div(
const size_t& numerator,
2120 const size_t& denominator)
noexcept {
2121 return (numerator + denominator - 1) / denominator;
2129 static inline size_t nodeslots_for(
const size_t& bit_count,
2130 const size_t& block_size_pow2)
noexcept {
2131 if (bit_count == 0) {
2134 size_t leaf_node_count = ceil_div(bit_count, block_size_pow2);
2135 return std::bit_ceil(std::max<size_t>(1, leaf_node_count)) +
2142 static inline float overhead_for(
const size_t& bit_count,
2143 const size_t& block_size_pow2)
noexcept {
2144 static constexpr size_t AUX_SLOT_BYTES =
2145 sizeof(uint32_t) +
sizeof(int32_t) +
sizeof(int32_t) +
sizeof(int32_t) +
2146 sizeof(uint32_t) +
sizeof(uint32_t) +
sizeof(uint8_t) +
sizeof(uint8_t);
2148 size_t bitvector_bytes = ceil_div(bit_count, 64) * 8;
2149 if (bitvector_bytes == 0) {
2152 size_t slot_count = nodeslots_for(bit_count, block_size_pow2);
2153 size_t aux_bytes = slot_count * AUX_SLOT_BYTES;
2154 return ((
float)aux_bytes) / ((float)bitvector_bytes);
2163 static inline size_t choose_block_bits_for_overhead(
2164 const size_t& bit_count,
2165 const float& overhead_cap)
noexcept {
2166 if (overhead_cap < 0.f) {
2170 const size_t max_block_bits = std::min<size_t>(bit_count, 16384);
2171 size_t candidate_block_bits = 64;
2172 while (candidate_block_bits < max_block_bits) {
2173 if (overhead_for(bit_count, candidate_block_bits) <= overhead_cap) {
2176 candidate_block_bits <<= 1;
2178 return candidate_block_bits;
2186 void build_from_string(
const std::string& bit_string_input,
2187 const size_t& leaf_block_bits = 0,
2188 const float& max_overhead = -1.0) {
2189 num_bits = bit_string_input.size();
2190 bits.assign((num_bits + 63) / 64, 0);
2191 for (
size_t i = 0; i < num_bits; ++i) {
2192 if (bit_string_input[i] ==
'1') {
2196 build(leaf_block_bits, max_overhead);
2206 void build_from_words(
const std::vector<std::uint64_t>& words,
2207 const size_t& bit_count,
2208 const size_t& leaf_block_bits = 0,
2209 const float& max_overhead = -1.0) {
2211 num_bits = bit_count;
2212 if (bits.size() * 64 < num_bits) {
2213 bits.resize((num_bits + 63) / 64);
2215 build(leaf_block_bits, max_overhead);
2221 inline int bit(
const size_t& position)
const noexcept {
2222 return (bits[position >> 6] >> (position & 63)) & 1u;
2228 inline void set1(
const size_t& position)
noexcept {
2229 bits[position >> 6] |= (std::uint64_t(1) << (position & 63));
2236 inline uint32_t ones_in_node(
const size_t& node_index)
const noexcept {
2237 return ((int64_t)segment_size_bits[node_index] +
2238 (int64_t)node_total_excess[node_index]) >>
2249 inline void scan_range_min_count8(
size_t range_begin,
2250 const size_t& range_end,
2251 int& current_excess,
2253 uint32_t& count)
const noexcept {
2255 min_value = INT_MAX;
2257 if (range_end < range_begin) {
2262 while (range_begin <= range_end && (range_begin & 7)) {
2263 current_excess += bit(range_begin) ? +1 : -1;
2264 if (current_excess < min_value) {
2265 min_value = current_excess;
2267 }
else if (current_excess == min_value) {
2273 const auto& aggregates_table = LUT8();
2274 while (range_begin + 7 <= range_end) {
2275 const auto& byte_aggregate = aggregates_table[get_byte(range_begin)];
2276 const int candidate = current_excess + byte_aggregate.min_prefix;
2277 if (candidate < min_value) {
2278 min_value = candidate;
2279 count = byte_aggregate.min_count;
2280 }
else if (candidate == min_value) {
2281 count += byte_aggregate.min_count;
2283 current_excess += byte_aggregate.excess_total;
2287 while (range_begin <= range_end) {
2288 current_excess += bit(range_begin) ? +1 : -1;
2289 if (current_excess < min_value) {
2290 min_value = current_excess;
2292 }
else if (current_excess == min_value) {
2297 if (min_value == INT_MAX) {
2298 min_value = count = 0;
2308 inline size_t cover_blocks_collect(
const size_t& block_begin_index,
2309 const size_t& block_end_index,
2310 size_t (&out_nodes)[64])
const noexcept {
2311 if (leaf_count == 0 || block_begin_index > block_end_index) {
2314 size_t left_index = first_leaf_index + block_begin_index;
2315 size_t right_index = first_leaf_index + block_end_index;
2316 size_t left_nodes[32];
2317 size_t right_nodes[32];
2318 size_t left_count = 0, right_count = 0;
2319 while (left_index <= right_index) {
2320 if (left_index & 1) {
2321 left_nodes[left_count++] = left_index++;
2323 if ((right_index & 1) == 0) {
2324 right_nodes[right_count++] = right_index--;
2329 size_t out_count = 0;
2330 for (
size_t i = 0; i < left_count; ++i) {
2331 out_nodes[out_count++] = left_nodes[i];
2333 while (right_count > 0) {
2334 out_nodes[out_count++] = right_nodes[--right_count];
2345 inline size_t qth_min_in_block(
const size_t& range_begin,
2346 const size_t& range_end,
2347 size_t target_min_rank)
const noexcept {
2348 if (range_end < range_begin || target_min_rank == 0) {
2352 const auto& aggregates_table = LUT8();
2354 int current_excess = 0, min_value = INT_MAX;
2355 size_t position = range_begin;
2357 while (position <= range_end && (position & 7)) {
2358 current_excess += bit(position) ? +1 : -1;
2359 if (current_excess < min_value) {
2360 min_value = current_excess;
2364 while (position + 7 <= range_end) {
2365 const auto& byte_aggregate = aggregates_table[get_byte(position)];
2367 std::min(min_value, current_excess + byte_aggregate.min_prefix);
2368 current_excess += byte_aggregate.excess_total;
2371 while (position <= range_end) {
2372 current_excess += bit(position) ? +1 : -1;
2373 if (current_excess < min_value) {
2374 min_value = current_excess;
2380 position = range_begin;
2383 while (position <= range_end && (position & 7)) {
2384 current_excess += bit(position) ? +1 : -1;
2385 if (current_excess == min_value) {
2386 if (--target_min_rank == 0) {
2394 while (position + 7 <= range_end) {
2395 const uint8_t byte_value = get_byte(position);
2396 const auto& byte_aggregate = aggregates_table[byte_value];
2397 const int candidate = current_excess + byte_aggregate.min_prefix;
2398 if (candidate == min_value) {
2400 for (
int k = 0; k < 8; ++k) {
2401 prefix_sum += ((byte_value >> k) & 1u) ? +1 : -1;
2402 if (prefix_sum == byte_aggregate.min_prefix) {
2403 if (--target_min_rank == 0) {
2404 return position + k;
2409 current_excess += byte_aggregate.excess_total;
2414 while (position <= range_end) {
2415 current_excess += bit(position) ? +1 : -1;
2416 if (current_excess == min_value) {
2417 if (--target_min_rank == 0) {
2437 inline size_t leaf_fwd_bp_simd(
const size_t& leaf_index,
2438 const size_t& leaf_block_begin,
2439 const size_t& start_position,
2441 int& leaf_delta)
const noexcept {
2442 const size_t leaf_length = segment_size_bits[first_leaf_index + leaf_index];
2443 const size_t leaf_end = std::min(num_bits, leaf_block_begin + leaf_length);
2444 if (start_position >= leaf_end) {
2448#if defined(PIXIE_AVX2_SUPPORT)
2453 const size_t tail_len = leaf_end - start_position;
2455 if (delta >= -8 && delta <= 8 && tail_len <= 256) {
2456 res = scan_leaf_fwd_lut8_fast(start_position, leaf_end, delta, &total);
2458 res = scan_leaf_fwd_simd(start_position, leaf_end, delta, &total);
2466 const size_t res = scan_leaf_fwd(start_position, leaf_end, delta);
2470 const int len = int(leaf_end - start_position);
2471 const int ones = int(rank1_in_block(start_position, leaf_end));
2472 leaf_delta = ones * 2 - len;
2485 inline size_t leaf_bwd_bp_simd(
const size_t& leaf_index,
2486 const size_t& leaf_block_begin,
2487 const size_t& start_position,
2489 int& leaf_delta)
const noexcept {
2490 const size_t leaf_length = segment_size_bits[first_leaf_index + leaf_index];
2491 const size_t leaf_end = std::min(num_bits, leaf_block_begin + leaf_length);
2492 if (start_position < leaf_block_begin || start_position > leaf_end) {
2499 const int len = int(start_position - leaf_block_begin);
2500 const int ones = int(rank1_in_block(leaf_block_begin, start_position));
2501 leaf_delta = ones * 2 - len;
2505 if (start_position == leaf_block_begin) {
2508 const int target_prefix = leaf_delta + delta;
2509 return scan_leaf_bwd(
2517 (start_position > leaf_block_begin
2518 ? (leaf_delta - (bit(start_position - 1) ? +1 : -1))
2528 inline void first_min_value_pos8(
size_t range_begin,
2529 const size_t& range_end,
2531 size_t& first_position)
const noexcept {
2532 const auto& aggregates_table = LUT8();
2533 int current_excess = 0;
2534 int min_value = INT_MAX;
2535 first_position =
npos;
2538 while (range_begin <= range_end && (range_begin & 7)) {
2539 current_excess += bit(range_begin) ? +1 : -1;
2540 if (current_excess < min_value) {
2541 min_value = current_excess;
2542 first_position = range_begin;
2548 while (range_begin + 7 <= range_end) {
2549 const auto& byte_aggregate = aggregates_table[get_byte(range_begin)];
2550 const int candidate = current_excess + byte_aggregate.min_prefix;
2551 if (candidate < min_value) {
2552 min_value = candidate;
2553 first_position = range_begin + byte_aggregate.pos_first_min;
2555 current_excess += byte_aggregate.excess_total;
2560 while (range_begin <= range_end) {
2561 current_excess += bit(range_begin) ? +1 : -1;
2562 if (current_excess < min_value) {
2563 min_value = current_excess;
2564 first_position = range_begin;
2569 min_value_out = (min_value == INT_MAX ? 0 : min_value);
2578 inline void first_max_value_pos8(
size_t range_begin,
2579 const size_t& range_end,
2581 size_t& first_position)
const noexcept {
2582 const auto& aggregates_table = LUT8();
2583 int current_excess = 0;
2584 int max_value = INT_MIN;
2585 first_position =
npos;
2587 while (range_begin <= range_end && (range_begin & 7)) {
2588 current_excess += bit(range_begin) ? +1 : -1;
2589 if (current_excess > max_value) {
2590 max_value = current_excess;
2591 first_position = range_begin;
2596 while (range_begin + 7 <= range_end) {
2597 const auto& byte_aggregate = aggregates_table[get_byte(range_begin)];
2598 const int candidate = current_excess + byte_aggregate.max_prefix;
2599 if (candidate > max_value) {
2600 max_value = candidate;
2601 first_position = range_begin + byte_aggregate.pos_first_max;
2603 current_excess += byte_aggregate.excess_total;
2607 while (range_begin <= range_end) {
2608 current_excess += bit(range_begin) ? +1 : -1;
2609 if (current_excess > max_value) {
2610 max_value = current_excess;
2611 first_position = range_begin;
2616 max_value_out = (max_value == INT_MIN ? 0 : max_value);
2625 void build(
const size_t& leaf_block_bits,
const float& max_overhead) {
2628 const size_t clamp_by_overhead =
2629 (max_overhead >= 0.0
2630 ? choose_block_bits_for_overhead(num_bits, max_overhead)
2635 if (leaf_block_bits == 0) {
2637 std::max(clamp_by_overhead,
2638 std::bit_ceil<size_t>(
2639 (num_bits <= 1) ? 1 : std::bit_width(num_bits - 1)));
2642 std::max(clamp_by_overhead,
2643 std::bit_ceil(std::max<size_t>(1, leaf_block_bits)));
2648 built_overhead = overhead_for(num_bits, block_bits);
2651 leaf_count = ceil_div(num_bits, block_bits);
2652 first_leaf_index = std::bit_ceil(std::max<size_t>(1, leaf_count));
2653 const size_t tree_size = first_leaf_index + leaf_count - 1;
2654 segment_size_bits.assign(tree_size + 1, 0);
2655 node_total_excess.assign(tree_size + 1, 0);
2656 node_min_prefix_excess.assign(tree_size + 1, 0);
2657 node_max_prefix_excess.assign(tree_size + 1, 0);
2658 node_min_count.assign(tree_size + 1, 0);
2659 node_pattern10_count.assign(tree_size + 1, 0);
2660 node_first_bit.assign(tree_size + 1, 0);
2661 node_last_bit.assign(tree_size + 1, 0);
2664 for (
size_t leaf_block_index = 0; leaf_block_index < leaf_count;
2665 ++leaf_block_index) {
2666 const size_t leaf_node_index = first_leaf_index + leaf_block_index;
2667 const size_t segment_begin = leaf_block_index * block_bits;
2668 const size_t segment_end = std::min(num_bits, segment_begin + block_bits);
2669 segment_size_bits[leaf_node_index] = segment_end - segment_begin;
2671 if (segment_begin < segment_end) {
2672 node_first_bit[leaf_node_index] = bit(segment_begin);
2675 const auto& aggregates_table = LUT8();
2677 int current_excess = 0, min_value = INT_MAX, max_value = INT_MIN;
2678 uint32_t min_count = 0;
2679 uint32_t pattern10_count = 0;
2681 uint8_t previous_bit = 0;
2683 size_t position = segment_begin;
2686 while (position + 8 <= segment_end) {
2687 const uint8_t byte_value = get_byte(position);
2688 const auto& byte_aggregate = aggregates_table[byte_value];
2691 pattern10_count += byte_aggregate.pattern10_count;
2694 if (previous_bit == 1 && byte_aggregate.first_bit == 0) {
2699 const int candidate_min = current_excess + byte_aggregate.min_prefix;
2700 if (candidate_min < min_value) {
2701 min_value = candidate_min;
2702 min_count = byte_aggregate.min_count;
2703 }
else if (candidate_min == min_value) {
2704 min_count += byte_aggregate.min_count;
2708 std::max(max_value, current_excess + byte_aggregate.max_prefix);
2709 current_excess += byte_aggregate.excess_total;
2710 previous_bit = byte_aggregate.last_bit;
2715 while (position < segment_end) {
2716 const uint8_t bit_value = bit(position);
2717 if (previous_bit == 1 && bit_value == 0) {
2720 const int step = bit_value ? +1 : -1;
2721 current_excess += step;
2722 if (current_excess < min_value) {
2723 min_value = current_excess;
2725 }
else if (current_excess == min_value) {
2728 if (current_excess > max_value) {
2729 max_value = current_excess;
2732 previous_bit = bit_value;
2736 if (segment_begin < segment_end) {
2737 node_last_bit[leaf_node_index] = previous_bit;
2740 node_total_excess[leaf_node_index] = current_excess;
2741 node_min_prefix_excess[leaf_node_index] =
2742 (segment_size_bits[leaf_node_index] == 0 ? 0 : min_value);
2743 node_max_prefix_excess[leaf_node_index] =
2744 (segment_size_bits[leaf_node_index] == 0 ? 0 : max_value);
2745 node_min_count[leaf_node_index] = min_count;
2746 node_pattern10_count[leaf_node_index] = (uint32_t)pattern10_count;
2749 for (
size_t node_index = first_leaf_index - 1; node_index >= 1;
2751 const size_t left_child = node_index << 1;
2752 const size_t right_child = left_child | 1;
2753 const bool has_left =
2754 (left_child <= tree_size) && segment_size_bits[left_child];
2755 const bool has_right =
2756 (right_child <= tree_size) && segment_size_bits[right_child];
2757 if (!has_left && !has_right) {
2758 segment_size_bits[node_index] = 0;
2761 if (has_left && !has_right) {
2762 segment_size_bits[node_index] = segment_size_bits[left_child];
2763 node_total_excess[node_index] = node_total_excess[left_child];
2764 node_min_prefix_excess[node_index] = node_min_prefix_excess[left_child];
2765 node_max_prefix_excess[node_index] = node_max_prefix_excess[left_child];
2766 node_min_count[node_index] = node_min_count[left_child];
2767 node_pattern10_count[node_index] = node_pattern10_count[left_child];
2768 node_first_bit[node_index] = node_first_bit[left_child];
2769 node_last_bit[node_index] = node_last_bit[left_child];
2770 }
else if (!has_left && has_right) {
2771 segment_size_bits[node_index] = segment_size_bits[right_child];
2772 node_total_excess[node_index] = node_total_excess[right_child];
2773 node_min_prefix_excess[node_index] =
2774 node_min_prefix_excess[right_child];
2775 node_max_prefix_excess[node_index] =
2776 node_max_prefix_excess[right_child];
2777 node_min_count[node_index] = node_min_count[right_child];
2778 node_pattern10_count[node_index] = node_pattern10_count[right_child];
2779 node_first_bit[node_index] = node_first_bit[right_child];
2780 node_last_bit[node_index] = node_last_bit[right_child];
2782 segment_size_bits[node_index] =
2783 segment_size_bits[left_child] + segment_size_bits[right_child];
2784 node_total_excess[node_index] =
2785 node_total_excess[left_child] + node_total_excess[right_child];
2786 const int right_min_candidate =
2787 node_total_excess[left_child] + node_min_prefix_excess[right_child];
2788 const int right_max_candidate =
2789 node_total_excess[left_child] + node_max_prefix_excess[right_child];
2790 node_min_prefix_excess[node_index] =
2791 std::min(node_min_prefix_excess[left_child], right_min_candidate);
2792 node_max_prefix_excess[node_index] =
2793 std::max(node_max_prefix_excess[left_child], right_max_candidate);
2794 node_min_count[node_index] =
2795 (node_min_prefix_excess[left_child] ==
2796 node_min_prefix_excess[node_index]
2797 ? node_min_count[left_child]
2799 (right_min_candidate == node_min_prefix_excess[node_index]
2800 ? node_min_count[right_child]
2802 node_pattern10_count[node_index] = node_pattern10_count[left_child] +
2803 node_pattern10_count[right_child] +
2804 ((node_last_bit[left_child] == 1 &&
2805 node_first_bit[right_child] == 0)
2808 node_first_bit[node_index] = node_first_bit[left_child];
2809 node_last_bit[node_index] = node_last_bit[right_child];
2811 if (node_index == 1) {