@@ -115,7 +115,7 @@ namespace cp_algo::math {
115115 uint32_t product;
116116 };
117117
118- auto make_wheel (big_vector<uint32_t > primes, uint32_t product) {
118+ constexpr auto make_wheel (big_vector<uint32_t > primes, uint32_t product) {
119119 assert (product % (period * dynamic_bit_array::width) == 0 );
120120 wheel_t wheel;
121121 wheel.product = product;
@@ -128,64 +128,45 @@ namespace cp_algo::math {
128128 }
129129 return wheel;
130130 }
131-
132- constexpr uint32_t wheel_threshold = 500 ;
133- size_t medium_primes_begin;
134- const auto wheels = []() {
135- uint32_t product = period * dynamic_bit_array::width;
136- big_vector<uint32_t > current;
137- big_vector<wheel_t > wheels;
138- for (size_t i = 0 ; i < size (sqrt_primes); i++) {
139- uint32_t p = sqrt_primes[i];
140- if (product * p > max_wheel_size) {
141- wheels.push_back (make_wheel (current, product));
142- current = {p};
143- product = period * dynamic_bit_array::width * p;
144- if (product > max_wheel_size || p > wheel_threshold) {
145- medium_primes_begin = i;
146- checkpoint (" make wheels" );
147- return wheels;
148- }
149- } else {
150- current.push_back (p);
151- product *= p;
152- }
153- }
154- assert (false );
155- }();
156-
157- const auto [ord_step, step_sum] = []() {
158- big_vector<std::array<uint32_t , 2 * coprime>> ord_steps (num_primes);
159- big_vector<uint32_t > sums (num_primes);
160- for (uint32_t i = 0 ; i < size (sqrt_primes); i++) {
161- auto p = sqrt_primes[i];
162- for (uint32_t j = 0 ; j < coprime; j++) {
163- ord_steps[i][j] = to_ord (p * (res210[j] + gap210[j])) - to_ord (p * res210[j]);
164- }
165- sums[i] = std::ranges::fold_left (ord_steps[i], 0u , std::plus{});
166- for (uint32_t j = 0 ; j < coprime; j++) {
167- ord_steps[i][j + coprime] = ord_steps[i][j];
168- }
169- }
170- return std::pair{ord_steps, sums};
171- }();
172-
173- void sieve_dense (auto &prime, uint32_t l, uint32_t r, wheel_t const & wheel) {
131+
132+ constexpr void sieve_dense (auto &prime, uint32_t l, uint32_t r, wheel_t const & wheel) {
174133 if (l >= r) return ;
175- const auto width = (uint32_t )dynamic_bit_array::width;
134+ const uint32_t width = (uint32_t )dynamic_bit_array::width; // 64
176135 uint32_t wl = l / width;
177136 uint32_t wr = (r + width - 1 ) / width;
178- uint32_t N = (uint32_t )wheel.mask .words ;
179- for (uint32_t i = wl; i < wr; i += N) {
180- auto block = std::min (N, wr - i);
181- for (uint32_t j = 0 ; j < block; j++) {
137+ uint32_t N = (uint32_t )wheel.mask .words ;
138+
139+ for (uint32_t i = wl; i < wr; i += N) {
140+ uint32_t block = std::min (N, wr - i);
141+ uint32_t j = 0 ;
142+ for (; j + 4 <= block; j += 4 ) {
143+ auto &p_vec = vector_cast<u64x4>(prime.word (i + j));
144+ auto m_vec = vector_cast<const u64x4>(wheel.mask .word (j));
145+ p_vec &= m_vec;
146+ }
147+ for (; j < block; j++) {
182148 prime.word (i + j) &= wheel.mask .word (j);
183149 }
184150 }
185151 }
186152
187153 template <class BitArray >
188- void sieve210 (BitArray& prime, uint32_t l, uint32_t r, size_t i, int state) {
154+ constexpr void sieve210 (BitArray& prime, uint32_t l, uint32_t r, size_t i, int state) {
155+ static const auto [ord_step, step_sum] = []() {
156+ big_vector<std::array<uint32_t , 2 * coprime>> ord_steps (num_primes);
157+ big_vector<uint32_t > sums (num_primes);
158+ for (uint32_t i = 0 ; i < size (sqrt_primes); i++) {
159+ auto p = sqrt_primes[i];
160+ for (uint32_t j = 0 ; j < coprime; j++) {
161+ ord_steps[i][j] = to_ord (p * (res210[j] + gap210[j])) - to_ord (p * res210[j]);
162+ }
163+ sums[i] = std::ranges::fold_left (ord_steps[i], 0u , std::plus{});
164+ for (uint32_t j = 0 ; j < coprime; j++) {
165+ ord_steps[i][j + coprime] = ord_steps[i][j];
166+ }
167+ }
168+ return std::pair{ord_steps, sums};
169+ }();
189170 while (l + step_sum[i] <= r) {
190171 #pragma GCC unroll coprime
191172 for (size_t j = 0 ; j < coprime; j++) {
@@ -202,10 +183,32 @@ namespace cp_algo::math {
202183 }
203184
204185 // Primes smaller or equal than N
205- dynamic_bit_array sieve210 (uint32_t N) {
186+ constexpr dynamic_bit_array sieve210 (uint32_t N) {
206187 N++;
207188 dynamic_bit_array prime (to_ord (N));
208189 prime.set_all ();
190+ constexpr uint32_t wheel_threshold = 500 ;
191+ static const auto [wheels, medium_primes_begin] = []() {
192+ uint32_t product = period * dynamic_bit_array::width;
193+ big_vector<uint32_t > current;
194+ big_vector<wheel_t > wheels;
195+ for (size_t i = 0 ; i < size (sqrt_primes); i++) {
196+ uint32_t p = sqrt_primes[i];
197+ if (product * p > max_wheel_size) {
198+ wheels.push_back (make_wheel (current, product));
199+ current = {p};
200+ product = period * dynamic_bit_array::width * p;
201+ if (product > max_wheel_size || p > wheel_threshold) {
202+ checkpoint (" make wheels" );
203+ return std::pair{wheels, i};
204+ }
205+ } else {
206+ current.push_back (p);
207+ product *= p;
208+ }
209+ }
210+ assert (false );
211+ }();
209212 static constexpr uint32_t dense_block = 1 << 25 ;
210213 for (uint32_t start = 0 ; start < N; start += dense_block) {
211214 uint32_t r = std::min (start + dense_block, N);
0 commit comments