@@ -134,50 +134,50 @@ namespace cp_algo::math {
134134 uint32_t wl = l / width;
135135 uint32_t wr = (r + width - 1 ) / width;
136136 uint32_t N = (uint32_t )wheel.mask .words ;
137-
138- for (uint32_t i = wl; i < wr; i += N) {
139- uint32_t block = std::min (N, wr - i);
140- uint32_t j = 0 ;
141- for (; j + 4 <= block; j += 4 ) {
142- auto &p_vec = vector_cast<u64x4>(prime.word (i + j));
143- auto m_vec = vector_cast<const u64x4>(wheel.mask .word (j));
144- p_vec &= m_vec;
145- }
146- for (; j < block; j++) {
147- prime.word (i + j) &= wheel.mask .word (j);
137+ auto loop = [&](uint32_t i, uint32_t block) {
138+ auto p_ptr = std::assume_aligned<32 >(&prime.word (i));
139+ auto m_ptr = std::assume_aligned<32 >(&wheel.mask .word (0 ));
140+ #pragma GCC unroll coprime
141+ for (uint32_t j = 0 ; j < block; j++) {
142+ p_ptr[j] &= m_ptr[j];
148143 }
144+ };
145+ while (wl + N <= wr) {
146+ loop (wl, N);
147+ wl += N;
149148 }
149+ loop (wl, wr - wl);
150150 }
151151
152152 template <class BitArray >
153153 constexpr void sieve210 (BitArray& prime, uint32_t l, uint32_t r, size_t i, int state) {
154- static const auto [ord_step, step_sum] = []() {
155- big_vector<std::array<uint32_t , 2 * coprime>> ord_steps (num_primes);
156- big_vector<uint32_t > sums (num_primes);
154+ static const auto ord_step = []() {
155+ std::array<std::array<uint32_t , 2 * coprime>, num_primes> ord_steps;
157156 for (uint32_t i = 0 ; i < size (sqrt_primes); i++) {
158157 auto p = sqrt_primes[i];
158+ auto &ords = ord_steps[i];
159+ auto last = to_ord (p);
159160 for (uint32_t j = 0 ; j < coprime; j++) {
160- ord_steps[i][j] = to_ord (p * (res210[j] + gap210[j])) - to_ord (p * res210[j]);
161- }
162- sums[i] = std::ranges::fold_left (ord_steps[i], 0u , std::plus{});
163- for (uint32_t j = 0 ; j < coprime; j++) {
164- ord_steps[i][j + coprime] = ord_steps[i][j];
161+ auto next = to_ord (p * (res210[j] + gap210[j]));
162+ ords[j] = ords[j + coprime] = next - last;
163+ last = next;
165164 }
166165 }
167- return std::pair{ ord_steps, sums} ;
166+ return ord_steps;
168167 }();
169- while (l + step_sum[i] <= r) {
168+ auto advance = [&]() {
169+ prime.reset (std::exchange (l, l + ord_step[i][state++]));
170+ };
171+ uint32_t p = sqrt_primes[i];
172+ while (l + p * coprime <= r) {
170173 #pragma GCC unroll coprime
171174 for (size_t j = 0 ; j < coprime; j++) {
172- prime.reset (l);
173- l += ord_step[i][state++];
175+ advance ();
174176 }
175177 state -= coprime;
176178 }
177179 while (l < r) {
178- prime.reset (l);
179- l += ord_step[i][state++];
180- state = state == coprime ? 0 : state;
180+ advance ();
181181 }
182182 }
183183
0 commit comments