Skip to content

Commit 548e409

Browse files
committed
improve vectorization in sieve
1 parent d607be1 commit 548e409

File tree

2 files changed

+56
-57
lines changed

2 files changed

+56
-57
lines changed

cp-algo/math/sieve.hpp

Lines changed: 54 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ namespace cp_algo::math {
115115
uint32_t product;
116116
};
117117

118-
auto make_wheel(big_vector<uint32_t> primes, uint32_t product) {
118+
constexpr auto make_wheel(big_vector<uint32_t> primes, uint32_t product) {
119119
assert(product % (period * dynamic_bit_array::width) == 0);
120120
wheel_t wheel;
121121
wheel.product = product;
@@ -128,64 +128,45 @@ namespace cp_algo::math {
128128
}
129129
return wheel;
130130
}
131-
132-
constexpr uint32_t wheel_threshold = 500;
133-
size_t medium_primes_begin;
134-
const auto wheels = []() {
135-
uint32_t product = period * dynamic_bit_array::width;
136-
big_vector<uint32_t> current;
137-
big_vector<wheel_t> wheels;
138-
for(size_t i = 0; i < size(sqrt_primes); i++) {
139-
uint32_t p = sqrt_primes[i];
140-
if (product * p > max_wheel_size) {
141-
wheels.push_back(make_wheel(current, product));
142-
current = {p};
143-
product = period * dynamic_bit_array::width * p;
144-
if (product > max_wheel_size || p > wheel_threshold) {
145-
medium_primes_begin = i;
146-
checkpoint("make wheels");
147-
return wheels;
148-
}
149-
} else {
150-
current.push_back(p);
151-
product *= p;
152-
}
153-
}
154-
assert(false);
155-
}();
156-
157-
const auto [ord_step, step_sum] = []() {
158-
big_vector<std::array<uint32_t, 2 * coprime>> ord_steps(num_primes);
159-
big_vector<uint32_t> sums(num_primes);
160-
for (uint32_t i = 0; i < size(sqrt_primes); i++) {
161-
auto p = sqrt_primes[i];
162-
for(uint32_t j = 0; j < coprime; j++) {
163-
ord_steps[i][j] = to_ord(p * (res210[j] + gap210[j])) - to_ord(p * res210[j]);
164-
}
165-
sums[i] = std::ranges::fold_left(ord_steps[i], 0u, std::plus{});
166-
for(uint32_t j = 0; j < coprime; j++) {
167-
ord_steps[i][j + coprime] = ord_steps[i][j];
168-
}
169-
}
170-
return std::pair{ord_steps, sums};
171-
}();
172-
173-
void sieve_dense(auto &prime, uint32_t l, uint32_t r, wheel_t const& wheel) {
131+
132+
constexpr void sieve_dense(auto &prime, uint32_t l, uint32_t r, wheel_t const& wheel) {
174133
if (l >= r) return;
175-
const auto width = (uint32_t)dynamic_bit_array::width;
134+
const uint32_t width = (uint32_t)dynamic_bit_array::width; // 64
176135
uint32_t wl = l / width;
177136
uint32_t wr = (r + width - 1) / width;
178-
uint32_t N = (uint32_t)wheel.mask.words;
179-
for(uint32_t i = wl; i < wr; i += N) {
180-
auto block = std::min(N, wr - i);
181-
for(uint32_t j = 0; j < block; j++) {
137+
uint32_t N = (uint32_t)wheel.mask.words;
138+
139+
for (uint32_t i = wl; i < wr; i += N) {
140+
uint32_t block = std::min(N, wr - i);
141+
uint32_t j = 0;
142+
for (; j + 4 <= block; j += 4) {
143+
auto &p_vec = vector_cast<u64x4>(prime.word(i + j));
144+
auto m_vec = vector_cast<const u64x4>(wheel.mask.word(j));
145+
p_vec &= m_vec;
146+
}
147+
for (; j < block; j++) {
182148
prime.word(i + j) &= wheel.mask.word(j);
183149
}
184150
}
185151
}
186152

187153
template <class BitArray>
188-
void sieve210(BitArray& prime, uint32_t l, uint32_t r, size_t i, int state) {
154+
constexpr void sieve210(BitArray& prime, uint32_t l, uint32_t r, size_t i, int state) {
155+
static const auto [ord_step, step_sum] = []() {
156+
big_vector<std::array<uint32_t, 2 * coprime>> ord_steps(num_primes);
157+
big_vector<uint32_t> sums(num_primes);
158+
for (uint32_t i = 0; i < size(sqrt_primes); i++) {
159+
auto p = sqrt_primes[i];
160+
for(uint32_t j = 0; j < coprime; j++) {
161+
ord_steps[i][j] = to_ord(p * (res210[j] + gap210[j])) - to_ord(p * res210[j]);
162+
}
163+
sums[i] = std::ranges::fold_left(ord_steps[i], 0u, std::plus{});
164+
for(uint32_t j = 0; j < coprime; j++) {
165+
ord_steps[i][j + coprime] = ord_steps[i][j];
166+
}
167+
}
168+
return std::pair{ord_steps, sums};
169+
}();
189170
while (l + step_sum[i] <= r) {
190171
#pragma GCC unroll coprime
191172
for (size_t j = 0; j < coprime; j++) {
@@ -202,10 +183,32 @@ namespace cp_algo::math {
202183
}
203184

204185
// Primes smaller or equal than N
205-
dynamic_bit_array sieve210(uint32_t N) {
186+
constexpr dynamic_bit_array sieve210(uint32_t N) {
206187
N++;
207188
dynamic_bit_array prime(to_ord(N));
208189
prime.set_all();
190+
constexpr uint32_t wheel_threshold = 500;
191+
static const auto [wheels, medium_primes_begin] = []() {
192+
uint32_t product = period * dynamic_bit_array::width;
193+
big_vector<uint32_t> current;
194+
big_vector<wheel_t> wheels;
195+
for(size_t i = 0; i < size(sqrt_primes); i++) {
196+
uint32_t p = sqrt_primes[i];
197+
if (product * p > max_wheel_size) {
198+
wheels.push_back(make_wheel(current, product));
199+
current = {p};
200+
product = period * dynamic_bit_array::width * p;
201+
if (product > max_wheel_size || p > wheel_threshold) {
202+
checkpoint("make wheels");
203+
return std::pair{wheels, i};
204+
}
205+
} else {
206+
current.push_back(p);
207+
product *= p;
208+
}
209+
}
210+
assert(false);
211+
}();
209212
static constexpr uint32_t dense_block = 1 << 25;
210213
for(uint32_t start = 0; start < N; start += dense_block) {
211214
uint32_t r = std::min(start + dense_block, N);

cp-algo/structures/bit_array.hpp

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,8 @@ namespace cp_algo::structures {
3535
constexpr _bit_array(size_t N): data() {
3636
resize(N);
3737
}
38-
39-
constexpr word_t& word(size_t x) {
40-
return data[x];
41-
}
42-
constexpr word_t word(size_t x) const {
43-
return data[x];
38+
constexpr auto&& word(this auto&& self, size_t x) {
39+
return self.data[x];
4440
}
4541
constexpr void set_all(word_t val = -1) {
4642
for(size_t i = 0; i < words; i++) {

0 commit comments

Comments
 (0)