提问人:user140242 提问时间:11/22/2022 最后编辑:user140242 更新时间:11/28/2022 访问量:171
在埃拉托色尼的分段筛中选择分段的大小
Choosing the size of the segment in segmented sieve of Eratosthenes
问:
我制作了这个使用轮因式分解的分段筛。在这里你可以找到解释。
通过将轮子尺寸设置为 210 并使用大小为 277140 = 6 * (11 * 13 * 17 * 19 + 1) = nB*(segment_size+1) 的线段向量,筛子的速度足以达到 n=10^9。uint8_t
如果要减少段内存,例如设置和大小为 58350 = 6 * (4 * 11 * 13 * 17 + 1) 的向量,但执行时间较长,约为双倍。segment_size_min=2048
int64_t segment_size = 4;
uint8_t
/// This is a implementation of the bit wheel segmented sieve
/// with max base wheel size choice 30 , 210 , 2310
#include <iostream>
#include <cmath>
#include <vector>
#include <cstdlib>
#include <stdint.h>
const int64_t n_PB_max = 5;
const int64_t PrimesBase[n_PB_max] = {2,3,5,7,11};
const int64_t del_bit[8] =
{
~(1 << 0),~(1 << 1),~(1 << 2),~(1 << 3),
~(1 << 4),~(1 << 5),~(1 << 6),~(1 << 7)
};
const int64_t bit_count[256] =
{
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
int64_t Euclidean_Diophantine( int64_t coeff_a, int64_t coeff_b)
{
// return y in Diophantine equation coeff_a x + coeff_b y = 1
int64_t k=1;
std::vector<int64_t> div_t;
std::vector<int64_t> rem_t;
std::vector<int64_t> coeff_t;
div_t.push_back(coeff_a);
rem_t.push_back(coeff_b);
coeff_t.push_back((int64_t) 0);
div_t.push_back((int64_t) div_t[0] / rem_t[0]);
rem_t.push_back((int64_t) div_t[0] % rem_t[0]);
coeff_t.push_back((int64_t) 0);
while (rem_t[k] > 1)
{
k++;
div_t.push_back((int64_t) rem_t[k - 2] / rem_t[k - 1]);
rem_t.push_back((int64_t) rem_t[k - 2] % rem_t[k - 1]);
coeff_t.push_back((int64_t) 0);
}
k--;
coeff_t[k] = -div_t[k + 1];
if (k > 0)
coeff_t[k - 1] = (int64_t) 1;
while (k > 1)
{
k--;
coeff_t[k - 1] = coeff_t[k + 1];
coeff_t[k] += (int64_t) (coeff_t[k + 1] * (-div_t[k + 1]));
}
if (k == 1)
return (int64_t) (coeff_t[k - 1] + coeff_t[k] * ( -div_t[k]));
else
return (int64_t) (coeff_t[0]);
}
int64_t segmented_bit_sieve_wheel(uint64_t n,int64_t max_bW)
{
int64_t segment_size_min = 4096; //can be scaled down to have smaller segments
int64_t sqrt_n = (int64_t) std::sqrt(n);
int64_t count_p = (int64_t)0;
int64_t n_PB = (int64_t) 3;
int64_t bW = (int64_t) 30;
//get bW base wheel equal to p1*p2*...*pn <=min(max_bW,sqrt_n) with n=n_PB
while(n_PB < n_PB_max && (bW * PrimesBase[n_PB] <= std::min(max_bW , sqrt_n)))
{
bW *= PrimesBase[n_PB];
n_PB++;
}
for (int64_t i = 0; i < n_PB; i++)
if (n > (uint64_t) PrimesBase[i])
count_p++;
if (n > (uint64_t) (1 + PrimesBase[n_PB - 1])){
int64_t k_end = (n < (uint64_t)bW) ? (int64_t) 2 :(int64_t) (n / (uint64_t) bW + 1);
int64_t n_mod_bW = (int64_t) (n % (uint64_t) bW);
int64_t k_sqrt = (int64_t) std::sqrt(k_end / bW) + 1;
//find reduct residue set modulo bW
std::vector<char> Remainder_i_t(bW + 1,true);
for (int64_t i = 0; i < n_PB; i++)
for (int64_t j = PrimesBase[i]; j < bW + 1; j += PrimesBase[i])
Remainder_i_t[j] = false;
std::vector<int64_t> RW;
for (int64_t j = 2; j < bW + 1; j++)
if (Remainder_i_t[j] == true)
RW.push_back(-bW + j);
RW.push_back(1);
int64_t nR = RW.size(); //nR=phi(bW)
std::vector<int64_t> C1(nR * nR);
std::vector<int64_t> C2(nR * nR);
for (int64_t j = 0; j < nR - 2; j++)
{
int64_t rW_t , rW_t1;
rW_t1 = Euclidean_Diophantine(bW , -RW[j]);
for (int64_t i = 0; i < nR; i++)
{
if (i == j)
{
C2[nR * i + j] = 0;
C1[nR * i + j] = RW[j] + 1;
}
else if(i == nR - 3 - j)
{
C2[nR * i + j] = 1;
C1[nR * i + j] = RW[j] - 1;
}
else
{
rW_t = (int64_t) (rW_t1 * (-RW[i])) % bW;
if (rW_t > 1)
rW_t -= bW;
C1[nR * i + j] = rW_t + RW[j];
C2[nR * i + j] = (int64_t) (rW_t * RW[j]) / bW + 1;
if (i == nR - 1)
C2[nR * i + j] -= 1;
}
}
C2[nR * j + nR - 2] = (int64_t) 1;
C1[nR * j + nR - 2] = -(bW + RW[j]) - 1;
C1[nR * j + nR - 1] = RW[j] + 1;
C2[nR * j + nR - 1] = (int64_t )0;
}
for (int64_t i = nR - 2; i < nR; i++)
{
C2[nR * i + nR - 2] = (int64_t) 0;
C1[nR * i + nR - 2] = -RW[i] - 1;
C1[nR * i + nR - 1] = RW[i] + 1;
C2[nR * i + nR - 1] = (int64_t) 0;
}
int64_t segment_size = 1; //can be scaled up to have larger segments
int64_t p_mask_i = 3; //number primes for pre-sieve vector mask
for (int64_t i = 0; i < p_mask_i; i++)
segment_size *= (bW + RW[i]);
while (segment_size < std::max(k_sqrt , segment_size_min) && p_mask_i < 7 )
{
segment_size *= (bW + RW[p_mask_i]);
p_mask_i++;
}
int64_t nB = nR / 8; //nB number of byte for residue of congruence class equal to nR=8*nB
int64_t segment_size_b = nB * segment_size;
//vector used for the first segment containing prime numbers less than sqrt(n)
std::vector<uint8_t> Primes(nB + segment_size_b, 0xff);
int64_t p , pb , mb , mb_i , ib , i , jb , j , jp , k , kb;
int64_t kmax = (int64_t) std::sqrt(segment_size / bW) + 1;
//sieve for the first segment
for (k = (int64_t)1; k <= kmax; k++)
{
kb = nB * k;
mb_i = kb * k * bW; //nB * k * k * bW
for (jb = 0; jb < nB; jb++)
{
for (j = 0; j < 8; j++)
{
if(Primes[kb + jb] & (1 << j))
{
jp = j + jb * 8;
pb = nB * (bW * k + RW[jp]);
for (ib = 0; ib < nB; ib++)
{
for (i = 0; i < 8; i++)
{
mb = mb_i + nB * k * C1[(i + ib * 8) * nR + jp] + nB * C2[(i + ib * 8) * nR + jp];
for (; mb <= segment_size_b; mb += pb)
Primes[mb + ib] &= del_bit[i];
}
}
}
}
}
}
//count the prime numbers in the first segment
for (kb = nB; kb < std::min (nB + segment_size_b , nB * k_end); kb++)
count_p += bit_count[Primes[kb]];
if (kb == nB * k_end && kb <= segment_size_b && kb > 0)
for (ib = 0; ib < nB; ib++)
for (i = 0; i < 8; i++)
if(Primes[kb + ib]& (1 << i) && RW[i + ib * 8] < (n_mod_bW - bW))
count_p++;
if (k_end > segment_size)
{
// vector mask pre-sieve multiples of primes bW+RW[j] with 0<j<p_mask_i
std::vector<uint8_t> Segment_i(nB+segment_size_b , 0xff);
for (j = 0; j < p_mask_i; j++)
{
p = bW+RW[j];
pb = p * nB;
for (ib = 0; ib < nB; ib++)
{
for (i = 0; i < 8; i++)
{
mb = -segment_size + bW + C1[(i + ib * 8) * nR + j] + C2[(i + ib * 8) * nR + j];
if (mb < 0)
mb=(mb % p + p) % p;
mb *= nB;
for (; mb <= segment_size_b; mb += pb)
Segment_i[mb + ib] &= del_bit[i];
}
}
}
//vector used for subsequent segments of size (nB+segment_size_b)=nB*(1+segment_size)
std::vector<uint8_t> Segment_t(nB + segment_size_b);
int64_t k_low , kb_low;
for (k_low = segment_size; k_low < k_end; k_low += segment_size)
{
kb_low = k_low * nB;
for (kb = (int64_t)0; kb < (nB + segment_size_b); kb++)
Segment_t[kb] = Segment_i[kb];
kmax = (std::min(segment_size , (int64_t) std::sqrt((k_low + segment_size) / bW) + 2));
j = p_mask_i;
for(k = (int64_t) 1; k <= kmax; k++)
{
kb = k * nB;
mb_i = -k_low + bW * k * k;
for (jb = 0; jb < nB; jb++)
{
for (; j < 8; j++)
{
if (Primes[kb + jb] & (1 << j))
{
jp = j + jb * 8;
p = bW * k + RW[jp];
pb = p * nB;
for (ib = 0; ib < nB; ib++)
{
for (i = 0; i < 8; i++)
{
mb = mb_i + k * C1[(i + ib * 8) * nR+jp] + C2[(i + ib * 8) * nR+jp];
if (mb < 0)
mb = (mb % p + p) % p;
mb *= nB;
for (; mb <= segment_size_b; mb += pb)
Segment_t[mb + ib] &= del_bit[i];
}
}
}
}
j = (int64_t) 0;
}
}
for ( kb = nB + kb_low; kb < std::min (kb_low + segment_size_b + nB , nB * k_end); kb++)
count_p += bit_count[Segment_t[kb - kb_low]];
}
if (kb == nB * k_end && kb - kb_low <= segment_size_b && kb - kb_low > (int64_t) 0)
for (ib = 0; ib < nB; ib++)
for (i = 0; i < 8; i++)
if(Segment_t[kb - kb_low + ib]& (1 << i) && RW[i + ib * 8] < (n_mod_bW - bW))
count_p++;
}
}
return count_p;
}
int main()
{
int64_t n=1000000000;
//segmented_bit_sieve_wheel(n,max_bW) with max base wheel size choice max_bW= 30 , 210 , 2310
std::cout << " primes < " << n << ": "<< segmented_bit_sieve_wheel(n , 210) << std::endl;
return 0;
}
我期待相反的行为,有人可以帮助我了解细分市场的大小应该在速度方面进行优化吗?
编辑: 我想澄清一下,Euclidean_Diophantine函数仅在初始阶段用于查找数组 C1 和 C2,可以通过替换
rW_t1 = Euclidean_Diophantine(bW , -RW[j]);
跟
int64_t j1=0;
while((RW[j] * RW[j1]) % bW != bW - 1 && j1 < nR - 1)
j1++;
rW_t1 = RW[j1];
此外,通过使用带有 -O3 选项的 rextester.com g++ 进行编译来了解计时 绝对运行时间:0.51秒
因此,我通过在第二部分中删除来测试切换块所花费的时间
for(k = (int64_t) 1; k <= kmax; k++)
{
...
}
因此,我们得到的绝对运行时间等于 0.18s,这在两种配置中变化不大。
答:
规则是基准,基准,基准。
但不仅仅是整体时间安排。时间在图表中的哪个位置?如果您使用更多功能,火焰图会有所帮助。但你不是。
因此,我建议您添加一些可能隐藏在 .您要做的是根据不同类型的活动将代码划分为多个部分。就像设置一个新块来筛分而不是筛分块一样。有一个计时器。每次切换时,请记录新的时间戳,添加上一个活动的经过时间,并保存新的时间戳以对当前活动进行计时。#ifdef
至于为什么较小的方块会更慢,那是因为你花时间筛选,而且每次切换方块时也要花时间计算东西的位置。从 277140 到 58350 的块,您最终切换块的频率约为 4.75 倍。如果您之前 (36.36%) 花费 1/3.75 的时间进行切换,那么现在您将花费相同的 2.75/3.75 在筛分上,加上 4.75/3.75 在切换块上,从而花费 2 倍的时间。
加快速度的一个想法是,您经常使用相同的第一个参数进行调用。(我认为参数是 210。如果您首先使用查找表来解决问题 mod 210,然后使用它来跳过大部分计算,可能会更快。Euclidean_Diophantine
评论