CUDA 39-2 总和扫描跳过元素

CUDA 39-2 Summed Scan Skipping Elements

提问人:Brisk4t 提问时间:11/13/2023 最后编辑:Brisk4t 更新时间:11/13/2023 访问量:58

问:

我需要实现 GPU Gems 3 第 39.2 章中的并行求和扫描,并对函数进行轻微修改,因为它使用变量。我似乎已经让它“工作”了,因为我可以看到每个第二个元素的正确总和。但是,替代元素正在变为“0”。start

我怀疑这与递归扫描功能期间清除块的最后一个元素的过程有关,但我无法弄清楚为什么会这样。

__global__ void scan(float *input, float *output, float *aux, int len) {

    __shared__ float scan_array[BLOCK_SIZE << 1];
    unsigned int thid = threadIdx.x, start = 2 * blockIdx.x * BLOCK_SIZE;

    int d;
    int ai = thid;
    int bi = thid + BLOCK_SIZE;
    float sum;

    scan_array[ai] = (start + ai < len) ? input[start + ai] : 0;
    scan_array[bi] = (start + bi < len) ? input[start + bi] : 0;

    int offset = 1;

    for(d = BLOCK_SIZE>>1; d>0; d >>= 1){
        __syncthreads();

        if(thid < d){
            int ai = offset*(2*thid+1)-1;
            int bi = offset*(2*thid+2)-1;
            scan_array[bi] += scan_array[ai];
        }
        offset *= 2;
    }

    if(threadIdx.x == 0){
        if(aux)
            aux[blockIdx.x] = scan_array[BLOCK_SIZE - 1];
        scan_array[BLOCK_SIZE - 1] = 0;
    }


    for(int d = 1; d<BLOCK_SIZE; d <<= 1){
        offset >>= 1;
        __syncthreads();

        if(thid < d){
            int ai = offset*(2*thid+1)-1;
            int bi = offset*(2*thid+2)-1;
            float t = scan_array[ai];
            scan_array[ai] = scan_array[bi];
            scan_array[bi] += t;
        }
    }

    __syncthreads();



   if (start + ai < len)
            output[ai] = scan_array[ai];

   if (start + bi < len)
            output[bi] = scan_array[bi];
     
}
void rscan(float* deviceInput, float *deviceOutput, int numElements)
{
    float *deviceAuxArray, *deviceAuxScannedArray;

    int numBlocks = ceil((float)numElements / (BLOCK_SIZE << 1));
    cudaMalloc(&deviceAuxArray, numBlocks * sizeof(float));

    scan << <numBlocks, BLOCK_SIZE >> >(deviceInput, deviceOutput, deviceAuxArray,numElements);
    cudaPeekAtLastError();
    cudaDeviceSynchronize();

    if (numBlocks > 1) {
        cudaMalloc(&deviceAuxScannedArray, numBlocks * sizeof(float));

        rscan(deviceAuxArray, deviceAuxScannedArray, numBlocks);

        fixup << <numBlocks, BLOCK_SIZE >> >(deviceOutput, deviceAuxScannedArray,numElements);
        cudaPeekAtLastError();
        cudaDeviceSynchronize();

        cudaFree(deviceAuxScannedArray);
    }

    cudaFree(deviceAuxArray);
}

代码输出:

0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 
0.0 3.0 0.0 11.0 0.0 18.0 0.0 20.0 24.0 25.0 0.0 31.0 0.0 35.0 0.0 37.0 
0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 
0.0 7.0 0.0 22.0 0.0 38.0 0.0 48.0 55.0 61.0 0.0 79.0 0.0 94.0 0.0 100.0 
0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 
0.0 9.0 0.0 33.0 0.0 61.0 0.0 81.0 95.0 104.0 0.0 131.0 0.0 152.0 0.0 167.0 
0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 
0.0 9.0 0.0 43.0 0.0 81.0 0.0 112.0 131.0 144.0 0.0 179.0 0.0 207.0 0.0 227.0 
0.0 11.0 0.0 47.0 0.0 88.0 0.0 123.0 145.0 162.0 0.0 201.0 0.0 233.0 0.0 257.0 
0.0 14.0 0.0 53.0 0.0 99.0 0.0 141.0 166.0 183.0 0.0 227.0 0.0 261.0 0.0 289.0 
0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 
0.0 21.0 0.0 71.0 0.0 126.0 0.0 173.0 204.0 222.0 0.0 274.0 0.0 314.0 0.0 353.0 
0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 
0.0 28.0 0.0 90.0 0.0 155.0 0.0 208.0 241.0 262.0 0.0 325.0 0.0 373.0 0.0 418.0 
0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 
0.0 32.0 0.0 102.0 0.0 177.0 0.0 238.0 274.0 300.0 0.0 370.0 0.0 423.0 0.0 477.0 

预期输出:

0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 
0.0 3.0 7.0 11.0 14.0 18.0 20.0 20.0 24.0 25.0 28.0 31.0 31.0 35.0 36.0 37.0 
0.0 6.0 14.0 19.0 24.0 30.0 36.0 38.0 43.0 47.0 51.0 58.0 60.0 67.0 68.0 72.0 
0.0 7.0 15.0 22.0 30.0 38.0 46.0 48.0 55.0 61.0 69.0 79.0 85.0 94.0 96.0 100.0 
0.0 8.0 17.0 27.0 38.0 47.0 58.0 63.0 74.0 82.0 91.0 104.0 113.0 123.0 127.0 135.0 
0.0 9.0 20.0 33.0 48.0 61.0 72.0 81.0 95.0 104.0 116.0 131.0 142.0 152.0 158.0 167.0 
0.0 9.0 23.0 37.0 53.0 68.0 81.0 94.0 111.0 120.0 133.0 149.0 163.0 173.0 180.0 191.0 
0.0 9.0 27.0 43.0 63.0 81.0 95.0 112.0 131.0 144.0 160.0 179.0 193.0 207.0 216.0 227.0 
0.0 11.0 31.0 47.0 69.0 88.0 104.0 123.0 145.0 162.0 178.0 201.0 217.0 233.0 243.0 257.0 
0.0 14.0 35.0 53.0 77.0 99.0 119.0 141.0 166.0 183.0 202.0 227.0 244.0 261.0 273.0 289.0 
0.0 18.0 43.0 65.0 92.0 115.0 137.0 161.0 190.0 208.0 228.0 255.0 276.0 294.0 310.0 327.0 
0.0 21.0 46.0 71.0 102.0 126.0 148.0 173.0 204.0 222.0 245.0 274.0 296.0 314.0 332.0 353.0 
0.0 25.0 53.0 81.0 116.0 144.0 167.0 194.0 226.0 245.0 271.0 303.0 326.0 348.0 367.0 391.0 
0.0 28.0 59.0 90.0 127.0 155.0 181.0 208.0 241.0 262.0 292.0 325.0 348.0 373.0 394.0 418.0 
0.0 30.0 63.0 95.0 136.0 165.0 191.0 222.0 256.0 280.0 312.0 347.0 372.0 397.0 422.0 447.0 
0.0 32.0 68.0 102.0 145.0 177.0 206.0 238.0 274.0 300.0 333.0 370.0 398.0 423.0 451.0 477.0 
C 算法 CUDA

评论

0赞 Abator Abetor 11/13/2023
我建议使用库进行前缀总和计算。
0赞 paleonix 11/13/2023
我同意@AbatorAbetor尽管我怀疑这是为了教育?对于生产代码,我将使用 cub::D eviceScan
0赞 paleonix 11/13/2023
int numBlocks = ceil((float)numElements / (BLOCK_SIZE << 1));请不要使用浮点数学来做整数上限除法。取而代之的是 简单的公式 。ceil_div = (dividend + divisor - 1) / divisor;
0赞 paleonix 11/13/2023
由于变量名称相同,在嵌套代码块中具有多个变量并相互隐藏是一种代码异味。尝试以不同的方式命名它们。aibi
2赞 paleonix 11/13/2023
如果您想要一个正确的答案,请发布一个最小的、可重复的例子。问题至少缺少函数和输入数据。main()

答: 暂无答案