CUDA:GPU 上的矩阵反转速度比 CPU 上的慢

CUDA: Matrix inversion slower on GPU than on CPU

提问人:Gagy Krayper 提问时间:5/3/2023 更新时间:5/3/2023 访问量:159

问:

我有一个稳定、简单的 Gauss-Jordan 算法,用于计算 CPU 上的矩阵反演。我尝试将此算法传输到 GPU,它工作正常,但速度显着下降,大约 10 倍。我知道我不太精通 C++ 和 CUDA,我很高兴听到一些建议。提前致谢。

安慰:

matrix size = 88; CPU time = 6531 mcs; GPU time = 52806 mcs;

以下是 CPU 的代码:

void GaussJordanInverse(float* original, float* temp, float* singular, float* bigmatrix, int size) {
    //create temp
    //create singular matrix
    for (int i = 0; i < size; i++)
    {
        for (int j = 0; j < size; j++) {
            temp[i * size + j] = original[i * size + j];
            singular[i * size + j] = 0; 
        }
        singular[i * size + i] = 1;
    }
    //create big matrix
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++)
        {
            bigmatrix[i * (size * 2) + j] = temp[i * size + j];
            bigmatrix[i * (size * 2) + j + size] = singular[i * size + j];
        }
    }
    //direct
    for (int k = 0; k < size; k++)
    {
        for (int i = 0; i < 2 * size; i++) bigmatrix[k * (size * 2) + i] = bigmatrix[k * (size * 2) + i] / temp[k * size + k];
        for (int i = k + 1; i < size; i++) {
            float K = bigmatrix[i * (size * 2) + k] / bigmatrix[k * (size * 2) + k];
            for (int j = 0; j < 2 * size; j++)bigmatrix[i * (size * 2) + j] = bigmatrix[i * (size * 2) + j] - bigmatrix[k * (size * 2) + j] * K;
        }
        for (int i = 0; i < size; i++) {
            for (int j = 0; j < size; j++)
            {
                temp[i * size + j] = bigmatrix[i * (size * 2) + j];
            }
        }
    }
    //indirect
    for (int k = size - 1; k > -1; k--)
    {
        for (int i = 2 * size - 1; i > -1; i--) bigmatrix[k * (size * 2) + i] = bigmatrix[k * (size * 2) + i] / temp[k * size + k];
        for (int i = k - 1; i > -1; i--)
        {
            float K = bigmatrix[i * (size * 2) + k] / bigmatrix[k * (size * 2) + k];
            for (int j = 2 * size - 1; j > -1; j--)
                bigmatrix[i * (size * 2) + j] = bigmatrix[i * (size * 2) + j] - bigmatrix[k * (size * 2) + j] * K;
        }
    }
    //cut
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++)
        {
            singular[i * size + j] = bigmatrix[i * (size * 2) + j + size];
            original[i * size + j] = singular[i * size + j];
        }
    }
}

以下是 GPU 的代码:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <iostream>
#ifndef __CUDACC__  
#define __CUDACC__
#endif
#include <device_functions.h>
#include <stdio.h>
#include <chrono>
#include "matrixconversion.h"
using namespace std;
using namespace std::chrono;

__global__ void create(float* original, float* temp, float* singular, float* bigmatrix, int size) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;
    //create singular matrix
    if (x < size && y < size)
    {
        temp[x * size + y] = original[x * size + y];
        if (x == y) singular[x * size + y] = 1;
        //singular[x * size + y] = 0;
        //create big matrix
        bigmatrix[x * (2 * size) + y] = temp[x * size + y];
        bigmatrix[x * (2 * size) + y + size] = singular[x * size + y];
    }
}
__global__ void direct_kernel(float* temp, float* bigmatrix, int size, int k) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    if (x < 2 * size) {
        bigmatrix[k * (size * 2) + x] /= temp[k * size + k];
    }
    __syncthreads();
    if (x >= k + 1 && x < size * 2) {
        float K = bigmatrix[x * (size * 2) + k] / bigmatrix[k * (size * 2) + k];
        for (int j = 0; j < 2 * size; j++) {
            bigmatrix[x * (size * 2) + j] -= bigmatrix[k * (size * 2) + j] * K;
        }
    }
    __syncthreads();
    if (x < size) {
        for (int j = 0; j < size; j++) {
            temp[x * size + j] = bigmatrix[x * (size * 2) + j];
        }
    }
}
__global__ void indirect_kernel(float* temp, float* bigmatrix, int size, int k) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;

    if (x < 2 * size) {
        bigmatrix[k * (size * 2) + x] /= temp[k * size + k];
    }
    __syncthreads();
    if (x < k&& x >= 0) {
        float K = bigmatrix[x * (size * 2) + k] / bigmatrix[k * (size * 2) + k];
        for (int j = 2 * size - 1; j > -1; j--) {
            bigmatrix[x * (size * 2) + j] -= bigmatrix[k * (size * 2) + j] * K;
        }
    }
    __syncthreads();
}

__global__ void cut(float* original, float* singular, float* bigmatrix, int size) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;

    if (x < size && y < size) {
        singular[x * size + y] = bigmatrix[x * (size * 2) + y + size];
        original[x * size + y] = singular[x * size + y];
    }
}

void GPUInverse(float* copy, float* original, int size, long int* time) {
    Copy1DFloat(copy, original, size);
    //set kernels
    int tr = 16;
    int bl = (int)ceil(size / tr);
    dim3 grid_size(16, 16);
    dim3 block_size(bl, bl);
    //create temp
    float* d_copy, * d_temp, * d_singular, * d_bigmatrix;
    //memory
    cudaMalloc((void**)&d_copy, size * size * sizeof(float));
    cudaMalloc((void**)&d_temp, size * size * sizeof(float));
    cudaMalloc((void**)&d_singular, size * size * sizeof(float));
    cudaMalloc((void**)&d_bigmatrix, size * (size * 2) * sizeof(float));
    cudaMemset(d_copy, 0, size * size * sizeof(float));
    cudaMemset(d_temp, 0, size * size * sizeof(float));
    cudaMemset(d_singular, 0, size * size * sizeof(float));
    cudaMemset(d_bigmatrix, 0, size * (2 * size) * sizeof(float));
    //to device
    cudaMemcpy(d_copy, copy, size * size * sizeof(float), cudaMemcpyHostToDevice);

    auto start = high_resolution_clock::now();
    //fill temp
    create <<<grid_size, block_size >>> (d_copy, d_temp, d_singular, d_bigmatrix, size);
    cudaDeviceSynchronize();
    //direct
    for (int k = 0; k < size; k++) {
        direct_kernel <<<(2 * size + 255) / 256, 256 >>> (d_temp, d_bigmatrix, size, k);
        cudaDeviceSynchronize();
    }
    //indirect
    for (int k = size - 1; k > -1; k--) {
        indirect_kernel <<<(2 * size + 255) / 256, 256 >>> (d_temp, d_bigmatrix, size, k);
        cudaDeviceSynchronize();
    }
    //cut
    cut <<<grid_size, block_size >>> (d_copy, d_singular, d_bigmatrix, size);
    cudaDeviceSynchronize();

    auto stop = high_resolution_clock::now();
    auto duration = duration_cast<microseconds>(stop - start);
    *time = duration.count();
    //from device
    cudaMemcpy(copy, d_copy, size * size * sizeof(float), cudaMemcpyDeviceToHost);
    //clean

    cudaFree(d_copy);
    cudaFree(d_temp);
    cudaFree(d_singular);
    cudaFree(d_bigmatrix);
}
C++ CUDA GPU 矩阵逆向

评论

0赞 paleonix 5/3/2023
对于合并,应尽可能使用矩阵的内部维度。也就是说,在内核中交换和交换可能会提供一些简单的加速。threadIdx.xxy
0赞 paleonix 5/3/2023
题外话,但仅供参考 避免high_resolution_clock,使用 steady_clock 进行基准测试使用 CUDA 运行时 API 检查错误的规范方法是什么?
0赞 paleonix 5/3/2023
__syncthreads()仅在每个线程块之间同步。您的使用情况看起来就像您假设它同步了整个网格。
1赞 paleonix 5/3/2023
避免在内核调用之间。只要所有内核都在同一(默认)流上启动,就不需要在它们之间同步。cudaDeviceSynchronize()
7赞 paleonix 5/3/2023
GPU 解决方案可能永远不会比 CPU 解决方案更快,只是没有足够的并行性。如果有的话,您将需要比较更大的问题规模。但是,由于主机上的顺序循环也随着增长而增长(即内核启动开销的总和也会随着增长而增长),我通常不太乐观。size == 88sizesize

答:

1赞 Gagy Krayper 5/3/2023 #1

感谢paleonix的建议。事实上,您拥有的数据越多,您就越能看到 CPU 和 GPU 之间的加速差异。此外,删除有助于进一步加快工作速度。现在控制台如下所示:cudaDeviceSynchronize()

matrix size = 88; CPU time = 7096 mcs; GPU time = 418 mcs;
matrix size = 1000; CPU time = 9834535 mcs; GPU time = 4347 mcs;