提问人:Gagy Krayper 提问时间:5/3/2023 更新时间:5/3/2023 访问量:159
CUDA:GPU 上的矩阵反转速度比 CPU 上的慢
CUDA: Matrix inversion slower on GPU than on CPU
问:
我有一个稳定、简单的 Gauss-Jordan 算法,用于计算 CPU 上的矩阵反演。我尝试将此算法传输到 GPU,它工作正常,但速度显着下降,大约 10 倍。我知道我不太精通 C++ 和 CUDA,我很高兴听到一些建议。提前致谢。
安慰:
matrix size = 88; CPU time = 6531 mcs; GPU time = 52806 mcs;
以下是 CPU 的代码:
void GaussJordanInverse(float* original, float* temp, float* singular, float* bigmatrix, int size) {
//create temp
//create singular matrix
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++) {
temp[i * size + j] = original[i * size + j];
singular[i * size + j] = 0;
}
singular[i * size + i] = 1;
}
//create big matrix
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++)
{
bigmatrix[i * (size * 2) + j] = temp[i * size + j];
bigmatrix[i * (size * 2) + j + size] = singular[i * size + j];
}
}
//direct
for (int k = 0; k < size; k++)
{
for (int i = 0; i < 2 * size; i++) bigmatrix[k * (size * 2) + i] = bigmatrix[k * (size * 2) + i] / temp[k * size + k];
for (int i = k + 1; i < size; i++) {
float K = bigmatrix[i * (size * 2) + k] / bigmatrix[k * (size * 2) + k];
for (int j = 0; j < 2 * size; j++)bigmatrix[i * (size * 2) + j] = bigmatrix[i * (size * 2) + j] - bigmatrix[k * (size * 2) + j] * K;
}
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++)
{
temp[i * size + j] = bigmatrix[i * (size * 2) + j];
}
}
}
//indirect
for (int k = size - 1; k > -1; k--)
{
for (int i = 2 * size - 1; i > -1; i--) bigmatrix[k * (size * 2) + i] = bigmatrix[k * (size * 2) + i] / temp[k * size + k];
for (int i = k - 1; i > -1; i--)
{
float K = bigmatrix[i * (size * 2) + k] / bigmatrix[k * (size * 2) + k];
for (int j = 2 * size - 1; j > -1; j--)
bigmatrix[i * (size * 2) + j] = bigmatrix[i * (size * 2) + j] - bigmatrix[k * (size * 2) + j] * K;
}
}
//cut
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++)
{
singular[i * size + j] = bigmatrix[i * (size * 2) + j + size];
original[i * size + j] = singular[i * size + j];
}
}
}
以下是 GPU 的代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <iostream>
#ifndef __CUDACC__
#define __CUDACC__
#endif
#include <device_functions.h>
#include <stdio.h>
#include <chrono>
#include "matrixconversion.h"
using namespace std;
using namespace std::chrono;
__global__ void create(float* original, float* temp, float* singular, float* bigmatrix, int size) {
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
//create singular matrix
if (x < size && y < size)
{
temp[x * size + y] = original[x * size + y];
if (x == y) singular[x * size + y] = 1;
//singular[x * size + y] = 0;
//create big matrix
bigmatrix[x * (2 * size) + y] = temp[x * size + y];
bigmatrix[x * (2 * size) + y + size] = singular[x * size + y];
}
}
__global__ void direct_kernel(float* temp, float* bigmatrix, int size, int k) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < 2 * size) {
bigmatrix[k * (size * 2) + x] /= temp[k * size + k];
}
__syncthreads();
if (x >= k + 1 && x < size * 2) {
float K = bigmatrix[x * (size * 2) + k] / bigmatrix[k * (size * 2) + k];
for (int j = 0; j < 2 * size; j++) {
bigmatrix[x * (size * 2) + j] -= bigmatrix[k * (size * 2) + j] * K;
}
}
__syncthreads();
if (x < size) {
for (int j = 0; j < size; j++) {
temp[x * size + j] = bigmatrix[x * (size * 2) + j];
}
}
}
__global__ void indirect_kernel(float* temp, float* bigmatrix, int size, int k) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < 2 * size) {
bigmatrix[k * (size * 2) + x] /= temp[k * size + k];
}
__syncthreads();
if (x < k&& x >= 0) {
float K = bigmatrix[x * (size * 2) + k] / bigmatrix[k * (size * 2) + k];
for (int j = 2 * size - 1; j > -1; j--) {
bigmatrix[x * (size * 2) + j] -= bigmatrix[k * (size * 2) + j] * K;
}
}
__syncthreads();
}
__global__ void cut(float* original, float* singular, float* bigmatrix, int size) {
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < size && y < size) {
singular[x * size + y] = bigmatrix[x * (size * 2) + y + size];
original[x * size + y] = singular[x * size + y];
}
}
void GPUInverse(float* copy, float* original, int size, long int* time) {
Copy1DFloat(copy, original, size);
//set kernels
int tr = 16;
int bl = (int)ceil(size / tr);
dim3 grid_size(16, 16);
dim3 block_size(bl, bl);
//create temp
float* d_copy, * d_temp, * d_singular, * d_bigmatrix;
//memory
cudaMalloc((void**)&d_copy, size * size * sizeof(float));
cudaMalloc((void**)&d_temp, size * size * sizeof(float));
cudaMalloc((void**)&d_singular, size * size * sizeof(float));
cudaMalloc((void**)&d_bigmatrix, size * (size * 2) * sizeof(float));
cudaMemset(d_copy, 0, size * size * sizeof(float));
cudaMemset(d_temp, 0, size * size * sizeof(float));
cudaMemset(d_singular, 0, size * size * sizeof(float));
cudaMemset(d_bigmatrix, 0, size * (2 * size) * sizeof(float));
//to device
cudaMemcpy(d_copy, copy, size * size * sizeof(float), cudaMemcpyHostToDevice);
auto start = high_resolution_clock::now();
//fill temp
create <<<grid_size, block_size >>> (d_copy, d_temp, d_singular, d_bigmatrix, size);
cudaDeviceSynchronize();
//direct
for (int k = 0; k < size; k++) {
direct_kernel <<<(2 * size + 255) / 256, 256 >>> (d_temp, d_bigmatrix, size, k);
cudaDeviceSynchronize();
}
//indirect
for (int k = size - 1; k > -1; k--) {
indirect_kernel <<<(2 * size + 255) / 256, 256 >>> (d_temp, d_bigmatrix, size, k);
cudaDeviceSynchronize();
}
//cut
cut <<<grid_size, block_size >>> (d_copy, d_singular, d_bigmatrix, size);
cudaDeviceSynchronize();
auto stop = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(stop - start);
*time = duration.count();
//from device
cudaMemcpy(copy, d_copy, size * size * sizeof(float), cudaMemcpyDeviceToHost);
//clean
cudaFree(d_copy);
cudaFree(d_temp);
cudaFree(d_singular);
cudaFree(d_bigmatrix);
}
答:
1赞
Gagy Krayper
5/3/2023
#1
感谢paleonix的建议。事实上,您拥有的数据越多,您就越能看到 CPU 和 GPU 之间的加速差异。此外,删除有助于进一步加快工作速度。现在控制台如下所示:cudaDeviceSynchronize()
matrix size = 88; CPU time = 7096 mcs; GPU time = 418 mcs;
matrix size = 1000; CPU time = 9834535 mcs; GPU time = 4347 mcs;
评论
threadIdx.x
x
y
high_resolution_clock
,使用steady_clock
进行基准测试,使用 CUDA 运行时 API 检查错误的规范方法是什么?__syncthreads()
仅在每个线程块之间同步。您的使用情况看起来就像您假设它同步了整个网格。cudaDeviceSynchronize()
size == 88
size
size