使用 MPI 和 CUDA 进行矩阵乘法 - 超级计算机上的多个 GPU 增加 GPU 时间-解网

问：

我正在研究一个矩阵乘法任务，该任务利用 MPI（消息传递接口）和 CUDA 进行并行处理。但是，我在对超级计算机的性能进行基准测试时遇到了一个意想不到的问题。具体来说，随着我增加使用的 GPU 数量，GPU 处理时间似乎会增加而不是减少，这与我的预期相反。

我正在寻找见解和解决方案，以便在将矩阵乘法代码扩展到多个 GPU 时优化其性能。以下是我的主要代码部分的片段供参考：


int main(int argc, char *argv[])
{
    srand(time(NULL));
    /* VARIABLES FOR TIMING */
    double tf0, tf1, t1, t2, t3, t4, t7, t8;
    double tmpf0, tmpf1, tmp1, tmp2, tmp3, tmp4, tmp7, tmp8;
    double tcy1, tcy2, tcy3, tcy4;
    double tmpcy1, tmpcy2, tmpcy3, tmpcy4;

    /* MPI INIT */
    int provided;
    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
    MPI_Comm_size(MPI_COMM_WORLD, &npes);
    MPI_Comm_rank(MPI_COMM_WORLD, &mype);

    if (N % npes != 0)
    {
        printf("Number of allocated procs (npes = %d) does not divide the matrix size (N = %d)\n", npes, N);
        exit(-1);
    }

    rest = N % npes;
    n_loc = N / npes;
    offset = mype * n_loc;


    /* CPU ALLOCATION */
    double *A = (double *)calloc(n_loc * N, sizeof(double));
    double *B = (double *)calloc(n_loc * N, sizeof(double));
    double *C = (double *)calloc(n_loc * N, sizeof(double));
    double *Bcol = (double *)calloc(N * n_loc, sizeof(double));
    double *Bbloc = (double *)calloc(n_loc * n_loc, sizeof(double));
    double *C_nai = (double *)calloc(n_loc * N, sizeof(double));

    /* SPECIFIYING GPU IDs*/
    int deviceCount; cudaGetDeviceCount(&deviceCount);
    int process_per_gpu = npes / deviceCount;
    int gpu_id = mype / process_per_gpu;
    cudaSetDevice(gpu_id);

    /* GPU ALLOCATION */
    double *Bcol_dev;
    cudaMalloc((void **)&Bcol_dev, (N * n_loc) * sizeof(double));
    double *A_dev;
    cudaMalloc((void **)&A_dev, (n_loc * N) * sizeof(double));
    double *C_dev;
    cudaMalloc((void **)&C_dev, (N * n_loc) * sizeof(double));

    /* FILLING A AND B USING CPUs */
    tf0 = seconds();
    fillArray(A, n_loc, N, mype);
    fillArray(B, n_loc, N, mype);
    tf1 = seconds();
    MPI_Reduce(&tf0, &tmpf0, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&tf1, &tmpf1, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);


    /* COPYING A TO GPU */
    tcy1 = seconds();
    cudaMemcpy(A_dev, A, (n_loc * N) * sizeof(double), cudaMemcpyHostToDevice);
    tcy2 = seconds();
    MPI_Reduce(&tcy1, &tmpcy1, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&tcy2, &tmpcy2, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

    /* CALLING CUBLAS */
    cublasHandle_t handle;
    cublasCreate(&handle);
    cudaDeviceSynchronize();
    for (int process = 0; process < npes; process++)
    {   
        t1 += seconds();
        PreBbloc(Bbloc, n_loc, n_loc, B, N, process);
        t2 += seconds();

        t3 += seconds();
        MPI_Allgather(Bbloc, n_loc * n_loc, MPI_DOUBLE, Bcol, n_loc * n_loc, MPI_DOUBLE, MPI_COMM_WORLD);
        t4 += seconds();

        /* COPYING Bcol TO GPU */
        tcy3 += seconds();
        cudaMemcpy(Bcol_dev, Bcol, (N * n_loc) * sizeof(double), cudaMemcpyHostToDevice);
        tcy4 += seconds();
        MPI_Reduce(&tcy3, &tmpcy3, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        MPI_Reduce(&tcy4, &tmpcy4, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

        /* PRODUCT USING NATIVE METHOD ON CPU + MPI */
        // t5 = seconds();
        // prodNATIVE(A, Bcol, C_nai, process);
        // t6 = seconds();

        /* PRODUCT USING OPENBLAS ON CPU + MPI */
        //prodOBLAS(A, Bcol, Cblas, process);
 
        /* PRODUCT USING cuBLAS ON GPU */
        t7 += seconds();
        prodCUBLAS(A_dev, Bcol_dev, C_dev, process, n_loc, handle);
        t8 += seconds();
    }

    /* DESTROYING CUBLAS */
    cublasDestroy(handle);

    /* TIMING COMPUTATION AND COMMUNICATION*/
    MPI_Reduce(&t1, &tmp1, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&t2, &tmp2, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&t3, &tmp3, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&t4, &tmp4, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    // MPI_Reduce(&t5, &tmp5, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    // MPI_Reduce(&t6, &tmp6, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&t7, &tmp7, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&t8, &tmp8, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

    /*COPY C RESULT FROM DEVICE TO HOST*/
    cudaMemcpy(C, C_dev, (n_loc * N) * sizeof(double), cudaMemcpyDeviceToHost);


    /* PRINT THE TIMING RESULTS */
    if (mype == 0)
    {
        double GPU_CPY_TIME = tmpcy2 - tmpcy1 + tmpcy4 - tmpcy3;
        double FILL_TIME = tmpf1 - tmpf0;
        double PRE_BLOCK_TIME = tmp2 - tmp1;
        double MPI_COMMUN_TIME = tmp4 - tmp3;
        //double COMPU_TIME_NATIVE = tmp6 - tmp5;
        double COMPU_TIME_CUBLAS = tmp8 - tmp7;

        FILE *fp = fopen("BENCHMARKS.txt", "a");
        fprintf(fp, "\n");
        fprintf(fp, "\n");
        fprintf(fp, "************************************************************\n");
        fprintf(fp, "                    RESULTS: N = (%d*%d)               \n", N, N);
        fprintf(fp, "************************************************************\n");
        fprintf(fp, "\n");
        fprintf(fp, "NUMBER OF NODES: %d \n", npes/4);
        fprintf(fp, "NUMBER OF PROCS: %d \n", npes);
        fprintf(fp, "FILL_TIME: %lf \n", FILL_TIME);
        fprintf(fp, "PREBBLOCK_TIME: %lf \n", PRE_BLOCK_TIME);
        fprintf(fp, "MPI_COMMUN TIME: %lf \n", MPI_COMMUN_TIME);
        fprintf(fp, "GPU_CPY_TIME: %lf \n", GPU_CPY_TIME);
        //fprintf(fp, "COMPU_TIME_NATIVE (CPU): %lf \n", COMPU_TIME_NATIVE);
        fprintf(fp, "COMPU_TIME_CUBLAS (GPU): %lf \n", COMPU_TIME_CUBLAS);        
        fprintf(fp, "\n");
        fprintf(fp, "------------------------------------------------------------\n");
        fprintf(fp, " T COMPU_TIME_CPU: %lf \n", FILL_TIME + PRE_BLOCK_TIME);
        fprintf(fp, " T COMPU_TIME_GPU: %lf \n", COMPU_TIME_CUBLAS);
        fprintf(fp, " T COMMUN_TIME (CPU + GPU): %lf \n", GPU_CPY_TIME + MPI_COMMUN_TIME);
        fprintf(fp, "------------------------------------------------------------\n");
        fprintf(fp, "\n");
        // fprintf(fp, "************************************************************\n");
        // fprintf(fp, "CUBLAS (GPU) IS %lf TIMES FASTER THAN NATIVE (CPU)!\n", COMPU_TIME_NATIVE / COMPU_TIME_CUBLAS);
        // fprintf(fp, "************************************************************\n");
        fclose(fp);
    }

    /* PRINITING A, B, C_native, C_cublas */
    // matPrint0(A, mype, "MATRIX A");
    // matPrint0(B, mype, "MATRIX B");
    // matPrint0(C_nai, mype, "A*B NATIVE CPU");
    // matPrint0(C, mype, "A*B cuBLAS GPU");

    /* FREEING THE ALLOCATED MEMORY ON CPU AND GPU*/
    free(A);
    free(B);
    free(C);
    free(C_nai);
    free(Bcol);
    free(Bbloc);
    cudaFree(Bcol_dev);
    cudaFree(C_dev);
    MPI_Finalize();
    return 0;
}

我正在使用 cuda blas 在 GPU 上进行矩阵乘法运算！

不确定我的时机是否正确？

C CUDA MPI OpenMP

2赞 Simon Goater 11/4/2023

您是否对使用 MPI 和 CUDA 带来的开销进行了任何分析？如果每个工作块不比开销成本更耗时，那么通过更多的扩展，您将获得更差的性能。

0赞 paleonix 11/5/2023

使用 Nsight Systems 跟踪所有进程并覆盖跟踪可能会很刺激。请参阅使用 CLI 分析 MPI 代码和示例：MPI

答： 暂无答案

上一个：为什么使用 OpenMP 库的代码在并行模式下比在顺序模式下运行的时间更长？

下一个：有没有可能具有memory_order_relaxed的存储永远不会到达其他线程？

使用 MPI 和 CUDA 进行矩阵乘法 - 超级计算机上的多个 GPU 增加 GPU 时间

Matrix Multiplication with MPI and CUDA - GPU Time Increases with Multiple GPUs on Supercomputer

评论