如何将 cudaArray 转换为 Torch 张量？-解网

问：

我正在尝试使用 Torch 和 CUDA。到目前为止，使用我能够做到以下几点：torch::from_blob()

#include <cuda_runtime.h>
#include <torch/torch.h>
#include <iostream>
#include <exception>
#include <memory>
#include <math.h>

using std::cout;
using std::endl;
using std::exception;

/*
 * Demonstration of interoperability between CUDA and Torch C++ API using 
 * pinned memory.
 *
 * Using the ENABLE_ERROR variable a change in the result (CUDA) can be
 * introduced through its respective Torch tensor. This will also affect
 * the copied data from GPU to CPU, resulting in an error during assert
 * checks at the end
 */

// Contains the call to the CUDA kernel
void vector_add(int* a, int* b, int* c, int N, int cuda_grid_size, int cuda_block_size);

bool ENABLE_ERROR = false;

int main(int argc, const char* argv[])
{
    // Setup array, here 2^16 = 65536 items
    const int N = 1 << 16;
    size_t bytes = N * sizeof(int);

    // Declare pinned memory pointers
    int* a_cpu, * b_cpu, * c_cpu;

    // Allocate pinned memory for the pointers
    // The memory will be accessible from both CPU and GPU
    // without the requirements to copy data from one device
    // to the other
    cout << "Allocating memory for vectors on CPU" << endl;
    cudaMallocHost(&a_cpu, bytes);
    cudaMallocHost(&b_cpu, bytes);
    cudaMallocHost(&c_cpu, bytes);

    // Init vectors
    cout << "Populating vectors with random integers" << endl;
    for (int i = 0; i < N; ++i)
    {
        a_cpu[i] = rand() % 100;
        b_cpu[i] = rand() % 100;
    }

    // Declare GPU memory pointers
    int* a_gpu, * b_gpu, * c_gpu;

    // Allocate memory on the device
    cout << "Allocating memory for vectors on GPU" << endl;
    cudaMalloc(&a_gpu, bytes);
    cudaMalloc(&b_gpu, bytes);
    cudaMalloc(&c_gpu, bytes);

    // Copy data from the host to the device (CPU -> GPU)
    cout << "Transfering vectors from CPU to GPU" << endl;
    cudaMemcpy(a_gpu, a_cpu, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(b_gpu, b_cpu, bytes, cudaMemcpyHostToDevice);

    // Specify threads per CUDA block (CTA), her 2^10 = 1024 threads
    int NUM_THREADS = 1 << 10;

    // CTAs per grid
    int NUM_BLOCKS = (N + NUM_THREADS - 1) / NUM_THREADS;

    // Call CUDA kernel
    cout << "Running CUDA kernels" << endl;
    vector_add(a_gpu, b_gpu, c_gpu, N, NUM_BLOCKS, NUM_THREADS);

    try
    {
        // Convert pinned memory on GPU to Torch tensor on GPU
        auto options = torch::TensorOptions().dtype(torch::kInt).device(torch::kCUDA, 0).pinned_memory(true);
        cout << "Converting vectors and result to Torch tensors on GPU" << endl;
        torch::Tensor a_gpu_tensor = torch::from_blob(a_gpu, { N }, options);
        torch::Tensor b_gpu_tensor = torch::from_blob(b_gpu, { N }, options);
        torch::Tensor c_gpu_tensor = torch::from_blob(c_gpu, { N }, options);

        cout << "Verifying result using Torch tensors" << endl;
        if (ENABLE_ERROR)
        {
            /*
            TEST
            Change the value of the result should result in two things:
             - the GPU memory will be modified
             - the CPU test later on (after the GPU memory is copied to the CPU side) should fail
            */
            cout << "ERROR GENERATION ENABLED! Application will crash during verification of results" << endl;
            cout << "Changing result first element from " << c_gpu_tensor[0];
            c_gpu_tensor[0] = 99999999;
            cout << " to " << c_gpu_tensor[0] << endl;
        }
        else
        {
            assert(c_gpu_tensor.equal(a_gpu_tensor.add(b_gpu_tensor)) == true);
        }
    }
    catch (exception& e)
    {
        cout << e.what() << endl;

        cudaFreeHost(a_cpu);
        cudaFreeHost(b_cpu);
        cudaFreeHost(c_cpu);

        cudaFree(a_gpu);
        cudaFree(b_gpu);
        cudaFree(c_gpu);

        return 1;
    }

    // Copy memory to device and also synchronize (implicitly)
    cout << "Synchronizing CPU and GPU. Copying result from GPU to CPU" << endl;
    cudaMemcpy(c_cpu, c_gpu, bytes, cudaMemcpyDeviceToHost);

    // Verify the result on the CPU
    cout << "Verifying result on CPU" << endl;
    for (int i = 0; i < N; ++i)
    {
        assert(c_cpu[i] == a_cpu[i] + b_cpu[i]);
    }

    cudaFreeHost(a_cpu);
    cudaFreeHost(b_cpu);
    cudaFreeHost(c_cpu);

    cudaFree(a_gpu);
    cudaFree(b_gpu);
    cudaFree(c_gpu);

    return 0;
}

使用内核

__global__ void vector_add_kernel(int* a, int* b, int* c, int N)
{
    // Calculate global thread ID
    int t_id = (blockDim.x * blockIdx.x) + threadIdx.x;

    // Check boundry
    if (t_id < N)
    {
        c[t_id] = a[t_id] + b[t_id];
    }
}

void vector_add(int* a, int* b, int* c, int N, int cuda_grid_size, int cuda_block_size)
{
    vector_add_kernel << <cuda_grid_size, cuda_block_size >> > (a, b, c, N);
    cudaGetLastError();
}

上面的代码使用固定内存（用于 CPU 和 GPU 之间的快速传输），并使用各自的内核在两个向量之间执行加法运算。此外，我将用于这些向量的 GPU 内存块转换为张量，同时保留在 GPU 上，并执行相同的操作，但使用张量。我甚至添加了一个小“错误”，允许我验证我最初分配的数据（没有张量）在操作张量时是否确实发生了变化。libtorch

我还设法使用了 's ，这是一个指向 OpenCV 图像像素数据的指针，成功，例如cv::Matdatavoidtorch::from_blob()

auto tensor_input = torch::from_blob(img_torch.data, { 1, img_torch.size().height, img_torch.size().width, 1 }, torch::kFloat32);
tensor_input = tensor_input.permute({ 0, 3, 1, 2 });

对于我必须转换为的 BGRA （PNG）图像（以便与我的 ML 模型一起使用并使用上面的张量形状（）进行一些调整。CV_32FC3permute()

我无法做到这一点，我想知道这是否可能。cudaArray

我使用 a 的原因是，就像在这种类型的描述中一样，我正在存储我需要处理的纹理（在我的例子中是 D3D11 2D 纹理）。我实际上能够使用我自己编写的纯 CUDA 内核来做到这一点，同时还使用，我怀疑我是否可以以任何形式或形式传递到它。cudaArraycudaSurfaceObject_tlibtorch

我在（伪代码）行中寻找一些东西：

// Register cudaGraphicsResource* cu_arr_interop using cudaGraphicsMapResources(...)
...

// Map the texture's texels to a CUDA array
cudaArray* cu_arr;
cudaGraphicsSubResourceGetMappedArray(&cu_arr, cu_arr_interop, 0, 0);

// Convert the CUDA array to a Torch tensor
auto options = torch::TensorOptions().dtype(...).device(torch::kCUDA, 0).pinned_memory(true);
auto tensor_in = torch::from_blob((void*)cu_arr, { ... }, options);

// Run ML model
auto tensor_out = module.forward({ tensor_in }).toTensor();

// See result on screen
...

// cudaGraphicsUnmapResources(...)

C++ CUDA 互操作 Direct3D11 libtorch

cudaError_t cr = cudaSuccess;

// Allocate linear CUDA memory
void* copy = nullptr;
cr = cudaMalloc(&copy, dpitch * height);
if (cr != cudaSuccess)
{
    ...
}

// Copying the input CUDA array to the flat CUDA memory
cr = cudaMemcpy2DFromArray(copy, dpitch, array_read, 0, 0, dpitch, height, cudaMemcpyDeviceToDevice);
if (cr != cudaSuccess)
{
    ...
}

// Setup tensor that maps the flat CUDA memory so that it can be used in libtorch
at::Tensor tensor_in;
auto options = torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCUDA, 0).pinned_memory(true);
// Map memory as a HEIGHTxWIDTHxCHANNELS tensor that will represent the image with its 4 channels
tensor_in = torch::from_blob(copy, { height, width,  4 }, options);
// Permute so that the channels are the first dimension. This allows extracting the pixel data per channel as a separate tensor
tensor_in = tensor_in.permute({2, 0, 1});

进一步的转换取决于将用于推理给定张量的模型。上述排列允许将每个通道提取为单独的张量。就我而言，我必须进行一些额外的转换才能使张量与我的模型兼容，例如

// Extract channels and convert to tensors that are compatible with the expected input for the ML
at::Tensor tensor_in_R, tensor_in_G, tensor_in_B, tensor_in_A;
tensor_in_R= tensor_in[0].div(255.0).unsqueeze(0).unsqueeze(0).to(torch::kFloat32);
tensor_in_G = ...
tensor_in_B = ...
tensor_in_A = ...

当复制回来时

// Copy tensor to the CUDA output array
cr = cudaMemcpy2DToArray(array_write,
    0, 0,
    tensor_out.data_ptr(),
    dpitch, dpitch,
    height, cudaMemcpyDeviceToDevice);

等于这个是行不通的。dpitchwidth * sizeof(unsigned char) * 4

输出张量（推理的结果）需要进行后处理 - 必要时（取消）压缩维度、置换、转换为原始数据格式（例如就我而言）等等。torch::kUInt8

有两个步骤非常重要，即：

合并 - 如果要拆分图像并分别处理每个通道，则必须合并结果（此处为 R、G、B 和 A）。就我而言，我用做到了，它沿着现有维度连接张量。cat()
```
tensor_out = torch::cat({
tensor_out_R.unsqueeze(0),
tensor_out_G.unsqueeze(0),
tensor_out_B.unsqueeze(0),
tensor_in_processed[3].unsqueeze(0)
     }).permute({ 2, 0, 1 });
```
扁平化 - 我花了大约 2 天时间才弄清楚这个非常明显的必要性。张量具有不同的内存布局。为了将其复制回 CUDA 数组，需要将其展平。默认值将给定的张量压缩为一维数组。通过这样做，您甚至可以重复使用输入 CUDA 数组的格式（包括尺寸）与输出相同的音高。flatten()cudaMemcpy2DFromArray()

由于 libtorch 在异常和错误处理方面的文档很差，我建议将中间结果从 C++ 转储到序列化的张量文件。然后可以使用

t_from_cpp = list(torch.jit.load('tensor_cpp_dump.pt').parameters())[0]

您可以使用可视化/另存为图像张量。检查形状并尝试张量的各种转换提供了一种快速获得解决方案的方法，然后您可以在 C++ 中传输该解决方案。为了进行比较，在 C++ 中运行推理在发生错误时提供零反馈。在 PyTorch 中，你经常会得到一个很好的描述，说明出了什么问题，包括完整的跟踪。torchvision.transformsPILToImage()

[![在此处输入图片描述][3]][3]

每当我看到频道显示为单独的图像或其他一些奇怪的东西时，我总是认为内存的对齐/读取方式顺序不正确。我几乎可以肯定，作为参数的双重使用是罪魁祸首。我需要在这里放置什么值是一个谜。dpitchcudaMemcpy2DToArray()

上一个：如何在 Unity 中使用 Xaudio2 和 ill2cpp

下一个：如何使用 Windows 图形捕获 API CreateForWindow （hWnd）复制 CAPTUREBLT 行为

如何将 cudaArray 转换为 Torch 张量？

How to convert a cudaArray to a Torch tensor?

评论