为什么“write_imageh”将矢量数据写入 OpenCL 的不可预测的位置?

why "write_imageh" write the vector data into unpredicted position for OpenCL?

提问人:walter 提问时间:11/16/2023 最后编辑:walter 更新时间:11/20/2023 访问量:29

问:

我正在移动 GPU(mali-G77 MC9)上进行一些 OpenCL 测试,以计算一般矩阵乘法。我的主机端代码如下:

  1. 定义 metrix 和输出缓冲区:
    int h = 8;
    int w = 8;
    int wo = 8;
    __fp16 * input = new __fp16[h*w];
    for(int i=0;i<h*w; i++)
    {
        input[i] = gen_random_float(-0.1, 0.1);
    }
    __fp16 * weight = new __fp16[w*wo];
    for(int i=0;i<w*wo; i++)
    {
        weight[i] = gen_random_float(-0.1, 0.1);
    }
    __fp16 * output = new __fp16[h*wo];
    memset(output, 0, sizeof(__fp16)*h*wo);
  1. 初始化 opencl 环境和内核。
  2. 配置参数并将其馈送到内核:
    // Allocate texture
    cl_image_format img_fmt;
    memset(&img_fmt, 0, sizeof(img_fmt));
    img_fmt.image_channel_order = CL_RGBA;
    img_fmt.image_channel_data_type = CL_HALF_FLOAT;
    cl_image_desc desc;
    memset(&desc, 0, sizeof(desc));
    desc.image_type = CL_MEM_OBJECT_IMAGE2D;
    desc.image_width = int(w/4);
    desc.image_height = h;
    cl_mem b_input = clCreateImage(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &img_fmt, &desc, input, &error);
    if (error != CL_SUCCESS) {        
        return -1;
    }
    cl_mem b_weight = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(__fp16)*w*wo, weight, &error);
    if (error != CL_SUCCESS) {        
        return -1;
    }
    memset(&img_fmt, 0, sizeof(img_fmt));
    img_fmt.image_channel_order = CL_RGBA;
    img_fmt.image_channel_data_type = CL_HALF_FLOAT;
    memset(&desc, 0, sizeof(desc));
    desc.image_type = CL_MEM_OBJECT_IMAGE2D;
    desc.image_width = int(wo/4);
    desc.image_height = h;
    cl_mem b_output = clCreateImage(context, CL_MEM_READ_WRITE, &img_fmt, &desc, NULL, &error);
    if (error != CL_SUCCESS) {        
        return -1;
    }
    // Enqueuing parameters
    error = clSetKernelArg(k_linear_opt_without_transpose_4x8_texture_test, 0, sizeof(cl_mem), &b_input);
    if (error != CL_SUCCESS) {        
        return -1;
    }
    error = clSetKernelArg(k_linear_opt_without_transpose_4x8_texture_test, 1, sizeof(int), &h);
    if (error != CL_SUCCESS) {        
        return -1;
    }
    error = clSetKernelArg(k_linear_opt_without_transpose_4x8_texture_test, 2, sizeof(int), &w);
    if (error != CL_SUCCESS) {        
        return -1;
    }
    error = clSetKernelArg(k_linear_opt_without_transpose_4x8_texture_test, 3, sizeof(int), &wo);
    if (error != CL_SUCCESS) {
        return -1;
    }
    error = clSetKernelArg(k_linear_opt_without_transpose_4x8_texture_test, 4, sizeof(cl_mem), &b_weight);
    if (error != CL_SUCCESS) {
        return -1;
    }
    error = clSetKernelArg(k_linear_opt_without_transpose_4x8_texture_test, 5, sizeof(cl_mem), &b_output);
    if (error != CL_SUCCESS) {
        return -1;
    }
  1. 启动内核
  2. 回读到输出缓冲区:
    size_t origin[] = {0, 0, 0};
    size_t region[] = {(size_t)(wo/4), (size_t)h, 1};
    size_t  rowPitch;
    size_t  slicePitch;
    __fp16 * p_output = (__fp16 *)clEnqueueMapImage(queue, b_output, CL_TRUE, CL_MAP_READ, origin, region, &rowPitch, &slicePitch, 0, NULL, NULL, &error);
    if (error != CL_SUCCESS) {
        return -1;
    }
    memcpy(output, p_output, sizeof(__fp16)*h*wo);
    error = clEnqueueUnmapMemObject(queue, b_output, p_output, 0, NULL, NULL);
    if (error != CL_SUCCESS) {
        return -1;
    }
  1. 检查输出缓冲区的数据:
for(int i=0;i<h*wo/2; i++)
    {
        __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "output[%d]: %f, output[%d]: %f\n", i, output[i], h*wo-1-i, output[h*wo-1-i]);
    }
  1. 干净

我的内核是:

#pragma OPENCL EXTENSION cl_khr_fp16 : enable
const sampler_t default_sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void linear_opt_without_transpose_4x8_texture_test(__global __read_only image2d_t input,
int h, 
int w, 
int wo, 
__global half * weight,
__global __read_write image2d_t output)
{
  int idx = get_global_id(0);
  if (idx >= 1)
  {
      return;
  }
  half4 v_value[4] = {(half4)(0), (half4)(0), (half4)(0), (half4)(0)};
  v_value[0] = (half4)(1);
  v_value[1] = (half4)(2);
  v_value[2] = (half4)(3);
  v_value[3] = (half4)(4);
  write_imageh(output, (int2)(0, 0), v_value[0]);
  write_imageh(output, (int2)(0, 1), v_value[1]);
  write_imageh(output, (int2)(1, 0), v_value[2]);
  write_imageh(output, (int2)(1, 1), v_value[3]);
}

输出数据为:

output[0]: 1.000000, output[63]: 0.000000
output[1]: 1.000000, output[62]: 0.000000
output[2]: 1.000000, output[61]: 0.000000
output[3]: 1.000000, output[60]: 0.000000
output[4]: 3.000000, output[59]: 0.000000
output[5]: 3.000000, output[58]: 0.000000
output[6]: 3.000000, output[57]: 0.000000
output[7]: 3.000000, output[56]: 0.000000
output[8]: 0.000000, output[55]: 0.000000
output[9]: 0.000000, output[54]: 0.000000
output[10]: 0.000000, output[53]: 0.000000
output[11]: 0.000000, output[52]: 0.000000
output[12]: 0.000000, output[51]: 0.000000
output[13]: 0.000000, output[50]: 0.000000
output[14]: 0.000000, output[49]: 0.000000
output[15]: 0.000000, output[48]: 0.000000
output[16]: 0.000000, output[47]: 0.000000
output[17]: 0.000000, output[46]: 0.000000
output[18]: 0.000000, output[45]: 0.000000
output[19]: 0.000000, output[44]: 0.000000
output[20]: 0.000000, output[43]: 0.000000
output[21]: 0.000000, output[42]: 0.000000
output[22]: 0.000000, output[41]: 0.000000
output[23]: 0.000000, output[40]: 0.000000
output[24]: 0.000000, output[39]: 4.000000
output[25]: 0.000000, output[38]: 4.000000
output[26]: 0.000000, output[37]: 4.000000
output[27]: 0.000000, output[36]: 4.000000
output[28]: 0.000000, output[35]: 2.000000
output[29]: 0.000000, output[34]: 2.000000
output[30]: 0.000000, output[33]: 2.000000
output[31]: 0.000000, output[32]: 2.000000

似乎数据已被写入:

v_value[0]: (0, 0)
v_value[1]: (0, 4) 
v_value[2]: (1, 0)
v_value[3]: (1, 4)

但我想要的是:

v_value[0]: (0, 0)
v_value[1]: (0, 1) 
v_value[2]: (1, 0)
v_value[3]: (1, 1)

这太奇怪了,谁能帮我一把?

非常感谢!

此致敬意

沃尔特

C 移动 OpenCL

评论

0赞 solidpixel 11/24/2023
驱动程序返回的值是多少?你似乎忽略了它。rowPitch
0赞 walter 11/28/2023
非常感谢 solidpixel!我找到了一点,要从图像对象中复制数据,我使用了: clEnqueueMapImage memcpy clEnqueueUnmapMemObject 数据很奇怪,如上所述。然后,我尝试了clEnqueueReadImage,它有效。缓冲区中的数据层和 Image 对象似乎有一些区别。我仍在调查中。此致敬意

答: 暂无答案