第3章 Basic - 基础功能测试-优快云博客

Basic 测试套件是 OpenCL-CTS 中最核心的测试模块，包含 100+ 个测试用例，涵盖 OpenCL 的基础功能、内存模型、数据类型和控制流等关键特性。

3.1 主机指针操作 (hostptr)

3.1.1 测试目标

验证 OpenCL 使用主机内存指针（host pointer）创建缓冲区对象的能力，确保：

CL_MEM_USE_HOST_PTR 标志正确工作
主机内存和设备内存之间的数据一致性
内核可以直接访问主机分配的内存

3.1.2 测试内核代码

__kernel void test_hostptr(__global float *srcA, 
                           __global float *srcB, 
                           __global float *dst)
{
    int tid = get_global_id(0);
    dst[tid] = srcA[tid] + srcB[tid];
}

内核功能：对两个输入数组执行逐元素相加操作。

3.1.3 测试流程

int test_hostptr(cl_device_id device, cl_context context, 
                 cl_command_queue queue, int num_elements)
{
    // 1. 在主机端分配内存
    float *input_ptr[2], *output_ptr;
    input_ptr[0] = (float*)malloc(sizeof(float) * num_elements);
    input_ptr[1] = (float*)malloc(sizeof(float) * num_elements);
    output_ptr = (float*)malloc(sizeof(float) * num_elements);
    
    // 2. 生成随机测试数据
    make_random_data(num_elements, input_ptr[0], d);
    make_random_data(num_elements, input_ptr[1], d);
    
    // 3. 使用主机指针创建缓冲区
    cl_mem buffers[3];
    buffers[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
                                sizeof(float) * num_elements, input_ptr[0], &err);
    buffers[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
                                sizeof(float) * num_elements, input_ptr[1], &err);
    buffers[2] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
                                sizeof(float) * num_elements, output_ptr, &err);
    
    // 4. 编译内核
    cl_program program;
    cl_kernel kernel;
    err = create_single_kernel_helper(context, &program, &kernel, 1,
                                      &hostptr_kernel_code, "test_hostptr");
    
    // 5. 设置内核参数
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffers[0]);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &buffers[1]);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &buffers[2]);
    
    // 6. 执行内核
    size_t global_size = num_elements;
    clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, NULL);
    
    // 7. 映射输出缓冲区读取结果
    void *mapped_ptr = clEnqueueMapBuffer(queue, buffers[2], CL_TRUE, CL_MAP_READ,
                                          0, sizeof(float) * num_elements, 0, NULL, NULL, &err);
    
    // 8. 验证结果
    int result = verify_hostptr(input_ptr[0], input_ptr[1], output_ptr, num_elements);
    
    // 9. 清理资源
    clEnqueueUnmapMemObject(queue, buffers[2], mapped_ptr, 0, NULL, NULL);
    clReleaseMemObject(buffers[0]);
    clReleaseMemObject(buffers[1]);
    clReleaseMemObject(buffers[2]);
    
    return result;
}

3.1.4 关键验证点

内存标志验证：CL_MEM_USE_HOST_PTR 确保使用主机提供的内存
数据一致性：内核修改后的数据在主机端可见
内存映射：通过 clEnqueueMapBuffer 访问设备内存
性能考量：避免不必要的数据拷贝

3.1.5 常见失败原因

内存对齐问题：主机指针未按设备要求对齐
标志冲突：CL_MEM_USE_HOST_PTR 与 CL_MEM_COPY_HOST_PTR 混用
数据竞争：未正确同步主机和设备访问
内存泄漏：忘记 unmap 或释放资源

3.2 浮点运算测试

3.2.1 测试概述

浮点运算测试验证 OpenCL 设备对 IEEE 754 浮点标准的支持程度，包括：

基本算术运算（加减乘除）
特殊值处理（NaN、Inf、-0.0）
舍入模式
向量类型运算

3.2.2 fpmath_float - 单精度标量浮点

测试内核模板

__kernel void test_fp(__global TYPE *srcA, 
                      __global TYPE *srcB, 
                      __global TYPE *dst)
{
    int tid = get_global_id(0);
    dst[tid] = srcA[tid] OP srcB[tid];
}

TYPE：float, float2, float4 等
OP：+, -, *, /

测试操作定义

struct TestDef {
    const char op;                          // 操作符
    std::function<float(float, float)> ref; // 参考实现
};

// 测试用例
TestDef tests[] = {
    { '+', [](float a, float b) { return a + b; } },
    { '-', [](float a, float b) { return a - b; } },
    { '*', [](float a, float b) { return a * b; } },
    { '/', [](float a, float b) { return a / b; } }
};

随机数据生成

void generate_random_inputs(std::vector<cl_float> (&input)[2])
{
    RandomSeed seed(gRandomSeed);
    
    auto random_generator = [&seed]() {
        return get_random_float(
            -MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31),  // -2^31
             MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31),  //  2^31
            seed
        );
    };
    
    for (auto &v : input) {
        std::generate(v.begin(), v.end(), random_generator);
    }
}

结果验证

static int verify_fp(std::vector<float> (&input)[2], 
                     std::vector<float> &output,
                     const TestDef &test)
{
    auto &inA = input[0];
    auto &inB = input[1];
    
    for (int i = 0; i < output.size(); i++)
    {
        float r = test.ref(inA[i], inB[i]);
        if (r != output[i])
        {
            log_error("FP '%c' float test failed at index %d: "
                      "expected %f, got %f\n", 
                      test.op, i, r, output[i]);
            return -1;
        }
    }
    
    log_info("FP '%c' float test passed\n", test.op);
    return 0;
}

3.2.3 fpmath_float2 / fpmath_float4 - 向量浮点

向量类型测试扩展了标量测试，验证 SIMD 操作：

// float2 示例
__kernel void test_fp(__global float2 *srcA, 
                      __global float2 *srcB, 
                      __global float2 *dst)
{
    int tid = get_global_id(0);
    dst[tid] = srcA[tid] + srcB[tid];  // 向量加法
}

向量操作特点：

逐分量运算
支持 float2, float4, float8, float16
SIMD 优化潜力

3.2.4 舍入模式测试

int test_fpmath(...)
{
    int isRTZ = 0;
    RoundingMode oldMode = kDefaultRoundingMode;
    
    // 检查设备是否仅支持 RTZ（Round Toward Zero）
    if (CL_FP_ROUND_TO_ZERO == get_default_rounding_mode(device))
    {
        isRTZ = 1;
        oldMode = get_round();
        set_round(kRoundTowardZero, kfloat);
    }
    
    // ... 执行测试 ...
    
    // 恢复舍入模式
    if (isRTZ) set_round(oldMode, kfloat);
}

支持的舍入模式：

RTE (Round to Nearest Even) - 默认
RTZ (Round Toward Zero)
RTP (Round Toward Positive Infinity)
RTN (Round Toward Negative Infinity)

3.2.5 特殊值测试

// 测试特殊浮点值
float special_values[] = {
    0.0f,               // 正零
    -0.0f,              // 负零
    INFINITY,           // 正无穷
    -INFINITY,          // 负无穷
    NAN,                // 非数字
    FLT_MIN,            // 最小正常数
    FLT_MAX,            // 最大有限数
    FLT_EPSILON         // 机器精度
};

3.3 整数运算测试

3.3.1 intmath_int/int2/int4 - 32位整数

测试内核

__kernel void test_intmath_int(__global int *srcA, 
                               __global int *srcB, 
                               __global int *dst)
{
    int tid = get_global_id(0);
    
    // 基本算术
    int add = srcA[tid] + srcB[tid];
    int sub = srcA[tid] - srcB[tid];
    int mul = srcA[tid] * srcB[tid];
    int div = srcA[tid] / srcB[tid];  // 整数除法
    int mod = srcA[tid] % srcB[tid];  // 取模
    
    // 位操作
    int and_op = srcA[tid] & srcB[tid];
    int or_op  = srcA[tid] | srcB[tid];
    int xor_op = srcA[tid] ^ srcB[tid];
    int not_op = ~srcA[tid];
    
    // 移位操作
    int shl = srcA[tid] << srcB[tid];
    int shr = srcA[tid] >> srcB[tid];
    
    dst[tid] = add + sub + mul;  // 组合结果
}

整数溢出行为

// OpenCL 整数运算采用环绕（wrap-around）语义
int a = INT_MAX;
int b = 1;
int c = a + b;  // c == INT_MIN（溢出回绕）

// 饱和运算（需要使用内置函数）
int d = add_sat(a, b);  // d == INT_MAX（饱和到最大值）

3.3.2 intmath_long/long2/long4 - 64位整数

64位整数测试类似，但需要检查设备支持：

int test_intmath_long(...)
{
    // 检查 64 位整数支持
    char extensions[4096];
    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 
                    sizeof(extensions), extensions, NULL);
    
    if (!strstr(extensions, "cl_khr_int64")) {
        log_info("Device does not support cl_khr_int64, skipping test\n");
        return TEST_SKIP;
    }
    
    // ... 执行测试 ...
}

64位运算示例：

__kernel void test_intmath_long(__global long *srcA, 
                                __global long *srcB, 
                                __global long *dst)
{
    int tid = get_global_id(0);
    
    long a = srcA[tid];
    long b = srcB[tid];
    
    // 64位运算
    dst[tid] = a * b + (a / b) - (a % b);
}

3.3.3 整数向量运算

__kernel void test_intmath_int4(__global int4 *srcA, 
                                __global int4 *srcB, 
                                __global int4 *dst)
{
    int tid = get_global_id(0);
    
    // 向量运算（逐分量）
    int4 result = srcA[tid] + srcB[tid];
    result = result * (int4)(2, 2, 2, 2);  // 标量乘法
    result = result >> (int4)(1, 2, 3, 4); // 向量移位
    
    dst[tid] = result;
}

3.4 控制流测试

3.4.1 if - 条件分支

测试内核

__kernel void test_if(__global int *src, __global int *dst)
{
    int tid = get_global_id(0);
    int value = src[tid];
    
    // 简单 if 语句
    if (value > 0) {
        dst[tid] = value * 2;
    } else {
        dst[tid] = value / 2;
    }
    
    // 嵌套 if
    if (value > 100) {
        if (value < 200) {
            dst[tid] = 100;
        } else {
            dst[tid] = 200;
        }
    }
    
    // if-else if-else 链
    if (value < 0) {
        dst[tid] = -1;
    } else if (value == 0) {
        dst[tid] = 0;
    } else {
        dst[tid] = 1;
    }
}

分支发散（Branch Divergence）

__kernel void test_divergent_branch(__global int *src, 
                                    __global int *dst)
{
    int tid = get_global_id(0);
    int lid = get_local_id(0);
    
    // 工作项内分支发散
    if (lid % 2 == 0) {
        // 偶数线程执行路径 A
        dst[tid] = src[tid] * 2;
    } else {
        // 奇数线程执行路径 B
        dst[tid] = src[tid] + 10;
    }
}

性能注意事项：

同一 wavefront/warp 内的分支发散会降低性能
尽量避免基于线程 ID 的条件分支
使用 select() 函数替代简单分支

3.4.2 loop - 循环结构

各种循环类型

__kernel void test_loop(__global int *src, __global int *dst)
{
    int tid = get_global_id(0);
    int sum = 0;
    
    // for 循环
    for (int i = 0; i < 10; i++) {
        sum += src[tid + i];
    }
    
    // while 循环
    int count = 0;
    while (count < 5) {
        sum += count;
        count++;
    }
    
    // do-while 循环
    int j = 0;
    do {
        sum *= 2;
        j++;
    } while (j < 3);
    
    // 嵌套循环
    for (int i = 0; i < 4; i++) {
        for (int j = 0; j < 4; j++) {
            sum += i * j;
        }
    }
    
    dst[tid] = sum;
}

循环展开

__kernel void test_loop_unroll(__global int *src, 
                               __global int *dst)
{
    int tid = get_global_id(0);
    int sum = 0;
    
    // 编译器提示：展开循环
    #pragma unroll 4
    for (int i = 0; i < 16; i++) {
        sum += src[tid + i];
    }
    
    dst[tid] = sum;
}

循环边界条件

__kernel void test_loop_boundary(__global int *src, 
                                 __global int *dst,
                                 int n)
{
    int tid = get_global_id(0);
    
    // 安全的循环边界检查
    if (tid < n) {
        int sum = 0;
        for (int i = tid; i < n; i += get_global_size(0)) {
            sum += src[i];
        }
        dst[tid] = sum;
    }
}

3.5 内存模型测试

3.5.1 local_arg_def - 局部内存参数定义

测试目标

验证局部内存（__local）作为内核参数的声明和使用。

测试内核

__kernel void test_local_arg(__global int *input,
                             __global int *output,
                             __local int *scratch)  // 局部内存参数
{
    int tid = get_global_id(0);
    int lid = get_local_id(0);
    int lsize = get_local_size(0);
    
    // 将全局内存数据加载到局部内存
    scratch[lid] = input[tid];
    
    // 同步工作组
    barrier(CLK_LOCAL_MEM_FENCE);
    
    // 使用局部内存进行计算
    int sum = 0;
    for (int i = 0; i < lsize; i++) {
        sum += scratch[i];
    }
    
    output[tid] = sum;
}

主机端设置局部内存参数

// 计算局部内存大小
size_t local_size = 256;
size_t local_mem_size = local_size * sizeof(cl_int);

// 设置内核参数
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_buffer);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_buffer);
clSetKernelArg(kernel, 2, local_mem_size, NULL);  // 局部内存参数传 NULL

// 执行内核
size_t global_size = 1024;
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, 
                       &global_size, &local_size, 
                       0, NULL, NULL);

3.5.2 local_kernel_def - 局部内存内核定义

局部内存在内核内部静态声明：

__kernel void test_local_kernel_def(__global int *input,
                                    __global int *output)
{
    // 静态局部内存声明
    __local int scratch[256];
    
    int tid = get_global_id(0);
    int lid = get_local_id(0);
    
    // 初始化局部内存
    scratch[lid] = input[tid];
    barrier(CLK_LOCAL_MEM_FENCE);
    
    // 使用局部内存
    output[tid] = scratch[lid] + scratch[(lid + 1) % 256];
}

局部内存大小限制：

// 查询设备局部内存大小
cl_ulong local_mem_size;
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, 
                sizeof(local_mem_size), &local_mem_size, NULL);

log_info("Device local memory size: %llu bytes\n", local_mem_size);

// 查询内核使用的局部内存
size_t kernel_local_mem;
clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_LOCAL_MEM_SIZE,
                         sizeof(kernel_local_mem), &kernel_local_mem, NULL);

3.5.3 local_kernel_scope - 局部内存作用域

验证局部内存的作用域规则：

__kernel void test_local_scope(__global int *output)
{
    __local int shared_data[128];
    
    int lid = get_local_id(0);
    int gid = get_global_id(0);
    
    // 每个工作组有独立的 shared_data 副本
    shared_data[lid] = lid;
    barrier(CLK_LOCAL_MEM_FENCE);
    
    // 工作组内可见，工作组间不可见
    output[gid] = shared_data[(lid + 1) % get_local_size(0)];
}

作用域测试要点：

局部内存在工作组（work-group）间隔离
同一工作组内所有工作项共享
生命周期：工作组执行期间

3.5.4 constant - 常量内存

常量内存用于只读数据：

__constant float coefficients[16] = {
    1.0f, 2.0f, 3.0f, 4.0f,
    5.0f, 6.0f, 7.0f, 8.0f,
    9.0f, 10.0f, 11.0f, 12.0f,
    13.0f, 14.0f, 15.0f, 16.0f
};

__kernel void test_constant(__global float *input,
                            __global float *output)
{
    int tid = get_global_id(0);
    
    float sum = 0.0f;
    for (int i = 0; i < 16; i++) {
        sum += input[tid] * coefficients[i];
    }
    
    output[tid] = sum;
}

常量内存作为参数：

__kernel void test_constant_arg(__global float *input,
                                __constant float *coeffs,
                                __global float *output)
{
    int tid = get_global_id(0);
    output[tid] = input[tid] * coeffs[tid % 16];
}

主机端创建常量缓冲区：

cl_mem const_buffer = clCreateBuffer(context, 
                                     CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                     sizeof(float) * 16, 
                                     host_coeffs, 
                                     &err);

3.5.5 constant_source - 常量内存源代码

测试编译时常量和运行时常量的区别：

// 编译时常量（内核源码中定义）
__constant int compile_time_const[4] = {1, 2, 3, 4};

__kernel void test_constant_source(__global int *output,
                                   __constant int *runtime_const)
{
    int tid = get_global_id(0);
    
    // 使用编译时常量
    int val1 = compile_time_const[tid % 4];
    
    // 使用运行时常量（从主机传递）
    int val2 = runtime_const[tid % 4];
    
    output[tid] = val1 + val2;
}

3.6 类型操作测试

3.6.1 sizeof - 类型大小

验证 OpenCL C 类型的大小符合规范：

__kernel void test_sizeof(__global int *output)
{
    int tid = get_global_id(0);
    
    // 标量类型
    output[0] = sizeof(char);      // 1
    output[1] = sizeof(short);     // 2
    output[2] = sizeof(int);       // 4
    output[3] = sizeof(long);      // 8
    output[4] = sizeof(float);     // 4
    output[5] = sizeof(double);    // 8
    
    // 向量类型
    output[6] = sizeof(int2);      // 8
    output[7] = sizeof(int4);      // 16
    output[8] = sizeof(int8);      // 32
    output[9] = sizeof(int16);     // 64
    
    // 指针类型
    output[10] = sizeof(void*);     // 8 (64位) 或 4 (32位)
}

OpenCL C 类型大小规范：

类型	大小（字节）	对齐
char	1	1
short	2	2
int	4	4
long	8	8
float	4	4
double	8	8
half	2	2

3.6.2 pointer_cast - 指针转换

测试不同地址空间指针之间的转换：

__kernel void test_pointer_cast(__global uchar *src,
                                __global uint *dst)
{
    int tid = get_global_id(0);
    
    // 指针转换（reinterpret cast）
    __global uchar *byte_ptr = src + tid * 4;
    __global uint *word_ptr = (__global uint *)byte_ptr;
    
    // 读取 4 个字节作为 1 个 uint
    dst[tid] = *word_ptr;
    
    // 等价操作（显式组装）
    uint value = (uint)byte_ptr[0] 
               | ((uint)byte_ptr[1] << 8)
               | ((uint)byte_ptr[2] << 16)
               | ((uint)byte_ptr[3] << 24);
}

地址空间转换：

__kernel void test_address_space_cast(__global int *g_ptr,
                                      __local int *l_ptr)
{
    int tid = get_global_id(0);
    int lid = get_local_id(0);
    
    // 全局内存 -> 私有内存（隐式）
    int private_var = g_ptr[tid];
    
    // 私有内存 -> 局部内存（通过赋值）
    l_ptr[lid] = private_var;
    
    barrier(CLK_LOCAL_MEM_FENCE);
    
    // 局部内存 -> 全局内存
    g_ptr[tid] = l_ptr[lid];
}

3.6.3 hiloeo - 高低位和奇偶位操作

测试向量分量访问和重组：

__kernel void test_hiloeo(__global int4 *input,
                          __global int *output)
{
    int tid = get_global_id(0);
    int4 vec = input[tid];
    
    // .hi 和 .lo - 高半部分和低半部分
    int2 hi = vec.hi;  // vec.s23 = {vec.z, vec.w}
    int2 lo = vec.lo;  // vec.s01 = {vec.x, vec.y}
    
    // .even 和 .odd - 偶数和奇数位
    int2 even = vec.even;  // vec.s02 = {vec.x, vec.z}
    int2 odd  = vec.odd;   // vec.s13 = {vec.y, vec.w}
    
    output[tid * 4 + 0] = hi.x + hi.y;
    output[tid * 4 + 1] = lo.x + lo.y;
    output[tid * 4 + 2] = even.x + even.y;
    output[tid * 4 + 3] = odd.x + odd.y;
}

向量访问方式：

int4 v = (int4)(10, 20, 30, 40);

// 索引访问
v.s0 = 100;  // v = (100, 20, 30, 40)
v.s1 = 200;  // v = (100, 200, 30, 40)

// xyzw 访问（最多4分量）
v.x = 1; v.y = 2; v.z = 3; v.w = 4;

// 混洗（swizzle）
int2 v2 = v.xy;   // (1, 2)
int2 v3 = v.zw;   // (3, 4)
int4 v4 = v.wzyx; // (4, 3, 2, 1) 反转

// hi/lo
int2 hi = v.hi;   // (3, 4)
int2 lo = v.lo;   // (1, 2)

// even/odd
int2 even = v.even; // (1, 3)
int2 odd  = v.odd;  // (2, 4)

3.7 图像基础测试

3.7.1 readimage - 2D图像读取

测试内核

__kernel void test_readimage(__read_only image2d_t src_img,
                             sampler_t sampler,
                             __global float4 *output)
{
    int tid = get_global_id(0);
    int width = get_image_width(src_img);
    
    // 计算归一化坐标
    int x = tid % width;
    int y = tid / width;
    float2 coord = (float2)((float)x / width, (float)y / width);
    
    // 读取图像
    float4 pixel = read_imagef(src_img, sampler, coord);
    
    output[tid] = pixel;
}

创建测试图像

// 创建图像格式
cl_image_format format;
format.image_channel_order = CL_RGBA;
format.image_channel_data_type = CL_UNORM_INT8;

// 创建图像描述
cl_image_desc desc;
memset(&desc, 0, sizeof(desc));
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
desc.image_width = 512;
desc.image_height = 512;

// 创建图像对象
cl_mem image = clCreateImage(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                             &format, &desc, host_image_data, &err);

// 创建采样器
cl_sampler sampler = clCreateSampler(context, 
                                     CL_FALSE,                // normalized_coords
                                     CL_ADDRESS_CLAMP_TO_EDGE, // addressing_mode
                                     CL_FILTER_NEAREST,       // filter_mode
                                     &err);

3.7.2 readimage_int16 / readimage_fp32 - 不同数据类型

测试不同图像数据类型的读取：

// 16位整数图像
__kernel void test_readimage_int16(__read_only image2d_t src_img,
                                   sampler_t sampler,
                                   __global int4 *output)
{
    int tid = get_global_id(0);
    int2 coord = (int2)(tid % get_image_width(src_img), 
                        tid / get_image_width(src_img));
    
    // 读取为整数
    int4 pixel = read_imagei(src_img, sampler, coord);
    output[tid] = pixel;
}

// 32位浮点图像
__kernel void test_readimage_fp32(__read_only image2d_t src_img,
                                  sampler_t sampler,
                                  __global float4 *output)
{
    int tid = get_global_id(0);
    int2 coord = (int2)(tid % get_image_width(src_img), 
                        tid / get_image_width(src_img));
    
    // 读取为浮点
    float4 pixel = read_imagef(src_img, sampler, coord);
    output[tid] = pixel;
}

3.7.3 writeimage - 2D图像写入

__kernel void test_writeimage(__read_only image2d_t src_img,
                              __write_only image2d_t dst_img,
                              sampler_t sampler)
{
    int x = get_global_id(0);
    int y = get_global_id(1);
    int2 coord = (int2)(x, y);
    
    // 读取源图像
    float4 pixel = read_imagef(src_img, sampler, coord);
    
    // 处理（例如：灰度化）
    float gray = 0.299f * pixel.x + 0.587f * pixel.y + 0.114f * pixel.z;
    float4 output_pixel = (float4)(gray, gray, gray, pixel.w);
    
    // 写入目标图像
    write_imagef(dst_img, coord, output_pixel);
}

3.7.4 mri_one / mri_multiple - 多图像读取

测试同时读取多个图像：

__kernel void test_mri_multiple(__read_only image2d_t img1,
                                __read_only image2d_t img2,
                                __read_only image2d_t img3,
                                sampler_t sampler,
                                __write_only image2d_t output)
{
    int x = get_global_id(0);
    int y = get_global_id(1);
    int2 coord = (int2)(x, y);
    
    // 读取多个图像
    float4 pixel1 = read_imagef(img1, sampler, coord);
    float4 pixel2 = read_imagef(img2, sampler, coord);
    float4 pixel3 = read_imagef(img3, sampler, coord);
    
    // 混合
    float4 result = (pixel1 + pixel2 + pixel3) / 3.0f;
    
    write_imagef(output, coord, result);
}

3.7.5 image_r8 - R8格式图像

测试单通道8位图像：

__kernel void test_image_r8(__read_only image2d_t src_img,
                            sampler_t sampler,
                            __global uchar *output)
{
    int x = get_global_id(0);
    int y = get_global_id(1);
    int2 coord = (int2)(x, y);
    
    // 读取单通道图像
    float4 pixel = read_imagef(src_img, sampler, coord);
    
    // R8 格式只有红色通道
    output[y * get_image_width(src_img) + x] = (uchar)(pixel.x * 255.0f);
}