Ray Tracing in One Weekend with CUDA_oneweekgpuraytracing-优快云博客

本文链接：https://blog.youkuaiyun.com/LegQQQ/article/details/144088426

CUDA 并行

要做什么

将Ray Tracing in One Weekdend 实现的程序用cuda作并行(作为课程大作业)

对每个像素点做采样。

__global__ void render(unsigned char *buffer, int max_x, int max_y, camera_to_renderer_info renderer_info,Hittable_list& world,curandState* d_state) {
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    int j = threadIdx.y + blockIdx.y * blockDim.y;
    int idx = j*max_x+i;

    if (i >= max_x || j >= max_y) return;

    curandState local_state = d_state[idx];

    //分配共享内存
    __shared__ camera_to_renderer_info local_renderer_info;
    if (threadIdx.x == 0 && threadIdx.y == 0) {
        local_renderer_info = renderer_info;
    }
    // printf("kernel get j:%d, i:%d",j,i);
    __syncthreads();

    if((i >= max_x) || (j >= max_y)) return;

    //根据线程号确定render的位置
    int pixel_index = j*max_x + i;
    point3 pixel_center = local_renderer_info.pixel00_loc+ (i * local_renderer_info.pixel_delta_u) + (j * local_renderer_info.pixel_delta_v);

    //确定ray
    vec3 ray_dir = pixel_center - local_renderer_info.center;
    Ray r(local_renderer_info.center, ray_dir);

    //初始化当前的pixel_color
    color pixel_color{ 0,0,0 };

    //重复采样
    int multisample = 100;
    for (int k = 0; k < multisample; ++k) {
        pixel_color += ray_color_sample_multiple(r, world, 20, local_state);
    }
    pixel_color /= multisample;


    
    //gamma矫正
    pixel_color.e[0] = linear_to_gamma(pixel_color.x());
    pixel_color.e[1] = linear_to_gamma(pixel_color.y());
    pixel_color.e[2] = linear_to_gamma(pixel_color.z());

    //保存值
    Interval interv{ 0.0f,0.95f };
    unsigned char uc_r = unsigned char(255.999 * interv.clamp(pixel_color.x()));
    unsigned char uc_g = unsigned char(255.999 * interv.clamp(pixel_color.y()));
    unsigned char uc_b = unsigned char(255.999 * interv.clamp(pixel_color.z()));

    buffer[pixel_index*3 + 0] = uc_r;
    buffer[pixel_index*3 + 1] = uc_g;
    buffer[pixel_index*3 + 2] = uc_b;

    d_state[idx] = local_state;
}

CUDA虚函数实现

为什么是虚函数

假设我们已经开始对像素点采样，已经生成了Ray类型的ray对象，下一步要怎么做？

把要渲染的物体收集起来
对这些物体逐个对光线进行相交测试，返回结果并处理
最终得到这个像素的颜色
对于需要做相交测试的物体，可以让他们基于一个基类 Hittable类派生而来，从而建立存储结构。在这个类上再派生出Sphere类或者 Cube类
在Hittable类上生命一个虚函数Hit(···,const& Ray ray,···)传入光线进行相交测试，在Sphere上再实现Sphere的Hit()
同时我建立了一个名为Hittable_list的Hittable*数组，存了一些Sphere*对象指针，指向我的Sphere对象。
在我计算光追的时候，我只需要先准备ray，逐个调用 Hittable* 的Hit函数就可以完成对不同类的相交测试。

问题在哪？

困扰了我有点久搜索解决方法的时候还找到了同届同校隔壁班竞赛佬的文章，可惜代码读不太明白，唉，智商

google一下

"It is not allowed to pass as an argument to a global function an object of a class with virtual functions. "
The reason is that if you instantiate the object on the host, then the virtual function table gets populated with host pointers. When you copy this object to the device, these host-pointers become meaningless.
https://forums.developer.nvidia.com/t/can-cuda-properly-handle-pure-virtual-classes/37588/4

讲的比较清楚，简单来说，正常在host端实例化对象的话，虚函数表的指针指向的是host的内存，在device端处理，首先需要将对象copy到device的内存中，但是此时虚函数表的指针指向的地址仍然是host端的内存，所以此时无法正确调用虚函数。

如何解决？

让对象直接在device的内存上创建。即在__device__/__global__函数内创建
比如要对Hittable_list类型的对象添加一个sphere
先分配Hittable_list类的内存

    Hittable_list* d_world;
    cudaMalloc(&d_world, sizeof(Hittable_list));

定义一个__global__函数

__global__ void add_sphere_to_list(Hittable_list* d_world, point3 center, float radius) {
    if (threadIdx.x == 0 && blockIdx.x == 0) {
        d_world->DeviceAddSphere(center, radius);
    }
}

其中的 DeviceAddSphere 会new一个Sphere，这个对象也会存在 Global Memory中听上去和C++堆的概念有点像

__device__ void DeviceAddSphere(const point3& center, float radius){
    if (num_objects < max_objects) {
        Sphere* sphere = new Sphere(center, radius);
        objects[num_objects++] = sphere;
    } else {
        printf("Hittable_list is full, cannot add more objects.\n");
    }
}

这样在主函数中

    //creat world in GPU Global Memory
    Hittable_list* d_world;
    cudaMalloc(&d_world, sizeof(Hittable_list));

    vec3 center = ...;
    float radius = ...;
    //调用核函数
    add_sphere_to_list<<<1, 1>>>(d_world,center,radius);
    // init_hittable_list<<<1, 1>>>(d_world);
    cudaDeviceSynchronize();

这样就完成了在device的内存上创建sphere实例，并且将指向这个sphere的指针存入到d_world当中

拓展一下 From ChatGPT

GPU 全局内存和 C++ 堆内存都是动态分配内存的方式，但它们位于不同的内存空间，分别用于 GPU 设备内存和主机内存。
GPU 全局内存适合用于存储大规模的并行计算数据，但由于其访问速度较慢，优化GPU 计算时需要尽量减少对全局内存的访问。

全局内存访问太慢了怎么办？

先写到这歇歇

__global__ void create_function(
    Hittable** hittable,
    point3 center,
    float radius
    )
{
    if (threadIdx.x == 0 && blockIdx.x == 0)
    {

        *hittable = new Sphere(center, radius);
    }
}

__global__ void delete_function(Hittable** hittable)
{
    if (threadIdx.x == 0 && blockIdx.x == 0)
    {

        delete *hittable;
    }
}

__device__ color ray_color_sample_test(const Ray& r, Hittable const* const* __restrict__ hittables, int depth)
{

    if ((*hittables)->hit(r, ray_t, rec)) 
    ...