cuda cudaMallocManaged 测试

最新推荐文章于 2024-10-11 07:04:43 发布

后来居上_m

最新推荐文章于 2024-10-11 07:04:43 发布

阅读量395

点赞数

文章标签：经验分享 linux c语言

本文链接：https://blog.youkuaiyun.com/weixin_45206081/article/details/132216175

版权

文章比较了在CUDA编程中使用不同内存管理方法（如cudaMallocManaged、cudaMallocHost和cudaHostAlloc）对cufft计算性能的影响，以及展示了如何创建和同步CUDA事件来测量GPU时间。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

下面展示一些 内联代码片。

// 开辟内存的不同方法

   cufftComplex* DBF_Result = NULL; // 定义指向cufftComplex类型的指针
   // cudaMallocHost((void**)&DBF_Result, M * N * sizeof(cufftComplex)); // 申请内存空间
    //cudaHostAlloc((void**)&DBF_Result, M * N * sizeof(cufftComplex), cudaHostAllocMapped);   //锁业内存
   // cudaHostAlloc((void**)&DBF_Result, M * N * sizeof(cufftComplex), cudaHostAllocDefault);
    cudaHostAlloc((void**)&DBF_Result, M * N * sizeof(cufftComplex), cudaHostAllocWriteCombined);
   // cudaHostAlloc((void**)&DBF_Result, M * N * sizeof(cufftComplex), cudaHostAllocPortable);

//1. 开辟内存的不同方法cudaMallocManaged 测试

// 开辟内存的不同方法cudaMallocManaged

#include <stdio.h>
#include <windows.h>
#include"iostream"
#include"cuda_runtime_api.h"
#include"device_launch_parameters.h"
#include"cufft.h"
#include <stdio.h>
#include <windows.h>

int main()
{

    const int Nt = 1024*1024;
    const int BATCH = 1;
    cufftComplex* host_in, * host_out, * device_in, * device_out;

    LARGE_INTEGER frequency, start_time, end_time;
    double elapsed_time;

    QueryPerformanceFrequency(&frequency);

    QueryPerformanceCounter(&start_time);


    // create two events
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);

    cudaMallocManaged((void**)&host_in, Nt * sizeof(cufftComplex));
    cudaMallocManaged((void**)&host_out, Nt * sizeof(cufftComplex));
    cudaDeviceSynchronize();

     cudaEventRecord(stop);
     cudaEventSynchronize(stop);         // 等到stop event完成

     float time;
     cudaEventElapsedTime(&time, start, stop);
     cudaEventDestroy(start);// clean up the two events
     cudaEventDestroy(stop);
     printf("GPU Time = %g ms.\n", time);

    for (int i = 0; i < Nt; i++)
    {
        host_in[i].x = i + 1;
        host_in[i].y = 0;

    }
    /*host_in[3].x = 0;
    host_in[3].y = 0;*/
    cufftHandle cufftForwrdHandle;
   
    int m;

    cudaDeviceSynchronize();
   // for (j = 2 * 1024; j <= 1024 * 1024; j *= 2) {

        //cufftPlan1d(&cufftForwrdHandle, 2 * 1024, CUFFT_C2C, BATCH);
        cufftPlan1d(&cufftForwrdHandle, Nt, CUFFT_C2C, BATCH);

        // start = what_time_is_it_now();
    //    double start = GetTickCount64(); //计时器

        int loop = 10;
        for (m = 0; m < loop; m++) {
            //执行fft正变换
            cufftExecC2C(cufftForwrdHandle, host_in, host_out, CUFFT_FORWARD);
        }
        cudaDeviceSynchronize();
        /*QueryPerformanceCounter(&end_time);
        elapsed_time = (double)(end_time.QuadPart - start_time.QuadPart) / frequency.QuadPart;
        printf("Elapsed time: %f milliseconds\n", (elapsed_time * 1000) / loop);*/
        for (int i = 0; i < 4; i++) {
      //     printf("%f+%f\n", host_out[i].x, host_out[i].y);
        }
      return 0;
}

// 2.开辟内存的不同方法cudaMallocHost

// 开辟内存的不同方法cudaMallocHost

 int main()
{
    const int Nt = 1024*1024;
    const int BATCH = 1;

    cufftComplex* host_in, * host_out, * device_in, * device_out;
    cudaMallocHost((void**)&host_in, Nt * sizeof(cufftComplex));
    cudaMallocHost((void**)&host_out, ( Nt+1) * sizeof(cufftComplex));

    for (int i = 0; i < Nt; i++)
    {
        host_in[i].x = i + 1;
        host_in[i].y = 0;

    }

    // create two events
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);

    //设备内存申请
    cudaMalloc((void**)&device_in, (Nt)  * sizeof(cufftComplex));
    cudaMalloc((void**)&device_out, (Nt) * sizeof(cufftComplex));
    //数据传输--H2D
    cudaMemcpy(device_in, host_in, Nt * sizeof(cufftComplex), cudaMemcpyHostToDevice);

     cudaEventRecord(stop);
     cudaEventSynchronize(stop);         // 等到stop event完成

     float time;
     cudaEventElapsedTime(&time, start, stop);
     cudaEventDestroy(start);// clean up the two events
     cudaEventDestroy(stop);
     printf("GPU Time = %g ms.\n", time);

    cufftHandle cufftForwrdHandle;
    {

        cufftPlan1d(&cufftForwrdHandle, Nt, CUFFT_C2C, BATCH);

        cufftExecC2C(cufftForwrdHandle, device_in, device_out, CUFFT_FORWARD);

        cudaMemcpy(host_out, device_out, Nt * sizeof(cufftComplex), cudaMemcpyDeviceToHost);


        for (int i = 0; i < 10; i++) {
            printf("%f+%f\n", host_out[i].x, host_out[i].y);
        }
    }


    return 0;