C++多线程调用cuda Ver2_cuda多线程调用-优快云博客

本文链接：https://blog.youkuaiyun.com/bocai1215/article/details/126376012

前面已经写过一个c++多线程调用cuda优化的文章，但是根据那种方式，在项目上尝试，发现仍然不行，其中一个库是外部库，看不到里面实现了什么，怀疑库里面有大量的cudaMemcpy，所以才会导致效果不明显，所以这个文章再整理下库中包含cudaMemcpy的情况，也是边尝试，边整理。

1. 模拟库中存在cudaMemcpy情况：

1.1. 代码段

cmake_minimum_required(VERSION 3.16)
project(demo)

find_package(CUDA REQUIRED)

set(_SRCS
    main.cpp)

include_directories(include)
link_directories(F:/cudaso/build/Debug)

cuda_add_executable(demo ${_SRCS})
target_link_libraries(demo cudaso)

#include <iostream>
#include "TimeConsume.h"
#include "cudasoSrc.h"
#include <vector>
#include <thread>
int main()
{
    const int arraySize = 1024;
    std::vector<int> a(arraySize, 0);
    std::vector<int> b(arraySize, 0);
    std::vector<int> c(arraySize, 0);
    for (int i = 0; i < arraySize; ++i)
    {
        a[i] = i;
        b[i] = i;
    }

    std::thread th_0 = std::thread(Fun, a.data(), b.data(), c.data(), arraySize, "Thread 0");
    std::thread th_1 = std::thread(Fun, a.data(), b.data(), c.data(), arraySize, "Thread 1");
    
    th_0.join();
    th_1.join();

    return 0;
}

cmake_minimum_required(VERSION 3.16)
project(cudaso)

find_package(CUDA REQUIRED)

# set(
#     CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; --default-stream per-thread
#     )

add_definitions(-DCUDA_API_PER_THREAD_DEFAULT_STREAM)

set(_CUSOSRC 
    TimeConsume.h
    cudasoSrc.h
    cudasoSrc.cu
    )

add_definitions(-DMYDLL)

cuda_add_library(cudaso STATIC ${_CUSOSRC})

#include "cudasoSrc.h"
#include "TimeConsume.h"
#include <vector>
#include <assert.h>

cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size, int *testCpy, int iTestCpYlEN);
 
__global__ void addKernel(int *c, const int *a, const int *b)
{
    int i = threadIdx.x;
    for (int i = 0; i < 100000; ++i)
    {
        c[i] = a[i] % (b[i] + 1);

        c[i] = atanf(a[i]);
    }
    
}

void Fun(int *c, int *a, int *b, int arraySize, const char* strName)
{
    int i_count = 0;

    while (i_count++ < 5)
    {
        TimeConsume ts_loop(strName);

        std::vector<int> v_test_data(1024 * 1024 * 100, 0);

        // Add vectors in parallel.
        cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize, v_test_data.data(), v_test_data.size());
        if (cudaStatus != cudaSuccess)
        {
            fprintf(stderr, "addWithCuda failed!");
            return;
        }
    }
}



// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size, int *testCpy, int testCpyLen)
{
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;
    
    int *dev_cpy = 0;
    cudaMalloc((void**)&dev_cpy, testCpyLen * sizeof(int));
    cudaStatus = cudaMemcpy(dev_cpy, testCpy, testCpyLen * sizeof(int), cudaMemcpyHostToDevice);
    assert(cudaSuccess == cudaStatus);
    cudaFree(dev_cpy);

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
 
    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
 
    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
 
    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
 
    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
 
    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
 
    // cudaThreadSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaThreadSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }
 
    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
 
Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return cudaStatus;
}

上述代码中，Fun是线程调用的主函数，其中分配了100M内存，并在addWithCuda中往显卡中拷贝：