Ubuntu下的CUDA编程(五)——使用pt…

本文介绍了一个使用pthread和CUDA实现多线程程序的例子,其中包括一条GPU线程执行向量加法操作,三条CPU线程执行循环打印任务。通过测试时间来分析不同线程间的执行关系。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

本次测试目的仅仅为了尝试能否使用pthread实现多线程对CPU和GPU分别进行控制:
文件结构如下:
main.cc:控制线程的主函数
vector.cu:实现对核函数进行调用的任务函数
vector.h:任务函数声明
vector_kernel.cu:核函数
vector_kernel.h:核函数声明
生成文件如下:
vector:最终生成的可执行文件
main.o:生成的函数体obj
vector.o:生成的任务函数obj
vector_kernel.o:生成的核函数obj

功能描述:
线程1:执行GPU代码(向量加法)
线程2:执行CPU代码(循环打印数据)
线程3:执行CPU代码(循环打印数据)
线程4:执行CPU代码(循环打印数据)

文件代码:
main.cc:

#include <stdio.h>
#include <pthread.h>
#include <sys/time.h>
#include <stdlib.h>
#include <string.h>
#include "vector.h"

pthread_t thread[4];

void *thread1(void *)
{
//do GPU task
float time, start;
start = clock();
printf("thread1:I'm thread1\n");
int i, n = 100;
float *a, *b, *c;
a = (float *)malloc(n * sizeof(float));
b = (float *)malloc(n * sizeof(float));
c = (float *)malloc(n * sizeof(float));
for(i = 0; i < n; i++)
{
a[i] = 1.0f;
b[i] = 1.0f;
}
for(i = 0; i < 100; i++)
{
vectorAdd(a, b, c, n);
}
printf("thread1:c[%d] = %f\n", 0, c[0]);
free(a);
free(b);
free(c);
time = clock() - start;
printf("thread1: task was finished!\ncostTime1 : %f\n", time / CLOCKS_PER_SEC);
pthread_exit(NULL);
}

void *thread2(void *)
{
//do CPU task
float time, start;
start = clock();
printf("thread2:I'm thread2\n");
int i, j, k = 1;
for(i = 0; i < 1000; i++)
{
for(j = 0; j < 1000; j++)
{
printf("thread2:k = %d\n", k);
k++;
}
}
time  = clock() - start;
printf("thread2: task was finished!\ncostTime2 : %f\n", time / CLOCKS_PER_SEC);
pthread_exit(NULL);
}

void *thread3(void *)
{
//do CPU task
float time, start;
start = clock();
printf("thread3:I'm thread3\n");
int i, j, k = 1;
for(i = 0; i < 1000; i++)
{
for(j = 0; j < 1000; j++)
{
printf("thread3:k = %d\n", k);
k++;
}
}
time  = clock() - start;
printf("thread3: task was finished!\ncostTime3 : %f\n", time / CLOCKS_PER_SEC);
pthread_exit(NULL);
}

void *thread4(void *)
{
//do CPU task
float time, start;
start = clock();
printf("thread4:I'm thread4\n");
int i, j, k = 1;
for(i = 0; i < 1000; i++)
{
for(j = 0; j < 1000; j++)
{
printf("thread4:k = %d\n", k);
k++;
}
}
time  = clock() - start;
printf("thread4: task was finished!\ncostTime4 : %f\n", time / CLOCKS_PER_SEC);
pthread_exit(NULL);
}

void thread_create()
{
int temp;
memset(&thread, 0, sizeof(thread));
if((temp = pthread_create(&thread[0], NULL, thread1, NULL)) != 0)
printf("线程1创建失败!\n");
else
printf("线程1被创建!\n");
if((temp = pthread_create(&thread[1], NULL, thread2, NULL)) != 0)
printf("线程2创建失败!\n");
else
printf("线程2被创建!\n");
if((temp = pthread_create(&thread[2], NULL, thread3, NULL)) != 0)
printf("线程3创建失败!\n");
else
printf("线程3被创建!\n");
if((temp = pthread_create(&thread[3], NULL, thread4, NULL)) != 0)
printf("线程4创建失败!\n");
else
printf("线程4被创建!\n");
}
void thread_wait()
{
if(thread[0] != 0)
{
pthread_join(thread[0], NULL);
printf("线程1已经结束\n");
}
if(thread[1] != 0)
{
pthread_join(thread[1], NULL);
printf("线程2已经结束\n");
}
if(thread[2] != 0)
{
pthread_join(thread[2], NULL);
printf("线程3已经结束\n");
}
if(thread[3] != 0)
{
pthread_join(thread[3], NULL);
printf("线程4已经结束\n");
}
}

int main()
{
float time, start;
printf("我是主函数,正在创建线程\n");
start = clock();
thread_create();
printf("我是主函数,正在等待线程完成任务\n");
thread_wait();
time = clock() - start;
printf("costTime0 : %f\n", time / CLOCKS_PER_SEC);
return 0;
}


vector.cu:

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include "vector.h"
#include "vector_kernel.h"

void vectorAdd(float *a, float *b, float *c, int n)
{
float *d_a, *d_b, *d_c;
cudaMalloc((void **)&d_a, n * sizeof(float));
cudaMemcpy(d_a, a, n * sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_b, n * sizeof(float));
cudaMemcpy(d_b, b, n * sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_c, n * sizeof(float));
cudaMemcpy(d_b, b, n * sizeof(float), cudaMemcpyHostToDevice);
vectorAddKernel<<<1, n>>>(d_a, d_b, d_c, n);
cudaMemcpy(c, d_c, n * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}

vector.h:

#include <stdio.h>
#include <stdlib.h>

void vectorAdd(float *a, float *b, float *c, int n);

vector_kernel.cu:

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include "vector_kernel.h"

__global__ void vectorAddKernel(float *a, float *b, float *c, int n)
{
int tid;
tid = threadIdx.x;
if(tid < n)
c[tid] = a[tid] + b[tid];
}

vector_kernel.h:

#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>

__global__ void vectorAddKernel(float *a, float *b, float *c, int n);

而最主要的是makefile的内容,由于nvcc会将.cu生成obj默认为c++方式,所以,需要将主函数改为.cc文件,不然将会编译出错!

makefile:

vector : main.o vector.o vector_kernel.o
nvcc -o vector main.o vector.o vector_kernel.o

vector_kernel.o : vector_kernel.cu vector_kernel.h
nvcc -c vector_kernel.cu

vector.o : vector.cu vector.h vector_kernel.h
nvcc -c vector.cu

main.o : main.cc vector.h
cc -lpthread -c main.cc

通过测试时间可以知道几个线程之间的执行关系,大家自己测试啦~

以下是一个运行结果:

Ubuntu下的CUDA编程(五)鈥斺斒褂胮thread实现多线程CPU+GPU计算

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值