当我们使用显卡调用cuFFT库计算FFT后(FFT计算,请参考https://blog.youkuaiyun.com/endlch/article/details/46724811),需要对cufftComplex*类型的数据进行进一步处理,比如取模,两个复数相乘等操作,恰巧,库里面也配套了cuComplex.h,其中包含复数基本操作函数,主机和设备端均可调用。我目前还没找到针对复数数组取模的现成并行函数,就写了个简单的核函数。以下是代码。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <cufft.h>
#include <cufftXt.h>
#include <math.h>
using namespace std;
__global__ void abs_complex(cufftComplex* pSrc, float* pDst)
{
int tx = threadIdx.x;
pDst[tx] = cuCabsf(pSrc[tx]);
}
int main()
{
cufftComplex* hSrc = (cufftComplex*)malloc(sizeof(cufftComplex) * 10);
float* hDst = (float*)malloc(sizeof(float) * 10);
cufftComplex* pSrc;
float* pDst;
cudaMalloc((void **)(&pSrc), sizeof(cufftComplex) * 10);
cudaMalloc((void **)(&pDst), sizeof(float) * 10);
memset(hSrc, 0, sizeof(cufftComplex) * 10);
for (int i = 0; i < 10; i++)
{
hSrc[i].x = i;
hSrc[i].y = i;
}
cudaMemcpy(pSrc, hSrc, 10 * sizeof(cufftComplex), cudaMemcpyHostToDevice);
dim3 block(10);
dim3 grid(1);
abs_complex << <grid, block >> > (pSrc, pDst);
cudaMemcpy(hDst, pDst, sizeof(float) * 10, cudaMemcpyDeviceToHost);
for (int i = 0; i < 10; i++)
{
//hDst[i] = cuCabsf(hSrc[i]);
cout<<hDst[i]<<endl;
}
cudaFree(pSrc);
cudaFree(pDst);
free(hSrc);
free(hDst);
system("pause");
return 0;
}
其中重点是取模函数 float cuCabsf (cuFloatComplex x) (来自cuComplex.h):
__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
{
float a = cuCrealf(x);
float b = cuCimagf(x);
float v, w, t;
a = fabsf(a);
b = fabsf(b);
if (a > b) {
v = a;
w = b;
} else {
v = b;
w = a;
}
t = w / v;
t = 1.0f + t * t;
t = v * sqrtf(t);
if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
t = v + w;
}
return t;
}
求模函数这样写原因之一是为了不让实部或虚部平方后的数不超float的32位,但可能会影响精度。