Matlab中有dst函数用来计算离散正弦变换,但是类似于DFT,它的变换矩阵并不是一个完全的正交矩阵,需要外加一个定标因子才能变换正交矩阵,有关Matlab中的dst正变换和dst逆变换函数对应的公式分别如下:


从以上两个公式可以看出它类似于我们常见的DFT,DFT反变换中有个1/N定标因子,正变换中没有(此处是反变换中有个2/(N+1))。
CPU版本:
#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include "fftw3.h" //fft动态库
void dst(double* a,int n,int m,double* b) //dst
{
int yW = 2*(n+1);
fftw_complex *iin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*yW);
fftw_complex *out = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*yW);
fftw_plan p;
//对每一行做FFT变换
for (int j=0;j<m;j++)
{
iin[0][0] = 0;
iin[0][1] = 0;
iin[n+1][0] = 0;
iin[n+1][1] = 0;
for (int i=0;i<n;i++)
{
iin[i+1][0] = a[j*n+i];
iin[n+2+i][0] = -a[j*n+(n-1-i)];
iin[i][1] = 0;
}
p = fftw_plan_dft_1d(yW,iin,out,FFTW_FORWARD, FFTW_ESTIMATE);
fftw_execute(p);
for (int i=0;i<n;i++)
{
b[j*n+i] = -out[i+1][1]/2;
}
}
fftw_destroy_plan(p);
fftw_free(iin);
fftw_free(out);
}
//使用实数做傅立叶,更省内存
void dst2(double* a,int n,int m,double* b)
{
int yW = 2*(n+1);
double *iin = (double*)fftw_malloc(sizeof(double)*yW);
fftw_complex *out = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*(n+2));
fftw_plan p;
//对每一行做FFT变换
for (int j=0;j<m;j++)
{
iin[0] = 0;
iin[n+1] = 0;
for (int i=0;i<n;i++)
{
iin[i+1] = a[j*n+i];
//iin[n+2+i] = -a[j*n+(n-1-i)];
iin[yW-1-i] = -a[j*n+i];
}
p = fftw_plan_dft_r2c_1d(yW,iin,out,FFTW_ESTIMATE);
fftw_execute(p);
for (int i=0;i<n;i++)
{
b[j*n+i] = -out[i+1][1]/2;
}
}
fftw_destroy_plan(p);
fftw_free(iin);
fftw_free(out);
}
void idst(double* a,int n,int m,double* b)
{
int yW = 2*(n+1);
fftw_complex *iin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*yW);
fftw_complex *out = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*yW);
fftw_plan p;
//对每一行做FFT变换
for (int j=0;j<m;j++)
{
iin[0][0] = 0;
iin[0][1] = 0;
iin[n+1][0] = 0;
iin[n+1][1] = 0;
for (int i=0;i<n;i++)
{
iin[i+1][0] = a[j*n+i];
iin[n+2+i][0] = -a[j*n+(n-1-i)];
iin[i][1] = 0;
}
p = fftw_plan_dft_1d(yW,iin,out,FFTW_FORWARD, FFTW_ESTIMATE);
fftw_execute(p);
for (int i=0;i<n;i++)
{
b[j*n+i] = -out[i+1][1]/(n+1);
}
}
fftw_destroy_plan(p);
fftw_free(iin);
fftw_free(out);
}
void dstXY(double* a,int w,int h,double* b)
{
//对行每行做dst
int ww = 2*(w+1);
fftw_complex *hiin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*ww);
fftw_complex *hout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*ww);
fftw_plan p;
for (int j=0;j<h;j++)
{
hiin[0][0] = 0;
hiin[0][1] = 0;
hiin[w+1][0] = 0;
hiin[w+1][1] = 0;
for (int i=0;i<w;i++)
{
hiin[i+1][0] = a[j*w+i];
hiin[w+2+i][0] = -a[j*w+(w-1-i)];
hiin[i+1][1] = 0;
hiin[w+2+i][1] = 0;
}
p = fftw_plan_dft_1d(ww,hiin,hout,FFTW_FORWARD, FFTW_ESTIMATE);
fftw_execute(p);
for (int i=0;i<w;i++)
{
b[j*w+i] = -hout[i+1][1]/2;
}
}
fftw_free(hiin);
fftw_free(hout);
//对结果每列做dst
int hh = 2*(h+1);
fftw_complex *liin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*hh);
fftw_complex *lout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*hh);
for (int i=0;i<w;i++)
{
liin[0][0] = 0;
liin[0][1] = 0;
liin[h+1][0] = 0;
liin[h+1][1] = 0;
for (int j=0;j<h;j++)
{
liin[j+1][0] = b[j*w+i];
liin[h+2+j][0] = -b[(h-1-j)*w+i];
liin[j+1][1] = 0;
}
p = fftw_plan_dft_1d(hh,liin,lout,FFTW_FORWARD, FFTW_ESTIMATE);
fftw_execute(p);
for (int j=0;j<h;j++)
{
b[j*w+i] = -lout[j+1][1]/2;
}
}
fftw_free(liin);
fftw_free(lout);
fftw_destroy_plan(p);
}
void dstXY2(double* a,int w,int h,double* b)
{
//对行每行做dst
int ww = 2*(w+1);
double *hiin = (double*)fftw_malloc(sizeof(double)*ww);
fftw_complex *hout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*(w+2));
fftw_plan p;
for (int j=0;j<h;j++)
{
hiin[0] = 0;
hiin[w+1] = 0;
for (int i=0;i<w;i++)
{
hiin[i+1] = a[j*w+i];
hiin[w+2+i] = -a[j*w+(w-1-i)];
}
p = fftw_plan_dft_r2c_1d(ww,hiin,hout,FFTW_ESTIMATE);
fftw_execute(p);
for (int i=0;i<w;i++)
{
b[j*w+i] = -hout[i+1][1]/2;
}
}
fftw_free(hiin);
fftw_free(hout);
//对结果每列做dst
int hh = 2*(h+1);
double *liin = (double*)fftw_malloc(sizeof(double)*hh);
fftw_complex *lout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*(h+2));
for (int i=0;i<w;i++)
{
liin[0] = 0;
liin[h+1] = 0;
for (int j=0;j<h;j++)
{
liin[j+1] = b[j*w+i];
liin[h+2+j] = -b[(h-1-j)*w+i];
}
p = fftw_plan_dft_r2c_1d(hh,liin,lout,FFTW_ESTIMATE);
fftw_execute(p);
for (int j=0;j<h;j++)
{
b[j*w+i] = -lout[j+1][1]/2;
}
}
fftw_free(liin);
fftw_free(lout);
fftw_destroy_plan(p);
}
void idstXY(double* a,int w,int h,double* b)
{
//对行每行做idst
int ww = 2*(w+1);
fftw_complex *hiin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*ww);
fftw_complex *hout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*ww);
fftw_plan p;
for (int j=0;j<h;j++)
{
hiin[0][0] = 0;
hiin[0][1] = 0;
hiin[w+1][0] = 0;
hiin[w+1][1] = 0;
for (int i=0;i<w;i++)
{
hiin[i+1][0] = a[j*w+i];
hiin[w+2+i][0] = -a[j*w+(w-1-i)];
hiin[i+1][1] = 0;
hiin[w+2+i][1] = 0;
}
p = fftw_plan_dft_1d(ww,hiin,hout,FFTW_FORWARD, FFTW_ESTIMATE);
fftw_execute(p);
for (int i=0;i<w;i++)
{
b[j*w+i] = -hout[i+1][1]/(w+1);
}
}
fftw_free(hiin);
fftw_free(hout);
//对结果每列做idst
int hh = 2*(h+1);
fftw_complex *liin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*hh);
fftw_complex *lout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*hh);
for (int i=0;i<w;i++)
{
liin[0][0] = 0;
liin[0][1] = 0;
liin[h+1][0] = 0;
liin[h+1][1] = 0;
for (int j=0;j<h;j++)
{
liin[j+1][0] = b[j*w+i];
liin[h+2+j][0] = -b[(h-1-j)*w+i];
liin[j+1][1] = 0;
}
p = fftw_plan_dft_1d(hh,liin,lout,FFTW_FORWARD, FFTW_ESTIMATE);
fftw_execute(p);
for (int j=0;j<h;j++)
{
b[j*w+i] = -lout[j+1][1]/(h+1);
}
}
fftw_free(liin);
fftw_free(lout);
fftw_destroy_plan(p);
}
void dstY(double* a,int w,int h,double* b)
{
fftw_plan p;
int hh = 2*(h+1);
fftw_complex *liin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*hh);
fftw_complex *lout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*hh);
for (int i=0;i<w;i++)
{
liin[0][0] = 0;
liin[0][1] = 0;
liin[h+1][0] = 0;
liin[h+1][1] = 0;
for (int j=0;j<h;j++)
{
liin[j+1][0] = a[j*w+i];
liin[h+2+j][0] = -a[(h-1-j)*w+i];
liin[j+1][1] = 0;
liin[h+2+j][1] = 0;
}
p = fftw_plan_dft_1d(hh,liin,lout,FFTW_FORWARD, FFTW_ESTIMATE);
fftw_execute(p);
for (int j=0;j<h;j++)
{
b[j*w+i] = -lout[j+1][1]/2;
}
}
fftw_free(liin);
fftw_free(lout);
fftw_destroy_plan(p);
}
int main()
{
int w = 5;
int h = 4;
double* b = (double*)malloc(h*w*sizeof(double));
//A=[1,2,3,7;4,5,6,1;7,8,9,2;1,2,3,4;3,4,5,9];
double a[20] = {1,4,7,1,3,
2,5,8,2,4,
3,6,9,3,5,
7,1,2,4,9};
//dst(a,w,h,b); //相当于matlab的 :DSTA = dst(A);
//dstXY(a,w,h,b); //相当于matlab的 :DSTA = dst(A); DSTAA = dst(DSTA')';
//dstY(a,w,h,b); //相当于matlab的 :DSTAA = dst(DSTA')';
//idst(a,w,h,b); //相当于matlab的 :DSTB = dst(A')';
//idstXY(a,w,h,b); // === 相当于matlab的 :IDSTA = idst(A); IDSTAA = idst(IDSTA')';
//dstXY2(a,w,h,b);
dst2(a,w,h,b);
for (int j=0;j<h;j++)
{
for (int i=0;i<w;i++)
{
printf("%lf ",b[j*w+i]);
}
printf("\n");
}
//free(a);
free(b);
system("pause");
return 0;
}
GPU版本:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <stdio.h>
__global__ void InKernel(double* d_a,int w,int h,cufftReal* iin)
{
int x=blockIdx.x*blockDim.x+threadIdx.x;
int y=blockIdx.y*blockDim.y+threadIdx.y;
if (x<w&&y<h)
{
int ww = 2*(w+1);
float val = d_a[y*w+x];
if(x==0) iin[y*ww] = 0;
if(x==w-1) iin[y*ww+w+1] = 0;
iin[y*ww+x+1] = val;
iin[y*ww+(ww-1-x)] = -val;
}
}
__global__ void OutKernel(cufftComplex* out,int w,int h,double* d_b)
{
int x=blockIdx.x*blockDim.x+threadIdx.x;
int y=blockIdx.y*blockDim.y+threadIdx.y;
if (x<w&&y<h)
{
d_b[y*w+x] = -out[y*(w+2)+x+1].y/2;
}
}
void cuda_dst(double* d_a,int w,int h,double* d_b)
{
int yW = 2*(w+1);
int batch = h;
cufftHandle plan;
//一维傅立叶批处理
cufftPlan1d(&plan, yW, CUFFT_R2C, batch);
cufftReal* iin;
cudaMalloc((void**)&iin,sizeof(cufftReal)*yW*batch);
cufftComplex* out;
cudaMalloc((void**)&out, sizeof(cufftComplex)*(w+2)*batch);
dim3 block(16,16,1);
dim3 grid((w+15)>>4,(h+15)>>4,1);
InKernel<<<grid,block>>>(d_a,w,h,iin);
cufftExecR2C(plan,iin,out);
OutKernel<<<grid,block>>>(out,w,h,d_b);
cudaFree(iin);
cudaFree(out);
cufftDestroy(plan);
}
__global__ void InYKernel(double* d_a,int w,int h,cufftReal* iin)
{
int x=blockIdx.x*blockDim.x+threadIdx.x;
int y=blockIdx.y*blockDim.y+threadIdx.y;
if (x<w&&y<h)
{
int hh = 2*(h+1);
float val = d_a[y*w+x];
if(y==0) iin[x*hh] = 0;
if(y==w-1) iin[x*hh+h+1] = 0;
iin[x*hh+y+1] = val;
iin[x*hh+(hh-1-y)] = -val;
}
}
__global__ void OutYKernel(cufftComplex* out,int w,int h,double* d_b)
{
int x=blockIdx.x*blockDim.x+threadIdx.x;
int y=blockIdx.y*blockDim.y+threadIdx.y;
if (x<w&&y<h)
{
d_b[y*w+x] = -out[x*(h+2)+y+1].y/2;
}
}
void cuda_dstY(double* d_a,int w,int h,double* d_b)
{
int hh = 2*(h+1);
int batch = w;
cufftHandle plan;
cufftPlan1d(&plan,hh,CUFFT_R2C,batch);
int n[1] = {hh};//fft的点数
int istride = w; //进行fft的输入数据间隔点数
int ostride = w; //进行fft的输出数据间隔点数
int inembed[1] = {hh}; //行数 进行fft的点数(每隔ISTRIDE个点读取的输入数据个数)
int onembed[1] = {hh}; //fft后的结果每隔OSTRIDE个点输出的个数
int idist = 1; //每次批处理输入数据起始元素间的间隔点数
int odist = 1; //每次批处理输出数据的元素间的间隔点数
//int n[1] = {hh};
//int inembed[1] = {hh}; //行数 进行fft的点数(每隔ISTRIDE个点读取的输入数据个数)
//int onembed[1] = {hh}; //fft后的结果每隔OSTRIDE个点输出的个数
//cufftPlanMany(&plan,1,n,NULL,w,1,NULL,w,1,CUFFT_R2C,batch);
cufftReal* iin;
cudaMalloc((void**)&iin,sizeof(cufftReal)*hh*batch);
cufftComplex* out;
cudaMalloc((void**)&out, sizeof(cufftComplex)*(h+2)*batch);
dim3 block(16,16,1);
dim3 grid((w+15)>>4,(h+15)>>4,1);
InYKernel<<<grid,block>>>(d_a,w,h,iin);
cufftReal* hiin = (cufftReal*)malloc(sizeof(cufftReal)*hh*batch);
cudaMemcpy(hiin,iin,hh*batch*sizeof(cufftReal),cudaMemcpyDeviceToHost);
cufftExecR2C(plan,iin,out);
cufftComplex* hout = (cufftComplex*)malloc(sizeof(cufftComplex)*(h+2)*batch);
cudaMemcpy(hout,out,sizeof(cufftComplex)*(h+2)*batch,cudaMemcpyDeviceToHost);
OutYKernel<<<grid,block>>>(out,w,h,d_b);
cudaFree(iin);
cudaFree(out);
cufftDestroy(plan);
}
int main()
{
int w = 5;
int h = 4;
double* b = (double*)malloc(h*w*sizeof(double));
double a[20] = {1,4,7,1,3,
2,5,8,2,4,
3,6,9,3,5,
7,1,2,4,9};
double* d_a;
cudaMalloc((void**)&d_a,h*w*sizeof(double));
cudaMemcpy(d_a,a,h*w*sizeof(double),cudaMemcpyHostToDevice);
double* d_b;
cudaMalloc((void**)&d_b,h*w*sizeof(double));
//cuda_dst(d_a,w,h,d_b); //DSTA = dst(A);
cuda_dstY(d_a,w,h,d_b);//DSTB = dst(A')';
cudaMemcpy(b,d_b,h*w*sizeof(double),cudaMemcpyDeviceToHost);
for (int j=0;j<h;j++)
{
for (int i=0;i<w;i++)
{
printf("%lf ",b[j*w+i]);
}
printf("\n");
}
cudaFree(d_a);
cudaFree(d_b);
free(b);
system("pause");
return 0;
}