SIMD向量化案例

Begoniaish

已于 2024-03-11 22:01:10 修改

阅读量750

点赞数 5

文章标签：算法 c语言

于 2024-02-22 14:29:32 首次发布

本文链接：https://blog.youkuaiyun.com/Begoniaish/article/details/136232984

版权

本文介绍了如何使用ARMNEONIntrinsic指令优化传统函数（如vsum）的向量化实现，以及如何应用ompsimd进行图像数据转换函数（如bgra2rgb）的向量化改写，同时展示了矩阵向量乘法（dgemv）的SIMD优化。文章还涉及图像旋转的向量化处理，提升计算效率。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1.使用NEON intrinsic指令，实现下图所示vsum传统串行函数的向量化

要求：
a) #include <arm_neon.h>

b) 函数接口声明为： int vsum_arm_neon(int *v, int n)

c) 主函数分别调用vsum和vsum_arm_neon接口，加入clock计时函数，对比向量化前后的性能。

d) 初始化长度n为1024000，数组v元素初始为0~10之间随机数

e) 编译选项使用–O1优化等级

f) 存储int元素的128位向量数据结构名称：int32x4_t

g) 向量vs记录累加结果，其初始化使用函数：vdupq_n_s32

h) 累加结果向量vs各元素的最终求和，使用：vaddvq_s32

i) 可在ARM官网大全，查看每条Intrinsic函数用法：

Intrinsics – Arm Developer

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <arm_neon.h>

int vsum(int *v, int n) {
    int sum = 0;
    for (int i = 0; i < n; i++) {
        sum += v[i];
    }
    return sum;
}

int vsum_arm_neon(int *v, int n) {
    int32x4_t vs = vdupq_n_s32(0);
    int32x4_t *vv = (int32x4_t*) v;
    int32x4_t vtemp;
    int i;

    for (i = 0; i < n / 4; i++) {
        vtemp = vld1q_s32(&vv[i]);
        vs = vaddq_s32(vs, vtemp);
    }

    int32_t sum = vaddvq_s32(vs);
    for (i = i * 4; i < n; i++) {
        sum += v[i];
    }

    return sum;
}

int main() {
    int n = 1024000;
    int v[n];

    // 初始化数组v为0~10之间的随机数
    srand(time(NULL));
    for (int i = 0; i < n; i++) {
        v[i] = rand() % 11;
    }

    clock_t start_time = clock();
    int sum = vsum(v, n);
    clock_t end_time = clock();
    printf("vsum elapsed time: %.4f seconds\n", (double) (end_time - start_time) / CLOCKS_PER_SEC);

    start_time = clock();
    sum = vsum_arm_neon(v, n);
    end_time = clock();
    printf("vsum_arm_neon elapsed time: %.4f seconds\n", (double) (end_time - start_time) / CLOCKS_PER_SEC);

    return 0;
}

2.使用omp simd指导语句，对图像数据格式转换函数`-bgra2rgb`（如下图所示），进行向量化改写，并对比改写前后时间。

要求：
a) 转换结果需正确

b) 打印对比bgra2rgb函数运行时间

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <arm_neon.h>
using namespace std;

void bgra2rgb(const uint8_t *src, uint8_t *dst, int w, int h)
{
	#pragma omp parallel for simd
	for (int i = 0; i < h; ++i)
	{
		for (int j = 0; j < w; j++)
		{
			dst[(i * w + j) * 3] = src[(i * w + j) * 4 + 2];
			dst[(i * w + j) * 3 + 1] = src[(i * w + j) * 4 + 1];
			dst[(i * w + j) * 3 + 2] = src[(i * w + j) * 4];
		}
	}
}


int main()
{
	int nloop = 100;
	const int w = 480;
	const int h = 640;
	uint8_t bgra_mat[w * h * 4];
	uint8_t rgb_mat[w * h * 3];
	srand(100);
	for (int i = 0; i < w * h * 4; i++)
	{
		bgra_mat[i] = rand() % 256;
	}

	clock_t t = clock();
	for (int iloop = 0; iloop < nloop; iloop++)
		bgra2rgb(bgra_mat, rgb_mat, w, h);
	t = clock() - t;

	cout << "bgra[4-6] data:" << (int)bgra_mat[4] << "," << (int)bgra_mat[5] << "," << (int)bgra_mat[6] << endl;
	cout << "rgb[3-5] data:" << (int)rgb_mat[3] << "," << (int)rgb_mat[4] << "," << (int)rgb_mat[5] << endl;
	cout << "seq cost time(clock):" << t / nloop << endl;
}

3.使用了如下图所示的omp simd语句来实现bgra2rgb函数自动向量化。通过手动向量化来实现

a) 借助interleaving交叉存取技术的ld4、st3指令

b) 可参考如下图所示的RGB to BGR的汇编层面实现过程

c）向量化函数声明如下：

void bgra2rgb_neon(const int8_t *src, int8_t *dst, int w, int h);

d）使用intrinsic函数，手动向量化图像数据结构转化函数bgra2rgb。

f）对比bgra2rgb原始函数和bgra2rgb_neon向量化函数的运行结果和时间

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <arm_neon.h>
using namespace std;
/*
void bgra2rgb(const uint8_t *src, uint8_t *dst, int w, int h)
{
	#pragma omp parallel for simd
	for (int i = 0; i < h; ++i)
	{
		for (int j = 0; j < w; j++)
		{
			dst[(i * w + j) * 3] = src[(i * w + j) * 4 + 2];
			dst[(i * w + j) * 3 + 1] = src[(i * w + j) * 4 + 1];
			dst[(i * w + j) * 3 + 2] = src[(i * w + j) * 4];
		}
	}
}
*/

void bgra2rgb(const uint8_t *src, uint8_t *dst, int w, int h)
{
    for (int i = 0; i < h; ++i)
    {
        for (int j = 0; j < w; j += 4)
        {
            uint8x8x4_t vsrc = vld4_u8(&src[(i * w + j) * 4]);
            uint8x8x3_t vdst;
            vdst.val[0] = vsrc.val[2]; // B
            vdst.val[1] = vsrc.val[1]; // G
            vdst.val[2] = vsrc.val[0]; // R
            vst3_u8(&dst[(i * w + j) * 3], vdst);
        }
    }
}
int main()
{
	int nloop = 100;
	const int w = 480;
	const int h = 640;
	uint8_t bgra_mat[w * h * 4];
	uint8_t rgb_mat[w * h * 3];
	srand(100);
	for (int i = 0; i < w * h * 4; i++)
	{
		bgra_mat[i] = rand() % 256;
	}

	clock_t t = clock();
	for (int iloop = 0; iloop < nloop; iloop++)
		bgra2rgb(bgra_mat, rgb_mat, w, h);
	t = clock() - t;

	cout << "bgra[4-6] data:" << (int)bgra_mat[4] << "," << (int)bgra_mat[5] << "," << (int)bgra_mat[6] << endl;
	cout << "rgb[3-5] data:" << (int)rgb_mat[3] << "," << (int)rgb_mat[4] << "," << (int)rgb_mat[5] << endl;
	cout << "seq cost time(clock):" << t / nloop << endl;
}

4.实现针对32位浮点型向量的neon版本除法接口

目标：

实现neon版本除法接口，并在主函数中调用该接口，保证计算结果正确。

要求：

a) 接口声明为：float32x4_t neon_vdivq_f32(float32x4_t op1, float32x4_t op2)

以实现形如r[i]=op1[i]/op2[i]的向量高精度除法。

其中op1、op2为操作数向量，r为返回值向量。三者皆为float32x4_t类型。

b) 在主函数中，使用长度为32的一维数组作为输入来调用neon_vdivq_f32接口，并验证其正确性。

提示：

vrecpeq_f32

vrecpsq_f32

牛顿 - 拉夫逊迭代


#include <stdio.h>
#include <arm_neon.h>

float32x4_t vdivq_f32_neon(float32x4_t op1, float32x4_t op2)
{
    float32x4_t reciprocal = vrecpeq_f32(op2);  // 计算 op2 的倒数
    reciprocal = vrecpsq_f32(op2, reciprocal);  // 进行牛顿-拉夫逊迭代优化
    return vmulq_f32(op1, reciprocal);  // 返回 op1 与 op2 的逐元素除法结果
}

int main()
{
    float arr_op1[4] = {1.0f, 2.0f, 3.0f, 4.0f};
    float arr_op2[4] = {0.5f, 1.0f, 2.0f, 0.0f};  // 注意处理除数为0的情况

    float32x4_t op1 = vld1q_f32(arr_op1);
    float32x4_t op2 = vld1q_f32(arr_op2);

    float32x4_t result = vdivq_f32_neon(op1, op2);

    float res[4];
    vst1q_f32(res, result);

    for (int i = 0; i < 4; i++) {
        printf("Result[%d]: %f\n", i, res[i]);
    }

    return 0;
}

5.使用simd和循环展开技术，对矩阵向量乘法dgemv进行改写，以实现y=Ax计算的性能优化。其中A是大小为n x m的矩阵，x、y是长度为m的向量。

要求
a) 外层i循环进行循环展开
b) 内层j循环进行simd向量化
c) 对比串行和优化版本的运行时间，确保结果正确性

提示
vdupq_n_f64：标量填充
vfmaq_f64：融合乘加操作
vaddvq_f64：向量内部求和

#include <iostream>
#include <ctime>
#include <omp.h>
#include <arm_neon.h>
using namespace std;
/*
void dgemv(const int n, const int m, const double *const A,
           const double *const x, double *y)
{
    for (int i = 0; i < n; i++)
    {
        double y_tmp = 0.0;
        for (int j = 0; j < m; j++)
        {
            y_tmp += A[i * m + j] * x[j];
        }
        y[i] = y_tmp;
    }
    return;
}
*/
 

void dgemv(const int n, const int m, const double *const A,
                         const double *const x, double *y)
{
    for (int i = 0; i < n; i +=2) // 外层i循环进行循环展开
    {
        float64x2_t y_tmp = vdupq_n_f64(0.0);
		float64x2_t y_tmp0 = vdupq_n_f64(0.0);
        for (int j = 0; j < m; j +=2) // 内层j循环进行simd向量化
        {
            float64x2_t x_vec = vld1q_f64(&x[j]);

            y_tmp = vfmaq_f64(y_tmp, vld1q_f64(&A[i * m + j]), x_vec);
			y_tmp0 = vfmaq_f64(y_tmp0, vld1q_f64(&A[(i + 1) * m + j]), x_vec);
        }
        y[i] = vaddvq_f64(y_tmp);
        y[i + 1] = vaddvq_f64(y_tmp0);
    }
}
int main()
{
    int nloop = 10;
    int n = 1024;
    double *mat_A = new double[n * n];
    double *vec_x = new double[n];
    double *vec_y = new double[n];
 
    srand(0);
    for (int i = 0; i < n; i++)
    {
        for (int j = 0; j < n; j++)
        {
            vec_x[j] = (double)(rand() % 10) + (double)rand() / RAND_MAX * 3.14;
            mat_A[i * n + j] = (double)(rand() % 10) + (double)rand() / RAND_MAX * 3.14;
        }
        vec_y[i] = 0.0;
    }
 
    clock_t t = clock();
    for (int iloop = 0; iloop < nloop; iloop++)
    {
        dgemv(n, n, mat_A, vec_x, vec_y);
    }
    t = clock() - t;
 
    cout << "y[0]:" << vec_y[0] << endl;
    cout << "y[1]:" << vec_y[1] << endl;
    cout << "y[2]:" << vec_y[2] << endl;
    cout << "y[3]:" << vec_y[3] << endl;
    cout << "y[4]:" << vec_y[4] << endl;
 
    cout << "gemv ave cost time(clock):" << t / nloop << endl;
}

6.图像旋转

目标：

a) 将512x512大小的Lena图像（上图）逆时针旋转90度

b) 使用neon intrinsic函数进行优化

提示：

先转置（借助vtrn指令）

再水平翻转（借助vrev指令）

#include <arm_neon.h>

void rotateImageNeon(uint8_t* src, uint8_t* dst, int width, int height) {
    // 逐行处理
    for (int y = 0; y < height; y++) {
        // 每次处理16个像素，因为NEON寄存器可以同时处理16个8位整数
        for (int x = 0; x < width; x += 16) {
            // 加载16个像素到NEON寄存器
            uint8x16_t pixels = vld1q_u8(src + y * width + x);
            // 转置
            pixels = vtrnq_u8(pixels, pixels);
            // 水平翻转
            pixels = vrev64q_u8(pixels);
            // 存储结果
            vst1q_u8(dst + (width - x - 16) * height + y, pixels);
        }
    }
}