C# SIMD向量加速运算简单例子

文章介绍了如何在C#中利用SIMD(SingleInstructionMultipleData)技术,如AVX,对数组元素进行高效并行的加法和乘法运算,以及计算标准偏差。通过Vector256等向量类型,实现对大量数据的快速处理,显著提高图像运算速度,尤其是在无GPU环境下的性能提升.

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

 public unsafe struct VectorOperation<T> where T : struct, INumber<T>
    {
        private T[]? _data = null;
        public VectorOperation()
        {

        }
        public VectorOperation(T[] values)
        {
            _data = values;
        }
        public T[] Data
        {
            get => _data;
            set => _data = value;
        }
        /// <summary>
        /// 累乘
        /// </summary>
        /// <param name="array"></param>
        /// <param name="count"></param>
        /// <returns></returns>
        public T Multiply(T[] array, int count)
        {
            int vectorSize = Vector<T>.Count;
            Vector<T> accVector = Vector<T>.One;
            int i;
            T result = T.One;
            for (i = 0; i <= count - vectorSize; i += vectorSize)
            {
                Vector<T> v = new Vector<T>(array, i);
                accVector = Vector.Multiply(accVector, v);
            }

            for (int j = 0; j < vectorSize; j++)
            {
                result *= accVector[j];
            }

            for (; i < count; i++)
            {
                result *= array[i];
            }

            return result;
        }

        /// <summary>
        /// 元素累加
        /// </summary>
        /// <param name="array"></param>
        /// <param name="count"></param>
        /// <returns></returns>
        public T Sum(T[] array, int count)
        {
            int vectorSize = Vector<T>.Count;
            T result = T.Zero;
            int i;
            for (i = 0; i <= count - vectorSize; i += vectorSize)
            {
                Vector<T> v = new Vector<T>(array, i);
                result += Vector.Sum(v);
            }
            for (; i < array.Length; i++)
            {
                result += array[i];
            }
            return result;
        }

      
    }

上面就简单做了元素累加与累乘,速度也高。

int vectorSize = Vector256<int>.Count;
long sum4 = 0;
Vector256<int> sumV = Vector256<int>.Zero;
int j;
int m = n - vectorSize;
unsafe
{
    fixed (int* p = buffer)
    {
        for (j = 0; j <= m; j += vectorSize)
        {
            sumV += Avx2.LoadVector256(p + j);
        }
        int* ptr = (int*)&sumV;
        for (int i = 0; i < vectorSize; i++)
        {
            sum4 += *(ptr + i);
        }
        for (; j < buffer.Length; j++)
        {
            sum4 += *(p + j);
        }
    }
}

SIMD 是一种在现代 CPU 中广泛使用的并行计算技术。在 SIMD 中,单一的指令可以同时对多个数据进行操作。例如,如果你有两个包含四个元素的数组,你可以使用一条 SIMD 指令来同时对这两个数组的所有元素进行加法运算,而不是分别对每对元素进行加法运算。

int vectorSize = Vector<T>.Count;s是对应值类型数组元素数量。

然后for循环vectorSize长度数组一次操作,比如加或者乘;

for (; i < array.Length; i++)h后面这个循环是对剩余元素操作,比如int是8bit,一次操作的元素数量是8个,一共有17个元素,操作2次剩余1个元素,剩余的元素另外进行运算。

在此操作上可以加速图像运算,在没有GPU的电脑上运行是非常快了的。

例子:

求和

 public static double Sum(double[] data)
        {
            int length = data.Length;
            int vectorSize = Vector256<double>.Count;
            int vectorCount = (int)Math.Ceiling(length / (double)vectorSize);
            fixed (double* num = data)
            {
                Vector256<double>[] vectors = new Vector256<double>[vectorCount];

                fixed (Vector256<double>* vector = vectors)
                {
                    for (int i = 0; i < vectorCount; i++)
                    {
                        *(vector + i) = Avx.LoadVector256(num + i * vectorSize);
                    }
                    Vector256<double> sumVector = Vector256<double>.Zero;
                    for (int i = 0; i < vectorCount; i++)
                    {
                        sumVector = Avx.Add(sumVector, *(vector + i));
                    }
                    return Sum(sumVector);
                }
            }
        }
计算标准偏差
public static double StDevForVector(double[] data)
        {
            int length = data.Length;
            int vectorSize = Vector256<double>.Count;
            int vectorCount = (int)Math.Ceiling(length / (double)vectorSize);
            
            fixed (double* num = data)
            {
                Vector256<double>[] vectors = new Vector256<double>[vectorCount];
                fixed (Vector256<double>* vector = vectors)
                {
                    for (int i = 0; i < vectorCount; i++)
                    {
                        *(vector + i) = Avx.LoadVector256(num + i * vectorSize);
                    }
                    Vector256<double> sumVector = Vector256<double>.Zero;
                    for (int i = 0; i < vectorCount; i++)
                    {
                        sumVector = Avx.Add(sumVector, *(vector + i));
                    }
                    double sum = Sum(sumVector);
                    double mean = sum / length;
                    Vector256<double> meanVector = Vector256.Create(mean);
                    Vector256<double> squaredDiffSumVector = Vector256<double>.Zero;
                    for (int i = 0; i < vectors.Length; i++)
                    {
                        var diffVector = *(vector + i) - meanVector;
                        squaredDiffSumVector = Avx.Add(squaredDiffSumVector, Avx.Multiply(diffVector, diffVector));
                    }
                    double squaredDiffSum = Sum(squaredDiffSumVector);
                    squaredDiffSum -= (vectorCount * vectorSize - length) * Math.Pow(mean, 2);
                    return Math.Sqrt(squaredDiffSum / (length - 1));
                }
            }
        }

其中sum方法:

private static double Sum(Vector256<double> vector)
        {
            double[] temp = new double[Vector256<double>.Count];
            vector.CopyTo(temp);
            double sum = 0f;
            fixed (double* data = temp)
            {
                for (int i = 0; i < temp.Length; i++)
                {
                    sum += *(data + i);
                }
            }
            return sum;
        }

注:

根据自己cpu优化选择Vector64、Vector256、Vector128、Vector512,我这里只用Vector256做示例

测试

可以看到模拟一张1920*1080的图片数据,速度大概是普通方法的6倍,指针方法的4倍

下面是指针方法与普通方法

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值