public unsafe struct VectorOperation<T> where T : struct, INumber<T>
{
private T[]? _data = null;
public VectorOperation()
{
}
public VectorOperation(T[] values)
{
_data = values;
}
public T[] Data
{
get => _data;
set => _data = value;
}
/// <summary>
/// 累乘
/// </summary>
/// <param name="array"></param>
/// <param name="count"></param>
/// <returns></returns>
public T Multiply(T[] array, int count)
{
int vectorSize = Vector<T>.Count;
Vector<T> accVector = Vector<T>.One;
int i;
T result = T.One;
for (i = 0; i <= count - vectorSize; i += vectorSize)
{
Vector<T> v = new Vector<T>(array, i);
accVector = Vector.Multiply(accVector, v);
}
for (int j = 0; j < vectorSize; j++)
{
result *= accVector[j];
}
for (; i < count; i++)
{
result *= array[i];
}
return result;
}
/// <summary>
/// 元素累加
/// </summary>
/// <param name="array"></param>
/// <param name="count"></param>
/// <returns></returns>
public T Sum(T[] array, int count)
{
int vectorSize = Vector<T>.Count;
T result = T.Zero;
int i;
for (i = 0; i <= count - vectorSize; i += vectorSize)
{
Vector<T> v = new Vector<T>(array, i);
result += Vector.Sum(v);
}
for (; i < array.Length; i++)
{
result += array[i];
}
return result;
}
}
上面就简单做了元素累加与累乘,速度也高。
int vectorSize = Vector256<int>.Count;
long sum4 = 0;
Vector256<int> sumV = Vector256<int>.Zero;
int j;
int m = n - vectorSize;
unsafe
{
fixed (int* p = buffer)
{
for (j = 0; j <= m; j += vectorSize)
{
sumV += Avx2.LoadVector256(p + j);
}
int* ptr = (int*)&sumV;
for (int i = 0; i < vectorSize; i++)
{
sum4 += *(ptr + i);
}
for (; j < buffer.Length; j++)
{
sum4 += *(p + j);
}
}
}
SIMD 是一种在现代 CPU 中广泛使用的并行计算技术。在 SIMD 中,单一的指令可以同时对多个数据进行操作。例如,如果你有两个包含四个元素的数组,你可以使用一条 SIMD 指令来同时对这两个数组的所有元素进行加法运算,而不是分别对每对元素进行加法运算。
int vectorSize = Vector<T>.Count;s是对应值类型数组元素数量。
然后for循环vectorSize长度数组一次操作,比如加或者乘;
for (; i < array.Length; i++)h后面这个循环是对剩余元素操作,比如int是8bit,一次操作的元素数量是8个,一共有17个元素,操作2次剩余1个元素,剩余的元素另外进行运算。
在此操作上可以加速图像运算,在没有GPU的电脑上运行是非常快了的。
例子:
求和
public static double Sum(double[] data)
{
int length = data.Length;
int vectorSize = Vector256<double>.Count;
int vectorCount = (int)Math.Ceiling(length / (double)vectorSize);
fixed (double* num = data)
{
Vector256<double>[] vectors = new Vector256<double>[vectorCount];
fixed (Vector256<double>* vector = vectors)
{
for (int i = 0; i < vectorCount; i++)
{
*(vector + i) = Avx.LoadVector256(num + i * vectorSize);
}
Vector256<double> sumVector = Vector256<double>.Zero;
for (int i = 0; i < vectorCount; i++)
{
sumVector = Avx.Add(sumVector, *(vector + i));
}
return Sum(sumVector);
}
}
}
计算标准偏差
public static double StDevForVector(double[] data)
{
int length = data.Length;
int vectorSize = Vector256<double>.Count;
int vectorCount = (int)Math.Ceiling(length / (double)vectorSize);
fixed (double* num = data)
{
Vector256<double>[] vectors = new Vector256<double>[vectorCount];
fixed (Vector256<double>* vector = vectors)
{
for (int i = 0; i < vectorCount; i++)
{
*(vector + i) = Avx.LoadVector256(num + i * vectorSize);
}
Vector256<double> sumVector = Vector256<double>.Zero;
for (int i = 0; i < vectorCount; i++)
{
sumVector = Avx.Add(sumVector, *(vector + i));
}
double sum = Sum(sumVector);
double mean = sum / length;
Vector256<double> meanVector = Vector256.Create(mean);
Vector256<double> squaredDiffSumVector = Vector256<double>.Zero;
for (int i = 0; i < vectors.Length; i++)
{
var diffVector = *(vector + i) - meanVector;
squaredDiffSumVector = Avx.Add(squaredDiffSumVector, Avx.Multiply(diffVector, diffVector));
}
double squaredDiffSum = Sum(squaredDiffSumVector);
squaredDiffSum -= (vectorCount * vectorSize - length) * Math.Pow(mean, 2);
return Math.Sqrt(squaredDiffSum / (length - 1));
}
}
}
其中sum方法:
private static double Sum(Vector256<double> vector)
{
double[] temp = new double[Vector256<double>.Count];
vector.CopyTo(temp);
double sum = 0f;
fixed (double* data = temp)
{
for (int i = 0; i < temp.Length; i++)
{
sum += *(data + i);
}
}
return sum;
}
注:
根据自己cpu优化选择Vector64、Vector256、Vector128、Vector512,我这里只用Vector256做示例
测试
可以看到模拟一张1920*1080的图片数据,速度大概是普通方法的6倍,指针方法的4倍
下面是指针方法与普通方法