一个用SIMD实现的向量类_simd 向量算法库-优快云博客

本文介绍了一个名为C_VECTOR4D的四维向量类，该类使用SIMD技术进行高效运算。文中详细展示了类的构造函数、成员变量、运算符重载方法以及一些关键的数学操作如长度计算和点乘。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

class C_VECTOR4D

{

public:

union

{

__declspec(align(16)) __m128 v; // SIMD data type access

float M[4]; // array indexed storage

// explicit names

struct

{

float x,y,z,w;

}; // end struct

}; // end union

// note: the declspec is redundant since in the type __m128 forces

// the compiler to align in 16-byte boundaries, so as long as __m128 is

// part of the union declspec is NOT needed :) But, it can't

// hurt and when you are defining locals and globals, always put

// declspec(align(16)) to KNOW data is on 16-byte boundaries

// CONSTRUCTORS //////////////////////////////////////////////////////////////

C_VECTOR4D()

{

// void constructor

// initialize vector to 0.0.0.1

x=y=z=0; w=1.0;

} // end C_VECTOR4D

//////////////////////////////////////////////////////////////////////////////

C_VECTOR4D(float _x, float _y, float _z, float _w = 1.0)

{

// initialize vector to sent values

x = _x;

y = _y;

z = _z;

w = _w;

} // end C_VECTOR4D

// FUNCTIONS ////////////////////////////////////////////////////////////////

void init(float _x, float _y, float _z, float _w = 1.0)

{

// initialize vector to sent values

x = _x;

y = _y;

z = _z;

w = _w;

} // end init

//////////////////////////////////////////////////////////////////////////////

void zero(void)

{

// initialize vector to 0.0.0.1

x=y=z=0; w=1.0;

} // end zero

//////////////////////////////////////////////////////////////////////////////

float length(void)

{

// computes the length of the vector

C_VECTOR4D vr = *this;

// set w=0

vr.w = 0;

//float a= SIMD_SHUFFLE(0x02,0x03,0x00,0x01) ;

// compile pure asm version?

#if (SIMD_ASM==1)

// begin inline asm version of SIMD dot product since we need its

// results for the length since length = sqrt(v*v)

_asm

{

// first we need dot product of this*this

movaps xmm0, vr.v // move left operand into xmm0

mulps xmm0, xmm0 // multiply operands vertically

// at this point, xmm0 =

// [ (v1.x * v2.x), (v1.y * v2.y), (v1.z * v2.z), (1*1) ]

// or more simply: let xmm0 = [x,y,z,1] =

// [ (v1.x * v2.x), (v1.y * v2.y), (v1.z * v2.z), (1*1) ]

// we need to sum the x,y,z components into a single scalar

// to compute the final dot product of:

// dp = x + y + z == x1*x2 + y1*y2 + z1*z2

// begin

// xmm0: = [x,y,z,1] (note: all regs in low to hight order)

// xmm1: = [?,?,?,?]

movaps xmm1, xmm0 // copy result into xmm1

// xmm0: = [x,y,z,1]

// xmm1: = [x,y,z,1]

shufps xmm1, xmm0, SIMD_SHUFFLE(0x01,0x00,0x03,0x02) //y,x,1,z

// xmm0: = [x,y,z,1]

// xmm1: = [z,1,x,y]

addps xmm1, xmm0

// xmm0: = [x ,y ,z ,1]

// xmm1: = [x+z,y+1,x+z,y+1]

shufps xmm0, xmm1, SIMD_SHUFFLE(0x02,0x03,0x00,0x01) //x+z,y+1,x+z,y+1//因为其实w是为0的，所以y+1=y

// xmm0: = [y ,x ,y+1,x+z]

// xmm1: = [x+z,y+1,x+z,y+1]

// finally we can add!

addps xmm0, xmm1

// xmm0: = [x+y+z,x+y+1,x+y+z+1,x+y+z+1]

// xmm1: = [x+z ,y+1 ,x+z ,y+1]

// xmm0.x contains the dot product

// xmm0.z, xmm0.w contains the dot+1

// now low double word contains dot product, let's take squaroot

sqrtss xmm0, xmm0

movaps vr, xmm0 // save results

} // end asm

#endif // end use inline asm version

// compile intrinsic version?

#if (SIMD_INTRISIC==1)

#endif // end use intrinsic library version

// return result

return(vr.x);

} // end length

// OVERLOADED OPERATORS //////////////////////////////////////////////////////

float& operator[](int index)

{

// return the ith element from the array

return(M[index]);

} // end operator[]

//////////////////////////////////////////////////////////////////////////////

C_VECTOR4D operator+(C_VECTOR4D &v)

{

// adds the "this" vector and the sent vector

__declspec(align(16)) C_VECTOR4D vr; // used to hold result, aligned on 16 bytes

// compile pure asm version?

#if (SIMD_ASM==1)

// begin inline asm version of SIMD add

_asm

{

mov esi, this // "this" contains a point to the left operand

mov edi, v // v points to the right operand

movaps xmm0, [esi] // esi points to first vector, move into xmm0

addps xmm0, [edi] // edi points to second vector, add it to xmm0

movaps vr, xmm0 // move result into output vector

} // end asm

#endif // end use inline asm version

// compile intrinsic version?

#if (SIMD_INTRISIC==1)

vr.v = _mm_add_ps(this->v, v.v);

#endif // end use intrinsic library version

// always set w=1

vr.w = 1.0;

// return result

return(vr);

} // end operator+

//////////////////////////////////////////////////////////////////////////////

C_VECTOR4D operator-(C_VECTOR4D &v)

{

// subtracts the "this" vector and the sent vector

__declspec(align(16)) C_VECTOR4D vr; // used to hold result, aligned on 16 bytes

// compile pure asm version?

#if (SIMD_ASM==1)

// begin inline asm version of SIMD add

_asm

{

mov esi, this // "this" contains a point to the left operand

mov edi, v // v points to the right operand

movaps xmm0, [esi] // esi points to first vector, move into xmm0

subps xmm0, [edi] // edi points to second vector, subtract it from xmm0

movaps vr, xmm0 // move result into output vector

} // end asm

#endif // end use inline asm version

// compile intrinsic version?

#if (SIMD_INTRISIC==1)

vr.v = _mm_sub_ps(this->v, v.v);

#endif // end use intrinsic library version

// always set w=1

vr.w = 1.0;

// return result

return(vr);

} // end operator-

//////////////////////////////////////////////////////////////////////////////

float operator*(C_VECTOR4D &v)

{

// the dot product will be * since dot product is a more common operation

// computes the dot between between the "this" vector and the sent vector

__declspec(align(16)) C_VECTOR4D vr; // used to hold result, aligned on 16 bytes

// compile pure asm version?

#if (SIMD_ASM==1)

// begin inline asm version of SIMD dot product

_asm

{

mov esi, this // "this" contains a point to the left operand

mov edi, v // v points to the right operand

movaps xmm0, [esi] // move left operand into xmm0

mulps xmm0, [edi] // multiply operands vertically

// at this point, xmm0 =

// [ (v1.x * v2.x), (v1.y * v2.y), (v1.z * v2.z), (1*1) ]

// or more simply: let xmm0 = [x,y,z,1] =

// [ (v1.x * v2.x), (v1.y * v2.y), (v1.z * v2.z), (1*1) ]

// we need to sum the x,y,z components into a single scalar

// to compute the final dot product of:

// dp = x + y + z where x = x1*x2, y = y1*y2, z = z1*z2

// begin

// xmm0: = [x,y,z,1] (note: all regs in low to hight order)

// xmm1: = [?,?,?,?]

movaps xmm1, xmm0 // copy result into xmm1

// xmm0: = [x,y,z,1]

// xmm1: = [x,y,z,1]

shufps xmm1, xmm0, SIMD_SHUFFLE(0x01,0x00,0x03,0x02)

// xmm0: = [x,y,z,1]

// xmm1: = [z,1,x,y]

addps xmm1, xmm0

// xmm0: = [x ,y ,z ,1]

// xmm1: = [x+z,y+1,x+z,y+1]

shufps xmm0, xmm1, SIMD_SHUFFLE(0x02,0x03,0x00,0x01)

// xmm0: = [y ,x ,y+1,x+z]

// xmm1: = [x+z,y+1,x+z,y+1]

// finally we can add!

addps xmm0, xmm1

// xmm0: = [x+y+z,x+y+1,x+y+z+1,x+y+z+1]

// xmm1: = [x+z ,y+1 ,x+z ,y+1]

// xmm0.x contains the dot product

// xmm0.z, xmm0.w contains the dot+1

movaps vr, xmm0

} // end asm

#endif // end use inline asm version

// compile intrinsic version?

#if (SIMD_INTRISIC==1)

vr.v = _mm_mul_ps(this->v, v.v);

return(vr.x + vr.y + vr.z);

#endif // end use intrinsic library version

// return result

return(vr.x);

} // end operator*

//////////////////////////////////////////////////////////////////////////////

void print(void)

{

// this member function prints out the vector

printf(" v = [%f, %f, %f, %f]", this->x, this->y, this->z, this->w);

} // end print

//////////////////////////////////////////////////////////////////////////////

}; // end class C_VECTOR4D