class C_VECTOR4D ...{ public: union ...{ __declspec(align(16)) __m128 v; // SIMD data type access float M[4]; // array indexed storage // explicit names struct ...{ float x,y,z,w; }; // end struct }; // end union // note: the declspec is redundant since in the type __m128 forces // the compiler to align in 16-byte boundaries, so as long as __m128 is // part of the union declspec is NOT needed :) But, it can't // hurt and when you are defining locals and globals, always put // declspec(align(16)) to KNOW data is on 16-byte boundaries // CONSTRUCTORS ////////////////////////////////////////////////////////////// C_VECTOR4D() ...{ // void constructor // initialize vector to 0.0.0.1 x=y=z=0; w=1.0; }// end C_VECTOR4D /**/////////////////////////////////////////////////////////////////////////////// C_VECTOR4D(float _x, float _y, float _z, float _w =1.0) ...{ // initialize vector to sent values x = _x; y = _y; z = _z; w = _w; }// end C_VECTOR4D // FUNCTIONS //////////////////////////////////////////////////////////////// void init(float _x, float _y, float _z, float _w =1.0) ...{ // initialize vector to sent values x = _x; y = _y; z = _z; w = _w; }// end init /**/////////////////////////////////////////////////////////////////////////////// void zero(void) ...{ // initialize vector to 0.0.0.1 x=y=z=0; w=1.0; }// end zero /**/////////////////////////////////////////////////////////////////////////////// float length(void) ...{ // computes the length of the vector C_VECTOR4D vr =*this; // set w=0 vr.w =0; //float a= SIMD_SHUFFLE(0x02,0x03,0x00,0x01) ; // compile pure asm version? #if (SIMD_ASM==1) // begin inline asm version of SIMD dot product since we need its // results for the length since length = sqrt(v*v) _asm ...{ // first we need dot product of this*this movaps xmm0, vr.v // move left operand into xmm0 mulps xmm0, xmm0 // multiply operands vertically // at this point, xmm0 = // [ (v1.x * v2.x), (v1.y * v2.y), (v1.z * v2.z), (1*1) ] // or more simply: let xmm0 = [x,y,z,1] = // [ (v1.x * v2.x), (v1.y * v2.y), (v1.z * v2.z), (1*1) ] // we need to sum the x,y,z components into a single scalar // to compute the final dot product of: // dp = x + y + z == x1*x2 + y1*y2 + z1*z2 // begin // xmm0: = [x,y,z,1] (note: all regs in low to hight order) // xmm1: = [?,?,?,?] movaps xmm1, xmm0 // copy result into xmm1 // xmm0: = [x,y,z,1] // xmm1: = [x,y,z,1] shufps xmm1, xmm0, SIMD_SHUFFLE(0x01,0x00,0x03,0x02) //y,x,1,z // xmm0: = [x,y,z,1] // xmm1: = [z,1,x,y] addps xmm1, xmm0 // xmm0: = [x ,y ,z ,1] // xmm1: = [x+z,y+1,x+z,y+1] shufps xmm0, xmm1, SIMD_SHUFFLE(0x02,0x03,0x00,0x01) //x+z,y+1,x+z,y+1//因为其实w是为0的,所以y+1=y // xmm0: = [y ,x ,y+1,x+z] // xmm1: = [x+z,y+1,x+z,y+1] // finally we can add! addps xmm0, xmm1 // xmm0: = [x+y+z,x+y+1,x+y+z+1,x+y+z+1] // xmm1: = [x+z ,y+1 ,x+z ,y+1] // xmm0.x contains the dot product // xmm0.z, xmm0.w contains the dot+1 // now low double word contains dot product, let's take squaroot sqrtss xmm0, xmm0 movaps vr, xmm0 // save results }// end asm #endif// end use inline asm version // compile intrinsic version? #if (SIMD_INTRISIC==1) #endif// end use intrinsic library version // return result return(vr.x); }// end length // OVERLOADED OPERATORS ////////////////////////////////////////////////////// float&operator[](int index) ...{ // return the ith element from the array return(M[index]); }// end operator[] /**/////////////////////////////////////////////////////////////////////////////// C_VECTOR4D operator+(C_VECTOR4D &v) ...{ // adds the "this" vector and the sent vector __declspec(align(16)) C_VECTOR4D vr; // used to hold result, aligned on 16 bytes // compile pure asm version? #if (SIMD_ASM==1) // begin inline asm version of SIMD add _asm ...{ mov esi, this// "this" contains a point to the left operand mov edi, v // v points to the right operand movaps xmm0, [esi] // esi points to first vector, move into xmm0 addps xmm0, [edi] // edi points to second vector, add it to xmm0 movaps vr, xmm0 // move result into output vector }// end asm #endif// end use inline asm version // compile intrinsic version? #if (SIMD_INTRISIC==1) vr.v = _mm_add_ps(this->v, v.v); #endif// end use intrinsic library version // always set w=1 vr.w =1.0; // return result return(vr); }// end operator+ /**/////////////////////////////////////////////////////////////////////////////// C_VECTOR4D operator-(C_VECTOR4D &v) ...{ // subtracts the "this" vector and the sent vector __declspec(align(16)) C_VECTOR4D vr; // used to hold result, aligned on 16 bytes // compile pure asm version? #if (SIMD_ASM==1) // begin inline asm version of SIMD add _asm ...{ mov esi, this// "this" contains a point to the left operand mov edi, v // v points to the right operand movaps xmm0, [esi] // esi points to first vector, move into xmm0 subps xmm0, [edi] // edi points to second vector, subtract it from xmm0 movaps vr, xmm0 // move result into output vector }// end asm #endif// end use inline asm version // compile intrinsic version? #if (SIMD_INTRISIC==1) vr.v = _mm_sub_ps(this->v, v.v); #endif// end use intrinsic library version // always set w=1 vr.w =1.0; // return result return(vr); }// end operator- /**/////////////////////////////////////////////////////////////////////////////// floatoperator*(C_VECTOR4D &v) ...{ // the dot product will be * since dot product is a more common operation // computes the dot between between the "this" vector and the sent vector __declspec(align(16)) C_VECTOR4D vr; // used to hold result, aligned on 16 bytes // compile pure asm version? #if (SIMD_ASM==1) // begin inline asm version of SIMD dot product _asm ...{ mov esi, this// "this" contains a point to the left operand mov edi, v // v points to the right operand movaps xmm0, [esi] // move left operand into xmm0 mulps xmm0, [edi] // multiply operands vertically // at this point, xmm0 = // [ (v1.x * v2.x), (v1.y * v2.y), (v1.z * v2.z), (1*1) ] // or more simply: let xmm0 = [x,y,z,1] = // [ (v1.x * v2.x), (v1.y * v2.y), (v1.z * v2.z), (1*1) ] // we need to sum the x,y,z components into a single scalar // to compute the final dot product of: // dp = x + y + z where x = x1*x2, y = y1*y2, z = z1*z2 // begin // xmm0: = [x,y,z,1] (note: all regs in low to hight order) // xmm1: = [?,?,?,?] movaps xmm1, xmm0 // copy result into xmm1 // xmm0: = [x,y,z,1] // xmm1: = [x,y,z,1] shufps xmm1, xmm0, SIMD_SHUFFLE(0x01,0x00,0x03,0x02) // xmm0: = [x,y,z,1] // xmm1: = [z,1,x,y] addps xmm1, xmm0 // xmm0: = [x ,y ,z ,1] // xmm1: = [x+z,y+1,x+z,y+1] shufps xmm0, xmm1, SIMD_SHUFFLE(0x02,0x03,0x00,0x01) // xmm0: = [y ,x ,y+1,x+z] // xmm1: = [x+z,y+1,x+z,y+1] // finally we can add! addps xmm0, xmm1 // xmm0: = [x+y+z,x+y+1,x+y+z+1,x+y+z+1] // xmm1: = [x+z ,y+1 ,x+z ,y+1] // xmm0.x contains the dot product // xmm0.z, xmm0.w contains the dot+1 movaps vr, xmm0 }// end asm #endif// end use inline asm version // compile intrinsic version? #if (SIMD_INTRISIC==1) vr.v = _mm_mul_ps(this->v, v.v); return(vr.x + vr.y + vr.z); #endif// end use intrinsic library version // return result return(vr.x); }// end operator* /**/////////////////////////////////////////////////////////////////////////////// void print(void) ...{ // this member function prints out the vector printf(" v = [%f, %f, %f, %f]", this->x, this->y, this->z, this->w); }// end print /**/////////////////////////////////////////////////////////////////////////////// }; // end class C_VECTOR4D