//PC
1. #include "stdio.h"
2. #include <xmmintrin.h> // Need this for SSE compiler intrinsics
3. #include <math.h> // Needed for sqrt in CPU-only version
4. #include <time.h>
5.
6. int main(int argc, char* argv[])
7. {
8. printf("Starting calculation...\n");
9. const int length = 64000;
10.
11. // We will be calculating Y = SQRT(x) / x, for x = 1->64000
12. // If you do not properly align your data for SSE instructions, you may take a huge performance hit.
13. float *pResult = (float*) _aligned_malloc(length * sizeof(float), 16); // align to 16-byte for SSE
14. __m128 x;
15. __m128 xDelta = _mm_set1_ps(4.0f); // Set the xDelta to (4,4,4,4)
16. __m128 *pResultSSE = (__m128*) pResult;
17.
18. const int SSELength = length / 4;
19. clock_t clock1=clock();
20. #define TIME_SSE // Define this if you want to run with SSE
21. #ifdef TIME_SSE
22. // lots of stress loops so we can easily use a stopwatch
23. for (int stress = 0; stress < 1000; stress++)
24. {
25. // Set the initial values of x to (4,3,2,1)
26. x = _mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f);
27. for (int i=0; i < SSELength; i++)
28. {
29. __m128 xSqrt = _mm_sqrt_ps(x);
30. // Note! Division is slow. It's actually faster to take the reciprocal of a number and multiply
31. // Also note that Division is more accurate than taking the reciprocal and multiplying
32.
33. #define USE_DIVISION_METHOD
34. #ifdef USE_FAST_METHOD
35. __m128 xRecip = _mm_rcp_ps(x);
36. pResultSSE[i] = _mm_mul_ps(xRecip, xSqrt);
37. #endif //USE_FAST_METHOD
38. #ifdef USE_DIVISION_METHOD
39. pResultSSE[i] = _mm_div_ps(xSqrt, x);
40. #endif // USE_DIVISION_METHOD
41. // Advance x to the next set of numbers
42. x = _mm_add_ps(x, xDelta);
43. }
44. }
45. clock_t clock2=clock();
46. printf("SIMDtime:%d ms\n",1000*(clock2-clock1)/CLOCKS_PER_SEC);
47. #endif // TIME_SSE
48.
49. #define TIME_NoSSE
50. #ifdef TIME_NoSSE
51. clock_t clock3=clock();
52. // lots of stress loops so we can easily use a stopwatch
53. for (int stress = 0; stress < 1000; stress++)
54. {
55. clock_t clock3=clock();
56. float xFloat = 1.0f;
57. for (int i=0 ; i < length; i++)
58.