高性能计算导论HPC实验三 CPU SIMD+GPU SIMD

文章介绍了CPU和GPU的SIMD技术,包括使用SSE和NEON进行向量计算的示例,以及CUDA的安装、deviceQuery功能和在CUDA中进行向量和PI计算的实践。作者强调了实践在学习SIMD和GPU加速计算中的重要性。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

一、CPU SIMD

//PC
1.	#include "stdio.h"  
2.	#include <xmmintrin.h>    // Need this for SSE compiler intrinsics  
3.	#include <math.h>  // Needed for sqrt in CPU-only version  
4.	#include <time.h>  
5.	       
6.	int main(int argc, char* argv[])  
7.	{  
8.	     printf("Starting calculation...\n");  
9.	     const int length = 64000;  
10.	          
11.	     // We will be calculating Y = SQRT(x) / x, for x = 1->64000  
12.	// If you do not properly align your data for SSE instructions, you may take a huge performance hit.  
13.	float *pResult = (float*) _aligned_malloc(length * sizeof(float), 16);  // align to 16-byte for SSE  
14.	     __m128 x;  
15.	     __m128 xDelta = _mm_set1_ps(4.0f);     // Set the xDelta to (4,4,4,4)  
16.	     __m128 *pResultSSE = (__m128*) pResult;  
17.	       
18.	     const int SSELength = length / 4;  
19.	     clock_t clock1=clock();  
20.	     #define TIME_SSE   // Define this if you want to run with SSE  
21.	     #ifdef TIME_SSE  
22.	     // lots of stress loops so we can easily use a stopwatch  
23.	     for (int stress = 0; stress < 1000; stress++)     
24.	     {  
25.	            // Set the initial values of x to (4,3,2,1)  
26.	     x = _mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f);      
27.	        for (int i=0; i < SSELength; i++)  
28.	        {  
29.	            __m128 xSqrt = _mm_sqrt_ps(x);  
30.	// Note! Division is slow. It's actually faster to take the reciprocal of a number and multiply  
31.	// Also note that Division is more accurate than taking the reciprocal and multiplying  
32.	       
33.	            #define USE_DIVISION_METHOD  
34.	            #ifdef USE_FAST_METHOD  
35.	                __m128 xRecip = _mm_rcp_ps(x);  
36.	                pResultSSE[i] = _mm_mul_ps(xRecip, xSqrt);  
37.	            #endif //USE_FAST_METHOD  
38.	            #ifdef USE_DIVISION_METHOD  
39.	                pResultSSE[i] = _mm_div_ps(xSqrt, x);  
40.	            #endif  // USE_DIVISION_METHOD  
41.	            // Advance x to the next set of numbers  
42.	            x = _mm_add_ps(x, xDelta);    
43.	     }  
44.	     }  
45.	     clock_t clock2=clock();  
46.	     printf("SIMDtime:%d ms\n",1000*(clock2-clock1)/CLOCKS_PER_SEC);  
47.	     #endif // TIME_SSE  
48.	       
49.	     #define TIME_NoSSE  
50.	     #ifdef TIME_NoSSE  
51.	     clock_t clock3=clock();      
52.	     // lots of stress loops so we can easily use a stopwatch  
53.	     for (int stress = 0; stress < 1000; stress++)     
54.	     {  
55.	        clock_t clock3=clock();  
56.	        float xFloat = 1.0f;  
57.	        for (int i=0 ; i < length; i++)  
58.	   
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值