openmp 超越通用核心

小猴啊0.0

已于 2023-10-13 22:27:59 修改

阅读量258

点赞数

文章标签： linux 运维服务器 c++

于 2023-10-13 16:22:18 首次发布

本文链接：https://blog.youkuaiyun.com/qq_52758467/article/details/133812106

版权

并行构造子句：

1.if() 如果表达式为真，则创建线程组

2.num_threads() 明确要求并行区域所使用地线程数

图10-1：

// sample compile command: "gcc -fopenmp -c Fig_10.1_parClaw.c" to generate *.o object file

#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

// initialization and transform functions 
// (we will not show the function bodies)
extern void initMats(int N, float *A, float *T);
extern void transform(int N, int id, int Nthrds, float *A, float *T);

int main(int argc, char**argv)
{
   float trace=0;
   int i, id, N, Nthrds;
   float  *A, *T;

   // set matrix order N
   if (argc == 2)
      N = atoi(argv[1]);
   else
      N = 10;

   // allocate space for three N x N matrices and initialize them
   T = (float *) malloc(N*N*sizeof(float));
   A = (float *) malloc(N*N*sizeof(float));
   initMats(N, A, T);
 
   #pragma omp parallel if(N>100) num_threads(4) default(none) \
               shared(A,T,N) private (i,id,Nthrds) reduction(+:trace)
   {
      id = omp_get_thread_num();
      Nthrds = omp_get_num_threads();
      transform(N, id, Nthrds, T, A);

      // compute trace of A matrix
      // i.e., the sum of diagonal elements
      #pragma omp for
      for (i = 0; i < N; i++)
         trace += *(A+i*N+i);
    }
    printf(" transform complete with trace = \%f\n",trace);
}

共享工作循环构造：

子句：schedule(runtime)

bash shell命令行：修改相应环境变量：export OMP_SCHEDULE="dynamic,7"

配对函数：omp_set_schedule(kind,chunk_size)\omp_get_schedule(kind,chunk_size)

kind:typedef enum omp_sched_t{

omp_sched_static = 1,

omp_sched_dynamic = 2,

omp_sched_guided = 3,

omp_sched_auto = 4,

}omp_sched_t;

图10-3：

// sample compile command: "gcc -fopenmp -c Fig_10.3_runtimeEx.c" to generate *.o object file

#include <omp.h>
#include <stdio.h>

#define DEBUG 1

// map schedule kind enum values to strings for printing
static char* schdKind[] = { "ERR","static","dynamic","guided","auto"};

// external function for potential energy term
extern double pot(double dist);

void forces(int npart,double x[],double f[],double side,double rcoff)
{
   #pragma omp parallel for schedule(runtime) 
      for (int i = 0; i < npart*3; i += 3) {

      // zero force components on particle i 
      double fxi = 0.0; double fyi = 0.0; double fzi = 0.0;

      // loop over all particles with index > i 
         for (int j = i + 3; j < npart * 3; j += 3) {

	    // compute distance between i and j with wraparound 
            double xx = x[i] - x[j];    
            double yy = x[i+1] - x[j+1];    
            double zz = x[i+2] - x[j+2];

            if(xx<(-0.5*side)) xx+=side; if(xx>(0.5*side)) xx-=side;
            if(yy<(-0.5*side)) yy+=side; if(yy>(0.5*side)) yy-=side;
            if(zz<(-0.5*side)) zz+=side; if(zz>(0.5*side)) zz-=side;
            double rd = xx * xx + yy * yy + zz * zz;

	    // if distance is inside cutoff radius, compute forces
            if (rd <= rcoff*rcoff) {
            double fcomp = pot(rd);
            fxi += xx*fcomp;   fyi += yy*fcomp;    fzi += zz*fcomp;
            f[j] -= xx*fcomp;  f[j+1] -= yy*fcomp; f[j+2] -= zz*fcomp;
            }
         } 
      // update forces on particle i 
	    f[i] += fxi;   f[i+1] += fyi;   f[i+2] += fzi;
      } 
   #ifdef DEBUG
      omp_sched_t kind;        
      int chunk_size;
      omp_get_schedule(&kind, &chunk_size);
      printf("schedule(%s,%d)\n",schdKind[kind],chunk_size);
   #endif 
}

子句：collapse(n)，对循环进行合并

如果是三层循环可以collapse(3)。但是要考虑数据竞争。详情建议参考雷洪多核异构并行计算

图10-4：

// sample compile command: "gcc -fopenmp -c Fig_10.4_loopCollapse.c" to generate *.o object file

#include <omp.h>

// apply a function (*MFUNC) to each element of an N by M array

void Apply(int N, int M, float* A, void(*MFUNC)(int, int, float*))
{ 
   #pragma omp parallel for num_threads(4) collapse(2) if(N*M>100)
      for (int i = 0; i < N; i++)
         for (int j = 0; j < M; j++)
            MFUNC(i, j, (A+i*M+j));
}

任务构造：

子句：

untied:任务默认是绑定的（线程绑定任务），可以用untied子句显示的将任务标记为非绑定状态。

priority()：设置任务优先级

环境变量OMP_MAX_TASK_PRIORITY.设置最大优先级

depend()：定义任务之间的执行顺序

图10-6：

// sample compile command: "gcc -fopenmp -c Fig_10.6_taskDep.c" to generate *.o object file

#include <omp.h>

// functions Awork through Ework not shown

int main()
{
   float A, B, C, D, E;
   #pragma omp parallel shared(A, B, C, D, E)
   {
      #pragma omp single
      {
         #pragma omp task depend(out:A)
            Awork(&A);
         #pragma omp task depend(out:E)
            Ework(&E);
         #pragma omp task depend(in:A) depend(out:B)
            Bwork(&B);
         #pragma omp task depend(in:A) depend(out:C)
            Cwork(&C);
         #pragma omp task depend(in:B,C,E)
            Dwork(&E);
      }
   }
}

多线程功能：

threadprivate:线程私有指令：

图10-7：

// sample compile command: "gcc -fopenmp -c Fig_10.7_threadpriv.c" to generate *.o object file
// will get warning messages about functions init_list, processwork, and freeList are implicitly declared

#include <stdio.h>
#include <sys/time.h>
#include <omp.h>

struct node {
   int data;
   struct node * next;
};

int counter = 0;
#pragma omp threadprivate(counter)

void inc_count()
{
   counter++;
}

int main() 
{
   struct node *p = NULL; 
   struct node *head = NULL; 
   init_list(p);
   head = p;

   #pragma omp parallel 
   {
      #pragma omp single
      {
         p = head;
         while (p) {
            #pragma omp task firstprivate(p) 
            {
               inc_count();
               processwork(p);
            }
         p = p->next;
         }
      }
   printf("thread \%d ran \%d tasks\n",omp_get_thread_num(),counter);
   } 
   freeList(p);

   return 0;
}

子句copyin(list): 可以将threadprivate变量出现在openmp构造中。

master构造：让主线程处理结构化块：

atomic构造：保护一个变量为原子操作。

环境变量 OMP_STACKSIZE：设置每个线程的栈空间大小。

运行时库例程：

omp_get_max_threads() //获取最大线程数

omp_set_dynamic //并行区域到下一个并行区域时，优化线程组大小，叫做动态模式。

omp_in_parallel //此时否在并行区域

同步和内存模型

图11-5：使用冲刷和原子性的成对同步：使用原子性更新然后读取flag

int flag = 0;  // a flag to communicate when the consumer can start
omp_set_num_threads(2);
   
#pragma omp parallel shared(A, flag)
{
   int id = omp_get_thread_num();
   int nthrds = omp_get_num_threads();
     
   // we need two or more threads for this program
   if ((id == 0) && (nthrds < 2)) exit(-1);
     
   if (id == 0) {
      produce(A);
      #pragma omp flush
      #pragma omp atomic write
         flag = 1;
   }
   if (id == 1) {
      while(1) {
         #pragma omp atomic read
            flag_temp = flag;
         if (flag_temp != 0) break;
      }
      #pragma omp flush
      consume (A);
   }
}

openmp的锁：

图11-6:

// sample compile command: "gcc -fopenmp -c Fig_11.6_hist.c" to generate *.o object file

#include <omp.h>
#include <math.h>
//#include "random.h"  //seed() and drandom()                     

extern double drandom();
extern void seed(double low_in, double hi_in);

#define num_trials 1000000	 // number of x values
#define num_bins   100		   // number of bins in histogram
static long xlow = 0.0;		   // low end of x range
static long xhi = 100.0;	   // High end of x range

int main ()
{
   double x;                 
   long hist[num_bins];  // the histogram
   double bin_width;     // the width of each bin in the histogram
   omp_lock_t hist_lcks[num_bins]; // array of locks, one per bucket
   seed(xlow, xhi);  // seed random generator over range of x
   bin_width = (xhi - xlow) / (double)num_bins;

   // initialize the histogram and the array of locks
   #pragma omp parallel for schedule(static) 
   for (int i = 0; i < num_bins; i++) {
      hist[i] = 0;
      omp_init_lock(&hist_lcks[i]);
   }
   // test uniform pseudorandom sequence by assigning values 
   // to the right histogram bin
   #pragma omp parallel for schedule(static) private(x)
      for(int i = 0; i < num_trials; i++) {
     
      x = drandom();
      long ival = (long) (x - xlow)/bin_width;

      // protect histogram bins. Low overhead due to uncontended locks
      omp_set_lock(&hist_lcks[ival]);      
         hist[ival]++;                               
      omp_unset_lock(&hist_lcks[ival]);  
   }
   double sumh = 0.0, sumhsq = 0.0, ave, std_dev;
   // compute statistics (ave, std_dev) and destroy locks
   #pragma omp parallel for schedule(static) reduction(+:sumh,sumhsq)
      for (int i = 0; i < num_bins; i++) {
         sumh   += (double) hist[i];
         sumhsq += (double) hist[i] * hist[i];
         omp_destroy_lock(&hist_lcks[i]); 
      }
   ave = sumh / num_bins;
   std_dev = sqrt(sumhsq / ((double)num_bins) - ave * ave); 
   return 0;
}

临界区的实现是使用锁实现的，当我们使用 #pragma omp critical 的时候，我们默认是使用的 OpenMP 内部的默认锁实现的，如果你在其他地方也使用 #pragma omp critical 的话使用的也是同一把锁，因此即使你用 #pragma omp critical 创建多个临界区你使用的也是同一把锁，也就是说这多个临界区在同一时刻也只会有一个线程在一个临界区执行，其余的临界区是没有线程在执行的，因为所有的临界区使用同一把锁，而一个时刻只能够有一个线程获得锁。

为了解决上面所谈到的问题，在 OpenMP 当中使用 critical 构造代码块的时候我们可以指定一个名字，以此用不同的锁在不同的临界区。

参考链接：https://zhuanlan.zhihu.com/p/600324334
总结：锁比临界区性能好

内存模型

seq_cst顺序一致，release,acquire,acquire_release

图11-7：

#include <omp.h>
#include <stdlib.h>
int main()
{

double *A;
int flag = 0;  // a flag to communicate when the consumer can start
omp_set_num_threads(2);
   
#pragma omp parallel shared(A, flag)
{
   int id = omp_get_thread_num();
   int nthrds = omp_get_num_threads();
   int flag_temp;
     
   // we need two or more threads for this program
   if ((id == 0) && (nthrds < 2)) exit(-1);
     
   if (id == 0) {
      produce(A);
      #pragma omp atomic write seq_cst
         flag = 1;
   }

   if (id == 1) {
      while(1) {
         #pragma omp atomic read seq_cst
            flag_temp = flag;
         if(flag_temp != 0) break;
      }
      consume(A);
   }
}

}