6条函数的MPI世界，处处有陷阱-优快云博客

本文链接：https://blog.youkuaiyun.com/jarodpku/article/details/1841516

MPI其实是十分简单而又强大的并行库。只是这次让我花了半天的工夫才到出了一个一个微小的BUG，让我几乎崩溃。

原程序用于计算两个矩阵相乘。分配任务时，只把第一个矩阵分解传给若干个slave，第二个矩阵全传。虽然效率不高，但我的作业需要是把固定的矩阵大小改为可变的。

源代码：

#include " mpi.h "

#include < stdio.h >

#include < stdlib.h >

#define NRA 62 /* number of rows in matrix A */

#define NCA 15 /* number of columns in matrix A */

#define NCB 7 /* number of columns in matrix B */

#define MASTER 0 /* taskid of first task */

#define FROM_MASTER 1 /* setting a message type */

#define FROM_WORKER 2 /* setting a message type */

int main(argc,argv)

int argc;

char * argv[];

{

int numtasks, /* number of tasks in partition */

taskid, /* a task identifier */

numworkers, /* number of worker tasks */

source, /* task id of message source */

dest, /* task id of message destination */

mtype, /* message type */

rows, /* rows of matrix A sent to each worker */

averow, extra, offset, /* used to determine rows sent to each worker */

i, j, k, rc; /* misc */

double a[NRA][NCA], /* matrix A to be multiplied */

b[NCA][NCB], /* matrix B to be multiplied */

c[NRA][NCB]; /* result matrix C */

MPI_Status status;

MPI_Init(&argc,&argv);

MPI_Comm_rank(MPI_COMM_WORLD,&taskid);

MPI_Comm_size(MPI_COMM_WORLD,&numtasks);

if (numtasks < 2 ) {

printf("Need at least two MPI tasks. Quitting... ");

MPI_Abort(MPI_COMM_WORLD, rc);

exit(1);

}

numworkers = numtasks-1;

/**************************** master task ************************************/

if (taskid == MASTER)

{

printf("mpi_mm has started with %d tasks. ",numtasks);

printf("Initializing arrays... ");

for (i=0; i<NRA; i++)

for (j=0; j<NCA; j++)

a[i][j]= i+j;

for (i=0; i<NCA; i++)

for (j=0; j<NCB; j++)

b[i][j]= i*j;

/* Send matrix data to the worker tasks */

averow = NRA/numworkers;

extra = NRA%numworkers;

offset = 0;

mtype = FROM_MASTER;

for (dest=1; dest<=numworkers; dest++)

{

rows = (dest <= extra) ? averow+1 : averow;

printf("Sending %d rows to task %d offset=%d ",rows,dest,offset);

MPI_Send(&offset, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD);

MPI_Send(&rows, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD);

MPI_Send(&a[offset][0], rows*NCA, MPI_DOUBLE, dest, mtype,

MPI_COMM_WORLD);

MPI_Send(&b, NCA*NCB, MPI_DOUBLE, dest, mtype, MPI_COMM_WORLD);

offset = offset + rows;

}

/* Receive results from worker tasks */

mtype = FROM_WORKER;

for (i=1; i<=numworkers; i++)

{

source = i;

MPI_Recv(&offset, 1, MPI_INT, source, mtype, MPI_COMM_WORLD, &status);

MPI_Recv(&rows, 1, MPI_INT, source, mtype, MPI_COMM_WORLD, &status);

MPI_Recv(&c[offset][0], rows*NCB, MPI_DOUBLE, source, mtype,

MPI_COMM_WORLD, &status);

printf("Received results from task %d ",source);

}

/* Print results */

printf("****************************************************** ");

printf("Result Matrix: ");

for (i=0; i<NRA; i++)

{

printf(" ");

for (j=0; j<NCB; j++)

printf("%6.2f ", c[i][j]);

}

printf(" ****************************************************** ");

printf ("Done. ");

}

/**************************** worker task ************************************/

if (taskid > MASTER)

{

mtype = FROM_MASTER;

MPI_Recv(&offset, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD, &status);

MPI_Recv(&rows, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD, &status);

MPI_Recv(&a, rows*NCA, MPI_DOUBLE, MASTER, mtype, MPI_COMM_WORLD, &status);

MPI_Recv(&b, NCA*NCB, MPI_DOUBLE, MASTER, mtype, MPI_COMM_WORLD, &status);

for (k=0; k<NCB; k++)

for (i=0; i<rows; i++)

{

c[i][k] = 0.0;

for (j=0; j<NCA; j++)

c[i][k] = c[i][k] + a[i][j] * b[j][k];

}

mtype = FROM_WORKER;

MPI_Send(&offset, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD);

MPI_Send(&rows, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD);

MPI_Send(&c, rows*NCB, MPI_DOUBLE, MASTER, mtype, MPI_COMM_WORLD);

}

MPI_Finalize();

}

改过的代码（已经加了很多调试输出）：

#include " mpi.h "

#include < stdio.h >

#include < stdlib.h >

#define MASTER 0 /* taskid of first task */

#define FROM_MASTER 1 /* setting a message type */

#define FROM_WORKER 2 /* setting a message type */

int main( int argc, char * argv[])

{

int NRA;

int NCA;

int NCB;

int numtasks, /* number of tasks in partition */

taskid, /* a task identifier */

numworkers, /* number of worker tasks */

source, /* task id of message source */

dest, /* task id of message destination */

mtype, /* message type */

rows, /* rows of matrix A sent to each worker */

averow, extra, offset, /* used to determine rows sent to each worker */

i, j, k, rc; /* misc */

double * a, /* matrix A to be multiplied */

*b, /* matrix B to be multiplied */

*c; /* result matrix C */

MPI_Status status;

MPI_Init(&argc,&argv);

MPI_Comm_rank(MPI_COMM_WORLD,&taskid);

MPI_Comm_size(MPI_COMM_WORLD,&numtasks);

if (numtasks < 2 )

{

printf("Need at least two MPI tasks. Quitting... ");

MPI_Abort(MPI_COMM_WORLD, rc);

exit(1);

}

numworkers = numtasks-1;

/**************************** master task ************************************/

if (taskid == MASTER)

{

scanf("%d %d %d", &NRA, &NCA, &NCB);

a= new double[NRA*NCA];

b= new double[NCA*NCB];

c= new double[NRA*NCB];

for (int dest=1; dest<= numworkers; ++ dest)

{

MPI_Send(&NRA, 1, MPI_INT, dest, FROM_MASTER, MPI_COMM_WORLD);

MPI_Send(&NCA, 1, MPI_INT, dest, FROM_MASTER, MPI_COMM_WORLD);

MPI_Send(&NCB, 1, MPI_INT, dest, FROM_MASTER, MPI_COMM_WORLD);

}

printf("mpi_mm has started with %d tasks. ",numtasks);

printf("Initializing arrays... ");

for (i=0; i<NRA; i++)

for (j=0; j<NCA; j++)

a[i*NCA+j]= i+j;

for (i=0; i<NCA; i++)

for (j=0; j<NCB; j++)

b[i*NCB+j]= i*j;

printf("****************************************************** ");

printf("A Matrix: ");

for (i=0; i<NRA; i++)

{

printf(" ");

for (j=0; j<NCA; j++)

printf("%6.2f ", a[i*NCA+j]);

}

printf(" B Matrix: ");

for (i=0; i<NCA; i++)

{

printf(" ");

for (j=0; j<NCB; j++)

printf("%6.2f ", b[i*NCB+j]);

}

printf(" ****************************************************** ");

/* Send matrix data to the worker tasks */

averow = NRA/numworkers;

extra = NRA%numworkers;

offset = 0;

mtype = FROM_MASTER;

for (dest=1; dest<=numworkers; dest++)

{

rows = (dest <= extra) ? averow+1 : averow;

printf("Sending %d rows to task %d offset=%d apos %d. ",rows,dest,offset, &(a[offset*NCA]) );

MPI_Send(&offset, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD);

MPI_Send(&rows, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD);

MPI_Send(&(a[offset*NCA]), rows*NCA, MPI_DOUBLE, dest, mtype,

MPI_COMM_WORLD);

MPI_Send(&b, NCA*NCB, MPI_DOUBLE, dest, mtype, MPI_COMM_WORLD);

offset = offset + rows;

}

/* Receive results from worker tasks */

mtype = FROM_WORKER;

for (i=1; i<=numworkers; i++)

{

source = i;

MPI_Recv(&offset, 1, MPI_INT, source, mtype, MPI_COMM_WORLD, &status);

MPI_Recv(&rows, 1, MPI_INT, source, mtype, MPI_COMM_WORLD, &status);

MPI_Recv(&(c[offset*NCB]), rows*NCB, MPI_DOUBLE, source, mtype,

MPI_COMM_WORLD, &status);

printf("Received results from task %d ",source);

}

/* Print results */

printf("****************************************************** ");

printf("Result Matrix: ");

for (i=0; i<NRA; i++)

{

printf(" ");

for (j=0; j<NCB; j++)

printf("%6.2f ", c[i*NCB+j]);

}

printf(" ****************************************************** ");

printf ("Done. ");

delete []a;

delete []b;

delete []c;

}

/**************************** worker task ************************************/

if (taskid > MASTER)

{

mtype = FROM_MASTER;

MPI_Recv(&NRA, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD, &status);

MPI_Recv(&NCA, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD, &status);

MPI_Recv(&NCB, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD, &status);

printf("processor %d : NRA %d, NCA %d, NCB %d. ", taskid, NRA, NCA, NCB);

a= new double[NRA*NCA];

b= new double[NCA*NCB];

c= new double[NRA*NCB];

printf("a addr : %d on procs %d. ", &a, taskid);

if (a==NULL || b==NULL || c==NULL)

{

printf("Allocated error on procs %d. ", taskid);

}

MPI_Recv(&offset, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD, &status);

MPI_Recv(&rows, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD, &status);

printf("processor %d : offset %d, rows %d. ", taskid, offset, rows);

MPI_Recv(&a, rows*NCA, MPI_DOUBLE, MASTER, mtype, MPI_COMM_WORLD, &status);

{

int count;

MPI_Get_count(&status, MPI_DOUBLE, &count);

printf("recived %d data of a on procs %d, %d. ", count, taskid, *(a+2));

printf("a addr : %d on procs %d. ", &a, taskid);

}

MPI_Recv(&b, NCA*NCB, MPI_DOUBLE, MASTER, mtype, MPI_COMM_WORLD, &status);

{

int count;

MPI_Get_count(&status, MPI_DOUBLE, &count);

printf("recived %d data of b on procs %d. ", count, taskid);

}

printf("******on processor %d ******************************** ", taskid);

printf("A Matrix: ");

for (i=0; i<NRA; i++)

{

printf(" ");

for (j=0; j<NCA; j++)

printf("%6.2f ", a[i*NCA+j]);

}

printf(" B Matrix: ");

for (i=0; i<NCA; i++)

{

printf(" ");

for (j=0; j<NCB; j++)

printf("%6.2f ", b[i*NCB+j]);

}

printf(" ****************************************************** ");

for (k=0; k<NCB; k++)

for (i=0; i<rows; i++)

{

c[i*NCB+k] = 0.0;

for (j=0; j<NCA; j++)

c[i*NCB+k] = c[i*NCB+k] + a[i*NCA+j] * b[j*NCB+k];

}

mtype = FROM_WORKER;

MPI_Send(&offset, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD);

MPI_Send(&rows, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD);

MPI_Send(&c, rows*NCB, MPI_DOUBLE, MASTER, mtype, MPI_COMM_WORLD);

delete []a;

delete []b;

delete []c;

}

MPI_Finalize();

}

以上程序的运行唯一结果，就是segment fault。

后在通过dbx工具在core中定位到printf("%6.2f ", a[i*NCA+j]);一句。经过分析，终于找到问题出现在由

double a[][] -> double *a = new [] 这样的转变中。

由于这样的转变，a变成了指针，因此使用Send或Recieve时，就不能再使用 &a 作为第一个参数了，而是直接使用a。

程序这样修改后，终于能正常执行了。而我也可以继续下一个作业了。