Message Passing Interface Dr. Bo Yuan
MPI MPI is a library of functions and macros that can be used in C, Fortran and C++ for writing parallel programs exploiting multiple processors via message passing. Both point-to-point and collective communication are supported. The de facto standard for communication among processes that model a parallel program running on a distributed memory system. Reference – – – 2
Getting Started 3 #include #include /* For strlen*/ #include /* For MPI functions */ #define MAX_STRING 100 int main(int argc, char* argv[]) { chargreeting[MAX_STRING]; intcomm_sz; /* Number of processes */ int my_rank; /* My process rank */ intsource; MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD, &comm_sz); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); if (my_rank!=0) { sprintf(greeting, “Greetings from process %d of %d!”, my_rank, comm_sz);
Getting Started 4 MPI_Send(greeting, strlen(greeting)+1, MPI_CHAR, 0, 0, MPI_COMM_WORLD); } else { printf(“Greetings from process %d of %d!\n”, my_rank, comm_sz); for (source=1; source<comm_sz; source++) { MPI_Recv(greeting, MAX_STRING, MPI_CHAR, source, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); printf(“%s\n”, greeting); } } MPI_Finalize(); return 0; } /* main */ $ mpicc –g mpi_hello mpi_hello.c $ mpiexec –n 4./mpi_hello
MPI Programs A copy of the executable program is scheduled to run on each processor (called a process). All processes are identified by a sequence of non-negative integers (ranks: 0, 1, …, n). Different processes can execute different statements by branching within the program (usually based on process ranks). Single Program, Multiple Data (SPMD) mpi.h –Prototypes of MPI functions, macro definitions, type definitions … 5
General Structure 6... #include /* For MPI functions */... int main(int argc, char* argv[]) {... /* No MPI calls before this */ /* Must be called once and only once */ /* MPI_Init(NULL, NULL) */ MPI_Init(&argc,&argv);... MPI_Finalize(); /* No MPI calls after this */... return 0; }
Communicator A collection of processes that can send messages to each other Used in all functions that involve communication. Default: MPI_COMM_WORLD To ensure messages are not accidentally received in the wrong place. 7 int MPI_Comm_size( MPI_Comm comm/* in */, int* comm_sz_p /* out */); int MPI_Comm_rank( MPI_Comm comm /* in */, int* my_rank_p /* out */);
Send & Receive 8 int MPI_Send( void*msg_buf_p/* in */, intmsg_size/* in*/, MPI_Datatypemsg_type/* in*/, intdest/* in*/, inttag/* in*/, MPI_Commcommunicator/* in*/); int MPI_Recv( void*msg_buf_p/* out */, intbuf_size/* in*/, MPI_Datatypebuf_type/* in*/, intsource/* in*/, inttag/* in*/, MPI_Commcommunicator/* in*/, MPI_Status*status_p/* out*/);
Send & Receive Message Matching (q r) –recv_comm=send_comm, recv_tag=send_tag –dest=r, src=q, recv_type=send_type –recv_buf_sz≥send_buf_sz The tag argument –Non-negative integer –Used to distinguish messages that are otherwise identical. The status_p argument –MPI_Status status –status.MPI_SOURCE, status.MPI_TAG, status.MPI_ERROR –MPI_Get_count(&status, recv_type, &count) 9
Send & Receive Wildcard –MPI_ANY_SOURCE, MPI_ANY_TAG Only a receiver can use a wildcard argument. There is no wildcard for communicator arguments. 10 for (i=1; i<comm_sz; i++){ MPI_Recv(result, result_sz, result_type, MPI_ANY_SOURCE, result_tag, comm, MPI_STATUS_IGNORE); Process_result(result); }
Send & Receive Message = Data + Envelope MPI_Send –Buffer –Block –No overtaking MPI_Recv –Block Pitfalls –Hang –Deadlock 11 MPI Data TypeC Data Type MPI_CHARsigned char MPI_INTsigned int MPI_UNSIGNEDunsigned int MPI_FLOATfloat MPI_DOUBLEdouble
Trapezoidal Rule 12 x y abx y xixi x i+1 f(xi)f(xi) f(x i+1 ) h y=f(x)
Parallel Trapezoidal Rule Get a, b, n 02. h=(b-a)/n; 03. local_n=n/comm_sz; 04. local_a=a+my_rank*local_n*h; 05. local_b=local_a+local_n*h; 06. local_integral=Trap(local_a, local_b, local_n, h); 07. if (my_rank!=0) 08. Send local_integral to process 0; 09. else { /* my_rank==0 */ 10. total_integral=local_integral; 11. for (proc=1; proc<comm_sz; proc++) { 12. Receive local_integral from proc; 13. total_integral+=local_integral; 14. } 15. } 16. if (my_rank==0) 17. print total_integral;
MPI Trapezoidal Rule 14 int main(void) { intmy_rank, comm_sz, n=1024, local_n; double a=0.0, b=3.0, h, local_a, local_b; double local_int, total_int; int source; MPI_Init(NULL,NULL); MPI_Comm_size(MPI_COMM_WORLD, &comm_sz); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); h=(b-a)/n; local_n=n/comm_sz; local_a=a+my_rank*local_n*h; local_b=local_a+local_n*h; local_int=Trap(local_a, local_b, local_n, h);
MPI Trapezoidal Rule 15 if (my_rank!=0) { MPI_Send(&local_int, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD); } else { total_int=local_int; for (source=1; source<comm_sz; source++) { MPI_Recv(&local_int, 1, MPI_DOUBLE, source, 0, MPI_COMM_WORLD,MPI_STATUS_IGNORE); total_int+=local_int; } if (my_rank==0) { printf(“With n=%d trapezoids, our estimate\n”, n); printf(“of the integral from %f to %f=%.15e\n”, a, b, total_int); } MPI_Finalize(); return 0; }
Handling Inputs 16 void Get_input( intmy_rank /* in*/, intcomm_sz /* in*/, double*a_p/* out*/, double* b_p/* out*/, int*n_p/* out*/) { int dest; if (my_rank==0) { printf(“Enter a, b, and n\n”); scanf(“%lf %lf %d”, a_p, b_p, n_p); for (dest=1; dest<comm_sz; dest++) { MPI_Send(a_p, 1, MPI_DOUBLE, dest, 0, MPI_COMM_WORLD); MPI_Send(b_p, 1, MPI_DOUBLE, dest, 0, MPI_COMM_WORLD); MPI_Send(n_p, 1, MPI_INT, dest, 0, MPI_COMM_WORLD); } } else { MPI_Recv(a_p, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); MPI_Recv(b_p, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); MPI_Recv(n_p, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } Only process 0 can access stdin !
Back to the Trapezoidal Rule Which process is the busiest one? –The global-sum function –Load Balancing Given 1024 processes –How many receives and additions in total? –Can we improve it? How to code such a tree-structured global sum function? Collective Communications –All processes in a communicator are involved. –All processes in the communicator must call the same collective function. –Matched solely on the communicator and the order in which they are called. –Point-to-Point Communications: MPI_Send and MPI_Recv 21
MPI_Reduce 22 int MPI_Reduce ( void*input_data_p /* in */, void* output_data_p /* out */, intcount /* in */, MPI_Datatypedatatype /* in */, MPI_Opoperator /* in */, intdest_process /* in */, MPI_Commcomm /* in */); MPI_Reduce(&local_int, &total_int, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); double local_x[N], sum[N];... MPI_Reduce(local_x, sum, N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Reduce 23 TimeProcess 0Process 1Process 2 0 a=1; c=2; 1 MPI_Reduce(&a, &b,)MPI_Reduce(&c, &d,)MPI_Reduce(&a, &b,) 2 MPI_Reduce(&c, &d,)MPI_Reduce(&a, &b,)MPI_Reduce(&c, &d,) Operation ValueMeaning MPI_MAXMaximum MPI_MINMinimum MPI_SUMSum MPI_PRODProduct MPI_LANDLogical and MPI_BANDBitwise and... MPI_SUM, Destination Process:0 b=1+2+1 d=2+1+2
MPI_Bcast 25 void Get_input( intmy_rank /* in*/, intcomm_sz /* in*/, double*a_p/* out*/, double* b_p/* out*/, int*n_p/* out*/) { if (my_rank==0) { printf(“Enter a, b, and n\n”); scanf(“%lf %lf %d”, a_p, b_p, n_p); } MPI_Bcast(a_p, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Bcast(b_p, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Bcast(n_p, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); } int MPI_Bcast( void* data_p /* in/out */, int count/* in */, MPI_Datatype datatype/* in */, int source_proc /* in */, MPI_Comm comm/* in */); How did we distribute the input data? How did we implement the global sum?
MPI_Scatter 26 void Read_vector( double local_a[] /* out*/, int n /* in*/, int my_rank/* in*/, int comm_sz/* in */) { double* a=NULL; int i, local_n; local_n=n/comm_sz; if (my_rank==0) { a=malloc(n*sizeof(double)); for (i=0; i<n; i++) scanf(“%lf”, &a[i]); MPI_Scatter(a, local_n, MPI_DOUBLE, local_a, local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD); free(a); } else { MPI_Scatter(a, local_n, MPI_DOUBLE, local_a, local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD); } Reading and distributing a vector
MPI_Gather 27 void Print_vector( double local_b[] /* in*/, int n /* in*/, int my_rank/* in*/, int comm_sz/* in */) { double* b=NULL; int i, local_n; local_n=n/comm_sz; if (my_rank==0) { b=malloc(n*sizeof(double)); MPI_Gather(local_b, local_n, MPI_DOUBLE, b, local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD); for (i=0; i<n; i++) printf(“%f “, b[i]); printf(“/n”); free(b); } else { MPI_Gather(local_b, local_n, MPI_DOUBLE, b, local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD); } Printing a distributed vector
MPI Derived Data Types 28 int MPI_Type_create_struct ( intcount/* in*/, intarray_of_blocklengths[]/* in*/, MPI_Aintarray_of_dispacements[]/* in*/, MPI_Datatypearray_of_types[]/* in*/, MPI_Datatype*new_type_p/* out*/); int MPI_Get_address ( void*location_p/* in*/, MPI_Aint*address_p/* out*/); VariableAddress a24 b40 n48 {(MPI_DOUBLE, 0), (MPI_DOUBLE, 16), (MPI_INT, 24)}
MPI Derived Data Types 29 void Build_mpi_type( double* a_p /* in*/, double* b_p /* in*/, int* n_p /* in*/, MPI_Datatype*input_mpi_t_p/* out */) { int array_of_blocklengths[3]={1, 1, 1}; MPI_Datatype array_of_types[3]={MPI_DOUBLE, MPI_DOUBLE, MPI_INT}; MPI_Aint a_addr, b_addr, n_addr; MPI_Aint array_of_displacements[3]={0}; MPI_Get_address(a_p, &a_addr); MPI_Get_address(b_p, &b_addr); MPI_Get_address(n_p, &n_addr); array_of_displacements[1]=b_addr-a_addr; array_of_displacements[2]=n_addr-a_addr; MPI_Type_create_struct(3, array_of_blocklengths, array_of_displacements, array_of_types, input_mpi_t_p); MPI_Type_commit(input_mpi_t_p); }
Get_input with Derived Data Types 30 void Get_input( intmy_rank /* in*/, intcomm_sz /* in*/, double*a_p /* out*/, double* b_p /* out*/, int*n_p /* out*/) { MPI_Datatype input_mpi_t; Build_mpi_type(a_p, b_p, n_p, &input_mpi_t); if (my_rank==0) { printf(“Enter a, b, and n\n”); scanf(“%lf %lf %d”, a_p, b_p, n_p); } MPI_Bcast(a_p, 1, input_mpi_t, 0, MPI_COMM_WORLD); MPI_Type_free(&input_mpi_t); }
Timing 31 double MPI_Wtime (void); int MPI_Barrier (MPI_Comm comm /* in*/); /* The following code is used to time a block of MPI code. */ double local_start, local_finish, local_elapsed, elapsed;... MPI_Barrier(comm); local_start=MPI_Wtime(); /* Code to be timed */... local_finish=MPI_Wtime(); local_elapsed=local_finish-local_start; MPI_Reduce(&local_elapsed, &elapsed, 1, MPI_DOUBLE, MPI_MAX, 0, comm); if (my_rank==0) printf(“Elapsed time=%e seconds\n”, elapsed);
Performance Measure 32 comm_sz Order of Matrix Running Times of Matrix-Vector Multiplication
Performance Measure 33 comm_sz Order of Matrix comm_sz Order of Matrix Speedups Efficiencies
Review What is the general structure of MPI programs? What is the so called SPMD? How to perform basic communication between processes? When will processes hang or deadlock? Which process is allowed to have access to stdin ? What is collective communication? Name three MPI collective communication functions. What is a MPI derived data type? 35