Download presentation
Presentation is loading. Please wait.
Published byClare Asp Modified over 9 years ago
1
Parallel Processing1 Parallel Processing (CS 667) Lecture 9: Advanced Point to Point Communication Jeremy R. Johnson *Parts of this lecture was derived from chapters 13 in Pacheco
2
Parallel Processing2 Introduction Objective: To further examine message passing communication patterns. Topics –Implementing Allgather Ring Hypercube –Non-blocking send/recv MPI_Isend MPI_Wait MPI_Test
3
Parallel Processing3 Broadcast/Reduce Ring P3P2 P1 P0 P3P2 P1 P0 P3P2 P1 P0 P3P2 P1 P0
4
Parallel Processing4 Bi-directional Broadcast Ring P3P2 P1 P0 P3P2 P1 P0 P3P2 P1 P0
5
Parallel Processing5 Allgather Ring x3x2 x0x1 P3P2 P1 P0 x2,x3x1,x2 x0,x3x0,x1 P3P2 P1 P0 x1,x2,x3 x0,x2,x3 P3P2 P1 P0 x0,x1,x2,x3 P3P2 P1 P0 x0,x1,x2 x0,x1,x3 x0,x1,x2,x3
6
Parallel Processing6 AllGather int MPI_AllGather( void* send_data /* in */ int send_count /* in */ MPI_Datatype send_type /* in */ void* recv_data /* out */ int recv_count /* in */ MPI_Datatype recv_type /* in */ MPI_Comm communicator /* in */) Process 0 Process 1 Process 2 Process 3 x0 x1 x2 x3
7
Parallel Processing7 Allgather_ring void Allgather_ring(float x[], int blocksize, float y[], MPI_Comm comm) { int i, p, my_rank; int successor, predecessor; int send_offset, recv_offset; MPI_Status status; MPI_Comm_size(comm, &p); MPI_Comm_Rank(comm, &my_rank); for (i=0; i < blocksize; i++) y[i + my_rank*blocksize] = x[i]; successor = (my_rank + 1) % p; predecessor = (my_rank – 1 + p) % p;
8
Parallel Processing8 Allgather_ring for (i=0; i < p-1; i++) { send_offset = ((my_rank – i + p) % p)*blocksize; recv_offset = ((my_rank –i – 1+p) % p)*blocksize; MPI_Send(y + send_offset,blocksize,MPI_FLOAT, successor, 0, comm); MPI_Recv(y + rec_offset,blocksize,MPI_FLOAT,predecessor,0, comm,&status); }
9
Parallel Processing9 Hypercube Graph (recursively defined) n-dimensional cube has 2 n nodes with each node connected to n vertices Binary labels of adjacent nodes differ in one bit 000 001 101 100 010 011 110 111 00 01 10 11 0 1
10
Parallel Processing10 000 001 101 100 010 011 110 111 Broadcast/Reduce
11
Parallel Processing11 000 001 101 100 010 011 110 111 Allgather
12
Parallel Processing12 Allgather 0 12 3 4 56 7 0 12 3 4 56 7 0 12 3 4 56 7 0 12 3 4 56 7
13
Parallel Processing13 Allgather_cube void Allgather_cube(float x[], int blocksize, float y[], MPI_Comm comm) { int i, d, p, my_rank; unsigned eor_bit, and_bits; int stage, partner; MPI_Datatype hole_type; int send_offset, recv_offset; MPI_Status status; int log_base2(int p); MPI_Comm_size(comm, &p); MPI_Comm_Rank(comm, &my_rank); for (i=0; i < blocksize; i++) y[i + my_rank*blocksize] = x[i]; d = log_base2(p); eor_bit = 1 << (d-1); and_bits = (1 << d) – 1;
14
Parallel Processing14 Allgather_cube for (stage = 0; stage < d; stage++) { partner = my_rank ^ eor_bit; send_offset = (my_rank & and_bits) * blocksize; recv_offset = (partner & and_bits)*blocksize; MPI_Type_vector(1 << stage, blocksize, (1 << (d-stage))*blocksize, MPI_FLOAT,&hold_type); MPI_Type_commit(&hole_type); MPI_Send(y+send_offset,1,hole_type,partner, 0, comm); MPI_Recv(y+recv_offset,1,hole_type,partner, 0, comm,&status); MPI_Type_free(&hole_type); eor_bit = eor_bit >> 1; and_bits = and_bits >> 1; }
15
Parallel Processing15 Buffering Assumption Previous code is not safe since it depends on sufficient system buffers being available so that deadlock does not occur. SendRecv can be used to guarantee that deadlock does not occur.
16
Parallel Processing16 SendRecv int MPI_Sendrecv( void* send_buf /* in */, int send_count /* in */, MPI_Datatype send_type /* in */, int dest /* in */, int send_tag /* in */, void* recv_buf /* out */, int recv_count /* in */, MPI_Datatype recv_type /* in */, int source /* in */, int recv_tag /* in */, MPI_Comm communicator /* in */, MPI_Status* status /* out */)
17
Parallel Processing17 SendRecvReplace int MPI_Sendrecv_replace( void* buffer /* in */, int count /* in */, MPI_Datatype datatype /* in */, int dest /* in */, int send_tag /* in */, int source /* in */, int recv_tag /* in */, MPI_Comm communicator /* in */, MPI_Status* status /* out */)
18
Parallel Processing18 Nonblocking Send/Recv Allow overlap of communication and computation. Does not wait for buffer to be copied or receive to occur. The communication is posted and can be tested later for completion int MPI_Isend( /* Immediate */ void* buffer /* in */, int count /* in */, MPI_Datatype datatype /* in */, int dest /* in */, int tag /* in */, MPI_Comm comm /* in */, MPI_Request* request /* out */)
19
Parallel Processing19 Nonblocking Send/Recv int MPI_Irecv( void* buffer /* in */, int count /* in */, MPI_Datatype datatype /* in */, int source /* in */, int tag /* in */, MPI_Comm comm /* in */, MPI_Request* request /* out */) int MPI_Wait( MPI_Request* request /* in/out a*/, MPI_Status* status /* out */) int MPI_Test(MPI_Request* request, int * flat, MPI_Status* status);
20
Parallel Processing20 Allgather_ring (Overlapped) recv_offset = ((my_rank –1 + p) % p)*blocksize; for (i=0; i < p-1; i++) { MPI_ISend(y + send_offset,blocksize,MPI_FLOAT, successor, 0, comm, &send_request); MPI_IRecv(y + rec_offset,blocksize,MPI_FLOAT,predecessor,0, comm,&recv_request); send_offset = ((my_rank – i -1 + p) % p)*blocksize; recv_offset = ((my_rank – i – 2 +p) % p)*blocksize; MPI_Wait(&send_request, &status); MPI_Wait(&recv_request, &status); }
21
Parallel Processing21 AllGather int MPI_AllGather( void* send_data /* in */ int send_count /* in */ MPI_Datatype send_type /* in */ void* recv_data /* out */ int recv_count /* in */ MPI_Datatype recv_type /* in */ MPI_Comm communicator /* in */) Process 0 Process 1 Process 2 Process 3 x0 x1 x2 x3
22
Parallel Processing22 Alltoall int MPI_Alltoall( void* send_buffer /* in */ int send_count /* in */ MPI_Datatype send_type /* in */ void* recv_buffer /* out */ int recv_count /* in */ MPI_Datatype recv_type /* in */ MPI_Comm communicator /* in */) 00 10 20 30 Process 0 Process 1 Process 2 Process 3 01 11 21 31 02 12 22 32 03 13 23 33 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
23
Parallel Processing23 AlltoAll Sequence of permutations implemented with send_recv 01234567 70123456 67012345 56701234 45670123 34567012 23456701 12345670
24
Parallel Processing24 AlltoAll (2 way) Sequence of permutations implemented with send_recv 01234567 10325476 23016745 32107654 45670123 54761032 67452301 76543210
25
Parallel Processing25 Communication Modes Synchronous (wait for receive) Ready (make sure receive has been posted) Buffered (user provides buffer space)
Similar presentations
© 2025 SlidePlayer.com. Inc.
All rights reserved.