1 |
C $Header: /u/gcmpack/models/MITgcmUV/eesupp/src/exch.F,v 1.4.2.2 1998/06/22 02:11:15 cnh Exp $ |
2 |
|
3 |
#include "CPP_EEOPTIONS.h" |
4 |
|
5 |
C-- File exch.F: Routines that perform atomic communication operations |
6 |
C i.e. communication operations that are completed by |
7 |
C time the routines return. |
8 |
C Note |
9 |
C ==== |
10 |
C The code in here although intricate is fairly |
11 |
C straightforward. There are four routines, one |
12 |
C for each of the data patterns we wish to do overlap |
13 |
C updates on - as listed below. Each routine has two |
14 |
C parts. The first part does overlap updates to and from |
15 |
C remote "processes", that is processes who do not truly |
16 |
C share the address space of this process. This part |
17 |
C requires a facility like MPI or CRAY shmem get and put |
18 |
C operations. It is possible to switch this part on or |
19 |
C off using compiler directives. In the case of a simple |
20 |
C serial execution nothing happens in this part. This part |
21 |
C is also switched off when communication strategies |
22 |
C that allow the overlapping of communication and computation |
23 |
C are employed. |
24 |
C The second part of each routine does the true shared memory |
25 |
C overlap copying i.e. copying from one part of array phi |
26 |
C to another part. This part is always active, in the case |
27 |
C of a single threaded messaging code, however, this part |
28 |
C will not do any copies as all edges will be flagged as using |
29 |
C for example MPI or SHMPUT, SHMGET. |
30 |
C Contents |
31 |
C o exch_xy_r4 - Does overlap updates for REAL*4 2d XY fields |
32 |
C o exch_xy_r8 - Does overlap updates for REAL*8 2d XY fields |
33 |
C o exch_xyz_r4 - Does overlap updates for REAL*4 3d XYZ fields |
34 |
C o exch_xyz_r8 - Does overlap updates for REAL*8 3d XYZ fields |
35 |
|
36 |
CStartOfInterface |
37 |
SUBROUTINE EXCH_XY_R4( |
38 |
U phi, |
39 |
I myThid ) |
40 |
C /==========================================================\ |
41 |
C | SUBROUTINE EXCH_XY_R4 | |
42 |
C | o Handle exchanges for real*4, two-dimensional arrays. | |
43 |
C |==========================================================| |
44 |
C | Do true shared-memory data transfers and "messaging" | |
45 |
C | tranfers for blocking case of data transfers. | |
46 |
C | Applications call this routine using | |
47 |
C | CALL EXCH..( x, myThid ) | |
48 |
C | where x is a two-dimensional array with overlaps. | |
49 |
C | This routine does true-shared-memory copies for blocks | |
50 |
C | within a thread. It will also do MPI meesaging between | |
51 |
C | different processes. | |
52 |
C | Note: | |
53 |
C | ===== | |
54 |
C | If it is used, "asynchronous" messaging in which | |
55 |
C | communication overlaps computation is handled elsewhere | |
56 |
C | - see recv.F and send.F. | |
57 |
C | In practice MPI implementations may not be completely | |
58 |
C | thread safe. In principle this code is correct even for | |
59 |
C | mixed MPI and multi-threaded execution. | |
60 |
C | In multi-thread execution data managed by other threads | |
61 |
C | is only automatically visible to another thread if it | |
62 |
C | existed before the thread was created. This means that | |
63 |
C | all Fortran we declare arrays with overlap regions in | |
64 |
C | COMMON blocks. | |
65 |
C \==========================================================/ |
66 |
|
67 |
C === Global data === |
68 |
#include "SIZE.h" |
69 |
#include "EEPARAMS.h" |
70 |
#include "EESUPPORT.h" |
71 |
|
72 |
C === Routine arguments === |
73 |
C phi - Array who's overlap regions are to be exchanged |
74 |
C myThid - My thread id. |
75 |
Real*4 phi(1-OLx:sNx+OLx,1-OLy:sNy+OLy,nSx,nSy) |
76 |
INTEGER myThid |
77 |
CEndOfInterface |
78 |
|
79 |
C === Local variables === |
80 |
C bi,bj - Outer loop counters |
81 |
C I,J,K - Inner loop counters |
82 |
INTEGER bi, bj |
83 |
INTEGER I, J |
84 |
#ifdef ALLOW_USE_MPI |
85 |
C tagSend - Tags used to mark and select messages |
86 |
C tagRecv |
87 |
C nReqs - MPI request counter. |
88 |
C reqIds - Outstanding requests to wait on. |
89 |
C mpiRC - MPI return code |
90 |
C mpiStatArr - Multi-request status reporting array. |
91 |
C toPid - Proc. id sending to |
92 |
C fromPid - Proc. id receving from |
93 |
C elCount - Number of items to send or receive |
94 |
C elType - Datat type of elements |
95 |
INTEGER tagSend |
96 |
INTEGER tagRecv |
97 |
INTEGER nReqs |
98 |
INTEGER reqIds(4) |
99 |
INTEGER mpiRC |
100 |
INTEGER mpiStatArr(MPI_STATUS_SIZE,4) |
101 |
INTEGER toPid |
102 |
INTEGER fromPid |
103 |
INTEGER elCount |
104 |
INTEGER elType |
105 |
#endif /* ALLOW_USE_MPI */ |
106 |
C bE, bW - Block index to east, west, north and |
107 |
C bN, bS south. |
108 |
INTEGER bW, bE, bS, bN |
109 |
|
110 |
C-- Do "message-passing" data exchanges |
111 |
#ifdef ALLOW_SYNC_COMMUNICATION |
112 |
#ifndef ALWAYS_USE_SYNC_COMMUNICATION |
113 |
IF ( usingSyncMessages ) THEN |
114 |
#endif |
115 |
|
116 |
C-- MPI based data exchages |
117 |
#ifdef ALLOW_USE_MPI |
118 |
#ifndef ALWAYS_USE_MPI |
119 |
IF ( usingMPI ) THEN |
120 |
#endif |
121 |
|
122 |
C Here use the approach that all the X shifts are completed |
123 |
C before the Y shifts start. Then we can expand our Y shifts |
124 |
C to include X overlap regions. This means we don't have to |
125 |
C transfer corners separately. |
126 |
C Notes: |
127 |
C ====== |
128 |
C 1. If we are only using messages in Y and using "true shared-memory" |
129 |
C in X then we rely on the shared-memory code to be implemented so that |
130 |
C corner values will be set correctly. The true shared-memory code |
131 |
C is in the block below - so be careful when altering it! |
132 |
C 2. In order to ensure we grab the right message we need to tag |
133 |
C messages. We tag the messages with the direction in which they |
134 |
C were sent out. Thus to receive from a processor to our west we |
135 |
C request the next message from that processor that was sent out |
136 |
C with a tag indicating an easterly send. |
137 |
C 3. We could implement a "safe" code here by having everyone |
138 |
C sync and then getting a particular thread ( say thread 1 ) to |
139 |
C do the MPI_Isend sequentially. In this mode the "communication" |
140 |
C thread would loop over nThreads and do MPI_Sendrecv for each |
141 |
C one. This could be more efficient on some platforms as the messages |
142 |
C might be sent as one large unit rather than by separate threads. |
143 |
C It would also be "thread-safe" for MPI implementations that have |
144 |
C multi-threading problems with things like request ids. |
145 |
C 4. We include overlap regions in both X and Y sends. |
146 |
C This means we can interchange the Y block with the X block. |
147 |
C Not sure whether this will ever be useful! |
148 |
C 5. The generation of a request handle by MPI_Isend and |
149 |
C MPI_Irecv ouught to involve some global state within MPI. |
150 |
C If this process is not thread safe it may be necessary to |
151 |
C enable a critical section around the MPI_Isend and |
152 |
C MPI_Irecv calls. |
153 |
|
154 |
C We need a barrier here to synchronise threads otherwise |
155 |
C one thread might declare it has is ready to send and receive |
156 |
C even though another thread is going to write to the first |
157 |
C threads send or receive region. This won't happen, however, |
158 |
C if we are careful about which data a thread updates - but at |
159 |
C the moment we aren't careful! |
160 |
_BARRIER |
161 |
|
162 |
C-- East-west messaging communication |
163 |
nReqs = 0 |
164 |
bE = myBxLo(myThid) |
165 |
IF ( bE .EQ. 1 .AND. commW(myThid) .EQ. COMM_MPI ) THEN |
166 |
C My west face uses MPI. Get ready to receive data from |
167 |
C west and post that we are ready to send data to the west. |
168 |
C Data we receive has to come with tag of thread we know is |
169 |
C sending from our west, data we send is tagged with our thread id |
170 |
C and the send direction. |
171 |
nReqs = nReqs+1 |
172 |
tagSend = mpiTagW*nThreads+myThid |
173 |
toPid = mpiPidW |
174 |
elCount = 1 |
175 |
elType = mpiTypeXFaceThread_xy_r4(myThid) |
176 |
C I tagSend,MPI_COMM_WORLD, |
177 |
CALL MPI_Isend( |
178 |
I phi(1,1-OLy,bE,myByLo(myThid)), |
179 |
I elCount, elType, toPid, |
180 |
I tagSend,MPI_COMM_WORLD, |
181 |
I reqIds(nReqs),mpiRC) |
182 |
nReqs = nReqs+1 |
183 |
tagRecv = mpiTagE*nThreads+myThrW(myThid) |
184 |
fromPid = mpiPidW |
185 |
elCount = 1 |
186 |
elType = mpiTypeXFaceThread_xy_r4(myThid) |
187 |
CALL MPI_Irecv( |
188 |
U phi(1-OLx,1-OLy,bE,myByLo(myThid)), |
189 |
I elCount, elType, fromPid, |
190 |
I tagRecv,MPI_COMM_WORLD, |
191 |
I reqIds(nReqs),mpiRC) |
192 |
ENDIF |
193 |
bW = myBxHi(myThid) |
194 |
IF ( bW .EQ. nSx .AND. commE(MyThid) .EQ. COMM_MPI ) THEN |
195 |
C My east face uses MPI. Get ready to receive data from |
196 |
C east and post that we are ready to send data to the east. |
197 |
C Data we receive has to come with tag of thread we know is |
198 |
C sending from our west, data we send is tagged with our thread id. |
199 |
nReqs = nReqs+1 |
200 |
tagSend = mpiTagE*nThreads+myThid |
201 |
toPid = mpiPidE |
202 |
elCount = 1 |
203 |
elType = mpiTypeXFaceThread_xy_r4(myThid) |
204 |
CALL MPI_Isend( |
205 |
I phi(sNx-OLx+1,1-OLy,bW,myByLo(myThid)), |
206 |
I elCount, elType, toPid, |
207 |
I tagSend,MPI_COMM_WORLD, |
208 |
I reqIds(nReqs),mpiRC) |
209 |
nReqs = nReqs+1 |
210 |
tagRecv = mpiTagW*nThreads+myThrE(myThid) |
211 |
fromPid = mpiPidE |
212 |
elCount = 1 |
213 |
elType = mpiTypeXFaceThread_xy_r4(myThid) |
214 |
CALL MPI_Irecv( |
215 |
U phi(sNx+1,1-OLy,bW,myByLo(myThid)), |
216 |
I elCount, elType, fromPid, |
217 |
I tagRecv,MPI_COMM_WORLD, |
218 |
I reqIds(nReqs),mpiRC) |
219 |
ENDIF |
220 |
C Wait for this threads east-west transactions to finish before |
221 |
C posting north-south transactions. We have to do this so that |
222 |
C the north-south transactions will send out the correct overlap |
223 |
C region values into the corner sections of our neighbors. |
224 |
CALL MPI_Waitall( nReqs, reqIds, mpiStatArr, mpiRC ) |
225 |
|
226 |
CcnhDebugStarts |
227 |
C RETURN |
228 |
CcnhDebugEnds |
229 |
|
230 |
C-- North-south messaging communication |
231 |
nReqs = 0 |
232 |
bN = myByLo(myTHid) |
233 |
IF ( bN .EQ. 1 .AND. commS(myThid) .EQ. COMM_MPI ) THEN |
234 |
C My south face uses MPI. Get ready to receive data from |
235 |
C south and post that I am ready to send data to the south. |
236 |
nReqs = nReqs + 1 |
237 |
tagSend = mpiTagS*nThreads+myThid |
238 |
toPid = mpiPidS |
239 |
elCount = 1 |
240 |
elType = mpiTypeYFaceThread_xy_r4(myThid) |
241 |
CALL MPI_Isend( |
242 |
I phi(1-OLx,1,myBxLo(myThid),bN), |
243 |
I elCount, elType, toPid, |
244 |
I tagSend,MPI_COMM_WORLD, |
245 |
I reqIds(nReqs), mpiRC ) |
246 |
nReqs = nReqs + 1 |
247 |
tagRecv = mpiTagN*nThreads+myThrS(myThid) |
248 |
fromPid = mpiPidS |
249 |
elCount = 1 |
250 |
elType = mpiTypeYFaceThread_xy_r4(myThid) |
251 |
CALL MPI_Irecv( |
252 |
I phi(1-OLx,1-OLy,myBxLo(myThid),bN), |
253 |
I elCount, elType, fromPid, |
254 |
I tagRecv,MPI_COMM_WORLD, |
255 |
I reqIds(nReqs), mpiRC ) |
256 |
ENDIF |
257 |
C |
258 |
bS = myByHi(myTHid) |
259 |
IF ( bS .EQ. nSy .AND. commN(myThid) .EQ. COMM_MPI ) THEN |
260 |
C My north face uses MPI. Get ready to receive data from |
261 |
C north and post that I am ready to send data to the north. |
262 |
nReqs = nReqs + 1 |
263 |
tagSend = mpiTagN*nThreads+myThid |
264 |
toPid = mpiPidN |
265 |
elCount = 1 |
266 |
elType = mpiTypeYFaceThread_xy_r4(myThid) |
267 |
CALL MPI_Isend( |
268 |
I phi(1-OLx,sNy-OLy+1,myBxLo(myThid),bS), |
269 |
I elCount, elType, toPid, |
270 |
I tagSend,MPI_COMM_WORLD, |
271 |
I reqIds(nReqs), mpiRC ) |
272 |
nReqs = nReqs + 1 |
273 |
tagRecv = mpiTagS*nThreads+myThrN(myThid) |
274 |
fromPid = mpiPidN |
275 |
elCount = 1 |
276 |
elType = mpiTypeYFaceThread_xy_r4(myThid) |
277 |
CALL MPI_Irecv( |
278 |
I phi(1-OLx,sNy+1,myBxLo(myThid),bS), |
279 |
I elCount, elType, fromPid, |
280 |
I tagRecv,MPI_COMM_WORLD, |
281 |
I reqIds(nReqs), mpiRC ) |
282 |
ENDIF |
283 |
C Wait for this threads north-south transactions to finish. |
284 |
CALL MPI_Waitall( nReqs, reqIds, mpiStatArr, mpiRC ) |
285 |
|
286 |
#ifndef ALWAYS_USE_MPI |
287 |
ENDIF |
288 |
#endif |
289 |
#endif /* ALLOW_USE_MPI */ |
290 |
|
291 |
#ifndef ALWAYS_USE_SYNC_COMMUNICATION |
292 |
ENDIF |
293 |
#endif |
294 |
#endif /* ALLOW_SYNC_COMMUNICATION */ |
295 |
|
296 |
C-- Do true shared-memory data exchanges |
297 |
C Note this is also doing the edge copies for a regular serial |
298 |
C code. |
299 |
C-- First make sure all threads have reached here |
300 |
_BARRIER |
301 |
|
302 |
C-- Now do copies |
303 |
C Notes: |
304 |
C ====== |
305 |
C Here we copy from up to and including the overlap |
306 |
C regions for both the X shift and the Y shift. This |
307 |
C catches the "corners" of the data copied using messages |
308 |
C in the "synchronous communication" section above. |
309 |
C As coded here we also write to a remote region in |
310 |
C one direction. In some situations it may be better only |
311 |
C to write to our own overlaps - could be the case with |
312 |
C Wildfire replication. This should be chamged once everything |
313 |
C is working OK. |
314 |
C |
315 |
C-- x-axis exchanges |
316 |
bE = myBxLo(myThid) |
317 |
IF ( bE .NE. 1 .OR. |
318 |
& commW(myThid) .EQ. COMM_SHARED ) THEN |
319 |
bW = bE-1 |
320 |
IF ( bW .LT. 1 ) bW = nSx |
321 |
DO bj=myByLo(myThid),myByHi(myThid) |
322 |
DO J=1-OLy,sNy+OLy |
323 |
DO I=1,OLx |
324 |
phi(sNx+I,J,bW,bj)=phi( 1+I-1,J,bE,bj) |
325 |
phi(1-I ,J,bE,bj)=phi(sNx-I+1,J,bW,bj) |
326 |
ENDDO |
327 |
ENDDO |
328 |
ENDDO |
329 |
ENDIF |
330 |
DO bE=myBxLo(myThid)+1,myBxHi(myThid) |
331 |
bW = bE-1 |
332 |
DO bj=myByLo(myThid),myByHi(myThid) |
333 |
DO J=1-OLy,sNy+OLy |
334 |
DO I=1,OLx |
335 |
phi(sNx+I,J,bW,bj)=phi( 1+I-1,J,bE,bj) |
336 |
phi(1-I ,J,bE,bj)=phi(sNx-I+1,J,bW,bj) |
337 |
ENDDO |
338 |
ENDDO |
339 |
ENDDO |
340 |
ENDDO |
341 |
|
342 |
C Need to ensure all threads have completed x-axis transfers before |
343 |
C we can do y-axis exchanges. |
344 |
_BARRIER |
345 |
|
346 |
C-- y-axis exchanges |
347 |
bN = myByLo(myThid) |
348 |
IF ( bN .NE. 1 .OR. |
349 |
& commS(myThid) .EQ. COMM_SHARED ) THEN |
350 |
bS = bN - 1 |
351 |
IF ( bS .LT. 1 ) bS = nSy |
352 |
DO bi=myBxLo(myThid),myBxHi(myThid) |
353 |
DO J=1,OLy |
354 |
DO I=1-OLx,sNx+OLx |
355 |
phi(I,1-J ,bi,bN)=phi(I,sNy-J+1,bi,bS) |
356 |
phi(I,sNy+J ,bi,bS)=phi(I,1+J-1 ,bi,bN) |
357 |
ENDDO |
358 |
ENDDO |
359 |
ENDDO |
360 |
ENDIF |
361 |
DO bN=myByLo(myThid)+1,myByHi(myThid) |
362 |
bS = bN - 1 |
363 |
DO bi=myBxLo(myThid),myBxHi(myThid) |
364 |
DO J=1,OLy |
365 |
DO I=1-OLx,sNx+OLx |
366 |
phi(I,1-J ,bi,bN)=phi(I,sNy-J+1,bi,bS) |
367 |
phi(I,sNy+J ,bi,bS)=phi(I,1+J-1 ,bi,bN) |
368 |
ENDDO |
369 |
ENDDO |
370 |
ENDDO |
371 |
ENDDO |
372 |
|
373 |
_BARRIER |
374 |
|
375 |
RETURN |
376 |
END |
377 |
|
378 |
CStartOfInterface |
379 |
SUBROUTINE EXCH_XYZ_R4( |
380 |
U phi, |
381 |
I myThid ) |
382 |
C /==========================================================\ |
383 |
C | SUBROUTINE EXCH_XYZ_R4 | |
384 |
C | o Handle exchanges for real*4, three-dimensional arrays. | |
385 |
C |==========================================================| |
386 |
C | Do true shared-memory data transfers and "messaging" | |
387 |
C | tranfers for blocking case of data transfers. | |
388 |
C | Applications call this routine using | |
389 |
C | CALL EXCH..( x, myThid ) | |
390 |
C | where x is a three-dimensional array with overlaps. | |
391 |
C | This routine does true-shared-memory copies for blocks | |
392 |
C | within a thread. It will also do MPI meesaging between | |
393 |
C | different processes. | |
394 |
C | Note: | |
395 |
C | ===== | |
396 |
C | If it is used, "asynchronous" messaging in which | |
397 |
C | communication overlaps computation is handled elsewhere | |
398 |
C | - see recv.F and send.F. | |
399 |
C | In practice MPI implementations may not be completely | |
400 |
C | thread safe. In principle this code is correct even for | |
401 |
C | mixed MPI and multi-threaded execution. | |
402 |
C | In multi-thread execution data managed by other threads | |
403 |
C | is only automatically visible to another thread if it | |
404 |
C | existed before the thread was created. This means that | |
405 |
C | for Fortran we declare arrays with overlap regions in | |
406 |
C | COMMON blocks. | |
407 |
C \==========================================================/ |
408 |
|
409 |
C === Global data === |
410 |
#include "SIZE.h" |
411 |
#include "EEPARAMS.h" |
412 |
#include "EESUPPORT.h" |
413 |
|
414 |
C === Routine arguments === |
415 |
C phi - Array who's overlap regions are to be exchanged |
416 |
C myThid - My thread id. |
417 |
Real*4 phi(1-OLx:sNx+OLx,1-OLy:sNy+OLy,1:Nz,nSx,nSy) |
418 |
INTEGER myThid |
419 |
CEndOfInterface |
420 |
|
421 |
C === Local variables === |
422 |
C bi,bj - Outer loop counters |
423 |
C I,J,K - Inner loop counters |
424 |
INTEGER bi, bj |
425 |
INTEGER I, J, K |
426 |
#ifdef ALLOW_USE_MPI |
427 |
C tagSend - Tags used to mark and select messages |
428 |
C tagRecv |
429 |
C nReqs - MPI request counter. |
430 |
C reqIds - Outstanding requests to wait on. |
431 |
C mpiRC - MPI return code |
432 |
C mpiStatArr - Multi-request status reporting array. |
433 |
C toPid - Proc. id sending to |
434 |
C fromPid - Proc. id receving from |
435 |
C elCount - Number of items to send or receive |
436 |
C elType - Data type of elements |
437 |
INTEGER tagSend |
438 |
INTEGER tagRecv |
439 |
INTEGER nReqs |
440 |
INTEGER reqIds(4) |
441 |
INTEGER mpiRC |
442 |
INTEGER mpiStatArr(MPI_STATUS_SIZE,4) |
443 |
INTEGER toPid |
444 |
INTEGER fromPid |
445 |
INTEGER elCount |
446 |
INTEGER elType |
447 |
#endif /* ALLOW_USE_MPI */ |
448 |
C bE, bW - Block index to east, west, north and |
449 |
C bN, bS south. |
450 |
INTEGER bW, bE, bS, bN |
451 |
|
452 |
C-- Do "message-passing" data exchanges |
453 |
#ifdef ALLOW_SYNC_COMMUNICATION |
454 |
#ifndef ALWAYS_USE_SYNC_COMMUNICATION |
455 |
IF ( usingSyncMessages ) THEN |
456 |
#endif |
457 |
|
458 |
C-- MPI based data exchages |
459 |
#ifdef ALLOW_USE_MPI |
460 |
#ifndef ALWAYS_USE_MPI |
461 |
IF ( usingMPI ) THEN |
462 |
#endif |
463 |
|
464 |
C Here use the approach that all the X shifts are completed |
465 |
C before the Y shifts start. Then we can expand our Y shifts |
466 |
C to include X overlap regions. This means we don't have to |
467 |
C transfer corners separately. |
468 |
C Notes: |
469 |
C ====== |
470 |
C 1. If we are only using messages in Y and using "true shared-memory" |
471 |
C in X then we rely on the shared-memory code to be implemented so that |
472 |
C corner values will be set correctly. The true shared-memory code |
473 |
C is in the block below - so be careful when altering it! |
474 |
C 2. In order to ensure we grab the right message we need to tag |
475 |
C messages. We tag the messages with the direction in which they |
476 |
C were sent out. Thus to receive from a processor to our west we |
477 |
C request the next message from that processor that was sent out |
478 |
C with a tag indicating an easterly send. |
479 |
C 3. We could implement a "safe" code here by having everyone |
480 |
C sync and then getting a particular thread ( say thread 1 ) to |
481 |
C do the MPI_Isend sequentially. In this mode the "communication" |
482 |
C thread would loop over nThreads and do MPI_Sendrecv for each |
483 |
C one. This could be more efficient on some platforms as the messages |
484 |
C might be sent as one large unit rather than by separate threads. |
485 |
C It would also be "thread-safe" for MPI implementations that have |
486 |
C multi-threading problems with things like request ids. |
487 |
C 4. We include overlap regions in both X and Y sends. |
488 |
C This means we can interchange the Y block with the X block. |
489 |
C Not sure whether this will ever be useful! |
490 |
C 5. The generation of a request handle by MPI_Isend and |
491 |
C MPI_Irecv ouught to involve some global state within MPI. |
492 |
C If this process is not thread safe it may be necessary to |
493 |
C enable a critical section around the MPI_Isend and |
494 |
C MPI_Irecv calls. |
495 |
|
496 |
C We need a barrier here to synchronise threads otherwise |
497 |
C one thread might declare it has is ready to send and receive |
498 |
C even though another thread is going to write to the first |
499 |
C threads send or receive region. This won't happen, however, |
500 |
C if we are careful about which data a thread updates - but at |
501 |
C the moment we aren't careful! |
502 |
_BARRIER |
503 |
|
504 |
C-- East-west messaging communication |
505 |
nReqs = 0 |
506 |
bE = myBxLo(myThid) |
507 |
IF ( bE .EQ. 1 .AND. commW(myThid) .EQ. COMM_MPI ) THEN |
508 |
C My west face uses MPI. Get ready to receive data from |
509 |
C west and post that we are ready to send data to the west. |
510 |
C Data we receive has to come with tag of thread we know is |
511 |
C sending from our west, data we send is tagged with our thread id |
512 |
C and the send direction. |
513 |
nReqs = nReqs+1 |
514 |
tagSend = mpiTagW*nThreads+myThid |
515 |
toPid = mpiPidW |
516 |
elCount = 1 |
517 |
elType = mpiTypeXFaceThread_xyz_r4(myThid) |
518 |
CALL MPI_Isend( |
519 |
I phi(1,1-OLy,1,bE,myByLo(myThid)), |
520 |
I elCount, elType, toPid, |
521 |
I tagSend,MPI_COMM_WORLD, |
522 |
I reqIds(nReqs),mpiRC) |
523 |
nReqs = nReqs+1 |
524 |
tagRecv = mpiTagE*nThreads+myThrW(myThid) |
525 |
fromPid = mpiPidW |
526 |
elCount = 1 |
527 |
elType = mpiTypeXFaceThread_xyz_r4(myThid) |
528 |
CALL MPI_Irecv( |
529 |
U phi(1-OLx,1-OLy,1,bE,myByLo(myThid)), |
530 |
I elCount, elType, fromPid, |
531 |
I tagRecv,MPI_COMM_WORLD, |
532 |
I reqIds(nReqs),mpiRC) |
533 |
ENDIF |
534 |
bW = myBxHi(myThid) |
535 |
IF ( bW .EQ. nSx .AND. commE(MyThid) .EQ. COMM_MPI ) THEN |
536 |
C My east face uses MPI. Get ready to receive data from |
537 |
C east and post that we are ready to send data to the east. |
538 |
C Data we receive has to come with tag of thread we know is |
539 |
C sending from our west, data we send is tagged with our thread id. |
540 |
nReqs = nReqs+1 |
541 |
tagSend = mpiTagE*nThreads+myThid |
542 |
toPid = mpiPidE |
543 |
elCount = 1 |
544 |
elType = mpiTypeXFaceThread_xyz_r4(myThid) |
545 |
CALL MPI_Isend( |
546 |
I phi(sNx-OLx+1,1-OLy,1,bW,myByLo(myThid)), |
547 |
I elCount, elType, toPid, |
548 |
I tagSend,MPI_COMM_WORLD, |
549 |
I reqIds(nReqs),mpiRC) |
550 |
nReqs = nReqs+1 |
551 |
tagRecv = mpiTagW*nThreads+myThrE(myThid) |
552 |
fromPid = mpiPidE |
553 |
elCount = 1 |
554 |
elType = mpiTypeXFaceThread_xyz_r4(myThid) |
555 |
CALL MPI_Irecv( |
556 |
U phi(sNx+1,1-OLy,1,bW,myByLo(myThid)), |
557 |
I elCount, elType, fromPid, |
558 |
I tagRecv,MPI_COMM_WORLD, |
559 |
I reqIds(nReqs),mpiRC) |
560 |
ENDIF |
561 |
C Wait for this threads east-west transactions to finish before |
562 |
C posting north-south transactions. We have to do this so that |
563 |
C the north-south transactions will send out the correct overlap |
564 |
C region values into the corner sections of our neighbors. |
565 |
CALL MPI_Waitall( nReqs, reqIds, mpiStatArr, mpiRC ) |
566 |
CcnhDebugStarts |
567 |
C RETURN |
568 |
CcnhDebugEnds |
569 |
|
570 |
|
571 |
C-- North-south messaging communication |
572 |
nReqs = 0 |
573 |
bN = myByLo(myTHid) |
574 |
IF ( bN .EQ. 1 .AND. commS(myThid) .EQ. COMM_MPI ) THEN |
575 |
C My south face uses MPI. Get ready to receive data from |
576 |
C south and post that I am ready to send data to the south. |
577 |
nReqs = nReqs + 1 |
578 |
tagSend = mpiTagS*nThreads+myThid |
579 |
toPid = mpiPidS |
580 |
elCount = 1 |
581 |
elType = mpiTypeYFaceThread_xyz_r4(myThid) |
582 |
CALL MPI_Isend( |
583 |
I phi(1-OLx,1,1,myBxLo(myThid),bN), |
584 |
I elCount, elType, toPid, |
585 |
I tagSend,MPI_COMM_WORLD, |
586 |
I reqIds(nReqs), mpiRC ) |
587 |
nReqs = nReqs + 1 |
588 |
tagRecv = mpiTagN*nThreads+myThrS(myThid) |
589 |
fromPid = mpiPidS |
590 |
elCount = 1 |
591 |
elType = mpiTypeYFaceThread_xyz_r4(myThid) |
592 |
CALL MPI_Irecv( |
593 |
I phi(1-OLx,1-OLy,1,myBxLo(myThid),bN), |
594 |
I elCount, elType, fromPid, |
595 |
I tagRecv,MPI_COMM_WORLD, |
596 |
I reqIds(nReqs), mpiRC ) |
597 |
ENDIF |
598 |
C |
599 |
bS = myByHi(myTHid) |
600 |
IF ( bN .EQ. 1 .AND. commS(myThid) .EQ. COMM_MPI ) THEN |
601 |
C My north face uses MPI. Get ready to receive data from |
602 |
C north and post that I am ready to send data to the north. |
603 |
nReqs = nReqs + 1 |
604 |
tagSend = mpiTagN*nThreads+myThid |
605 |
toPid = mpiPidN |
606 |
elCount = 1 |
607 |
elType = mpiTypeYFaceThread_xyz_r4(myThid) |
608 |
CALL MPI_Isend( |
609 |
I phi(1-OLx,sNy-OLy+1,1,myBxLo(myThid),bS), |
610 |
I elCount, elType, toPid, |
611 |
I tagSend,MPI_COMM_WORLD, |
612 |
I reqIds(nReqs), mpiRC ) |
613 |
nReqs = nReqs + 1 |
614 |
tagRecv = mpiTagS*nThreads+myThrN(myThid) |
615 |
fromPid = mpiPidN |
616 |
elCount = 1 |
617 |
elType = mpiTypeYFaceThread_xyz_r4(myThid) |
618 |
CALL MPI_Irecv( |
619 |
I phi(1-OLx,sNy+1,1,myBxLo(myThid),bS), |
620 |
I elCount, elType, fromPid, |
621 |
I tagRecv,MPI_COMM_WORLD, |
622 |
I reqIds(nReqs), mpiRC ) |
623 |
ENDIF |
624 |
C Wait for this threads north-south transactions to finish. |
625 |
CALL MPI_Waitall( nReqs, reqIds, mpiStatArr, mpiRC ) |
626 |
|
627 |
#ifndef ALWAYS_USE_MPI |
628 |
ENDIF |
629 |
#endif |
630 |
#endif /* ALLOW_USE_MPI */ |
631 |
|
632 |
#ifndef ALWAYS_USE_SYNC_COMMUNICATION |
633 |
ENDIF |
634 |
#endif |
635 |
#endif /* ALLOW_SYNC_COMMUNICATION */ |
636 |
|
637 |
C-- Do true shared-memory data exchanges |
638 |
C Note: This is also doing the overlap copies |
639 |
C for a single threaded code. |
640 |
C-- First make sure all threads have reached here |
641 |
_BARRIER |
642 |
|
643 |
C-- Now do copies |
644 |
C Notes: |
645 |
C ====== |
646 |
C Here we copy from up to and including the overlap |
647 |
C regions for both the X shift and the Y shift. This |
648 |
C catches the "corners" of the data copied using messages |
649 |
C in the "synchronous communication" section above. |
650 |
C-- x-axis exchanges |
651 |
bE = myBxLo(myThid) |
652 |
IF ( bE .NE. 1 .OR. |
653 |
& commW(myThid) .EQ. COMM_SHARED ) THEN |
654 |
bW = bE-1 |
655 |
IF ( bW .LT. 1 ) bW = nSx |
656 |
DO bj=myByLo(myThid),myByHi(myThid) |
657 |
DO K=1,Nz |
658 |
DO J=1-OLy,sNy+OLy |
659 |
DO I=1,OLx |
660 |
phi(sNx+I,J,K,bW,bj)=phi( 1+I-1,J,K,bE,bj) |
661 |
phi(1-I ,J,K,bE,bj)=phi(sNx-I+1,J,K,bW,bj) |
662 |
ENDDO |
663 |
ENDDO |
664 |
ENDDO |
665 |
ENDDO |
666 |
ENDIF |
667 |
DO bE=myBxLo(myThid)+1,myBxHi(myThid) |
668 |
bW = bE-1 |
669 |
DO bj=myByLo(myThid),myByHi(myThid) |
670 |
DO K=1,Nz |
671 |
DO J=1-OLy,sNy+OLy |
672 |
DO I=1,OLx |
673 |
phi(sNx+I,J,K,bW,bj)=phi( 1+I-1,J,K,bE,bj) |
674 |
phi(1-I ,J,K,bE,bj)=phi(sNx-I+1,J,K,bW,bj) |
675 |
ENDDO |
676 |
ENDDO |
677 |
ENDDO |
678 |
ENDDO |
679 |
ENDDO |
680 |
|
681 |
C Need to ensure all threads have completed x-axis transfers before |
682 |
C we can do y-axis exchanges. |
683 |
_BARRIER |
684 |
|
685 |
C-- y-axis exchanges |
686 |
bN = myByLo(myThid) |
687 |
IF ( bN .NE. 1 .OR. |
688 |
& commS(myThid) .EQ. COMM_SHARED ) THEN |
689 |
bS = bN - 1 |
690 |
IF ( bS .LT. 1 ) bS = nSy |
691 |
DO bi=myBxLo(myThid),myBxHi(myThid) |
692 |
DO K=1,Nz |
693 |
DO J=1,OLy |
694 |
DO I=1-OLx,sNx+OLx |
695 |
phi(I,1-J ,K,bi,bN)=phi(I,sNy-J+1,K,bi,bS) |
696 |
phi(I,sNy+J ,K,bi,bS)=phi(I,1+J-1 ,K,bi,bN) |
697 |
ENDDO |
698 |
ENDDO |
699 |
ENDDO |
700 |
ENDDO |
701 |
ENDIF |
702 |
DO bN=myByLo(myThid)+1,myByHi(myThid) |
703 |
bS = bN - 1 |
704 |
DO bi=myBxLo(myThid),myBxHi(myThid) |
705 |
DO K=1,Nz |
706 |
DO J=1,OLy |
707 |
DO I=1-OLx,sNx+OLx |
708 |
phi(I,1-J ,K,bi,bN)=phi(I,sNy-J+1,K,bi,bS) |
709 |
phi(I,sNy+J ,K,bi,bS)=phi(I,1+J-1 ,K,bi,bN) |
710 |
ENDDO |
711 |
ENDDO |
712 |
ENDDO |
713 |
ENDDO |
714 |
ENDDO |
715 |
|
716 |
_BARRIER |
717 |
|
718 |
RETURN |
719 |
END |
720 |
|
721 |
CStartOfInterface |
722 |
SUBROUTINE EXCH_XY_R8( |
723 |
U phi, |
724 |
I myThid ) |
725 |
C /==========================================================\ |
726 |
C | SUBROUTINE EXCH_XY_R8 | |
727 |
C | o Handle exchanges for real*8, two-dimensional arrays. | |
728 |
C |==========================================================| |
729 |
C | Do true shared-memory data transfers and "messaging" | |
730 |
C | tranfers for blocking case of data transfers. | |
731 |
C | Applications call this routine using | |
732 |
C | CALL EXCH..( x, myThid ) | |
733 |
C | where x is a two-dimensional array with overlaps. | |
734 |
C | This routine does true-shared-memory copies for blocks | |
735 |
C | within a thread. It will also do MPI meesaging between | |
736 |
C | different processes. | |
737 |
C | Note: | |
738 |
C | ===== | |
739 |
C | If it is used, "asynchronous" messaging in which | |
740 |
C | communication overlaps computation is handled elsewhere | |
741 |
C | - see recv.F and send.F. | |
742 |
C | In practice MPI implementations may not be completely | |
743 |
C | thread safe. In principle this code is correct even for | |
744 |
C | mixed MPI and multi-threaded execution. | |
745 |
C | In multi-thread execution data managed by other threads | |
746 |
C | is only automatically visible to another thread if it | |
747 |
C | existed before the thread was created. This means that | |
748 |
C | all Fortran we declare arrays with overlap regions in | |
749 |
C | COMMON blocks. | |
750 |
C \==========================================================/ |
751 |
|
752 |
C === Global data === |
753 |
#include "SIZE.h" |
754 |
#include "EEPARAMS.h" |
755 |
#include "EESUPPORT.h" |
756 |
|
757 |
C === Routine arguments === |
758 |
C phi - Array who's overlap regions are to be exchanged |
759 |
C myThid - My thread id. |
760 |
Real*8 phi(1-OLx:sNx+OLx,1-OLy:sNy+OLy,nSx,nSy) |
761 |
INTEGER myThid |
762 |
CEndOfInterface |
763 |
|
764 |
C === Local variables === |
765 |
C bi,bj - Outer loop counters |
766 |
C I,J,K - Inner loop counters |
767 |
INTEGER bi, bj |
768 |
INTEGER I, J |
769 |
#ifdef ALLOW_USE_MPI |
770 |
C tagSend - Tags used to mark and select messages |
771 |
C tagRecv |
772 |
C nReqs - MPI request counter. |
773 |
C reqIds - Outstanding requests to wait on. |
774 |
C mpiRC - MPI return code |
775 |
C mpiStatArr - Multi-request status reporting array. |
776 |
C toPid - Proc. id sending to |
777 |
C fromPid - Proc. id receving from |
778 |
C elCount - Number of items to send or receive |
779 |
C elType - Datat type of elements |
780 |
INTEGER tagSend |
781 |
INTEGER tagRecv |
782 |
INTEGER nReqs |
783 |
INTEGER reqIds(4) |
784 |
INTEGER mpiRC |
785 |
INTEGER mpiStatArr(MPI_STATUS_SIZE,4) |
786 |
INTEGER toPid |
787 |
INTEGER fromPid |
788 |
INTEGER elCount |
789 |
INTEGER elType |
790 |
#endif /* ALLOW_USE_MPI */ |
791 |
C bE, bW - Block index to east, west, north and |
792 |
C bN, bS south. |
793 |
INTEGER bW, bE, bS, bN |
794 |
|
795 |
C-- Do "message-passing" data exchanges |
796 |
#ifdef ALLOW_SYNC_COMMUNICATION |
797 |
#ifndef ALWAYS_USE_SYNC_COMMUNICATION |
798 |
IF ( usingSyncMessages ) THEN |
799 |
#endif |
800 |
|
801 |
C-- MPI based data exchages |
802 |
#ifdef ALLOW_USE_MPI |
803 |
#ifndef ALWAYS_USE_MPI |
804 |
IF ( usingMPI ) THEN |
805 |
#endif |
806 |
|
807 |
C Here use the approach that all the X shifts are completed |
808 |
C before the Y shifts start. Then we can expand our Y shifts |
809 |
C to include X overlap regions. This means we don't have to |
810 |
C transfer corners separately. |
811 |
C Notes: |
812 |
C ====== |
813 |
C 1. If we are only using messages in Y and using "true shared-memory" |
814 |
C in X then we rely on the shared-memory code to be implemented so that |
815 |
C corner values will be set correctly. The true shared-memory code |
816 |
C is in the block below - so be careful when altering it! |
817 |
C 2. In order to ensure we grab the right message we need to tag |
818 |
C messages. We tag the messages with the direction in which they |
819 |
C were sent out. Thus to receive from a processor to our west we |
820 |
C request the next message from that processor that was sent out |
821 |
C with a tag indicating an easterly send. |
822 |
C 3. We could implement a "safe" code here by having everyone |
823 |
C sync and then getting a particular thread ( say thread 1 ) to |
824 |
C do the MPI_Isend sequentially. In this mode the "communication" |
825 |
C thread would loop over nThreads and do MPI_Sendrecv for each |
826 |
C one. This could be more efficient on some platforms as the messages |
827 |
C might be sent as one large unit rather than by separate threads. |
828 |
C It would also be "thread-safe" for MPI implementations that have |
829 |
C multi-threading problems with things like request ids. |
830 |
C 4. We include overlap regions in both X and Y sends. |
831 |
C This means we can interchange the Y block with the X block. |
832 |
C Not sure whether this will ever be useful! |
833 |
C 5. The generation of a request handle by MPI_Isend and |
834 |
C MPI_Irecv ouught to involve some global state within MPI. |
835 |
C If this process is not thread safe it may be necessary to |
836 |
C enable a critical section around the MPI_Isend and |
837 |
C MPI_Irecv calls. |
838 |
|
839 |
C We need a barrier here to synchronise threads otherwise |
840 |
C one thread might declare it has is ready to send and receive |
841 |
C even though another thread is going to write to the first |
842 |
C threads send or receive region. This won't happen, however, |
843 |
C if we are careful about which data a thread updates - but at |
844 |
C the moment we aren't careful! |
845 |
_BARRIER |
846 |
|
847 |
C-- East-west messaging communication |
848 |
nReqs = 0 |
849 |
bE = myBxLo(myThid) |
850 |
IF ( bE .EQ. 1 .AND. commW(myThid) .EQ. COMM_MPI ) THEN |
851 |
C My west face uses MPI. Get ready to receive data from |
852 |
C west and post that we are ready to send data to the west. |
853 |
C Data we receive has to come with tag of thread we know is |
854 |
C sending from our west, data we send is tagged with our thread id |
855 |
C and the send direction. |
856 |
nReqs = nReqs+1 |
857 |
tagSend = mpiTagW*nThreads+myThid |
858 |
toPid = mpiPidW |
859 |
elCount = 1 |
860 |
elType = mpiTypeXFaceThread_xy_R8(myThid) |
861 |
C I tagSend,MPI_COMM_WORLD, |
862 |
CALL MPI_Isend( |
863 |
I phi(1,1-OLy,bE,myByLo(myThid)), |
864 |
I elCount, elType, toPid, |
865 |
I tagSend,MPI_COMM_WORLD, |
866 |
I reqIds(nReqs),mpiRC) |
867 |
nReqs = nReqs+1 |
868 |
tagRecv = mpiTagE*nThreads+myThrW(myThid) |
869 |
fromPid = mpiPidW |
870 |
elCount = 1 |
871 |
elType = mpiTypeXFaceThread_xy_R8(myThid) |
872 |
CALL MPI_Irecv( |
873 |
U phi(1-OLx,1-OLy,bE,myByLo(myThid)), |
874 |
I elCount, elType, fromPid, |
875 |
I tagRecv,MPI_COMM_WORLD, |
876 |
I reqIds(nReqs),mpiRC) |
877 |
ENDIF |
878 |
bW = myBxHi(myThid) |
879 |
IF ( bW .EQ. nSx .AND. commE(MyThid) .EQ. COMM_MPI ) THEN |
880 |
C My east face uses MPI. Get ready to receive data from |
881 |
C east and post that we are ready to send data to the east. |
882 |
C Data we receive has to come with tag of thread we know is |
883 |
C sending from our west, data we send is tagged with our thread id. |
884 |
nReqs = nReqs+1 |
885 |
tagSend = mpiTagE*nThreads+myThid |
886 |
toPid = mpiPidE |
887 |
elCount = 1 |
888 |
elType = mpiTypeXFaceThread_xy_R8(myThid) |
889 |
CALL MPI_Isend( |
890 |
I phi(sNx-OLx+1,1-OLy,bW,myByLo(myThid)), |
891 |
I elCount, elType, toPid, |
892 |
I tagSend,MPI_COMM_WORLD, |
893 |
I reqIds(nReqs),mpiRC) |
894 |
nReqs = nReqs+1 |
895 |
tagRecv = mpiTagW*nThreads+myThrE(myThid) |
896 |
fromPid = mpiPidE |
897 |
elCount = 1 |
898 |
elType = mpiTypeXFaceThread_xy_R8(myThid) |
899 |
CALL MPI_Irecv( |
900 |
U phi(sNx+1,1-OLy,bW,myByLo(myThid)), |
901 |
I elCount, elType, fromPid, |
902 |
I tagRecv,MPI_COMM_WORLD, |
903 |
I reqIds(nReqs),mpiRC) |
904 |
ENDIF |
905 |
C Wait for this threads east-west transactions to finish before |
906 |
C posting north-south transactions. We have to do this so that |
907 |
C the north-south transactions will send out the correct overlap |
908 |
C region values into the corner sections of our neighbors. |
909 |
CALL MPI_Waitall( nReqs, reqIds, mpiStatArr, mpiRC ) |
910 |
|
911 |
CcnhDebugStarts |
912 |
C RETURN |
913 |
CcnhDebugEnds |
914 |
|
915 |
|
916 |
C-- North-south messaging communication |
917 |
nReqs = 0 |
918 |
bN = myByLo(myTHid) |
919 |
IF ( bN .EQ. 1 .AND. commS(myThid) .EQ. COMM_MPI ) THEN |
920 |
C My south face uses MPI. Get ready to receive data from |
921 |
C south and post that I am ready to send data to the south. |
922 |
nReqs = nReqs + 1 |
923 |
tagSend = mpiTagS*nThreads+myThid |
924 |
toPid = mpiPidS |
925 |
elCount = 1 |
926 |
elType = mpiTypeYFaceThread_xy_R8(myThid) |
927 |
CALL MPI_Isend( |
928 |
I phi(1-OLx,1,myBxLo(myThid),bN), |
929 |
I elCount, elType, toPid, |
930 |
I tagSend,MPI_COMM_WORLD, |
931 |
I reqIds(nReqs), mpiRC ) |
932 |
nReqs = nReqs + 1 |
933 |
tagRecv = mpiTagN*nThreads+myThrS(myThid) |
934 |
fromPid = mpiPidS |
935 |
elCount = 1 |
936 |
elType = mpiTypeYFaceThread_xy_R8(myThid) |
937 |
CALL MPI_Irecv( |
938 |
I phi(1-OLx,1-OLy,myBxLo(myThid),bN), |
939 |
I elCount, elType, fromPid, |
940 |
I tagRecv,MPI_COMM_WORLD, |
941 |
I reqIds(nReqs), mpiRC ) |
942 |
ENDIF |
943 |
C |
944 |
bS = myByHi(myTHid) |
945 |
IF ( bS .EQ. nSy .AND. commN(myThid) .EQ. COMM_MPI ) THEN |
946 |
C My north face uses MPI. Get ready to receive data from |
947 |
C north and post that I am ready to send data to the north. |
948 |
nReqs = nReqs + 1 |
949 |
tagSend = mpiTagN*nThreads+myThid |
950 |
toPid = mpiPidN |
951 |
elCount = 1 |
952 |
elType = mpiTypeYFaceThread_xy_R8(myThid) |
953 |
CALL MPI_Isend( |
954 |
I phi(1-OLx,sNy-OLy+1,myBxLo(myThid),bS), |
955 |
I elCount, elType, toPid, |
956 |
I tagSend,MPI_COMM_WORLD, |
957 |
I reqIds(nReqs), mpiRC ) |
958 |
nReqs = nReqs + 1 |
959 |
tagRecv = mpiTagS*nThreads+myThrN(myThid) |
960 |
fromPid = mpiPidN |
961 |
elCount = 1 |
962 |
elType = mpiTypeYFaceThread_xy_R8(myThid) |
963 |
CALL MPI_Irecv( |
964 |
I phi(1-OLx,sNy+1,myBxLo(myThid),bS), |
965 |
I elCount, elType, fromPid, |
966 |
I tagRecv,MPI_COMM_WORLD, |
967 |
I reqIds(nReqs), mpiRC ) |
968 |
ENDIF |
969 |
C Wait for this threads north-south transactions to finish. |
970 |
CALL MPI_Waitall( nReqs, reqIds, mpiStatArr, mpiRC ) |
971 |
|
972 |
#ifndef ALWAYS_USE_MPI |
973 |
ENDIF |
974 |
#endif |
975 |
#endif /* ALLOW_USE_MPI */ |
976 |
|
977 |
#ifndef ALWAYS_USE_SYNC_COMMUNICATION |
978 |
ENDIF |
979 |
#endif |
980 |
#endif /* ALLOW_SYNC_COMMUNICATION */ |
981 |
|
982 |
C-- Do true shared-memory data exchanges |
983 |
C Note: This section also does the overlap copies for serial |
984 |
C code overlap copies. |
985 |
C-- First make sure all threads have reached here |
986 |
_BARRIER |
987 |
|
988 |
C-- Now do copies |
989 |
C Notes: |
990 |
C ====== |
991 |
C Here we copy from up to and including the overlap |
992 |
C regions for both the X shift and the Y shift. This |
993 |
C catches the "corners" of the data copied using messages |
994 |
C in the "synchronous communication" section above. |
995 |
C-- x-axis exchanges |
996 |
bE = myBxLo(myThid) |
997 |
IF ( bE .NE. 1 .OR. |
998 |
& commW(myThid) .EQ. COMM_SHARED ) THEN |
999 |
bW = bE-1 |
1000 |
IF ( bW .LT. 1 ) bW = nSx |
1001 |
DO bj=myByLo(myThid),myByHi(myThid) |
1002 |
DO J=1-OLy,sNy+OLy |
1003 |
DO I=1,OLx |
1004 |
phi(sNx+I,J,bW,bj)=phi( 1+I-1,J,bE,bj) |
1005 |
phi(1-I ,J,bE,bj)=phi(sNx-I+1,J,bW,bj) |
1006 |
ENDDO |
1007 |
ENDDO |
1008 |
ENDDO |
1009 |
ENDIF |
1010 |
DO bE=myBxLo(myThid)+1,myBxHi(myThid) |
1011 |
bW = bE-1 |
1012 |
DO bj=myByLo(myThid),myByHi(myThid) |
1013 |
DO J=1-OLy,sNy+OLy |
1014 |
DO I=1,OLx |
1015 |
phi(sNx+I,J,bW,bj)=phi( 1+I-1,J,bE,bj) |
1016 |
phi(1-I ,J,bE,bj)=phi(sNx-I+1,J,bW,bj) |
1017 |
ENDDO |
1018 |
ENDDO |
1019 |
ENDDO |
1020 |
ENDDO |
1021 |
|
1022 |
C Need to ensure all threads have completed x-axis transfers before |
1023 |
C we can do y-axis exchanges. |
1024 |
_BARRIER |
1025 |
|
1026 |
C-- y-axis exchanges |
1027 |
bN = myByLo(myThid) |
1028 |
IF ( bN .NE. 1 .OR. |
1029 |
& commS(myThid) .EQ. COMM_SHARED ) THEN |
1030 |
bS = bN - 1 |
1031 |
IF ( bS .LT. 1 ) bS = nSy |
1032 |
DO bi=myBxLo(myThid),myBxHi(myThid) |
1033 |
DO J=1,OLy |
1034 |
DO I=1-OLx,sNx+OLx |
1035 |
phi(I,1-J ,bi,bN)=phi(I,sNy-J+1,bi,bS) |
1036 |
phi(I,sNy+J ,bi,bS)=phi(I,1+J-1 ,bi,bN) |
1037 |
ENDDO |
1038 |
ENDDO |
1039 |
ENDDO |
1040 |
ENDIF |
1041 |
DO bN=myByLo(myThid)+1,myByHi(myThid) |
1042 |
bS = bN - 1 |
1043 |
DO bi=myBxLo(myThid),myBxHi(myThid) |
1044 |
DO J=1,OLy |
1045 |
DO I=1-OLx,sNx+OLx |
1046 |
phi(I,1-J ,bi,bN)=phi(I,sNy-J+1,bi,bS) |
1047 |
phi(I,sNy+J ,bi,bS)=phi(I,1+J-1 ,bi,bN) |
1048 |
ENDDO |
1049 |
ENDDO |
1050 |
ENDDO |
1051 |
ENDDO |
1052 |
|
1053 |
_BARRIER |
1054 |
|
1055 |
RETURN |
1056 |
END |
1057 |
|
1058 |
CStartOfInterface |
1059 |
SUBROUTINE EXCH_XYZ_R8( |
1060 |
U phi, |
1061 |
I myThid ) |
1062 |
C /==========================================================\ |
1063 |
C | SUBROUTINE EXCH_XYZ_R8 | |
1064 |
C | o Handle exchanges for real*8, three-dimensional arrays. | |
1065 |
C |==========================================================| |
1066 |
C | Do true shared-memory data transfers and "messaging" | |
1067 |
C | tranfers for blocking case of data transfers. | |
1068 |
C | Applications call this routine using | |
1069 |
C | CALL EXCH..( x, myThid ) | |
1070 |
C | where x is a three-dimensional array with overlaps. | |
1071 |
C | This routine does true-shared-memory copies for blocks | |
1072 |
C | within a thread. It will also do MPI meesaging between | |
1073 |
C | different processes. | |
1074 |
C | Note: | |
1075 |
C | ===== | |
1076 |
C | If it is used, "asynchronous" messaging in which | |
1077 |
C | communication overlaps computation is handled elsewhere | |
1078 |
C | - see recv.F and send.F. | |
1079 |
C | In practice MPI implementations may not be completely | |
1080 |
C | thread safe. In principle this code is correct even for | |
1081 |
C | mixed MPI and multi-threaded execution. | |
1082 |
C | In multi-thread execution data managed by other threads | |
1083 |
C | is only automatically visible to another thread if it | |
1084 |
C | existed before the thread was created. This means that | |
1085 |
C | for Fortran we declare arrays with overlap regions in | |
1086 |
C | COMMON blocks. | |
1087 |
C \==========================================================/ |
1088 |
|
1089 |
C === Global data === |
1090 |
#include "SIZE.h" |
1091 |
#include "EEPARAMS.h" |
1092 |
#include "EESUPPORT.h" |
1093 |
|
1094 |
C === Routine arguments === |
1095 |
C phi - Array who's overlap regions are to be exchanged |
1096 |
C myThid - My thread id. |
1097 |
Real*8 phi(1-OLx:sNx+OLx,1-OLy:sNy+OLy,1:Nz,nSx,nSy) |
1098 |
INTEGER myThid |
1099 |
CEndOfInterface |
1100 |
|
1101 |
C === Local variables === |
1102 |
C bi,bj - Outer loop counters |
1103 |
C I,J,K - Inner loop counters |
1104 |
INTEGER bi, bj |
1105 |
INTEGER I, J, K |
1106 |
#ifdef ALLOW_USE_MPI |
1107 |
C tagSend - Tags used to mark and select messages |
1108 |
C tagRecv |
1109 |
C nReqs - MPI request counter. |
1110 |
C reqIds - Outstanding requests to wait on. |
1111 |
C mpiRC - MPI return code |
1112 |
C mpiStatArr - Multi-request status reporting array. |
1113 |
C toPid - Proc. id sending to |
1114 |
C fromPid - Proc. id receving from |
1115 |
C elCount - Number of items to send or receive |
1116 |
C elType - Datat type of elements |
1117 |
INTEGER tagSend |
1118 |
INTEGER tagRecv |
1119 |
INTEGER nReqs |
1120 |
INTEGER reqIds(4) |
1121 |
INTEGER mpiRC |
1122 |
INTEGER mpiStatArr(MPI_STATUS_SIZE,4) |
1123 |
INTEGER toPid |
1124 |
INTEGER fromPid |
1125 |
INTEGER elCount |
1126 |
INTEGER elType |
1127 |
#endif /* ALLOW_USE_MPI */ |
1128 |
C bE, bW - Block index to east, west, north and |
1129 |
C bN, bS south. |
1130 |
INTEGER bW, bE, bS, bN |
1131 |
|
1132 |
C-- Do "message-passing" data exchanges |
1133 |
#ifdef ALLOW_SYNC_COMMUNICATION |
1134 |
#ifndef ALWAYS_USE_SYNC_COMMUNICATION |
1135 |
IF ( usingSyncMessages ) THEN |
1136 |
#endif |
1137 |
|
1138 |
C-- MPI based data exchages |
1139 |
#ifdef ALLOW_USE_MPI |
1140 |
#ifndef ALWAYS_USE_MPI |
1141 |
IF ( usingMPI ) THEN |
1142 |
#endif |
1143 |
|
1144 |
C Here use the approach that all the X shifts are completed |
1145 |
C before the Y shifts start. Then we can expand our Y shifts |
1146 |
C to include X overlap regions. This means we don't have to |
1147 |
C transfer corners separately. |
1148 |
C Notes: |
1149 |
C ====== |
1150 |
C 1. If we are only using messages in Y and using "true shared-memory" |
1151 |
C in X then we rely on the shared-memory code to be implemented so that |
1152 |
C corner values will be set correctly. The true shared-memory code |
1153 |
C is in the block below - so be careful when altering it! |
1154 |
C 2. In order to ensure we grab the right message we need to tag |
1155 |
C messages. We tag the messages with the direction in which they |
1156 |
C were sent out. Thus to receive from a processor to our west we |
1157 |
C request the next message from that processor that was sent out |
1158 |
C with a tag indicating an easterly send. |
1159 |
C 3. We could implement a "safe" code here by having everyone |
1160 |
C sync and then getting a particular thread ( say thread 1 ) to |
1161 |
C do the MPI_Isend sequentially. In this mode the "communication" |
1162 |
C thread would loop over nThreads and do MPI_Sendrecv for each |
1163 |
C one. This could be more efficient on some platforms as the messages |
1164 |
C might be sent as one large unit rather than by separate threads. |
1165 |
C It would also be "thread-safe" for MPI implementations that have |
1166 |
C multi-threading problems with things like request ids. |
1167 |
C 4. We include overlap regions in both X and Y sends. |
1168 |
C This means we can interchange the Y block with the X block. |
1169 |
C Not sure whether this will ever be useful! |
1170 |
C 5. The generation of a request handle by MPI_Isend and |
1171 |
C MPI_Irecv ouught to involve some global state within MPI. |
1172 |
C If this process is not thread safe it may be necessary to |
1173 |
C enable a critical section around the MPI_Isend and |
1174 |
C MPI_Irecv calls. |
1175 |
|
1176 |
C We need a barrier here to synchronise threads otherwise |
1177 |
C one thread might declare it has is ready to send and receive |
1178 |
C even though another thread is going to write to the first |
1179 |
C threads send or receive region. This won't happen, however, |
1180 |
C if we are careful about which data a thread updates - but at |
1181 |
C the moment we aren't careful! |
1182 |
_BARRIER |
1183 |
|
1184 |
C-- East-west messaging communication |
1185 |
bE = myBxLo(myThid) |
1186 |
bW = myBxHi(myThid) |
1187 |
nReqs = 0 |
1188 |
IF ( bE .EQ. 1 .AND. commW(myThid) .EQ. COMM_MPI ) THEN |
1189 |
C My west face uses MPI. Get ready to receive data from |
1190 |
C west and post that we are ready to send data to the west. |
1191 |
C Data we receive has to come with tag of thread we know is |
1192 |
C sending from our west, data we send is tagged with our thread id |
1193 |
C and the send direction. |
1194 |
nReqs = nReqs+1 |
1195 |
tagSend = mpiTagW*nThreads+myThid |
1196 |
toPid = mpiPidW |
1197 |
elCount = 1 |
1198 |
elType = mpiTypeXFaceThread_xyz_R8(myThid) |
1199 |
CALL MPI_Isend( |
1200 |
I phi(1,1-OLy,1,bE,myByLo(myThid)), |
1201 |
I elCount, elType, toPid, |
1202 |
I tagSend,MPI_COMM_WORLD, |
1203 |
I reqIds(nReqs),mpiRC) |
1204 |
ENDIF |
1205 |
IF ( bW .EQ. nSx .AND. commE(MyThid) .EQ. COMM_MPI ) THEN |
1206 |
nReqs = nReqs+1 |
1207 |
tagRecv = mpiTagW*nThreads+myThrE(myThid) |
1208 |
fromPid = mpiPidE |
1209 |
elCount = 1 |
1210 |
elType = mpiTypeXFaceThread_xyz_R8(myThid) |
1211 |
CALL MPI_Irecv( |
1212 |
U phi(sNx+1,1-OLy,1,bW,myByLo(myThid)), |
1213 |
I elCount, elType, fromPid, |
1214 |
I tagRecv,MPI_COMM_WORLD, |
1215 |
I reqIds(nReqs),mpiRC) |
1216 |
ENDIF |
1217 |
CALL MPI_Waitall( nReqs, reqIds, mpiStatArr, mpiRC ) |
1218 |
|
1219 |
nReqs = 0 |
1220 |
IF ( bW .EQ. nSx .AND. commE(MyThid) .EQ. COMM_MPI ) THEN |
1221 |
C My east face uses MPI. Get ready to receive data from |
1222 |
C east and post that we are ready to send data to the east. |
1223 |
C Data we receive has to come with tag of thread we know is |
1224 |
C sending from our west, data we send is tagged with our thread id. |
1225 |
nReqs = nReqs+1 |
1226 |
tagSend = mpiTagE*nThreads+myThid |
1227 |
toPid = mpiPidE |
1228 |
elCount = 1 |
1229 |
elType = mpiTypeXFaceThread_xyz_R8(myThid) |
1230 |
CALL MPI_Isend( |
1231 |
I phi(sNx-OLx+1,1-OLy,1,bW,myByLo(myThid)), |
1232 |
I elCount, elType, toPid, |
1233 |
I tagSend,MPI_COMM_WORLD, |
1234 |
I reqIds(nReqs),mpiRC) |
1235 |
ENDIF |
1236 |
IF ( bE .EQ. 1 .AND. commW(myThid) .EQ. COMM_MPI ) THEN |
1237 |
nReqs = nReqs+1 |
1238 |
tagRecv = mpiTagE*nThreads+myThrW(myThid) |
1239 |
fromPid = mpiPidW |
1240 |
elCount = 1 |
1241 |
elType = mpiTypeXFaceThread_xyz_R8(myThid) |
1242 |
CALL MPI_Irecv( |
1243 |
U phi(1-OLx,1-OLy,1,bE,myByLo(myThid)), |
1244 |
I elCount, elType, fromPid, |
1245 |
I tagRecv,MPI_COMM_WORLD, |
1246 |
I reqIds(nReqs),mpiRC) |
1247 |
ENDIF |
1248 |
|
1249 |
C Wait for this threads east-west transactions to finish before |
1250 |
C posting north-south transactions. We have to do this so that |
1251 |
C the north-south transactions will send out the correct overlap |
1252 |
C region values into the corner sections of our neighbors. |
1253 |
CALL MPI_Waitall( nReqs, reqIds, mpiStatArr, mpiRC ) |
1254 |
|
1255 |
CcnhDebugStarts |
1256 |
C RETURN |
1257 |
CcnhDebugEnds |
1258 |
|
1259 |
|
1260 |
C-- North-south messaging communication |
1261 |
nReqs = 0 |
1262 |
bN = myByLo(myTHid) |
1263 |
bS = myByHi(myTHid) |
1264 |
|
1265 |
IF ( bN .EQ. 1 .AND. commS(myThid) .EQ. COMM_MPI ) THEN |
1266 |
C My south face uses MPI. Get ready to receive data from |
1267 |
C south and post that I am ready to send data to the south. |
1268 |
nReqs = nReqs + 1 |
1269 |
tagSend = mpiTagS*nThreads+myThid |
1270 |
toPid = mpiPidS |
1271 |
elCount = 1 |
1272 |
elType = mpiTypeYFaceThread_xyz_R8(myThid) |
1273 |
CALL MPI_Isend( |
1274 |
I phi(1-OLx,1,1,myBxLo(myThid),bN), |
1275 |
I elCount, elType, toPid, |
1276 |
I tagSend,MPI_COMM_WORLD, |
1277 |
I reqIds(nReqs), mpiRC ) |
1278 |
ENDIF |
1279 |
IF ( bS .EQ. nSy .AND. commN(myThid) .EQ. COMM_MPI ) THEN |
1280 |
nReqs = nReqs + 1 |
1281 |
tagRecv = mpiTagS*nThreads+myThrN(myThid) |
1282 |
fromPid = mpiPidN |
1283 |
elCount = 1 |
1284 |
elType = mpiTypeYFaceThread_xyz_R8(myThid) |
1285 |
CALL MPI_Irecv( |
1286 |
I phi(1-OLx,sNy+1,1,myBxLo(myThid),bS), |
1287 |
I elCount, elType, fromPid, |
1288 |
I tagRecv,MPI_COMM_WORLD, |
1289 |
I reqIds(nReqs), mpiRC ) |
1290 |
ENDIF |
1291 |
C Wait for this threads north-south transactions to finish. |
1292 |
CALL MPI_Waitall( nReqs, reqIds, mpiStatArr, mpiRC ) |
1293 |
C |
1294 |
nReqs = 0 |
1295 |
IF ( bS .EQ. nSy .AND. commN(myThid) .EQ. COMM_MPI ) THEN |
1296 |
C My north face uses MPI. Get ready to receive data from |
1297 |
C north and post that I am ready to send data to the north. |
1298 |
nReqs = nReqs + 1 |
1299 |
tagSend = mpiTagN*nThreads+myThid |
1300 |
toPid = mpiPidN |
1301 |
elCount = 1 |
1302 |
elType = mpiTypeYFaceThread_xyz_R8(myThid) |
1303 |
CALL MPI_Isend( |
1304 |
I phi(1-OLx,sNy-OLy+1,1,myBxLo(myThid),bS), |
1305 |
I elCount, elType, toPid, |
1306 |
I tagSend,MPI_COMM_WORLD, |
1307 |
I reqIds(nReqs), mpiRC ) |
1308 |
ENDIF |
1309 |
IF ( bN .EQ. 1 .AND. commS(myThid) .EQ. COMM_MPI ) THEN |
1310 |
nReqs = nReqs + 1 |
1311 |
tagRecv = mpiTagN*nThreads+myThrS(myThid) |
1312 |
fromPid = mpiPidS |
1313 |
elCount = 1 |
1314 |
elType = mpiTypeYFaceThread_xyz_R8(myThid) |
1315 |
CALL MPI_Irecv( |
1316 |
I phi(1-OLx,1-OLy,1,myBxLo(myThid),bN), |
1317 |
I elCount, elType, fromPid, |
1318 |
I tagRecv,MPI_COMM_WORLD, |
1319 |
I reqIds(nReqs), mpiRC ) |
1320 |
ENDIF |
1321 |
C Wait for this threads north-south transactions to finish. |
1322 |
CALL MPI_Waitall( nReqs, reqIds, mpiStatArr, mpiRC ) |
1323 |
|
1324 |
#ifndef ALWAYS_USE_MPI |
1325 |
ENDIF |
1326 |
#endif |
1327 |
#endif /* ALLOW_USE_MPI */ |
1328 |
|
1329 |
#ifndef ALWAYS_USE_SYNC_COMMUNICATION |
1330 |
ENDIF |
1331 |
#endif |
1332 |
#endif /* ALLOW_SYNC_COMMUNICATION */ |
1333 |
|
1334 |
C-- Do true shared-memory data exchanges |
1335 |
C Note: This section also does the overlap copies for serial |
1336 |
C code overlap copies. |
1337 |
C-- First make sure all threads have reached here |
1338 |
_BARRIER |
1339 |
|
1340 |
C-- Now do copies |
1341 |
C Notes: |
1342 |
C ====== |
1343 |
C Here we copy from up to and including the overlap |
1344 |
C regions for both the X shift and the Y shift. This |
1345 |
C catches the "corners" of the data copied using messages |
1346 |
C in the "synchronous communication" section above. |
1347 |
C-- x-axis exchanges |
1348 |
bE = myBxLo(myThid) |
1349 |
IF ( bE .NE. 1 .OR. |
1350 |
& commW(myThid) .EQ. COMM_SHARED ) THEN |
1351 |
bW = bE-1 |
1352 |
IF ( bW .LT. 1 ) bW = nSx |
1353 |
DO bj=myByLo(myThid),myByHi(myThid) |
1354 |
DO K=1,Nz |
1355 |
DO J=1-OLy,sNy+OLy |
1356 |
DO I=1,OLx |
1357 |
phi(sNx+I,J,K,bW,bj)=phi( 1+I-1,J,K,bE,bj) |
1358 |
phi(1-I ,J,K,bE,bj)=phi(sNx-I+1,J,K,bW,bj) |
1359 |
ENDDO |
1360 |
ENDDO |
1361 |
ENDDO |
1362 |
ENDDO |
1363 |
ENDIF |
1364 |
DO bE=myBxLo(myThid)+1,myBxHi(myThid) |
1365 |
bW = bE-1 |
1366 |
DO bj=myByLo(myThid),myByHi(myThid) |
1367 |
DO K=1,Nz |
1368 |
DO J=1-OLy,sNy+OLy |
1369 |
DO I=1,OLx |
1370 |
phi(sNx+I,J,K,bW,bj)=phi( 1+I-1,J,K,bE,bj) |
1371 |
phi(1-I ,J,K,bE,bj)=phi(sNx-I+1,J,K,bW,bj) |
1372 |
ENDDO |
1373 |
ENDDO |
1374 |
ENDDO |
1375 |
ENDDO |
1376 |
ENDDO |
1377 |
|
1378 |
C Need to ensure all threads have completed x-axis transfers before |
1379 |
C we can do y-axis exchanges. |
1380 |
_BARRIER |
1381 |
|
1382 |
C-- y-axis exchanges |
1383 |
bN = myByLo(myThid) |
1384 |
IF ( bN .NE. 1 .OR. |
1385 |
& commS(myThid) .EQ. COMM_SHARED ) THEN |
1386 |
bS = bN - 1 |
1387 |
IF ( bS .LT. 1 ) bS = nSy |
1388 |
DO bi=myBxLo(myThid),myBxHi(myThid) |
1389 |
DO K=1,Nz |
1390 |
DO J=1,OLy |
1391 |
DO I=1-OLx,sNx+OLx |
1392 |
phi(I,1-J ,K,bi,bN)=phi(I,sNy-J+1,K,bi,bS) |
1393 |
phi(I,sNy+J ,K,bi,bS)=phi(I,1+J-1 ,K,bi,bN) |
1394 |
ENDDO |
1395 |
ENDDO |
1396 |
ENDDO |
1397 |
ENDDO |
1398 |
ENDIF |
1399 |
DO bN=myByLo(myThid)+1,myByHi(myThid) |
1400 |
bS = bN - 1 |
1401 |
DO bi=myBxLo(myThid),myBxHi(myThid) |
1402 |
DO K=1,Nz |
1403 |
DO J=1,OLy |
1404 |
DO I=1-OLx,sNx+OLx |
1405 |
phi(I,1-J ,K,bi,bN)=phi(I,sNy-J+1,K,bi,bS) |
1406 |
phi(I,sNy+J ,K,bi,bS)=phi(I,1+J-1 ,K,bi,bN) |
1407 |
ENDDO |
1408 |
ENDDO |
1409 |
ENDDO |
1410 |
ENDDO |
1411 |
ENDDO |
1412 |
|
1413 |
_BARRIER |
1414 |
|
1415 |
RETURN |
1416 |
END |