1 |
C $Header: /u/gcmpack/MITgcm/eesupp/src/global_sum_tile.F,v 1.2 2009/06/07 15:19:41 jmc Exp $ |
2 |
C $Name: $ |
3 |
|
4 |
#include "CPP_EEOPTIONS.h" |
5 |
|
6 |
C-- File global_sum_tile.F: Routines that perform global sum |
7 |
C on a tile array |
8 |
C Contents |
9 |
C o GLOBAL_SUM_TILE_RL |
10 |
C o GLOBAL_SUM_TILE_RS <- not yet coded |
11 |
|
12 |
C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----| |
13 |
CBOP |
14 |
C !ROUTINE: GLOBAL_SUM_TILE_RL |
15 |
|
16 |
C !INTERFACE: |
17 |
SUBROUTINE GLOBAL_SUM_TILE_RL( |
18 |
I phiTile, |
19 |
O sumPhi, |
20 |
I myThid ) |
21 |
|
22 |
C !DESCRIPTION: |
23 |
C *==========================================================* |
24 |
C | SUBROUTINE GLOBAL\_SUM\_TILE\_RL |
25 |
C | o Handle sum for _RL data. |
26 |
C *==========================================================* |
27 |
C | Apply sum on an array of one value per tile |
28 |
C | and operate over all tiles & all the processes. |
29 |
C *==========================================================* |
30 |
|
31 |
C !USES: |
32 |
IMPLICIT NONE |
33 |
|
34 |
C == Global data == |
35 |
#include "SIZE.h" |
36 |
#include "EEPARAMS.h" |
37 |
#include "EESUPPORT.h" |
38 |
#include "GLOBAL_SUM.h" |
39 |
|
40 |
C !INPUT/OUTPUT PARAMETERS: |
41 |
C == Routine arguments == |
42 |
C phiTile :: Input array with one value per tile |
43 |
C sumPhi :: Result of sum. |
44 |
C myThid :: My thread id. |
45 |
_RL phiTile(nSx,nSy) |
46 |
_RL sumPhi |
47 |
INTEGER myThid |
48 |
|
49 |
C !LOCAL VARIABLES: |
50 |
C == Local variables == |
51 |
C bi,bj :: Loop counters |
52 |
C mpiRC - MPI return code |
53 |
C- type declaration of: sumMyPr, sumAllP, localBuf and shareBufGSR8 : |
54 |
C all 4 needs to have the same length as MPI_DOUBLE_PRECISION |
55 |
INTEGER bi,bj |
56 |
#ifdef ALLOW_USE_MPI |
57 |
#ifdef GLOBAL_SUM_SEND_RECV |
58 |
INTEGER biG, bjG, npe, np1 |
59 |
INTEGER lbuff, idest, itag, ready_to_receive |
60 |
INTEGER istatus(MPI_STATUS_SIZE), ierr |
61 |
Real*8 localBuf (nSx,nSy) |
62 |
Real*8 globalBuf(nSx*nPx,nSy*nPy) |
63 |
#else |
64 |
INTEGER mpiRC |
65 |
Real*8 sumMyPr |
66 |
#endif |
67 |
#else /* ALLOW_USE_MPI */ |
68 |
Real*8 sumMyPr |
69 |
#endif /* ALLOW_USE_MPI */ |
70 |
Real*8 sumAllP |
71 |
CEOP |
72 |
|
73 |
C this barrier is not necessary: |
74 |
c CALL BAR2( myThid ) |
75 |
|
76 |
C-- write local sum into shared-buffer array |
77 |
DO bj = myByLo(myThid), myByHi(myThid) |
78 |
DO bi = myBxLo(myThid), myBxHi(myThid) |
79 |
shareBufGSR8(bi,bj) = phiTile(bi,bj) |
80 |
ENDDO |
81 |
ENDDO |
82 |
|
83 |
C-- Master thread cannot start until everyone is ready: |
84 |
CALL BAR2( myThid ) |
85 |
_BEGIN_MASTER( myThid ) |
86 |
|
87 |
#if (defined (GLOBAL_SUM_SEND_RECV) && defined (ALLOW_USE_MPI) ) |
88 |
|
89 |
lbuff = nSx*nSy |
90 |
idest = 0 |
91 |
itag = 0 |
92 |
ready_to_receive = 0 |
93 |
|
94 |
IF ( mpiMyId.NE.0 ) THEN |
95 |
|
96 |
C-- All proceses except 0 wait to be polled then send local array |
97 |
#ifndef DISABLE_MPI_READY_TO_RECEIVE |
98 |
CALL MPI_RECV (ready_to_receive, 1, MPI_INTEGER, |
99 |
& idest, itag, MPI_COMM_MODEL, istatus, ierr) |
100 |
#endif |
101 |
CALL MPI_SEND (shareBufGSR8, lbuff, MPI_DOUBLE_PRECISION, |
102 |
& idest, itag, MPI_COMM_MODEL, ierr) |
103 |
|
104 |
C-- All proceses except 0 receive result from process 0 |
105 |
CALL MPI_RECV (sumAllP, 1, MPI_DOUBLE_PRECISION, |
106 |
& idest, itag, MPI_COMM_MODEL, istatus, ierr) |
107 |
|
108 |
ELSE |
109 |
|
110 |
C-- Process 0 fills-in its local data |
111 |
np1 = 1 |
112 |
DO bj=1,nSy |
113 |
DO bi=1,nSx |
114 |
biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi |
115 |
bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj |
116 |
globalBuf(biG,bjG) = shareBufGSR8(bi,bj) |
117 |
ENDDO |
118 |
ENDDO |
119 |
|
120 |
C-- Process 0 polls and receives data from each process in turn |
121 |
DO npe = 1, numberOfProcs-1 |
122 |
#ifndef DISABLE_MPI_READY_TO_RECEIVE |
123 |
CALL MPI_SEND (ready_to_receive, 1, MPI_INTEGER, |
124 |
& npe, itag, MPI_COMM_MODEL, ierr) |
125 |
#endif |
126 |
CALL MPI_RECV (localBuf, lbuff, MPI_DOUBLE_PRECISION, |
127 |
& npe, itag, MPI_COMM_MODEL, istatus, ierr) |
128 |
|
129 |
C-- Process 0 gathers the local arrays into a global array. |
130 |
np1 = npe + 1 |
131 |
DO bj=1,nSy |
132 |
DO bi=1,nSx |
133 |
biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi |
134 |
bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj |
135 |
globalBuf(biG,bjG) = localBuf(bi,bj) |
136 |
ENDDO |
137 |
ENDDO |
138 |
ENDDO |
139 |
|
140 |
C-- Sum over all tiles: |
141 |
sumAllP = 0. |
142 |
DO bjG = 1,nSy*nPy |
143 |
DO biG = 1,nSx*nPx |
144 |
sumAllP = sumAllP + globalBuf(biG,bjG) |
145 |
ENDDO |
146 |
ENDDO |
147 |
|
148 |
C-- Process 0 sends result to all other processes |
149 |
lbuff = 1 |
150 |
DO npe = 1, numberOfProcs-1 |
151 |
CALL MPI_SEND (sumAllP, 1, MPI_DOUBLE_PRECISION, |
152 |
& npe, itag, MPI_COMM_MODEL, ierr) |
153 |
ENDDO |
154 |
|
155 |
|
156 |
ENDIF |
157 |
|
158 |
#else /* not (GLOBAL_SUM_SEND_RECV & ALLOW_USE_MPI) */ |
159 |
|
160 |
C-- Sum over all tiles (of the same process) first |
161 |
sumMyPr = 0. |
162 |
DO bj = 1,nSy |
163 |
DO bi = 1,nSx |
164 |
sumMyPr = sumMyPr + shareBufGSR8(bi,bj) |
165 |
ENDDO |
166 |
ENDDO |
167 |
|
168 |
C in case MPI is not used: |
169 |
sumAllP = sumMyPr |
170 |
|
171 |
#ifdef ALLOW_USE_MPI |
172 |
#ifndef ALWAYS_USE_MPI |
173 |
IF ( usingMPI ) THEN |
174 |
#endif |
175 |
CALL MPI_Allreduce(sumMyPr,sumAllP,1,MPI_DOUBLE_PRECISION, |
176 |
& MPI_SUM,MPI_COMM_MODEL,mpiRC) |
177 |
#ifndef ALWAYS_USE_MPI |
178 |
ENDIF |
179 |
#endif |
180 |
#endif /* ALLOW_USE_MPI */ |
181 |
|
182 |
#endif /* not (GLOBAL_SUM_SEND_RECV & ALLOW_USE_MPI) */ |
183 |
|
184 |
C-- Write solution to shared buffer (all threads can see it) |
185 |
c shareBufGSR8(1,1) = sumAllP |
186 |
phiGSR8(1,0) = sumAllP |
187 |
|
188 |
_END_MASTER( myThid ) |
189 |
C-- Everyone wait for Master thread to be ready |
190 |
CALL BAR2( myThid ) |
191 |
|
192 |
C-- set result for every threads |
193 |
c sumPhi = shareBufGSR8(1,1) |
194 |
sumPhi = phiGSR8(1,0) |
195 |
|
196 |
C-- A barrier was needed here to prevent thread 1 to modify shareBufGSR8(1,1) |
197 |
C (as it would in the following call to this S/R) before all threads get |
198 |
C their global-sum result out. |
199 |
C No longer needed since a dedicated shared var. is used to share the output |
200 |
c CALL BAR2( myThid ) |
201 |
|
202 |
RETURN |
203 |
END |