1 |
C $Header: /u/gcmpack/MITgcm/eesupp/src/global_sum_tile.F,v 1.3 2009/06/10 03:47:03 jmc Exp $ |
2 |
C $Name: $ |
3 |
|
4 |
#include "CPP_EEOPTIONS.h" |
5 |
|
6 |
C-- File global_sum_tile.F: Routines that perform global sum |
7 |
C on a tile array |
8 |
C Contents |
9 |
C o GLOBAL_SUM_TILE_RL |
10 |
C o GLOBAL_SUM_TILE_RS <- not yet coded |
11 |
|
12 |
C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----| |
13 |
CBOP |
14 |
C !ROUTINE: GLOBAL_SUM_TILE_RL |
15 |
|
16 |
C !INTERFACE: |
17 |
SUBROUTINE GLOBAL_SUM_TILE_RL( |
18 |
I phiTile, |
19 |
O sumPhi, |
20 |
I myThid ) |
21 |
|
22 |
C !DESCRIPTION: |
23 |
C *==========================================================* |
24 |
C | SUBROUTINE GLOBAL\_SUM\_TILE\_RL |
25 |
C | o Handle sum for _RL data. |
26 |
C *==========================================================* |
27 |
C | Apply sum on an array of one value per tile |
28 |
C | and operate over all tiles & all the processes. |
29 |
C *==========================================================* |
30 |
|
31 |
C !USES: |
32 |
IMPLICIT NONE |
33 |
|
34 |
C == Global data == |
35 |
#include "SIZE.h" |
36 |
#include "EEPARAMS.h" |
37 |
#include "EESUPPORT.h" |
38 |
#include "GLOBAL_SUM.h" |
39 |
|
40 |
C !INPUT/OUTPUT PARAMETERS: |
41 |
C == Routine arguments == |
42 |
C phiTile :: Input array with one value per tile |
43 |
C sumPhi :: Result of sum. |
44 |
C myThid :: My thread id. |
45 |
_RL phiTile(nSx,nSy) |
46 |
_RL sumPhi |
47 |
INTEGER myThid |
48 |
|
49 |
C !LOCAL VARIABLES: |
50 |
C == Local variables == |
51 |
C bi,bj :: Loop counters |
52 |
C mpiRC :: MPI return code |
53 |
C- type declaration of: sumMyPr, sumAllP, localBuf and shareBufGSR8 : |
54 |
C all 4 needs to have the same length as MPI_DOUBLE_PRECISION |
55 |
INTEGER bi,bj |
56 |
#ifdef ALLOW_USE_MPI |
57 |
#ifdef GLOBAL_SUM_SEND_RECV |
58 |
INTEGER biG, bjG, np, pId |
59 |
INTEGER lbuff, idest, itag, ready_to_receive |
60 |
INTEGER istatus(MPI_STATUS_SIZE), ierr |
61 |
Real*8 localBuf (nSx,nSy) |
62 |
Real*8 globalBuf(nSx*nPx,nSy*nPy) |
63 |
#endif |
64 |
INTEGER mpiRC |
65 |
#endif /* ALLOW_USE_MPI */ |
66 |
Real*8 sumMyPr |
67 |
Real*8 sumAllP |
68 |
CEOP |
69 |
|
70 |
C this barrier is not necessary: |
71 |
c CALL BAR2( myThid ) |
72 |
|
73 |
C-- write local sum into shared-buffer array |
74 |
DO bj = myByLo(myThid), myByHi(myThid) |
75 |
DO bi = myBxLo(myThid), myBxHi(myThid) |
76 |
shareBufGSR8(bi,bj) = phiTile(bi,bj) |
77 |
ENDDO |
78 |
ENDDO |
79 |
|
80 |
C-- Master thread cannot start until everyone is ready: |
81 |
CALL BAR2( myThid ) |
82 |
_BEGIN_MASTER( myThid ) |
83 |
|
84 |
#if (defined (GLOBAL_SUM_SEND_RECV) && defined (ALLOW_USE_MPI) ) |
85 |
IF ( usingMPI ) THEN |
86 |
|
87 |
lbuff = nSx*nSy |
88 |
idest = 0 |
89 |
itag = 0 |
90 |
ready_to_receive = 0 |
91 |
|
92 |
IF ( mpiMyId.NE.0 ) THEN |
93 |
|
94 |
C-- All proceses except 0 wait to be polled then send local array |
95 |
#ifndef DISABLE_MPI_READY_TO_RECEIVE |
96 |
CALL MPI_RECV (ready_to_receive, 1, MPI_INTEGER, |
97 |
& idest, itag, MPI_COMM_MODEL, istatus, ierr) |
98 |
#endif |
99 |
CALL MPI_SEND (shareBufGSR8, lbuff, MPI_DOUBLE_PRECISION, |
100 |
& idest, itag, MPI_COMM_MODEL, ierr) |
101 |
|
102 |
C-- All proceses except 0 receive result from process 0 |
103 |
CALL MPI_RECV (sumAllP, 1, MPI_DOUBLE_PRECISION, |
104 |
& idest, itag, MPI_COMM_MODEL, istatus, ierr) |
105 |
|
106 |
ELSE |
107 |
C- case mpiMyId = 0 |
108 |
|
109 |
C-- Process 0 fills-in its local data |
110 |
np = 1 |
111 |
DO bj=1,nSy |
112 |
DO bi=1,nSx |
113 |
biG = (mpi_myXGlobalLo(np)-1)/sNx+bi |
114 |
bjG = (mpi_myYGlobalLo(np)-1)/sNy+bj |
115 |
globalBuf(biG,bjG) = shareBufGSR8(bi,bj) |
116 |
ENDDO |
117 |
ENDDO |
118 |
|
119 |
C-- Process 0 polls and receives data from each process in turn |
120 |
DO np = 2, nPx*nPy |
121 |
pId = np - 1 |
122 |
#ifndef DISABLE_MPI_READY_TO_RECEIVE |
123 |
CALL MPI_SEND (ready_to_receive, 1, MPI_INTEGER, |
124 |
& pId, itag, MPI_COMM_MODEL, ierr) |
125 |
#endif |
126 |
CALL MPI_RECV (localBuf, lbuff, MPI_DOUBLE_PRECISION, |
127 |
& pId, itag, MPI_COMM_MODEL, istatus, ierr) |
128 |
|
129 |
C-- Process 0 gathers the local arrays into a global array. |
130 |
DO bj=1,nSy |
131 |
DO bi=1,nSx |
132 |
biG = (mpi_myXGlobalLo(np)-1)/sNx+bi |
133 |
bjG = (mpi_myYGlobalLo(np)-1)/sNy+bj |
134 |
globalBuf(biG,bjG) = localBuf(bi,bj) |
135 |
ENDDO |
136 |
ENDDO |
137 |
C- end loop on np |
138 |
ENDDO |
139 |
|
140 |
C-- Sum over all tiles: |
141 |
sumAllP = 0. |
142 |
DO bjG = 1,nSy*nPy |
143 |
DO biG = 1,nSx*nPx |
144 |
sumAllP = sumAllP + globalBuf(biG,bjG) |
145 |
ENDDO |
146 |
ENDDO |
147 |
|
148 |
C-- Process 0 sends result to all other processes |
149 |
lbuff = 1 |
150 |
DO np = 2, nPx*nPy |
151 |
pId = np - 1 |
152 |
CALL MPI_SEND (sumAllP, 1, MPI_DOUBLE_PRECISION, |
153 |
& pId, itag, MPI_COMM_MODEL, ierr) |
154 |
ENDDO |
155 |
|
156 |
C End if/else mpiMyId = 0 |
157 |
ENDIF |
158 |
|
159 |
ELSE |
160 |
#else /* not (GLOBAL_SUM_SEND_RECV & ALLOW_USE_MPI) */ |
161 |
IF ( .TRUE. ) THEN |
162 |
#endif /* not (GLOBAL_SUM_SEND_RECV & ALLOW_USE_MPI) */ |
163 |
|
164 |
C-- Sum over all tiles (of the same process) first |
165 |
sumMyPr = 0. |
166 |
DO bj = 1,nSy |
167 |
DO bi = 1,nSx |
168 |
sumMyPr = sumMyPr + shareBufGSR8(bi,bj) |
169 |
ENDDO |
170 |
ENDDO |
171 |
|
172 |
C in case MPI is not used: |
173 |
sumAllP = sumMyPr |
174 |
|
175 |
#ifdef ALLOW_USE_MPI |
176 |
IF ( usingMPI ) THEN |
177 |
CALL MPI_Allreduce(sumMyPr,sumAllP,1,MPI_DOUBLE_PRECISION, |
178 |
& MPI_SUM,MPI_COMM_MODEL,mpiRC) |
179 |
ENDIF |
180 |
#endif /* ALLOW_USE_MPI */ |
181 |
|
182 |
ENDIF |
183 |
|
184 |
C-- Write solution to shared buffer (all threads can see it) |
185 |
c shareBufGSR8(1,1) = sumAllP |
186 |
phiGSR8(1,0) = sumAllP |
187 |
|
188 |
_END_MASTER( myThid ) |
189 |
C-- Everyone wait for Master thread to be ready |
190 |
CALL BAR2( myThid ) |
191 |
|
192 |
C-- set result for every threads |
193 |
c sumPhi = shareBufGSR8(1,1) |
194 |
sumPhi = phiGSR8(1,0) |
195 |
|
196 |
C-- A barrier was needed here to prevent thread 1 to modify shareBufGSR8(1,1) |
197 |
C (as it would in the following call to this S/R) before all threads get |
198 |
C their global-sum result out. |
199 |
C No longer needed since a dedicated shared var. is used to share the output |
200 |
c CALL BAR2( myThid ) |
201 |
|
202 |
RETURN |
203 |
END |