eesupp/src/global_sum_tile.F

C $Header: /u/gcmpack/MITgcm/eesupp/src/global_sum_tile.F,v 1.2 2009/06/07 15:19:41 jmc Exp $
C $Name:  $

#include "CPP_EEOPTIONS.h"

C--   File global_sum_tile.F: Routines that perform global sum
C                             on a tile array
C      Contents
C      o GLOBAL_SUM_TILE_RL
C      o GLOBAL_SUM_TILE_RS <- not yet coded

C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----|
CBOP
C     !ROUTINE: GLOBAL_SUM_TILE_RL

C     !INTERFACE:
      SUBROUTINE GLOBAL_SUM_TILE_RL(
     I                       phiTile,
     O                       sumPhi,
     I                       myThid )

C     !DESCRIPTION:
C     *==========================================================*
C     | SUBROUTINE GLOBAL\_SUM\_TILE\_RL
C     | o Handle sum for _RL data.
C     *==========================================================*
C     | Apply sum on an array of one value per tile
C     |  and operate over all tiles & all the processes.
C     *==========================================================*

C     !USES:
      IMPLICIT NONE

C     == Global data ==
#include "SIZE.h"
#include "EEPARAMS.h"
#include "EESUPPORT.h"
#include "GLOBAL_SUM.h"

C     !INPUT/OUTPUT PARAMETERS:
C     == Routine arguments ==
C     phiTile :: Input array with one value per tile
C     sumPhi  :: Result of sum.
C     myThid  :: My thread id.
      _RL     phiTile(nSx,nSy)
      _RL     sumPhi
      INTEGER myThid

C     !LOCAL VARIABLES:
C     == Local variables ==
C     bi,bj      :: Loop counters
C     mpiRC  - MPI return code
C- type declaration of: sumMyPr, sumAllP, localBuf and shareBufGSR8 :
C         all 4 needs to have the same length as MPI_DOUBLE_PRECISION
      INTEGER bi,bj
#ifdef ALLOW_USE_MPI
#ifdef GLOBAL_SUM_SEND_RECV
      INTEGER biG, bjG, npe, np1
      INTEGER lbuff, idest, itag, ready_to_receive
      INTEGER istatus(MPI_STATUS_SIZE), ierr
      Real*8  localBuf (nSx,nSy)
      Real*8  globalBuf(nSx*nPx,nSy*nPy)
#else
      INTEGER mpiRC
      Real*8  sumMyPr
#endif
#else /* ALLOW_USE_MPI */
      Real*8  sumMyPr
#endif /* ALLOW_USE_MPI */
      Real*8  sumAllP
CEOP

C     this barrier is not necessary:
c     CALL BAR2( myThid )

C--   write local sum into shared-buffer array
      DO bj = myByLo(myThid), myByHi(myThid)
       DO bi = myBxLo(myThid), myBxHi(myThid)
         shareBufGSR8(bi,bj) = phiTile(bi,bj)
       ENDDO
      ENDDO

C--   Master thread cannot start until everyone is ready:
      CALL BAR2( myThid )
      _BEGIN_MASTER( myThid )

#if (defined (GLOBAL_SUM_SEND_RECV) && defined (ALLOW_USE_MPI) )

      lbuff = nSx*nSy
      idest = 0
      itag  = 0
      ready_to_receive = 0

      IF ( mpiMyId.NE.0 ) THEN

C--   All proceses except 0 wait to be polled then send local array
#ifndef DISABLE_MPI_READY_TO_RECEIVE
          CALL MPI_RECV (ready_to_receive, 1, MPI_INTEGER,
     &         idest, itag, MPI_COMM_MODEL, istatus, ierr)
#endif
          CALL MPI_SEND (shareBufGSR8, lbuff, MPI_DOUBLE_PRECISION,
     &         idest, itag, MPI_COMM_MODEL, ierr)

C--   All proceses except 0 receive result from process 0
          CALL MPI_RECV (sumAllP, 1, MPI_DOUBLE_PRECISION,
     &         idest, itag, MPI_COMM_MODEL, istatus, ierr)

      ELSE

C--   Process 0 fills-in its local data
        np1 = 1
        DO bj=1,nSy
          DO bi=1,nSx
            biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi
            bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj
            globalBuf(biG,bjG) = shareBufGSR8(bi,bj)
          ENDDO
        ENDDO

C--   Process 0 polls and receives data from each process in turn
        DO npe = 1, numberOfProcs-1
#ifndef DISABLE_MPI_READY_TO_RECEIVE
          CALL MPI_SEND (ready_to_receive, 1, MPI_INTEGER,
     &           npe, itag, MPI_COMM_MODEL, ierr)
#endif
          CALL MPI_RECV (localBuf, lbuff, MPI_DOUBLE_PRECISION,
     &           npe, itag, MPI_COMM_MODEL, istatus, ierr)

C--   Process 0 gathers the local arrays into a global array.
          np1 = npe + 1
          DO bj=1,nSy
           DO bi=1,nSx
            biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi
            bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj
            globalBuf(biG,bjG) = localBuf(bi,bj)
           ENDDO
          ENDDO
        ENDDO

C--   Sum over all tiles:
        sumAllP = 0.
        DO bjG = 1,nSy*nPy
         DO biG = 1,nSx*nPx
           sumAllP = sumAllP + globalBuf(biG,bjG)
         ENDDO
        ENDDO

C--   Process 0 sends result to all other processes
        lbuff = 1
        DO npe = 1, numberOfProcs-1
          CALL MPI_SEND (sumAllP, 1, MPI_DOUBLE_PRECISION,
     &                   npe, itag, MPI_COMM_MODEL, ierr)
        ENDDO


      ENDIF

#else /* not (GLOBAL_SUM_SEND_RECV & ALLOW_USE_MPI) */

C--   Sum over all tiles (of the same process) first
       sumMyPr = 0.
       DO bj = 1,nSy
        DO bi = 1,nSx
          sumMyPr = sumMyPr + shareBufGSR8(bi,bj)
        ENDDO
       ENDDO

C      in case MPI is not used:
       sumAllP = sumMyPr

#ifdef ALLOW_USE_MPI
#ifndef ALWAYS_USE_MPI
       IF ( usingMPI ) THEN
#endif
        CALL MPI_Allreduce(sumMyPr,sumAllP,1,MPI_DOUBLE_PRECISION,
     &                     MPI_SUM,MPI_COMM_MODEL,mpiRC)
#ifndef ALWAYS_USE_MPI
       ENDIF
#endif
#endif /* ALLOW_USE_MPI */

#endif /* not (GLOBAL_SUM_SEND_RECV & ALLOW_USE_MPI) */

C--    Write solution to shared buffer (all threads can see it)
c      shareBufGSR8(1,1) = sumAllP
       phiGSR8(1,0) = sumAllP

      _END_MASTER( myThid )
C--   Everyone wait for Master thread to be ready
      CALL BAR2( myThid )

C--   set result for every threads
c     sumPhi = shareBufGSR8(1,1)
      sumPhi = phiGSR8(1,0)

C--   A barrier was needed here to prevent thread 1 to modify shareBufGSR8(1,1)
C     (as it would in the following call to this S/R) before all threads get
C     their global-sum result out.
C     No longer needed since a dedicated shared var. is used to share the output
c     CALL BAR2( myThid )

      RETURN
      END
1	jmc	1.3	C $Header: /u/gcmpack/MITgcm/eesupp/src/global_sum_tile.F,v 1.2 2009/06/07 15:19:41 jmc Exp $
2	jmc	1.1	C $Name: $
3
4	jmc	1.3	#include "CPP_EEOPTIONS.h"
5
6	jmc	1.1	C-- File global_sum_tile.F: Routines that perform global sum
7			C on a tile array
8			C Contents
9	jmc	1.3	C o GLOBAL_SUM_TILE_RL
10			C o GLOBAL_SUM_TILE_RS <- not yet coded
11	jmc	1.1
12	jmc	1.3	C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-\|--+----\|
13	jmc	1.1	CBOP
14			C !ROUTINE: GLOBAL_SUM_TILE_RL
15
16			C !INTERFACE:
17			SUBROUTINE GLOBAL_SUM_TILE_RL(
18			I phiTile,
19			O sumPhi,
20			I myThid )
21	jmc	1.3
22	jmc	1.1	C !DESCRIPTION:
23			C ==========================================================
24			C \| SUBROUTINE GLOBAL\_SUM\_TILE\_RL
25			C \| o Handle sum for _RL data.
26			C ==========================================================
27			C \| Apply sum on an array of one value per tile
28			C \| and operate over all tiles & all the processes.
29			C ==========================================================
30
31			C !USES:
32	jmc	1.3	IMPLICIT NONE
33
34	jmc	1.1	C == Global data ==
35			#include "SIZE.h"
36			#include "EEPARAMS.h"
37			#include "EESUPPORT.h"
38			#include "GLOBAL_SUM.h"
39
40			C !INPUT/OUTPUT PARAMETERS:
41			C == Routine arguments ==
42			C phiTile :: Input array with one value per tile
43			C sumPhi :: Result of sum.
44			C myThid :: My thread id.
45			_RL phiTile(nSx,nSy)
46			_RL sumPhi
47			INTEGER myThid
48
49			C !LOCAL VARIABLES:
50			C == Local variables ==
51			C bi,bj :: Loop counters
52			C mpiRC - MPI return code
53			C- type declaration of: sumMyPr, sumAllP, localBuf and shareBufGSR8 :
54			C all 4 needs to have the same length as MPI_DOUBLE_PRECISION
55			INTEGER bi,bj
56			#ifdef ALLOW_USE_MPI
57			#ifdef GLOBAL_SUM_SEND_RECV
58	jmc	1.3	INTEGER biG, bjG, npe, np1
59	jmc	1.1	INTEGER lbuff, idest, itag, ready_to_receive
60			INTEGER istatus(MPI_STATUS_SIZE), ierr
61			Real*8 localBuf (nSx,nSy)
62			Real8 globalBuf(nSxnPx,nSy*nPy)
63			#else
64			INTEGER mpiRC
65			Real*8 sumMyPr
66			#endif
67			#else /* ALLOW_USE_MPI */
68			Real*8 sumMyPr
69			#endif /* ALLOW_USE_MPI */
70			Real*8 sumAllP
71			CEOP
72
73	jmc	1.2	C this barrier is not necessary:
74			c CALL BAR2( myThid )
75	jmc	1.1
76			C-- write local sum into shared-buffer array
77			DO bj = myByLo(myThid), myByHi(myThid)
78			DO bi = myBxLo(myThid), myBxHi(myThid)
79			shareBufGSR8(bi,bj) = phiTile(bi,bj)
80			ENDDO
81			ENDDO
82
83			C-- Master thread cannot start until everyone is ready:
84			CALL BAR2( myThid )
85			_BEGIN_MASTER( myThid )
86
87			#if (defined (GLOBAL_SUM_SEND_RECV) && defined (ALLOW_USE_MPI) )
88
89			lbuff = nSx*nSy
90			idest = 0
91			itag = 0
92			ready_to_receive = 0
93
94			IF ( mpiMyId.NE.0 ) THEN
95
96			C-- All proceses except 0 wait to be polled then send local array
97			#ifndef DISABLE_MPI_READY_TO_RECEIVE
98			CALL MPI_RECV (ready_to_receive, 1, MPI_INTEGER,
99			& idest, itag, MPI_COMM_MODEL, istatus, ierr)
100			#endif
101			CALL MPI_SEND (shareBufGSR8, lbuff, MPI_DOUBLE_PRECISION,
102			& idest, itag, MPI_COMM_MODEL, ierr)
103
104			C-- All proceses except 0 receive result from process 0
105			CALL MPI_RECV (sumAllP, 1, MPI_DOUBLE_PRECISION,
106			& idest, itag, MPI_COMM_MODEL, istatus, ierr)
107
108			ELSE
109
110			C-- Process 0 fills-in its local data
111	jmc	1.3	np1 = 1
112	jmc	1.1	DO bj=1,nSy
113			DO bi=1,nSx
114	jmc	1.3	biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi
115			bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj
116	jmc	1.1	globalBuf(biG,bjG) = shareBufGSR8(bi,bj)
117			ENDDO
118			ENDDO
119
120			C-- Process 0 polls and receives data from each process in turn
121			DO npe = 1, numberOfProcs-1
122			#ifndef DISABLE_MPI_READY_TO_RECEIVE
123			CALL MPI_SEND (ready_to_receive, 1, MPI_INTEGER,
124			& npe, itag, MPI_COMM_MODEL, ierr)
125			#endif
126			CALL MPI_RECV (localBuf, lbuff, MPI_DOUBLE_PRECISION,
127			& npe, itag, MPI_COMM_MODEL, istatus, ierr)
128
129			C-- Process 0 gathers the local arrays into a global array.
130	jmc	1.3	np1 = npe + 1
131	jmc	1.1	DO bj=1,nSy
132			DO bi=1,nSx
133	jmc	1.3	biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi
134			bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj
135	jmc	1.1	globalBuf(biG,bjG) = localBuf(bi,bj)
136			ENDDO
137			ENDDO
138			ENDDO
139
140			C-- Sum over all tiles:
141			sumAllP = 0.
142			DO bjG = 1,nSy*nPy
143			DO biG = 1,nSx*nPx
144			sumAllP = sumAllP + globalBuf(biG,bjG)
145			ENDDO
146			ENDDO
147
148			C-- Process 0 sends result to all other processes
149			lbuff = 1
150			DO npe = 1, numberOfProcs-1
151			CALL MPI_SEND (sumAllP, 1, MPI_DOUBLE_PRECISION,
152			& npe, itag, MPI_COMM_MODEL, ierr)
153			ENDDO
154
155
156			ENDIF
157
158			#else /* not (GLOBAL_SUM_SEND_RECV & ALLOW_USE_MPI) */
159
160			C-- Sum over all tiles (of the same process) first
161			sumMyPr = 0.
162			DO bj = 1,nSy
163			DO bi = 1,nSx
164			sumMyPr = sumMyPr + shareBufGSR8(bi,bj)
165			ENDDO
166			ENDDO
167
168			C in case MPI is not used:
169			sumAllP = sumMyPr
170
171			#ifdef ALLOW_USE_MPI
172			#ifndef ALWAYS_USE_MPI
173			IF ( usingMPI ) THEN
174			#endif
175			CALL MPI_Allreduce(sumMyPr,sumAllP,1,MPI_DOUBLE_PRECISION,
176			& MPI_SUM,MPI_COMM_MODEL,mpiRC)
177			#ifndef ALWAYS_USE_MPI
178			ENDIF
179			#endif
180			#endif /* ALLOW_USE_MPI */
181
182			#endif /* not (GLOBAL_SUM_SEND_RECV & ALLOW_USE_MPI) */
183
184			C-- Write solution to shared buffer (all threads can see it)
185	jmc	1.3	c shareBufGSR8(1,1) = sumAllP
186			phiGSR8(1,0) = sumAllP
187	jmc	1.1
188			_END_MASTER( myThid )
189			C-- Everyone wait for Master thread to be ready
190			CALL BAR2( myThid )
191
192			C-- set result for every threads
193	jmc	1.3	c sumPhi = shareBufGSR8(1,1)
194			sumPhi = phiGSR8(1,0)
195	jmc	1.1
196	jmc	1.3	C-- A barrier was needed here to prevent thread 1 to modify shareBufGSR8(1,1)
197	jmc	1.2	C (as it would in the following call to this S/R) before all threads get
198			C their global-sum result out.
199	jmc	1.3	C No longer needed since a dedicated shared var. is used to share the output
200			c CALL BAR2( myThid )
201	jmc	1.1
202			RETURN
203			END