eesupp/src/global_sum_tile.F

C $Header: /u/gcmpack/MITgcm/eesupp/src/global_sum_tile.F,v 1.3 2009/06/10 03:47:03 jmc Exp $
C $Name:  $

#include "CPP_EEOPTIONS.h"

C--   File global_sum_tile.F: Routines that perform global sum
C                             on a tile array
C      Contents
C      o GLOBAL_SUM_TILE_RL
C      o GLOBAL_SUM_TILE_RS <- not yet coded

C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----|
CBOP
C     !ROUTINE: GLOBAL_SUM_TILE_RL

C     !INTERFACE:
      SUBROUTINE GLOBAL_SUM_TILE_RL(
     I                       phiTile,
     O                       sumPhi,
     I                       myThid )

C     !DESCRIPTION:
C     *==========================================================*
C     | SUBROUTINE GLOBAL\_SUM\_TILE\_RL
C     | o Handle sum for _RL data.
C     *==========================================================*
C     | Apply sum on an array of one value per tile
C     |  and operate over all tiles & all the processes.
C     *==========================================================*

C     !USES:
      IMPLICIT NONE

C     == Global data ==
#include "SIZE.h"
#include "EEPARAMS.h"
#include "EESUPPORT.h"
#include "GLOBAL_SUM.h"

C     !INPUT/OUTPUT PARAMETERS:
C     == Routine arguments ==
C     phiTile :: Input array with one value per tile
C     sumPhi  :: Result of sum.
C     myThid  :: My thread id.
      _RL     phiTile(nSx,nSy)
      _RL     sumPhi
      INTEGER myThid

C     !LOCAL VARIABLES:
C     == Local variables ==
C     bi,bj   :: Loop counters
C     mpiRC   :: MPI return code
C- type declaration of: sumMyPr, sumAllP, localBuf and shareBufGSR8 :
C         all 4 needs to have the same length as MPI_DOUBLE_PRECISION
      INTEGER bi,bj
#ifdef ALLOW_USE_MPI
#ifdef GLOBAL_SUM_SEND_RECV
      INTEGER biG, bjG, np, pId
      INTEGER lbuff, idest, itag, ready_to_receive
      INTEGER istatus(MPI_STATUS_SIZE), ierr
      Real*8  localBuf (nSx,nSy)
      Real*8  globalBuf(nSx*nPx,nSy*nPy)
#endif
      INTEGER mpiRC
#endif /* ALLOW_USE_MPI */
      Real*8  sumMyPr
      Real*8  sumAllP
CEOP

C     this barrier is not necessary:
c     CALL BAR2( myThid )

C--   write local sum into shared-buffer array
      DO bj = myByLo(myThid), myByHi(myThid)
       DO bi = myBxLo(myThid), myBxHi(myThid)
         shareBufGSR8(bi,bj) = phiTile(bi,bj)
       ENDDO
      ENDDO

C--   Master thread cannot start until everyone is ready:
      CALL BAR2( myThid )
      _BEGIN_MASTER( myThid )

#if (defined (GLOBAL_SUM_SEND_RECV) && defined (ALLOW_USE_MPI) )
      IF ( usingMPI ) THEN

        lbuff = nSx*nSy
        idest = 0
        itag  = 0
        ready_to_receive = 0

        IF ( mpiMyId.NE.0 ) THEN

C--   All proceses except 0 wait to be polled then send local array
#ifndef DISABLE_MPI_READY_TO_RECEIVE
          CALL MPI_RECV (ready_to_receive, 1, MPI_INTEGER,
     &         idest, itag, MPI_COMM_MODEL, istatus, ierr)
#endif
          CALL MPI_SEND (shareBufGSR8, lbuff, MPI_DOUBLE_PRECISION,
     &         idest, itag, MPI_COMM_MODEL, ierr)

C--   All proceses except 0 receive result from process 0
          CALL MPI_RECV (sumAllP, 1, MPI_DOUBLE_PRECISION,
     &         idest, itag, MPI_COMM_MODEL, istatus, ierr)

        ELSE
C-      case mpiMyId = 0

C--   Process 0 fills-in its local data
         np = 1
         DO bj=1,nSy
          DO bi=1,nSx
            biG = (mpi_myXGlobalLo(np)-1)/sNx+bi
            bjG = (mpi_myYGlobalLo(np)-1)/sNy+bj
            globalBuf(biG,bjG) = shareBufGSR8(bi,bj)
          ENDDO
         ENDDO

C--   Process 0 polls and receives data from each process in turn
         DO np = 2, nPx*nPy
          pId = np - 1
#ifndef DISABLE_MPI_READY_TO_RECEIVE
          CALL MPI_SEND (ready_to_receive, 1, MPI_INTEGER,
     &           pId, itag, MPI_COMM_MODEL, ierr)
#endif
          CALL MPI_RECV (localBuf, lbuff, MPI_DOUBLE_PRECISION,
     &           pId, itag, MPI_COMM_MODEL, istatus, ierr)

C--   Process 0 gathers the local arrays into a global array.
          DO bj=1,nSy
           DO bi=1,nSx
             biG = (mpi_myXGlobalLo(np)-1)/sNx+bi
             bjG = (mpi_myYGlobalLo(np)-1)/sNy+bj
             globalBuf(biG,bjG) = localBuf(bi,bj)
           ENDDO
          ENDDO
C-       end loop on np
         ENDDO

C--   Sum over all tiles:
         sumAllP = 0.
         DO bjG = 1,nSy*nPy
          DO biG = 1,nSx*nPx
           sumAllP = sumAllP + globalBuf(biG,bjG)
          ENDDO
         ENDDO

C--   Process 0 sends result to all other processes
         lbuff = 1
         DO np = 2, nPx*nPy
          pId = np - 1
          CALL MPI_SEND (sumAllP, 1, MPI_DOUBLE_PRECISION,
     &                   pId, itag, MPI_COMM_MODEL, ierr)
         ENDDO

C       End if/else mpiMyId = 0
        ENDIF

      ELSE
#else /* not (GLOBAL_SUM_SEND_RECV & ALLOW_USE_MPI) */
      IF ( .TRUE. ) THEN
#endif /* not (GLOBAL_SUM_SEND_RECV & ALLOW_USE_MPI) */

C--   Sum over all tiles (of the same process) first
        sumMyPr = 0.
        DO bj = 1,nSy
         DO bi = 1,nSx
          sumMyPr = sumMyPr + shareBufGSR8(bi,bj)
         ENDDO
        ENDDO

C       in case MPI is not used:
        sumAllP = sumMyPr

#ifdef ALLOW_USE_MPI
        IF ( usingMPI ) THEN
         CALL MPI_Allreduce(sumMyPr,sumAllP,1,MPI_DOUBLE_PRECISION,
     &                      MPI_SUM,MPI_COMM_MODEL,mpiRC)
        ENDIF
#endif /* ALLOW_USE_MPI */

      ENDIF

C--    Write solution to shared buffer (all threads can see it)
c     shareBufGSR8(1,1) = sumAllP
      phiGSR8(1,0) = sumAllP

      _END_MASTER( myThid )
C--   Everyone wait for Master thread to be ready
      CALL BAR2( myThid )

C--   set result for every threads
c     sumPhi = shareBufGSR8(1,1)
      sumPhi = phiGSR8(1,0)

C--   A barrier was needed here to prevent thread 1 to modify shareBufGSR8(1,1)
C     (as it would in the following call to this S/R) before all threads get
C     their global-sum result out.
C     No longer needed since a dedicated shared var. is used to share the output
c     CALL BAR2( myThid )

      RETURN
      END
1	C $Header: /u/gcmpack/MITgcm/eesupp/src/global_sum_tile.F,v 1.3 2009/06/10 03:47:03 jmc Exp $
2	C $Name: $
3
4	#include "CPP_EEOPTIONS.h"
5
6	C-- File global_sum_tile.F: Routines that perform global sum
7	C on a tile array
8	C Contents
9	C o GLOBAL_SUM_TILE_RL
10	C o GLOBAL_SUM_TILE_RS <- not yet coded
11
12	C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-\|--+----\|
13	CBOP
14	C !ROUTINE: GLOBAL_SUM_TILE_RL
15
16	C !INTERFACE:
17	SUBROUTINE GLOBAL_SUM_TILE_RL(
18	I phiTile,
19	O sumPhi,
20	I myThid )
21
22	C !DESCRIPTION:
23	C ==========================================================
24	C \| SUBROUTINE GLOBAL\_SUM\_TILE\_RL
25	C \| o Handle sum for _RL data.
26	C ==========================================================
27	C \| Apply sum on an array of one value per tile
28	C \| and operate over all tiles & all the processes.
29	C ==========================================================
30
31	C !USES:
32	IMPLICIT NONE
33
34	C == Global data ==
35	#include "SIZE.h"
36	#include "EEPARAMS.h"
37	#include "EESUPPORT.h"
38	#include "GLOBAL_SUM.h"
39
40	C !INPUT/OUTPUT PARAMETERS:
41	C == Routine arguments ==
42	C phiTile :: Input array with one value per tile
43	C sumPhi :: Result of sum.
44	C myThid :: My thread id.
45	_RL phiTile(nSx,nSy)
46	_RL sumPhi
47	INTEGER myThid
48
49	C !LOCAL VARIABLES:
50	C == Local variables ==
51	C bi,bj :: Loop counters
52	C mpiRC :: MPI return code
53	C- type declaration of: sumMyPr, sumAllP, localBuf and shareBufGSR8 :
54	C all 4 needs to have the same length as MPI_DOUBLE_PRECISION
55	INTEGER bi,bj
56	#ifdef ALLOW_USE_MPI
57	#ifdef GLOBAL_SUM_SEND_RECV
58	INTEGER biG, bjG, np, pId
59	INTEGER lbuff, idest, itag, ready_to_receive
60	INTEGER istatus(MPI_STATUS_SIZE), ierr
61	Real*8 localBuf (nSx,nSy)
62	Real8 globalBuf(nSxnPx,nSy*nPy)
63	#endif
64	INTEGER mpiRC
65	#endif /* ALLOW_USE_MPI */
66	Real*8 sumMyPr
67	Real*8 sumAllP
68	CEOP
69
70	C this barrier is not necessary:
71	c CALL BAR2( myThid )
72
73	C-- write local sum into shared-buffer array
74	DO bj = myByLo(myThid), myByHi(myThid)
75	DO bi = myBxLo(myThid), myBxHi(myThid)
76	shareBufGSR8(bi,bj) = phiTile(bi,bj)
77	ENDDO
78	ENDDO
79
80	C-- Master thread cannot start until everyone is ready:
81	CALL BAR2( myThid )
82	_BEGIN_MASTER( myThid )
83
84	#if (defined (GLOBAL_SUM_SEND_RECV) && defined (ALLOW_USE_MPI) )
85	IF ( usingMPI ) THEN
86
87	lbuff = nSx*nSy
88	idest = 0
89	itag = 0
90	ready_to_receive = 0
91
92	IF ( mpiMyId.NE.0 ) THEN
93
94	C-- All proceses except 0 wait to be polled then send local array
95	#ifndef DISABLE_MPI_READY_TO_RECEIVE
96	CALL MPI_RECV (ready_to_receive, 1, MPI_INTEGER,
97	& idest, itag, MPI_COMM_MODEL, istatus, ierr)
98	#endif
99	CALL MPI_SEND (shareBufGSR8, lbuff, MPI_DOUBLE_PRECISION,
100	& idest, itag, MPI_COMM_MODEL, ierr)
101
102	C-- All proceses except 0 receive result from process 0
103	CALL MPI_RECV (sumAllP, 1, MPI_DOUBLE_PRECISION,
104	& idest, itag, MPI_COMM_MODEL, istatus, ierr)
105
106	ELSE
107	C- case mpiMyId = 0
108
109	C-- Process 0 fills-in its local data
110	np = 1
111	DO bj=1,nSy
112	DO bi=1,nSx
113	biG = (mpi_myXGlobalLo(np)-1)/sNx+bi
114	bjG = (mpi_myYGlobalLo(np)-1)/sNy+bj
115	globalBuf(biG,bjG) = shareBufGSR8(bi,bj)
116	ENDDO
117	ENDDO
118
119	C-- Process 0 polls and receives data from each process in turn
120	DO np = 2, nPx*nPy
121	pId = np - 1
122	#ifndef DISABLE_MPI_READY_TO_RECEIVE
123	CALL MPI_SEND (ready_to_receive, 1, MPI_INTEGER,
124	& pId, itag, MPI_COMM_MODEL, ierr)
125	#endif
126	CALL MPI_RECV (localBuf, lbuff, MPI_DOUBLE_PRECISION,
127	& pId, itag, MPI_COMM_MODEL, istatus, ierr)
128
129	C-- Process 0 gathers the local arrays into a global array.
130	DO bj=1,nSy
131	DO bi=1,nSx
132	biG = (mpi_myXGlobalLo(np)-1)/sNx+bi
133	bjG = (mpi_myYGlobalLo(np)-1)/sNy+bj
134	globalBuf(biG,bjG) = localBuf(bi,bj)
135	ENDDO
136	ENDDO
137	C- end loop on np
138	ENDDO
139
140	C-- Sum over all tiles:
141	sumAllP = 0.
142	DO bjG = 1,nSy*nPy
143	DO biG = 1,nSx*nPx
144	sumAllP = sumAllP + globalBuf(biG,bjG)
145	ENDDO
146	ENDDO
147
148	C-- Process 0 sends result to all other processes
149	lbuff = 1
150	DO np = 2, nPx*nPy
151	pId = np - 1
152	CALL MPI_SEND (sumAllP, 1, MPI_DOUBLE_PRECISION,
153	& pId, itag, MPI_COMM_MODEL, ierr)
154	ENDDO
155
156	C End if/else mpiMyId = 0
157	ENDIF
158
159	ELSE
160	#else /* not (GLOBAL_SUM_SEND_RECV & ALLOW_USE_MPI) */
161	IF ( .TRUE. ) THEN
162	#endif /* not (GLOBAL_SUM_SEND_RECV & ALLOW_USE_MPI) */
163
164	C-- Sum over all tiles (of the same process) first
165	sumMyPr = 0.
166	DO bj = 1,nSy
167	DO bi = 1,nSx
168	sumMyPr = sumMyPr + shareBufGSR8(bi,bj)
169	ENDDO
170	ENDDO
171
172	C in case MPI is not used:
173	sumAllP = sumMyPr
174
175	#ifdef ALLOW_USE_MPI
176	IF ( usingMPI ) THEN
177	CALL MPI_Allreduce(sumMyPr,sumAllP,1,MPI_DOUBLE_PRECISION,
178	& MPI_SUM,MPI_COMM_MODEL,mpiRC)
179	ENDIF
180	#endif /* ALLOW_USE_MPI */
181
182	ENDIF
183
184	C-- Write solution to shared buffer (all threads can see it)
185	c shareBufGSR8(1,1) = sumAllP
186	phiGSR8(1,0) = sumAllP
187
188	_END_MASTER( myThid )
189	C-- Everyone wait for Master thread to be ready
190	CALL BAR2( myThid )
191
192	C-- set result for every threads
193	c sumPhi = shareBufGSR8(1,1)
194	sumPhi = phiGSR8(1,0)
195
196	C-- A barrier was needed here to prevent thread 1 to modify shareBufGSR8(1,1)
197	C (as it would in the following call to this S/R) before all threads get
198	C their global-sum result out.
199	C No longer needed since a dedicated shared var. is used to share the output
200	c CALL BAR2( myThid )
201
202	RETURN
203	END