eesupp/src/cumulsum_z_tile.F

C $Header: /u/gcmpack/MITgcm/eesupp/src/global_sum_tile.F,v 1.3 2009/06/10 03:47:03 jmc Exp $
C $Name:  $

#include "PACKAGES_CONFIG.h"
#include "CPP_EEOPTIONS.h"

C--   File cumulsum_z_tile.F: Routines that perform cumulated sum
C                             on a tiled array, corner grid-cell location
C      Contents
C      o CUMULSUM_Z_TILE_RL
C      o CUMULSUM_Z_TILE_RS <- not yet coded

C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----|
CBOP
C     !ROUTINE: CUMULSUM_Z_TILE_RL

C     !INTERFACE:
      SUBROUTINE CUMULSUM_Z_TILE_RL(
     O                       psiZ, psiLoc,
     I                       dPsiX, dPsiY, myThid )

C     !DESCRIPTION:
C     *==========================================================*
C     | SUBROUTINE CUMULSUM\_Z\_TILE\_RL
C     | o Handle cumulated sum for _RL tile data.
C     *==========================================================*
C     | Cumulate sum on tiled array, corner grid-cell location:
C     |  Starts from 1rst tile and, going through all tiles & all
C     |  the processes, add increment in both directions
C     *==========================================================*

C     !USES:
      IMPLICIT NONE

C     == Global data ==
#include "SIZE.h"
#include "EEPARAMS.h"
#include "EESUPPORT.h"
c#include "CUMULSUM.h"
      COMMON  / CUMULSUM_R8 /  shareBufCS1_R8, shareBufCS2_R8
      Real*8  shareBufCS1_R8  (nSx,nSy)
      Real*8  shareBufCS2_R8(2,nSx,nSy)

C     !INPUT/OUTPUT PARAMETERS:
C     == Routine arguments ==
C     psiZ    :: results of cumulated sum, corresponds to tile South-East corner
C     psiLoc  :: cumulated sum at special locations
C     dPsiX   :: tile increment in X direction
C     dPsiY   :: tile increment in Y direction
C     myThid  :: my Thread Id. number
      _RL     psiZ  (nSx,nSy)
      _RL     psiLoc(2)
      _RL     dPsiX (nSx,nSy)
      _RL     dPsiY (nSx,nSy)
      INTEGER myThid

C     !LOCAL VARIABLES:
#ifndef ALLOW_EXCH2
C     == Local variables ==
C     bi,bj   :: tile indices
C- type declaration of: loc[1,2]Buf and shareBufCS[1,2]_R8 :
C         all 4 needs to have the same length as MPI_DOUBLE_PRECISION
      INTEGER bi,bj
      INTEGER nf
#ifdef ALLOW_USE_MPI
      INTEGER biG, bjG, npe, np1
      INTEGER lbuf1, lbuf2, idest, itag, ready_to_receive
      INTEGER istatus(MPI_STATUS_SIZE), ierr
      Real*8  loc1Buf  (nSx,nSy)
      Real*8  loc2Buf(2,nSx,nSy)
      Real*8  globalBuf(3,nSx*nPx,nSy*nPy)
#endif /* ALLOW_USE_MPI */
#else
      CHARACTER*(MAX_LEN_MBUF) msgBuf
#endif /* ALLOW_EXCH2 */
CEOP

#ifdef ALLOW_EXCH2
c     CALL EXCH2_CUMULSUM_Z_TILE_RL(
c    O                       psiZ, psiLoc,
c    I                       dPsiX, dPsiY, myThid )
      WRITE(msgBuf,'(A)')
     &          'CUMULSUM_Z_TILE_RL: missing call to EXCH2 S/R'
      CALL PRINT_ERROR( msgBuf, myThid )
      RETURN
#else /* ALLOW_EXCH2 */

C--   write input into shared-buffer array
      DO bj = myByLo(myThid), myByHi(myThid)
       DO bi = myBxLo(myThid), myBxHi(myThid)
         shareBufCS2_R8(1,bi,bj) = dPsiX(bi,bj)
         shareBufCS2_R8(2,bi,bj) = dPsiY(bi,bj)
       ENDDO
      ENDDO
      psiLoc(1) = 0.
      psiLoc(2) = 0.

C--   Master thread cannot start until everyone is ready:
      CALL BAR2( myThid )
      _BEGIN_MASTER( myThid )

#ifdef ALLOW_USE_MPI
#ifdef ALWAYS_USE_MPI
      IF ( .TRUE. ) THEN
#else
      IF ( usingMPI ) THEN
#endif

        lbuf1 = nSx*nSy
        lbuf2 = 2*lbuf1
        idest = 0
        itag  = 0
        ready_to_receive = 0

        IF ( mpiMyId.NE.0 ) THEN

C--   All proceses except 0 wait to be polled then send local array
#ifndef DISABLE_MPI_READY_TO_RECEIVE
            CALL MPI_RECV (ready_to_receive, 1, MPI_INTEGER,
     &           idest, itag, MPI_COMM_MODEL, istatus, ierr)
#endif
            CALL MPI_SEND (shareBufCS2_R8, lbuf2, MPI_DOUBLE_PRECISION,
     &           idest, itag, MPI_COMM_MODEL, ierr)

C--   All proceses except 0 receive result from process 0
            CALL MPI_RECV (shareBufCS1_R8, lbuf1, MPI_DOUBLE_PRECISION,
     &           idest, itag, MPI_COMM_MODEL, istatus, ierr)

        ELSE

C--   Process 0 fills-in its local data
          np1 = 1
          DO bj=1,nSy
            DO bi=1,nSx
              biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi
              bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj
              globalBuf(1,biG,bjG) = shareBufCS2_R8(1,bi,bj)
              globalBuf(2,biG,bjG) = shareBufCS2_R8(2,bi,bj)
            ENDDO
          ENDDO

C--   Process 0 polls and receives data from each process in turn
          DO npe = 1, numberOfProcs-1
#ifndef DISABLE_MPI_READY_TO_RECEIVE
            CALL MPI_SEND (ready_to_receive, 1, MPI_INTEGER,
     &             npe, itag, MPI_COMM_MODEL, ierr)
#endif
            CALL MPI_RECV (loc2Buf, lbuf2, MPI_DOUBLE_PRECISION,
     &             npe, itag, MPI_COMM_MODEL, istatus, ierr)

C--   Process 0 gathers the local arrays into a global array.
            np1 = npe + 1
            DO bj=1,nSy
             DO bi=1,nSx
              biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi
              bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj
              globalBuf(1,biG,bjG) = loc2Buf(1,bi,bj)
              globalBuf(2,biG,bjG) = loc2Buf(2,bi,bj)
             ENDDO
            ENDDO
          ENDDO

C--   Cumulate Sum over all tiles:
          globalBuf(3,1,1) = 0.
          bj = 1
          DO bi = 1,nSx*nPx-1
            globalBuf(3,1+bi,bj) = globalBuf(3,bi,bj)
     &                           + globalBuf(1,bi,bj)
          ENDDO
          DO bj = 1,nSy*nPy-1
           DO bi = 1,nSx*nPx
            globalBuf(3,bi,1+bj) = globalBuf(3,bi,bj)
     &                           + globalBuf(2,bi,bj)
           ENDDO
          ENDDO

C--   Process 0 fills-in its local data
          np1 = 1
          DO bj=1,nSy
            DO bi=1,nSx
              biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi
              bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj
              shareBufCS1_R8(bi,bj) = globalBuf(3,biG,bjG)
            ENDDO
          ENDDO

C--   Process 0 sends result to all other processes
          DO npe = 1, numberOfProcs-1
C-    fill local array with relevant portion of global array
            np1 = npe + 1
            DO bj=1,nSy
             DO bi=1,nSx
              biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi
              bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj
              loc1Buf(bi,bj) = globalBuf(3,biG,bjG)
             ENDDO
            ENDDO
            CALL MPI_SEND (loc1Buf, lbuf1, MPI_DOUBLE_PRECISION,
     &                     npe, itag, MPI_COMM_MODEL, ierr)

          ENDDO

        ENDIF

      ELSEIF (useCubedSphereExchange) THEN
#else /* not USE_MPI */
      IF     (useCubedSphereExchange) THEN
#endif /* ALLOW_USE_MPI */

C--   assume 1 tile / face, from bi=1 to 6, no MPI
        shareBufCS1_R8(1,1) = 0.
        bj = 1
        DO bi = 1,nSx-1
           nf = 1 + MOD(1+bi,2)
           shareBufCS1_R8(1+bi,bj) = shareBufCS1_R8(bi,bj)
     &                             + shareBufCS2_R8(nf,bi,bj)
        ENDDO
C-    fill in missing corner: 1 = North-West corner of face 1
C-                            2 = South-East corner of face 2
        bi = 1
        psiLoc(1) = shareBufCS1_R8(bi,bj) + shareBufCS2_R8(2,bi,bj)
        bi = MIN(2,nSx)
        psiLoc(2) = shareBufCS1_R8(bi,bj) + shareBufCS2_R8(1,bi,bj)

      ELSE

C--   Cumulate Sum over all tiles:
        shareBufCS1_R8(1,1) = 0.
        bj = 1
        DO bi = 1,nSx-1
           shareBufCS1_R8(1+bi,bj) = shareBufCS1_R8(bi,bj)
     &                             + shareBufCS2_R8(1,bi,bj)
        ENDDO
        DO bj = 1,nSy-1
         DO bi = 1,nSx
           shareBufCS1_R8(bi,1+bj) = shareBufCS1_R8(bi,bj)
     &                             + shareBufCS2_R8(2,bi,bj)
         ENDDO
        ENDDO

      ENDIF

      _END_MASTER( myThid )
C--   Everyone wait for Master thread to be ready
      CALL BAR2( myThid )

C--   set result for every threads
      DO bj = myByLo(myThid), myByHi(myThid)
       DO bi = myBxLo(myThid), myBxHi(myThid)
         psiZ(bi,bj) = shareBufCS1_R8(bi,bj)
       ENDDO
      ENDDO

      RETURN
#endif /* ALLOW_EXCH2 */
      END
1	C $Header: /u/gcmpack/MITgcm/eesupp/src/global_sum_tile.F,v 1.3 2009/06/10 03:47:03 jmc Exp $
2	C $Name: $
3
4	#include "PACKAGES_CONFIG.h"
5	#include "CPP_EEOPTIONS.h"
6
7	C-- File cumulsum_z_tile.F: Routines that perform cumulated sum
8	C on a tiled array, corner grid-cell location
9	C Contents
10	C o CUMULSUM_Z_TILE_RL
11	C o CUMULSUM_Z_TILE_RS <- not yet coded
12
13	C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-\|--+----\|
14	CBOP
15	C !ROUTINE: CUMULSUM_Z_TILE_RL
16
17	C !INTERFACE:
18	SUBROUTINE CUMULSUM_Z_TILE_RL(
19	O psiZ, psiLoc,
20	I dPsiX, dPsiY, myThid )
21
22	C !DESCRIPTION:
23	C ==========================================================
24	C \| SUBROUTINE CUMULSUM\_Z\_TILE\_RL
25	C \| o Handle cumulated sum for _RL tile data.
26	C ==========================================================
27	C \| Cumulate sum on tiled array, corner grid-cell location:
28	C \| Starts from 1rst tile and, going through all tiles & all
29	C \| the processes, add increment in both directions
30	C ==========================================================
31
32	C !USES:
33	IMPLICIT NONE
34
35	C == Global data ==
36	#include "SIZE.h"
37	#include "EEPARAMS.h"
38	#include "EESUPPORT.h"
39	c#include "CUMULSUM.h"
40	COMMON / CUMULSUM_R8 / shareBufCS1_R8, shareBufCS2_R8
41	Real*8 shareBufCS1_R8 (nSx,nSy)
42	Real*8 shareBufCS2_R8(2,nSx,nSy)
43
44	C !INPUT/OUTPUT PARAMETERS:
45	C == Routine arguments ==
46	C psiZ :: results of cumulated sum, corresponds to tile South-East corner
47	C psiLoc :: cumulated sum at special locations
48	C dPsiX :: tile increment in X direction
49	C dPsiY :: tile increment in Y direction
50	C myThid :: my Thread Id. number
51	_RL psiZ (nSx,nSy)
52	_RL psiLoc(2)
53	_RL dPsiX (nSx,nSy)
54	_RL dPsiY (nSx,nSy)
55	INTEGER myThid
56
57	C !LOCAL VARIABLES:
58	#ifndef ALLOW_EXCH2
59	C == Local variables ==
60	C bi,bj :: tile indices
61	C- type declaration of: loc[1,2]Buf and shareBufCS[1,2]_R8 :
62	C all 4 needs to have the same length as MPI_DOUBLE_PRECISION
63	INTEGER bi,bj
64	INTEGER nf
65	#ifdef ALLOW_USE_MPI
66	INTEGER biG, bjG, npe, np1
67	INTEGER lbuf1, lbuf2, idest, itag, ready_to_receive
68	INTEGER istatus(MPI_STATUS_SIZE), ierr
69	Real*8 loc1Buf (nSx,nSy)
70	Real*8 loc2Buf(2,nSx,nSy)
71	Real8 globalBuf(3,nSxnPx,nSy*nPy)
72	#endif /* ALLOW_USE_MPI */
73	#else
74	CHARACTER*(MAX_LEN_MBUF) msgBuf
75	#endif /* ALLOW_EXCH2 */
76	CEOP
77
78	#ifdef ALLOW_EXCH2
79	c CALL EXCH2_CUMULSUM_Z_TILE_RL(
80	c O psiZ, psiLoc,
81	c I dPsiX, dPsiY, myThid )
82	WRITE(msgBuf,'(A)')
83	& 'CUMULSUM_Z_TILE_RL: missing call to EXCH2 S/R'
84	CALL PRINT_ERROR( msgBuf, myThid )
85	RETURN
86	#else /* ALLOW_EXCH2 */
87
88	C-- write input into shared-buffer array
89	DO bj = myByLo(myThid), myByHi(myThid)
90	DO bi = myBxLo(myThid), myBxHi(myThid)
91	shareBufCS2_R8(1,bi,bj) = dPsiX(bi,bj)
92	shareBufCS2_R8(2,bi,bj) = dPsiY(bi,bj)
93	ENDDO
94	ENDDO
95	psiLoc(1) = 0.
96	psiLoc(2) = 0.
97
98	C-- Master thread cannot start until everyone is ready:
99	CALL BAR2( myThid )
100	_BEGIN_MASTER( myThid )
101
102	#ifdef ALLOW_USE_MPI
103	#ifdef ALWAYS_USE_MPI
104	IF ( .TRUE. ) THEN
105	#else
106	IF ( usingMPI ) THEN
107	#endif
108
109	lbuf1 = nSx*nSy
110	lbuf2 = 2*lbuf1
111	idest = 0
112	itag = 0
113	ready_to_receive = 0
114
115	IF ( mpiMyId.NE.0 ) THEN
116
117	C-- All proceses except 0 wait to be polled then send local array
118	#ifndef DISABLE_MPI_READY_TO_RECEIVE
119	CALL MPI_RECV (ready_to_receive, 1, MPI_INTEGER,
120	& idest, itag, MPI_COMM_MODEL, istatus, ierr)
121	#endif
122	CALL MPI_SEND (shareBufCS2_R8, lbuf2, MPI_DOUBLE_PRECISION,
123	& idest, itag, MPI_COMM_MODEL, ierr)
124
125	C-- All proceses except 0 receive result from process 0
126	CALL MPI_RECV (shareBufCS1_R8, lbuf1, MPI_DOUBLE_PRECISION,
127	& idest, itag, MPI_COMM_MODEL, istatus, ierr)
128
129	ELSE
130
131	C-- Process 0 fills-in its local data
132	np1 = 1
133	DO bj=1,nSy
134	DO bi=1,nSx
135	biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi
136	bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj
137	globalBuf(1,biG,bjG) = shareBufCS2_R8(1,bi,bj)
138	globalBuf(2,biG,bjG) = shareBufCS2_R8(2,bi,bj)
139	ENDDO
140	ENDDO
141
142	C-- Process 0 polls and receives data from each process in turn
143	DO npe = 1, numberOfProcs-1
144	#ifndef DISABLE_MPI_READY_TO_RECEIVE
145	CALL MPI_SEND (ready_to_receive, 1, MPI_INTEGER,
146	& npe, itag, MPI_COMM_MODEL, ierr)
147	#endif
148	CALL MPI_RECV (loc2Buf, lbuf2, MPI_DOUBLE_PRECISION,
149	& npe, itag, MPI_COMM_MODEL, istatus, ierr)
150
151	C-- Process 0 gathers the local arrays into a global array.
152	np1 = npe + 1
153	DO bj=1,nSy
154	DO bi=1,nSx
155	biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi
156	bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj
157	globalBuf(1,biG,bjG) = loc2Buf(1,bi,bj)
158	globalBuf(2,biG,bjG) = loc2Buf(2,bi,bj)
159	ENDDO
160	ENDDO
161	ENDDO
162
163	C-- Cumulate Sum over all tiles:
164	globalBuf(3,1,1) = 0.
165	bj = 1
166	DO bi = 1,nSx*nPx-1
167	globalBuf(3,1+bi,bj) = globalBuf(3,bi,bj)
168	& + globalBuf(1,bi,bj)
169	ENDDO
170	DO bj = 1,nSy*nPy-1
171	DO bi = 1,nSx*nPx
172	globalBuf(3,bi,1+bj) = globalBuf(3,bi,bj)
173	& + globalBuf(2,bi,bj)
174	ENDDO
175	ENDDO
176
177	C-- Process 0 fills-in its local data
178	np1 = 1
179	DO bj=1,nSy
180	DO bi=1,nSx
181	biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi
182	bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj
183	shareBufCS1_R8(bi,bj) = globalBuf(3,biG,bjG)
184	ENDDO
185	ENDDO
186
187	C-- Process 0 sends result to all other processes
188	DO npe = 1, numberOfProcs-1
189	C- fill local array with relevant portion of global array
190	np1 = npe + 1
191	DO bj=1,nSy
192	DO bi=1,nSx
193	biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi
194	bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj
195	loc1Buf(bi,bj) = globalBuf(3,biG,bjG)
196	ENDDO
197	ENDDO
198	CALL MPI_SEND (loc1Buf, lbuf1, MPI_DOUBLE_PRECISION,
199	& npe, itag, MPI_COMM_MODEL, ierr)
200
201	ENDDO
202
203	ENDIF
204
205	ELSEIF (useCubedSphereExchange) THEN
206	#else /* not USE_MPI */
207	IF (useCubedSphereExchange) THEN
208	#endif /* ALLOW_USE_MPI */
209
210	C-- assume 1 tile / face, from bi=1 to 6, no MPI
211	shareBufCS1_R8(1,1) = 0.
212	bj = 1
213	DO bi = 1,nSx-1
214	nf = 1 + MOD(1+bi,2)
215	shareBufCS1_R8(1+bi,bj) = shareBufCS1_R8(bi,bj)
216	& + shareBufCS2_R8(nf,bi,bj)
217	ENDDO
218	C- fill in missing corner: 1 = North-West corner of face 1
219	C- 2 = South-East corner of face 2
220	bi = 1
221	psiLoc(1) = shareBufCS1_R8(bi,bj) + shareBufCS2_R8(2,bi,bj)
222	bi = MIN(2,nSx)
223	psiLoc(2) = shareBufCS1_R8(bi,bj) + shareBufCS2_R8(1,bi,bj)
224
225	ELSE
226
227	C-- Cumulate Sum over all tiles:
228	shareBufCS1_R8(1,1) = 0.
229	bj = 1
230	DO bi = 1,nSx-1
231	shareBufCS1_R8(1+bi,bj) = shareBufCS1_R8(bi,bj)
232	& + shareBufCS2_R8(1,bi,bj)
233	ENDDO
234	DO bj = 1,nSy-1
235	DO bi = 1,nSx
236	shareBufCS1_R8(bi,1+bj) = shareBufCS1_R8(bi,bj)
237	& + shareBufCS2_R8(2,bi,bj)
238	ENDDO
239	ENDDO
240
241	ENDIF
242
243	_END_MASTER( myThid )
244	C-- Everyone wait for Master thread to be ready
245	CALL BAR2( myThid )
246
247	C-- set result for every threads
248	DO bj = myByLo(myThid), myByHi(myThid)
249	DO bi = myBxLo(myThid), myBxHi(myThid)
250	psiZ(bi,bj) = shareBufCS1_R8(bi,bj)
251	ENDDO
252	ENDDO
253
254	RETURN
255	#endif /* ALLOW_EXCH2 */
256	END