eesupp/src/cumulsum_z_tile.F

C $Header: /u/gcmpack/MITgcm/eesupp/src/cumulsum_z_tile.F,v 1.1 2011/07/01 18:18:31 jmc Exp $
C $Name:  $

#include "PACKAGES_CONFIG.h"
#include "CPP_EEOPTIONS.h"

C--   File cumulsum_z_tile.F: Routines that perform cumulated sum
C                             on a tiled array, corner grid-cell location
C      Contents
C      o CUMULSUM_Z_TILE_RL
C      o CUMULSUM_Z_TILE_RS <- not yet coded

C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----|
CBOP
C     !ROUTINE: CUMULSUM_Z_TILE_RL

C     !INTERFACE:
      SUBROUTINE CUMULSUM_Z_TILE_RL(
     O                       psiZ, psiLoc,
     I                       dPsiX, dPsiY, myThid )

C     !DESCRIPTION:
C     *==========================================================*
C     | SUBROUTINE CUMULSUM\_Z\_TILE\_RL
C     | o Handle cumulated sum for _RL tile data.
C     *==========================================================*
C     | Cumulate sum on tiled array, corner grid-cell location:
C     |  Starts from 1rst tile and, going through all tiles & all
C     |  the processes, add increment in both directions
C     *==========================================================*

C     !USES:
      IMPLICIT NONE

C     == Global data ==
#include "SIZE.h"
#include "EEPARAMS.h"
#include "EESUPPORT.h"
#include "CUMULSUM.h"

C     !INPUT/OUTPUT PARAMETERS:
C     == Routine arguments ==
C     psiZ    :: results of cumulated sum, corresponds to tile South-East corner
C     psiLoc  :: cumulated sum at special locations
C     dPsiX   :: tile increment in X direction
C     dPsiY   :: tile increment in Y direction
C     myThid  :: my Thread Id. number
      _RL     psiZ  (nSx,nSy)
      _RL     psiLoc(2)
      _RL     dPsiX (nSx,nSy)
      _RL     dPsiY (nSx,nSy)
      INTEGER myThid

C     !LOCAL VARIABLES:
#ifndef ALLOW_EXCH2
C     == Local variables ==
C     bi,bj   :: tile indices
C- type declaration of: loc[1,2]Buf and shareBufCS[1,2]_R8 :
C         all 4 needs to have the same length as MPI_DOUBLE_PRECISION
      INTEGER bi,bj
      INTEGER nf
#ifdef ALLOW_USE_MPI
      INTEGER biG, bjG, npe, np1
      INTEGER lbuf1, lbuf2, idest, itag, ready_to_receive
      INTEGER istatus(MPI_STATUS_SIZE), ierr
      Real*8  loc1Buf  (nSx,nSy)
      Real*8  loc2Buf(2,nSx,nSy)
      Real*8  globalBuf(3,nSx*nPx,nSy*nPy)
#endif /* ALLOW_USE_MPI */
#endif /* ALLOW_EXCH2 */
CEOP

#ifdef ALLOW_EXCH2
      CALL W2_CUMULSUM_Z_TILE_RL(
     O                       psiZ, psiLoc,
     I                       dPsiX, dPsiY, myThid )
      RETURN
#else /* ALLOW_EXCH2 */

C--   write input into shared-buffer array
      DO bj = myByLo(myThid), myByHi(myThid)
       DO bi = myBxLo(myThid), myBxHi(myThid)
         shareBufCS2_R8(1,bi,bj) = dPsiX(bi,bj)
         shareBufCS2_R8(2,bi,bj) = dPsiY(bi,bj)
       ENDDO
      ENDDO
      psiLoc(1) = 0.
      psiLoc(2) = 0.

C--   Master thread cannot start until everyone is ready:
      CALL BAR2( myThid )
      _BEGIN_MASTER( myThid )

#ifdef ALLOW_USE_MPI
#ifdef ALWAYS_USE_MPI
      IF ( .TRUE. ) THEN
#else
      IF ( usingMPI ) THEN
#endif

        lbuf1 = nSx*nSy
        lbuf2 = 2*lbuf1
        idest = 0
        itag  = 0
        ready_to_receive = 0

        IF ( mpiMyId.NE.0 ) THEN

C--   All proceses except 0 wait to be polled then send local array
#ifndef DISABLE_MPI_READY_TO_RECEIVE
            CALL MPI_RECV (ready_to_receive, 1, MPI_INTEGER,
     &           idest, itag, MPI_COMM_MODEL, istatus, ierr)
#endif
            CALL MPI_SEND (shareBufCS2_R8, lbuf2, MPI_DOUBLE_PRECISION,
     &           idest, itag, MPI_COMM_MODEL, ierr)

C--   All proceses except 0 receive result from process 0
            CALL MPI_RECV (shareBufCS1_R8, lbuf1, MPI_DOUBLE_PRECISION,
     &           idest, itag, MPI_COMM_MODEL, istatus, ierr)

        ELSE

C--   Process 0 fills-in its local data
          np1 = 1
          DO bj=1,nSy
            DO bi=1,nSx
              biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi
              bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj
              globalBuf(1,biG,bjG) = shareBufCS2_R8(1,bi,bj)
              globalBuf(2,biG,bjG) = shareBufCS2_R8(2,bi,bj)
            ENDDO
          ENDDO

C--   Process 0 polls and receives data from each process in turn
          DO npe = 1, numberOfProcs-1
#ifndef DISABLE_MPI_READY_TO_RECEIVE
            CALL MPI_SEND (ready_to_receive, 1, MPI_INTEGER,
     &             npe, itag, MPI_COMM_MODEL, ierr)
#endif
            CALL MPI_RECV (loc2Buf, lbuf2, MPI_DOUBLE_PRECISION,
     &             npe, itag, MPI_COMM_MODEL, istatus, ierr)

C--   Process 0 gathers the local arrays into a global array.
            np1 = npe + 1
            DO bj=1,nSy
             DO bi=1,nSx
              biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi
              bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj
              globalBuf(1,biG,bjG) = loc2Buf(1,bi,bj)
              globalBuf(2,biG,bjG) = loc2Buf(2,bi,bj)
             ENDDO
            ENDDO
          ENDDO

C--   Cumulate Sum over all tiles:
          globalBuf(3,1,1) = 0.
          bj = 1
          DO bi = 1,nSx*nPx-1
            globalBuf(3,1+bi,bj) = globalBuf(3,bi,bj)
     &                           + globalBuf(1,bi,bj)
          ENDDO
          DO bj = 1,nSy*nPy-1
           DO bi = 1,nSx*nPx
            globalBuf(3,bi,1+bj) = globalBuf(3,bi,bj)
     &                           + globalBuf(2,bi,bj)
           ENDDO
          ENDDO

C--   Process 0 fills-in its local data
          np1 = 1
          DO bj=1,nSy
            DO bi=1,nSx
              biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi
              bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj
              shareBufCS1_R8(bi,bj) = globalBuf(3,biG,bjG)
            ENDDO
          ENDDO

C--   Process 0 sends result to all other processes
          DO npe = 1, numberOfProcs-1
C-    fill local array with relevant portion of global array
            np1 = npe + 1
            DO bj=1,nSy
             DO bi=1,nSx
              biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi
              bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj
              loc1Buf(bi,bj) = globalBuf(3,biG,bjG)
             ENDDO
            ENDDO
            CALL MPI_SEND (loc1Buf, lbuf1, MPI_DOUBLE_PRECISION,
     &                     npe, itag, MPI_COMM_MODEL, ierr)

          ENDDO

        ENDIF

      ELSEIF (useCubedSphereExchange) THEN
#else /* not USE_MPI */
      IF     (useCubedSphereExchange) THEN
#endif /* ALLOW_USE_MPI */

C--   assume 1 tile / face, from bi=1 to 6, no MPI
        shareBufCS1_R8(1,1) = 0.
        bj = 1
        DO bi = 1,nSx-1
           nf = 1 + MOD(1+bi,2)
           shareBufCS1_R8(1+bi,bj) = shareBufCS1_R8(bi,bj)
     &                             + shareBufCS2_R8(nf,bi,bj)
        ENDDO
C-    fill in missing corner: 1 = North-West corner of face 1
C-                            2 = South-East corner of face 2
        bi = 1
        psiLoc(1) = shareBufCS1_R8(bi,bj) + shareBufCS2_R8(2,bi,bj)
        bi = MIN(2,nSx)
        psiLoc(2) = shareBufCS1_R8(bi,bj) + shareBufCS2_R8(1,bi,bj)

      ELSE

C--   Cumulate Sum over all tiles:
        shareBufCS1_R8(1,1) = 0.
        bj = 1
        DO bi = 1,nSx-1
           shareBufCS1_R8(1+bi,bj) = shareBufCS1_R8(bi,bj)
     &                             + shareBufCS2_R8(1,bi,bj)
        ENDDO
        DO bj = 1,nSy-1
         DO bi = 1,nSx
           shareBufCS1_R8(bi,1+bj) = shareBufCS1_R8(bi,bj)
     &                             + shareBufCS2_R8(2,bi,bj)
         ENDDO
        ENDDO

      ENDIF

      _END_MASTER( myThid )
C--   Everyone wait for Master thread to be ready
      CALL BAR2( myThid )

C--   set result for every threads
      DO bj = myByLo(myThid), myByHi(myThid)
       DO bi = myBxLo(myThid), myBxHi(myThid)
         psiZ(bi,bj) = shareBufCS1_R8(bi,bj)
       ENDDO
      ENDDO

      RETURN
#endif /* ALLOW_EXCH2 */
      END
1	jmc	1.2	C $Header: /u/gcmpack/MITgcm/eesupp/src/cumulsum_z_tile.F,v 1.1 2011/07/01 18:18:31 jmc Exp $
2	jmc	1.1	C $Name: $
3
4			#include "PACKAGES_CONFIG.h"
5			#include "CPP_EEOPTIONS.h"
6
7			C-- File cumulsum_z_tile.F: Routines that perform cumulated sum
8			C on a tiled array, corner grid-cell location
9			C Contents
10			C o CUMULSUM_Z_TILE_RL
11			C o CUMULSUM_Z_TILE_RS <- not yet coded
12
13			C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-\|--+----\|
14			CBOP
15			C !ROUTINE: CUMULSUM_Z_TILE_RL
16
17			C !INTERFACE:
18			SUBROUTINE CUMULSUM_Z_TILE_RL(
19			O psiZ, psiLoc,
20			I dPsiX, dPsiY, myThid )
21
22			C !DESCRIPTION:
23			C ==========================================================
24			C \| SUBROUTINE CUMULSUM\_Z\_TILE\_RL
25			C \| o Handle cumulated sum for _RL tile data.
26			C ==========================================================
27			C \| Cumulate sum on tiled array, corner grid-cell location:
28			C \| Starts from 1rst tile and, going through all tiles & all
29			C \| the processes, add increment in both directions
30			C ==========================================================
31
32			C !USES:
33			IMPLICIT NONE
34
35			C == Global data ==
36			#include "SIZE.h"
37			#include "EEPARAMS.h"
38			#include "EESUPPORT.h"
39	jmc	1.2	#include "CUMULSUM.h"
40	jmc	1.1
41			C !INPUT/OUTPUT PARAMETERS:
42			C == Routine arguments ==
43			C psiZ :: results of cumulated sum, corresponds to tile South-East corner
44			C psiLoc :: cumulated sum at special locations
45			C dPsiX :: tile increment in X direction
46			C dPsiY :: tile increment in Y direction
47			C myThid :: my Thread Id. number
48			_RL psiZ (nSx,nSy)
49			_RL psiLoc(2)
50			_RL dPsiX (nSx,nSy)
51			_RL dPsiY (nSx,nSy)
52			INTEGER myThid
53
54			C !LOCAL VARIABLES:
55			#ifndef ALLOW_EXCH2
56			C == Local variables ==
57			C bi,bj :: tile indices
58			C- type declaration of: loc[1,2]Buf and shareBufCS[1,2]_R8 :
59			C all 4 needs to have the same length as MPI_DOUBLE_PRECISION
60			INTEGER bi,bj
61			INTEGER nf
62			#ifdef ALLOW_USE_MPI
63			INTEGER biG, bjG, npe, np1
64			INTEGER lbuf1, lbuf2, idest, itag, ready_to_receive
65			INTEGER istatus(MPI_STATUS_SIZE), ierr
66			Real*8 loc1Buf (nSx,nSy)
67			Real*8 loc2Buf(2,nSx,nSy)
68			Real8 globalBuf(3,nSxnPx,nSy*nPy)
69			#endif /* ALLOW_USE_MPI */
70			#endif /* ALLOW_EXCH2 */
71			CEOP
72
73			#ifdef ALLOW_EXCH2
74	jmc	1.2	CALL W2_CUMULSUM_Z_TILE_RL(
75			O psiZ, psiLoc,
76			I dPsiX, dPsiY, myThid )
77	jmc	1.1	RETURN
78			#else /* ALLOW_EXCH2 */
79
80			C-- write input into shared-buffer array
81			DO bj = myByLo(myThid), myByHi(myThid)
82			DO bi = myBxLo(myThid), myBxHi(myThid)
83			shareBufCS2_R8(1,bi,bj) = dPsiX(bi,bj)
84			shareBufCS2_R8(2,bi,bj) = dPsiY(bi,bj)
85			ENDDO
86			ENDDO
87			psiLoc(1) = 0.
88			psiLoc(2) = 0.
89
90			C-- Master thread cannot start until everyone is ready:
91			CALL BAR2( myThid )
92			_BEGIN_MASTER( myThid )
93
94			#ifdef ALLOW_USE_MPI
95			#ifdef ALWAYS_USE_MPI
96			IF ( .TRUE. ) THEN
97			#else
98			IF ( usingMPI ) THEN
99			#endif
100
101			lbuf1 = nSx*nSy
102			lbuf2 = 2*lbuf1
103			idest = 0
104			itag = 0
105			ready_to_receive = 0
106
107			IF ( mpiMyId.NE.0 ) THEN
108
109			C-- All proceses except 0 wait to be polled then send local array
110			#ifndef DISABLE_MPI_READY_TO_RECEIVE
111			CALL MPI_RECV (ready_to_receive, 1, MPI_INTEGER,
112			& idest, itag, MPI_COMM_MODEL, istatus, ierr)
113			#endif
114			CALL MPI_SEND (shareBufCS2_R8, lbuf2, MPI_DOUBLE_PRECISION,
115			& idest, itag, MPI_COMM_MODEL, ierr)
116
117			C-- All proceses except 0 receive result from process 0
118			CALL MPI_RECV (shareBufCS1_R8, lbuf1, MPI_DOUBLE_PRECISION,
119			& idest, itag, MPI_COMM_MODEL, istatus, ierr)
120
121			ELSE
122
123			C-- Process 0 fills-in its local data
124			np1 = 1
125			DO bj=1,nSy
126			DO bi=1,nSx
127			biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi
128			bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj
129			globalBuf(1,biG,bjG) = shareBufCS2_R8(1,bi,bj)
130			globalBuf(2,biG,bjG) = shareBufCS2_R8(2,bi,bj)
131			ENDDO
132			ENDDO
133
134			C-- Process 0 polls and receives data from each process in turn
135			DO npe = 1, numberOfProcs-1
136			#ifndef DISABLE_MPI_READY_TO_RECEIVE
137			CALL MPI_SEND (ready_to_receive, 1, MPI_INTEGER,
138			& npe, itag, MPI_COMM_MODEL, ierr)
139			#endif
140			CALL MPI_RECV (loc2Buf, lbuf2, MPI_DOUBLE_PRECISION,
141			& npe, itag, MPI_COMM_MODEL, istatus, ierr)
142
143			C-- Process 0 gathers the local arrays into a global array.
144			np1 = npe + 1
145			DO bj=1,nSy
146			DO bi=1,nSx
147			biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi
148			bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj
149			globalBuf(1,biG,bjG) = loc2Buf(1,bi,bj)
150			globalBuf(2,biG,bjG) = loc2Buf(2,bi,bj)
151			ENDDO
152			ENDDO
153			ENDDO
154
155			C-- Cumulate Sum over all tiles:
156			globalBuf(3,1,1) = 0.
157			bj = 1
158			DO bi = 1,nSx*nPx-1
159			globalBuf(3,1+bi,bj) = globalBuf(3,bi,bj)
160			& + globalBuf(1,bi,bj)
161			ENDDO
162			DO bj = 1,nSy*nPy-1
163			DO bi = 1,nSx*nPx
164			globalBuf(3,bi,1+bj) = globalBuf(3,bi,bj)
165			& + globalBuf(2,bi,bj)
166			ENDDO
167			ENDDO
168
169			C-- Process 0 fills-in its local data
170			np1 = 1
171			DO bj=1,nSy
172			DO bi=1,nSx
173			biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi
174			bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj
175			shareBufCS1_R8(bi,bj) = globalBuf(3,biG,bjG)
176			ENDDO
177			ENDDO
178
179			C-- Process 0 sends result to all other processes
180			DO npe = 1, numberOfProcs-1
181			C- fill local array with relevant portion of global array
182			np1 = npe + 1
183			DO bj=1,nSy
184			DO bi=1,nSx
185			biG = (mpi_myXGlobalLo(np1)-1)/sNx+bi
186			bjG = (mpi_myYGlobalLo(np1)-1)/sNy+bj
187			loc1Buf(bi,bj) = globalBuf(3,biG,bjG)
188			ENDDO
189			ENDDO
190			CALL MPI_SEND (loc1Buf, lbuf1, MPI_DOUBLE_PRECISION,
191			& npe, itag, MPI_COMM_MODEL, ierr)
192
193			ENDDO
194
195			ENDIF
196
197			ELSEIF (useCubedSphereExchange) THEN
198			#else /* not USE_MPI */
199			IF (useCubedSphereExchange) THEN
200			#endif /* ALLOW_USE_MPI */
201
202			C-- assume 1 tile / face, from bi=1 to 6, no MPI
203			shareBufCS1_R8(1,1) = 0.
204			bj = 1
205			DO bi = 1,nSx-1
206			nf = 1 + MOD(1+bi,2)
207			shareBufCS1_R8(1+bi,bj) = shareBufCS1_R8(bi,bj)
208			& + shareBufCS2_R8(nf,bi,bj)
209			ENDDO
210			C- fill in missing corner: 1 = North-West corner of face 1
211			C- 2 = South-East corner of face 2
212			bi = 1
213			psiLoc(1) = shareBufCS1_R8(bi,bj) + shareBufCS2_R8(2,bi,bj)
214			bi = MIN(2,nSx)
215			psiLoc(2) = shareBufCS1_R8(bi,bj) + shareBufCS2_R8(1,bi,bj)
216
217			ELSE
218
219			C-- Cumulate Sum over all tiles:
220			shareBufCS1_R8(1,1) = 0.
221			bj = 1
222			DO bi = 1,nSx-1
223			shareBufCS1_R8(1+bi,bj) = shareBufCS1_R8(bi,bj)
224			& + shareBufCS2_R8(1,bi,bj)
225			ENDDO
226			DO bj = 1,nSy-1
227			DO bi = 1,nSx
228			shareBufCS1_R8(bi,1+bj) = shareBufCS1_R8(bi,bj)
229			& + shareBufCS2_R8(2,bi,bj)
230			ENDDO
231			ENDDO
232
233			ENDIF
234
235			_END_MASTER( myThid )
236			C-- Everyone wait for Master thread to be ready
237			CALL BAR2( myThid )
238
239			C-- set result for every threads
240			DO bj = myByLo(myThid), myByHi(myThid)
241			DO bi = myBxLo(myThid), myBxHi(myThid)
242			psiZ(bi,bj) = shareBufCS1_R8(bi,bj)
243			ENDDO
244			ENDDO
245
246			RETURN
247			#endif /* ALLOW_EXCH2 */
248			END