C $Header: /home/ubuntu/mnt/e9_copy/MITgcm/pkg/exch2/w2_cumulsum_z_tile.F,v 1.1 2011/07/09 22:08:37 jmc Exp $
C $Name:  $

#include "CPP_EEOPTIONS.h"
#include "W2_OPTIONS.h"

C--   File w2_cumulsum_z_tile.F: Routines that perform cumulated sum
C                                on a tiled array, corner grid-cell location
C      Contents
C      o W2_CUMULSUM_Z_TILE_RL
C      o W2_CUMULSUM_Z_TILE_RS <- not yet coded

C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----|
CBOP
C     !ROUTINE: W2_CUMULSUM_Z_TILE_RL

C     !INTERFACE:
      SUBROUTINE W2_CUMULSUM_Z_TILE_RL(
     O                       psiZ, psiLoc,
     I                       dPsiX, dPsiY, myThid )

C     !DESCRIPTION:
C     *==========================================================*
C     | SUBROUTINE W2\_CUMULSUM\_Z\_TILE\_RL
C     | o Handle cumulated sum for _RL tile data.
C     *==========================================================*
C     | Cumulate sum on tiled array, corner grid-cell location:
C     |  Starts from 1rst tile and, going through all tiles & all
C     |  the processes, add increment in both directions
C     *==========================================================*

C     !USES:
      IMPLICIT NONE

C     == Global data ==
#include "SIZE.h"
#include "EEPARAMS.h"
#include "EESUPPORT.h"
#include "W2_EXCH2_SIZE.h"
#include "W2_EXCH2_TOPOLOGY.h"
#include "CUMULSUM.h"

C     !INPUT/OUTPUT PARAMETERS:
C     == Routine arguments ==
C     psiZ    :: results of cumulated sum, corresponds to tile South-East corner
C     psiLoc  :: cumulated sum at special locations
C     dPsiX   :: tile increment in X direction
C     dPsiY   :: tile increment in Y direction
C     myThid  :: my Thread Id. number
      _RL     psiZ  (nSx,nSy)
      _RL     psiLoc(2)
      _RL     dPsiX (nSx,nSy)
      _RL     dPsiY (nSx,nSy)
      INTEGER myThid

C     !LOCAL VARIABLES:
C     == Local variables ==
C     bi,bj   :: tile indices
C- type declaration of: loc[1,2]Buf and shareBufCS[1,2]_R8 :
C         all 4 needs to have the same length as MPI_DOUBLE_PRECISION
      INTEGER bi,bj
      INTEGER tN, tS
      Real*8  globalBuf(3,W2_maxNbTiles)
#ifdef ALLOW_USE_MPI
      INTEGER npe, np1
      INTEGER lbuf1, lbuf2, idest, itag, ready_to_receive
      INTEGER istatus(MPI_STATUS_SIZE), ierr
      Real*8  loc1Buf  (nSx,nSy)
      Real*8  loc2Buf(2,nSx,nSy)
#endif /* ALLOW_USE_MPI */
CEOP

C--   Initialise to zero:
      psiLoc(1) = 0.
      psiLoc(2) = 0.
      DO tN = 1,exch2_nTiles
        globalBuf(1,tN) = 0.
        globalBuf(2,tN) = 0.
        globalBuf(3,tN) = 0.
      ENDDO

C--   write input into shared-buffer array
      DO bj = myByLo(myThid), myByHi(myThid)
       DO bi = myBxLo(myThid), myBxHi(myThid)
         shareBufCS2_R8(1,bi,bj) = dPsiX(bi,bj)
         shareBufCS2_R8(2,bi,bj) = dPsiY(bi,bj)
       ENDDO
      ENDDO

C--   Master thread cannot start until everyone is ready:
      CALL BAR2( myThid )
      _BEGIN_MASTER( myThid )

#ifdef ALLOW_USE_MPI
#ifndef ALWAYS_USE_MPI
      IF ( usingMPI ) THEN
#endif

       lbuf1 = nSx*nSy
       lbuf2 = 2*lbuf1
       idest = 0
       itag  = 0
       ready_to_receive = 0

       IF ( mpiMyId.NE.0 ) THEN

C--   All proceses except 0 wait to be polled then send local array
#ifndef DISABLE_MPI_READY_TO_RECEIVE
           CALL MPI_RECV (ready_to_receive, 1, MPI_INTEGER,
     &                    idest, itag, MPI_COMM_MODEL, istatus, ierr)
#endif
           CALL MPI_SEND (shareBufCS2_R8, lbuf2, MPI_DOUBLE_PRECISION,
     &                    idest, itag, MPI_COMM_MODEL, ierr)

C--   All proceses except 0 receive result from process 0
           CALL MPI_RECV (shareBufCS1_R8, lbuf1, MPI_DOUBLE_PRECISION,
     &                    idest, itag, MPI_COMM_MODEL, istatus, ierr)

       ELSE

C--   Process 0 polls and receives data from each process in turn
         DO npe = 1, numberOfProcs-1
#ifndef DISABLE_MPI_READY_TO_RECEIVE
           CALL MPI_SEND (ready_to_receive, 1, MPI_INTEGER,
     &                    npe, itag, MPI_COMM_MODEL, ierr)
#endif
           CALL MPI_RECV (loc2Buf, lbuf2, MPI_DOUBLE_PRECISION,
     &                    npe, itag, MPI_COMM_MODEL, istatus, ierr)

C--   Process 0 gathers the local arrays into a global array.
           np1 = npe + 1
           DO bj=1,nSy
            DO bi=1,nSx
              tN = W2_procTileList(bi,bj,np1)
              globalBuf(1,tN) = loc2Buf(1,bi,bj)
              globalBuf(2,tN) = loc2Buf(2,bi,bj)
            ENDDO
           ENDDO
         ENDDO

C--   end if process not 0 / else = 0
       ENDIF

#ifndef ALWAYS_USE_MPI
      ENDIF
#endif
#endif /* ALLOW_USE_MPI */

      IF ( myProcId.EQ.0 ) THEN

C--   Process 0 fills-in its local data
         DO bj=1,nSy
          DO bi=1,nSx
            tN = W2_myTileList(bi,bj)
            globalBuf(1,tN) = shareBufCS2_R8(1,bi,bj)
            globalBuf(2,tN) = shareBufCS2_R8(2,bi,bj)
          ENDDO
         ENDDO

C--   Cumulate Sum over all tiles:
         DO tN = 1,exch2_nTiles
           globalBuf(3,tN) = 0.
           DO tS = 1,exch2_nTiles
             globalBuf(3,tN) = globalBuf(3,tN)
     &                       + W2_cumSum_tiles(1,tS,tN)*globalBuf(1,tS)
     &                       + W2_cumSum_tiles(2,tS,tN)*globalBuf(2,tS)
           ENDDO
         ENDDO
C-    Value at Special location (e.g., Missing-Corner values)
         IF ( W2_tMC1.GE.1 )
     &     psiLoc(1) = globalBuf(3,W2_tMC1) + globalBuf(2,W2_tMC1)
         IF ( W2_tMC2.GE.1 )
     &     psiLoc(2) = globalBuf(3,W2_tMC2) + globalBuf(1,W2_tMC2)

C--   Process 0 fills-in its local data
         DO bj=1,nSy
          DO bi=1,nSx
            tN = W2_myTileList(bi,bj)
            shareBufCS1_R8(bi,bj) = globalBuf(3,tN)
          ENDDO
         ENDDO

#ifdef ALLOW_USE_MPI
#ifndef ALWAYS_USE_MPI
        IF ( usingMPI ) THEN
#endif
C--   Process 0 sends result to all other processes
         DO npe = 1, numberOfProcs-1
C-    fill local array with relevant portion of global array
           np1 = npe + 1
           DO bj=1,nSy
            DO bi=1,nSx
              tN = W2_procTileList(bi,bj,np1)
              loc1Buf(bi,bj) = globalBuf(3,tN)
            ENDDO
           ENDDO
           CALL MPI_SEND (loc1Buf, lbuf1, MPI_DOUBLE_PRECISION,
     &                    npe, itag, MPI_COMM_MODEL, ierr)
         ENDDO

#ifndef ALWAYS_USE_MPI
        ENDIF
#endif
#endif /* ALLOW_USE_MPI */

C--   end if process 0
      ENDIF

      _END_MASTER( myThid )
C--   Everyone wait for Master thread to be ready
      CALL BAR2( myThid )

C--   set result for every threads
      DO bj = myByLo(myThid), myByHi(myThid)
       DO bi = myBxLo(myThid), myBxHi(myThid)
         psiZ(bi,bj) = shareBufCS1_R8(bi,bj)
       ENDDO
      ENDDO

      RETURN
      END