model/src/cg2d.F

C $Header: /u/gcmpack/MITgcm/model/src/cg2d.F,v 1.51 2009/04/28 18:01:14 jmc Exp $
C $Name:  $

#include "CPP_OPTIONS.h"
#ifdef TARGET_NEC_SX
C     set a sensible default for the outer loop unrolling parameter that can
C     be overriden in the Makefile with the DEFINES macro or in CPP_OPTIONS.h
#ifndef CG2D_OUTERLOOPITERS
# define CG2D_OUTERLOOPITERS 10
#endif
#endif /* TARGET_NEC_SX */

CBOP
C     !ROUTINE: CG2D
C     !INTERFACE:
      SUBROUTINE CG2D(
     I                cg2d_b,
     U                cg2d_x,
     O                firstResidual,
     O                lastResidual,
     U                numIters,
     I                myThid )
C     !DESCRIPTION: \bv
C     *==========================================================*
C     | SUBROUTINE CG2D
C     | o Two-dimensional grid problem conjugate-gradient
C     |   inverter (with preconditioner).
C     *==========================================================*
C     | Con. grad is an iterative procedure for solving Ax = b.
C     | It requires the A be symmetric.
C     | This implementation assumes A is a five-diagonal
C     | matrix of the form that arises in the discrete
C     | representation of the del^2 operator in a
C     | two-dimensional space.
C     | Notes:
C     | ======
C     | This implementation can support shared-memory
C     | multi-threaded execution. In order to do this COMMON
C     | blocks are used for many of the arrays - even ones that
C     | are only used for intermedaite results. This design is
C     | OK if you want to all the threads to collaborate on
C     | solving the same problem. On the other hand if you want
C     | the threads to solve several different problems
C     | concurrently this implementation will not work.
C     *==========================================================*
C     \ev

C     !USES:
      IMPLICIT NONE
C     === Global data ===
#include "SIZE.h"
#include "EEPARAMS.h"
#include "PARAMS.h"
#include "CG2D.h"
c#include "GRID.h"
c#include "SURFACE.h"

C     !INPUT/OUTPUT PARAMETERS:
C     === Routine arguments ===
C     myThid    :: Thread on which I am working.
C     cg2d_b    :: The source term or "right hand side"
C     cg2d_x    :: The solution
C     firstResidual :: the initial residual before any iterations
C     lastResidual  :: the actual residual reached
C     numIters  :: Entry: the maximum number of iterations allowed
C                  Exit:  the actual number of iterations used
      _RL  cg2d_b(1-OLx:sNx+OLx,1-OLy:sNy+OLy,nSx,nSy)
      _RL  cg2d_x(1-OLx:sNx+OLx,1-OLy:sNy+OLy,nSx,nSy)
      _RL  firstResidual
      _RL  lastResidual
      INTEGER numIters
      INTEGER myThid

C     !LOCAL VARIABLES:
C     === Local variables ====
C     actualIts      :: Number of iterations taken
C     actualResidual :: residual
C     bi, bj     :: Block index in X and Y.
C     eta_qrN    :: Used in computing search directions
C     eta_qrNM1     suffix N and NM1 denote current and
C     cgBeta        previous iterations respectively.
C     alpha
C     sumRHS     :: Sum of right-hand-side. Sometimes this is a
C                   useful debuggin/trouble shooting diagnostic.
C                   For neumann problems sumRHS needs to be ~0.
C                   or they converge at a non-zero residual.
C     err        :: Measure of residual of Ax - b, usually the norm.
C     I, J, it2d :: Loop counters ( it2d counts CG iterations )
      INTEGER actualIts
      _RL    actualResidual
      INTEGER bi, bj
      INTEGER I, J, it2d
c     INTEGER ks
      _RL    err,    errTile(nSx,nSy)
      _RL    eta_qrN,eta_qrNtile(nSx,nSy)
      _RL    eta_qrNM1
      _RL    cgBeta
      _RL    alpha,  alphaTile(nSx,nSy)
      _RL    sumRHS, sumRHStile(nSx,nSy)
      _RL    rhsMax
      _RL    rhsNorm
#ifdef CG2D_SINGLECPU_SUM
      _RL    localBuf(1:sNx,1:sNy,nSx,nSy)
#endif
      CHARACTER*(MAX_LEN_MBUF) msgBuf
CEOP


CcnhDebugStarts
C     CHARACTER*(MAX_LEN_FNAM) suff
CcnhDebugEnds


C--   Initialise inverter
      eta_qrNM1 = 1. _d 0

CcnhDebugStarts
C     _EXCH_XY_RL( cg2d_b, myThid )
C     CALL PLOT_FIELD_XYRL( cg2d_b, 'CG2D.0 CG2D_B' , 1, myThid )
C     suff = 'unnormalised'
C     CALL WRITE_FLD_XY_RL (  'cg2d_b.',suff,    cg2d_b, 1, myThid)
C     STOP
CcnhDebugEnds

C--   Normalise RHS
      rhsMax = 0. _d 0
      DO bj=myByLo(myThid),myByHi(myThid)
       DO bi=myBxLo(myThid),myBxHi(myThid)
        DO J=1,sNy
         DO I=1,sNx
          cg2d_b(I,J,bi,bj) = cg2d_b(I,J,bi,bj)*cg2dNorm
          rhsMax = MAX(ABS(cg2d_b(I,J,bi,bj)),rhsMax)
         ENDDO
        ENDDO
       ENDDO
      ENDDO

      IF (cg2dNormaliseRHS) THEN
C-  Normalise RHS :
#ifdef LETS_MAKE_JAM
C     _GLOBAL_MAX_RL( rhsMax, myThid )
      rhsMax=1.
#else
      _GLOBAL_MAX_RL( rhsMax, myThid )
#endif
      rhsNorm = 1. _d 0
      IF ( rhsMax .NE. 0. ) rhsNorm = 1. _d 0 / rhsMax
      DO bj=myByLo(myThid),myByHi(myThid)
       DO bi=myBxLo(myThid),myBxHi(myThid)
        DO J=1,sNy
         DO I=1,sNx
          cg2d_b(I,J,bi,bj) = cg2d_b(I,J,bi,bj)*rhsNorm
          cg2d_x(I,J,bi,bj) = cg2d_x(I,J,bi,bj)*rhsNorm
         ENDDO
        ENDDO
       ENDDO
      ENDDO
C- end Normalise RHS
      ENDIF

C--   Update overlaps
      _EXCH_XY_RL( cg2d_b, myThid )
      _EXCH_XY_RL( cg2d_x, myThid )
CcnhDebugStarts
C     CALL PLOT_FIELD_XYRL( cg2d_b, 'CG2D.1 CG2D_B' , 1, myThid )
C     suff = 'normalised'
C     CALL WRITE_FLD_XY_RL (  'cg2d_b.',suff,    cg2d_b, 1, myThid)
CcnhDebugEnds

C--   Initial residual calculation
      DO bj=myByLo(myThid),myByHi(myThid)
       DO bi=myBxLo(myThid),myBxHi(myThid)
        sumRHStile(bi,bj) = 0. _d 0
        errTile(bi,bj)    = 0. _d 0
#ifdef TARGET_NEC_SX
!CDIR OUTERUNROLL=CG2D_OUTERLOOPITERS
#endif /* TARGET_NEC_SX */
        DO J=1,sNy
         DO I=1,sNx
c         ks = ksurfC(I,J,bi,bj)
          cg2d_s(I,J,bi,bj) = 0.
          cg2d_r(I,J,bi,bj) = cg2d_b(I,J,bi,bj) -
     &    (aW2d(I  ,J  ,bi,bj)*cg2d_x(I-1,J  ,bi,bj)
     &    +aW2d(I+1,J  ,bi,bj)*cg2d_x(I+1,J  ,bi,bj)
     &    +aS2d(I  ,J  ,bi,bj)*cg2d_x(I  ,J-1,bi,bj)
     &    +aS2d(I  ,J+1,bi,bj)*cg2d_x(I  ,J+1,bi,bj)
     &    +aC2d(I  ,J  ,bi,bj)*cg2d_x(I  ,J  ,bi,bj)
     &    )
c    &    -aW2d(I  ,J  ,bi,bj)*cg2d_x(I  ,J  ,bi,bj)
c    &    -aW2d(I+1,J  ,bi,bj)*cg2d_x(I  ,J  ,bi,bj)
c    &    -aS2d(I  ,J  ,bi,bj)*cg2d_x(I  ,J  ,bi,bj)
c    &    -aS2d(I  ,J+1,bi,bj)*cg2d_x(I  ,J  ,bi,bj)
c    &    -freeSurfFac*_rA(i,j,bi,bj)*deepFac2F(ks)*recip_Bo(i,j,bi,bj)
c    &                        *cg2d_x(I  ,J  ,bi,bj)
c    &                        /deltaTMom/deltaTfreesurf*cg2dNorm
c    &    )
#ifdef CG2D_SINGLECPU_SUM
          localBuf(I,J,bi,bj) = cg2d_r(I,J,bi,bj)*cg2d_r(I,J,bi,bj)
#else
          errTile(bi,bj)    = errTile(bi,bj)
     &                      + cg2d_r(I,J,bi,bj)*cg2d_r(I,J,bi,bj)
          sumRHStile(bi,bj) = sumRHStile(bi,bj) + cg2d_b(I,J,bi,bj)
#endif
         ENDDO
        ENDDO
       ENDDO
      ENDDO
#ifdef LETS_MAKE_JAM
      CALL EXCH_XY_O1_R8_JAM( cg2d_r )
#else
      CALL EXCH_XY_RL( cg2d_r, myThid )
#endif
#ifdef LETS_MAKE_JAM
      CALL EXCH_XY_O1_R8_JAM( cg2d_s )
#else
      CALL EXCH_XY_RL( cg2d_s, myThid )
#endif
#ifdef CG2D_SINGLECPU_SUM
      CALL GLOBAL_SUM_SINGLECPU_RL(localBuf, err, 0, 0, myThid)
      CALL GLOBAL_SUM_SINGLECPU_RL(cg2d_b, sumRHS, OLx, OLy, myThid)
#else
       CALL GLOBAL_SUM_TILE_RL( errTile,    err,    myThid )
       CALL GLOBAL_SUM_TILE_RL( sumRHStile, sumRHS, myThid )
#endif
       err = SQRT(err)
       actualIts      = 0
       actualResidual = err

       IF ( debugLevel .GE. debLevZero ) THEN
        _BEGIN_MASTER( myThid )
        WRITE(standardmessageunit,'(A,1P2E22.14)')
     &  ' cg2d: Sum(rhs),rhsMax = ', sumRHS,rhsMax
        _END_MASTER( myThid )
       ENDIF
C     _BARRIER
c     _BEGIN_MASTER( myThid )
c      WRITE(standardmessageunit,'(A,I6,1PE30.14)')
c    & ' CG2D iters, err = ',
c    & actualIts, actualResidual
c     _END_MASTER( myThid )
      firstResidual=actualResidual

      IF ( err .LT. cg2dTolerance ) GOTO 11

C     >>>>>>>>>>>>>>> BEGIN SOLVER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
      DO 10 it2d=1, numIters

CcnhDebugStarts
C      WRITE(*,*) ' CG2D: Iteration ',it2d-1,' residual = ',
C    &  actualResidual
CcnhDebugEnds
C--    Solve preconditioning equation and update
C--    conjugate direction vector "s".
       DO bj=myByLo(myThid),myByHi(myThid)
        DO bi=myBxLo(myThid),myBxHi(myThid)
         eta_qrNtile(bi,bj) = 0. _d 0
#ifdef TARGET_NEC_SX
!CDIR OUTERUNROLL=CG2D_OUTERLOOPITERS
#endif /* TARGET_NEC_SX */
         DO J=1,sNy
          DO I=1,sNx
           cg2d_q(I,J,bi,bj) =
     &      pC(I  ,J  ,bi,bj)*cg2d_r(I  ,J  ,bi,bj)
     &     +pW(I  ,J  ,bi,bj)*cg2d_r(I-1,J  ,bi,bj)
     &     +pW(I+1,J  ,bi,bj)*cg2d_r(I+1,J  ,bi,bj)
     &     +pS(I  ,J  ,bi,bj)*cg2d_r(I  ,J-1,bi,bj)
     &     +pS(I  ,J+1,bi,bj)*cg2d_r(I  ,J+1,bi,bj)
CcnhDebugStarts
C          cg2d_q(I,J,bi,bj) = cg2d_r(I  ,J  ,bi,bj)
CcnhDebugEnds
#ifdef CG2D_SINGLECPU_SUM
          localBuf(I,J,bi,bj) =
     &      cg2d_q(I,J,bi,bj)*cg2d_r(I,J,bi,bj)
#else
           eta_qrNtile(bi,bj) = eta_qrNtile(bi,bj)
     &     +cg2d_q(I,J,bi,bj)*cg2d_r(I,J,bi,bj)
#endif
          ENDDO
         ENDDO
        ENDDO
       ENDDO

#ifdef CG2D_SINGLECPU_SUM
       CALL GLOBAL_SUM_SINGLECPU_RL( localBuf,eta_qrN,0,0,myThid )
#else
       CALL GLOBAL_SUM_TILE_RL( eta_qrNtile,eta_qrN,myThid )
#endif
CcnhDebugStarts
C      WRITE(*,*) ' CG2D: Iteration ',it2d-1,' eta_qrN = ',eta_qrN
CcnhDebugEnds
       cgBeta   = eta_qrN/eta_qrNM1
CcnhDebugStarts
C      WRITE(*,*) ' CG2D: Iteration ',it2d-1,' beta = ',cgBeta
CcnhDebugEnds
       eta_qrNM1 = eta_qrN

       DO bj=myByLo(myThid),myByHi(myThid)
        DO bi=myBxLo(myThid),myBxHi(myThid)
         DO J=1,sNy
          DO I=1,sNx
           cg2d_s(I,J,bi,bj) = cg2d_q(I,J,bi,bj)
     &                       + cgBeta*cg2d_s(I,J,bi,bj)
          ENDDO
         ENDDO
        ENDDO
       ENDDO

C--    Do exchanges that require messages i.e. between
C--    processes.
C      _EXCH_XY_RL( cg2d_s, myThid )
#ifdef LETS_MAKE_JAM
      CALL EXCH_XY_O1_R8_JAM( cg2d_s )
#else
      CALL EXCH_XY_RL( cg2d_s, myThid )
#endif

C==    Evaluate laplace operator on conjugate gradient vector
C==    q = A.s
       DO bj=myByLo(myThid),myByHi(myThid)
        DO bi=myBxLo(myThid),myBxHi(myThid)
         alphaTile(bi,bj) = 0. _d 0
#ifdef TARGET_NEC_SX
!CDIR OUTERUNROLL=CG2D_OUTERLOOPITERS
#endif /* TARGET_NEC_SX */
         DO J=1,sNy
          DO I=1,sNx
c          ks = ksurfC(I,J,bi,bj)
           cg2d_q(I,J,bi,bj) =
     &     aW2d(I  ,J  ,bi,bj)*cg2d_s(I-1,J  ,bi,bj)
     &    +aW2d(I+1,J  ,bi,bj)*cg2d_s(I+1,J  ,bi,bj)
     &    +aS2d(I  ,J  ,bi,bj)*cg2d_s(I  ,J-1,bi,bj)
     &    +aS2d(I  ,J+1,bi,bj)*cg2d_s(I  ,J+1,bi,bj)
     &    +aC2d(I  ,J  ,bi,bj)*cg2d_s(I  ,J  ,bi,bj)
c    &    -aW2d(I  ,J  ,bi,bj)*cg2d_s(I  ,J  ,bi,bj)
c    &    -aW2d(I+1,J  ,bi,bj)*cg2d_s(I  ,J  ,bi,bj)
c    &    -aS2d(I  ,J  ,bi,bj)*cg2d_s(I  ,J  ,bi,bj)
c    &    -aS2d(I  ,J+1,bi,bj)*cg2d_s(I  ,J  ,bi,bj)
c    &    -freeSurfFac*_rA(i,j,bi,bj)*deepFac2F(ks)*recip_Bo(i,j,bi,bj)
c    &                        *cg2d_s(I  ,J  ,bi,bj)
c    &                        /deltaTMom/deltaTfreesurf*cg2dNorm
#ifdef CG2D_SINGLECPU_SUM
          localBuf(I,J,bi,bj) = cg2d_s(I,J,bi,bj)*cg2d_q(I,J,bi,bj)
#else
          alphaTile(bi,bj) = alphaTile(bi,bj)
     &                     + cg2d_s(I,J,bi,bj)*cg2d_q(I,J,bi,bj)
#endif
          ENDDO
         ENDDO
        ENDDO
       ENDDO
#ifdef CG2D_SINGLECPU_SUM
       CALL GLOBAL_SUM_SINGLECPU_RL(localBuf, alpha, 0, 0, myThid)
#else
       CALL GLOBAL_SUM_TILE_RL( alphaTile,  alpha,  myThid )
#endif
CcnhDebugStarts
C      WRITE(*,*) ' CG2D: Iteration ',it2d-1,' SUM(s*q)= ',alpha
CcnhDebugEnds
       alpha = eta_qrN/alpha
CcnhDebugStarts
C      WRITE(*,*) ' CG2D: Iteration ',it2d-1,' alpha= ',alpha
CcnhDebugEnds

C==    Update solution and residual vectors
C      Now compute "interior" points.
       DO bj=myByLo(myThid),myByHi(myThid)
        DO bi=myBxLo(myThid),myBxHi(myThid)
         errTile(bi,bj) = 0. _d 0
         DO J=1,sNy
          DO I=1,sNx
           cg2d_x(I,J,bi,bj)=cg2d_x(I,J,bi,bj)+alpha*cg2d_s(I,J,bi,bj)
           cg2d_r(I,J,bi,bj)=cg2d_r(I,J,bi,bj)-alpha*cg2d_q(I,J,bi,bj)
#ifdef CG2D_SINGLECPU_SUM
           localBuf(I,J,bi,bj) = cg2d_r(I,J,bi,bj)*cg2d_r(I,J,bi,bj)
#else
           errTile(bi,bj) = errTile(bi,bj)
     &                    + cg2d_r(I,J,bi,bj)*cg2d_r(I,J,bi,bj)
#endif
          ENDDO
         ENDDO
        ENDDO
       ENDDO

#ifdef CG2D_SINGLECPU_SUM
       CALL GLOBAL_SUM_SINGLECPU_RL(localBuf, err, 0, 0, myThid)
#else
       CALL GLOBAL_SUM_TILE_RL( errTile,    err,    myThid )
#endif
       err = SQRT(err)
       actualIts      = it2d
       actualResidual = err
       IF ( debugLevel.GT.debLevB ) THEN
c       IF ( DIFFERENT_MULTIPLE(monitorFreq,myTime,deltaTClock)
c    &     ) THEN
        _BEGIN_MASTER( myThid )
        WRITE(msgBuf,'(A,I6,A,1PE21.14)')
     &    ' cg2d: iter=', actualIts, ' ; resid.= ', actualResidual
        CALL PRINT_MESSAGE(msgBuf,standardMessageUnit,SQUEEZE_RIGHT,1)
        _END_MASTER( myThid )
c       ENDIF
       ENDIF
       IF ( err .LT. cg2dTolerance ) GOTO 11

#ifdef LETS_MAKE_JAM
      CALL EXCH_XY_O1_R8_JAM( cg2d_r )
#else
      CALL EXCH_XY_RL( cg2d_r, myThid )
#endif

   10 CONTINUE
   11 CONTINUE

      IF (cg2dNormaliseRHS) THEN
C--   Un-normalise the answer
        DO bj=myByLo(myThid),myByHi(myThid)
         DO bi=myBxLo(myThid),myBxHi(myThid)
          DO J=1,sNy
           DO I=1,sNx
            cg2d_x(I  ,J  ,bi,bj) = cg2d_x(I  ,J  ,bi,bj)/rhsNorm
           ENDDO
          ENDDO
         ENDDO
        ENDDO
      ENDIF

C     The following exchange was moved up to solve_for_pressure
C     for compatibility with TAMC.
C     _EXCH_XY_RL(cg2d_x, myThid )
c     _BEGIN_MASTER( myThid )
c      WRITE(*,'(A,I6,1PE30.14)') ' CG2D iters, err = ',
c    & actualIts, actualResidual
c     _END_MASTER( myThid )

C--   Return parameters to caller
      lastResidual=actualResidual
      numIters=actualIts

CcnhDebugStarts
C     CALL PLOT_FIELD_XYRL( cg2d_x, 'CALC_MOM_RHS CG2D_X' , 1, myThid )
C     err    = 0. _d 0
C     DO bj=myByLo(myThid),myByHi(myThid)
C      DO bi=myBxLo(myThid),myBxHi(myThid)
C       DO J=1,sNy
C        DO I=1,sNx
C         cg2d_r(I,J,bi,bj) = cg2d_b(I,J,bi,bj) -
C    &    (aW2d(I  ,J  ,bi,bj)*cg2d_x(I-1,J  ,bi,bj)
C    &    +aW2d(I+1,J  ,bi,bj)*cg2d_x(I+1,J  ,bi,bj)
C    &    +aS2d(I  ,J  ,bi,bj)*cg2d_x(I  ,J-1,bi,bj)
C    &    +aS2d(I  ,J+1,bi,bj)*cg2d_x(I  ,J+1,bi,bj)
C    &    -aW2d(I  ,J  ,bi,bj)*cg2d_x(I  ,J  ,bi,bj)
C    &    -aW2d(I+1,J  ,bi,bj)*cg2d_x(I  ,J  ,bi,bj)
C    &    -aS2d(I  ,J  ,bi,bj)*cg2d_x(I  ,J  ,bi,bj)
C    &    -aS2d(I  ,J+1,bi,bj)*cg2d_x(I  ,J  ,bi,bj))
C         err            = err            +
C    &     cg2d_r(I,J,bi,bj)*cg2d_r(I,J,bi,bj)
C        ENDDO
C       ENDDO
C      ENDDO
C     ENDDO
C     _GLOBAL_SUM_RL( err   , myThid )
C     write(*,*) 'cg2d: Ax - b = ',SQRT(err)
CcnhDebugEnds

      RETURN
      END
1	C $Header: /u/gcmpack/MITgcm/model/src/cg2d.F,v 1.51 2009/04/28 18:01:14 jmc Exp $
2	C $Name: $
3
4	#include "CPP_OPTIONS.h"
5	#ifdef TARGET_NEC_SX
6	C set a sensible default for the outer loop unrolling parameter that can
7	C be overriden in the Makefile with the DEFINES macro or in CPP_OPTIONS.h
8	#ifndef CG2D_OUTERLOOPITERS
9	# define CG2D_OUTERLOOPITERS 10
10	#endif
11	#endif /* TARGET_NEC_SX */
12
13	CBOP
14	C !ROUTINE: CG2D
15	C !INTERFACE:
16	SUBROUTINE CG2D(
17	I cg2d_b,
18	U cg2d_x,
19	O firstResidual,
20	O lastResidual,
21	U numIters,
22	I myThid )
23	C !DESCRIPTION: \bv
24	C ==========================================================
25	C \| SUBROUTINE CG2D
26	C \| o Two-dimensional grid problem conjugate-gradient
27	C \| inverter (with preconditioner).
28	C ==========================================================
29	C \| Con. grad is an iterative procedure for solving Ax = b.
30	C \| It requires the A be symmetric.
31	C \| This implementation assumes A is a five-diagonal
32	C \| matrix of the form that arises in the discrete
33	C \| representation of the del^2 operator in a
34	C \| two-dimensional space.
35	C \| Notes:
36	C \| ======
37	C \| This implementation can support shared-memory
38	C \| multi-threaded execution. In order to do this COMMON
39	C \| blocks are used for many of the arrays - even ones that
40	C \| are only used for intermedaite results. This design is
41	C \| OK if you want to all the threads to collaborate on
42	C \| solving the same problem. On the other hand if you want
43	C \| the threads to solve several different problems
44	C \| concurrently this implementation will not work.
45	C ==========================================================
46	C \ev
47
48	C !USES:
49	IMPLICIT NONE
50	C === Global data ===
51	#include "SIZE.h"
52	#include "EEPARAMS.h"
53	#include "PARAMS.h"
54	#include "CG2D.h"
55	c#include "GRID.h"
56	c#include "SURFACE.h"
57
58	C !INPUT/OUTPUT PARAMETERS:
59	C === Routine arguments ===
60	C myThid :: Thread on which I am working.
61	C cg2d_b :: The source term or "right hand side"
62	C cg2d_x :: The solution
63	C firstResidual :: the initial residual before any iterations
64	C lastResidual :: the actual residual reached
65	C numIters :: Entry: the maximum number of iterations allowed
66	C Exit: the actual number of iterations used
67	_RL cg2d_b(1-OLx:sNx+OLx,1-OLy:sNy+OLy,nSx,nSy)
68	_RL cg2d_x(1-OLx:sNx+OLx,1-OLy:sNy+OLy,nSx,nSy)
69	_RL firstResidual
70	_RL lastResidual
71	INTEGER numIters
72	INTEGER myThid
73
74	C !LOCAL VARIABLES:
75	C === Local variables ====
76	C actualIts :: Number of iterations taken
77	C actualResidual :: residual
78	C bi, bj :: Block index in X and Y.
79	C eta_qrN :: Used in computing search directions
80	C eta_qrNM1 suffix N and NM1 denote current and
81	C cgBeta previous iterations respectively.
82	C alpha
83	C sumRHS :: Sum of right-hand-side. Sometimes this is a
84	C useful debuggin/trouble shooting diagnostic.
85	C For neumann problems sumRHS needs to be ~0.
86	C or they converge at a non-zero residual.
87	C err :: Measure of residual of Ax - b, usually the norm.
88	C I, J, it2d :: Loop counters ( it2d counts CG iterations )
89	INTEGER actualIts
90	_RL actualResidual
91	INTEGER bi, bj
92	INTEGER I, J, it2d
93	c INTEGER ks
94	_RL err, errTile(nSx,nSy)
95	_RL eta_qrN,eta_qrNtile(nSx,nSy)
96	_RL eta_qrNM1
97	_RL cgBeta
98	_RL alpha, alphaTile(nSx,nSy)
99	_RL sumRHS, sumRHStile(nSx,nSy)
100	_RL rhsMax
101	_RL rhsNorm
102	#ifdef CG2D_SINGLECPU_SUM
103	_RL localBuf(1:sNx,1:sNy,nSx,nSy)
104	#endif
105	CHARACTER*(MAX_LEN_MBUF) msgBuf
106	CEOP
107
108
109	CcnhDebugStarts
110	C CHARACTER*(MAX_LEN_FNAM) suff
111	CcnhDebugEnds
112
113
114	C-- Initialise inverter
115	eta_qrNM1 = 1. _d 0
116
117	CcnhDebugStarts
118	C _EXCH_XY_RL( cg2d_b, myThid )
119	C CALL PLOT_FIELD_XYRL( cg2d_b, 'CG2D.0 CG2D_B' , 1, myThid )
120	C suff = 'unnormalised'
121	C CALL WRITE_FLD_XY_RL ( 'cg2d_b.',suff, cg2d_b, 1, myThid)
122	C STOP
123	CcnhDebugEnds
124
125	C-- Normalise RHS
126	rhsMax = 0. _d 0
127	DO bj=myByLo(myThid),myByHi(myThid)
128	DO bi=myBxLo(myThid),myBxHi(myThid)
129	DO J=1,sNy
130	DO I=1,sNx
131	cg2d_b(I,J,bi,bj) = cg2d_b(I,J,bi,bj)*cg2dNorm
132	rhsMax = MAX(ABS(cg2d_b(I,J,bi,bj)),rhsMax)
133	ENDDO
134	ENDDO
135	ENDDO
136	ENDDO
137
138	IF (cg2dNormaliseRHS) THEN
139	C- Normalise RHS :
140	#ifdef LETS_MAKE_JAM
141	C _GLOBAL_MAX_RL( rhsMax, myThid )
142	rhsMax=1.
143	#else
144	_GLOBAL_MAX_RL( rhsMax, myThid )
145	#endif
146	rhsNorm = 1. _d 0
147	IF ( rhsMax .NE. 0. ) rhsNorm = 1. _d 0 / rhsMax
148	DO bj=myByLo(myThid),myByHi(myThid)
149	DO bi=myBxLo(myThid),myBxHi(myThid)
150	DO J=1,sNy
151	DO I=1,sNx
152	cg2d_b(I,J,bi,bj) = cg2d_b(I,J,bi,bj)*rhsNorm
153	cg2d_x(I,J,bi,bj) = cg2d_x(I,J,bi,bj)*rhsNorm
154	ENDDO
155	ENDDO
156	ENDDO
157	ENDDO
158	C- end Normalise RHS
159	ENDIF
160
161	C-- Update overlaps
162	_EXCH_XY_RL( cg2d_b, myThid )
163	_EXCH_XY_RL( cg2d_x, myThid )
164	CcnhDebugStarts
165	C CALL PLOT_FIELD_XYRL( cg2d_b, 'CG2D.1 CG2D_B' , 1, myThid )
166	C suff = 'normalised'
167	C CALL WRITE_FLD_XY_RL ( 'cg2d_b.',suff, cg2d_b, 1, myThid)
168	CcnhDebugEnds
169
170	C-- Initial residual calculation
171	DO bj=myByLo(myThid),myByHi(myThid)
172	DO bi=myBxLo(myThid),myBxHi(myThid)
173	sumRHStile(bi,bj) = 0. _d 0
174	errTile(bi,bj) = 0. _d 0
175	#ifdef TARGET_NEC_SX
176	!CDIR OUTERUNROLL=CG2D_OUTERLOOPITERS
177	#endif /* TARGET_NEC_SX */
178	DO J=1,sNy
179	DO I=1,sNx
180	c ks = ksurfC(I,J,bi,bj)
181	cg2d_s(I,J,bi,bj) = 0.
182	cg2d_r(I,J,bi,bj) = cg2d_b(I,J,bi,bj) -
183	& (aW2d(I ,J ,bi,bj)*cg2d_x(I-1,J ,bi,bj)
184	& +aW2d(I+1,J ,bi,bj)*cg2d_x(I+1,J ,bi,bj)
185	& +aS2d(I ,J ,bi,bj)*cg2d_x(I ,J-1,bi,bj)
186	& +aS2d(I ,J+1,bi,bj)*cg2d_x(I ,J+1,bi,bj)
187	& +aC2d(I ,J ,bi,bj)*cg2d_x(I ,J ,bi,bj)
188	& )
189	c & -aW2d(I ,J ,bi,bj)*cg2d_x(I ,J ,bi,bj)
190	c & -aW2d(I+1,J ,bi,bj)*cg2d_x(I ,J ,bi,bj)
191	c & -aS2d(I ,J ,bi,bj)*cg2d_x(I ,J ,bi,bj)
192	c & -aS2d(I ,J+1,bi,bj)*cg2d_x(I ,J ,bi,bj)
193	c & -freeSurfFac_rA(i,j,bi,bj)deepFac2F(ks)*recip_Bo(i,j,bi,bj)
194	c & *cg2d_x(I ,J ,bi,bj)
195	c & /deltaTMom/deltaTfreesurf*cg2dNorm
196	c & )
197	#ifdef CG2D_SINGLECPU_SUM
198	localBuf(I,J,bi,bj) = cg2d_r(I,J,bi,bj)*cg2d_r(I,J,bi,bj)
199	#else
200	errTile(bi,bj) = errTile(bi,bj)
201	& + cg2d_r(I,J,bi,bj)*cg2d_r(I,J,bi,bj)
202	sumRHStile(bi,bj) = sumRHStile(bi,bj) + cg2d_b(I,J,bi,bj)
203	#endif
204	ENDDO
205	ENDDO
206	ENDDO
207	ENDDO
208	#ifdef LETS_MAKE_JAM
209	CALL EXCH_XY_O1_R8_JAM( cg2d_r )
210	#else
211	CALL EXCH_XY_RL( cg2d_r, myThid )
212	#endif
213	#ifdef LETS_MAKE_JAM
214	CALL EXCH_XY_O1_R8_JAM( cg2d_s )
215	#else
216	CALL EXCH_XY_RL( cg2d_s, myThid )
217	#endif
218	#ifdef CG2D_SINGLECPU_SUM
219	CALL GLOBAL_SUM_SINGLECPU_RL(localBuf, err, 0, 0, myThid)
220	CALL GLOBAL_SUM_SINGLECPU_RL(cg2d_b, sumRHS, OLx, OLy, myThid)
221	#else
222	CALL GLOBAL_SUM_TILE_RL( errTile, err, myThid )
223	CALL GLOBAL_SUM_TILE_RL( sumRHStile, sumRHS, myThid )
224	#endif
225	err = SQRT(err)
226	actualIts = 0
227	actualResidual = err
228
229	IF ( debugLevel .GE. debLevZero ) THEN
230	_BEGIN_MASTER( myThid )
231	WRITE(standardmessageunit,'(A,1P2E22.14)')
232	& ' cg2d: Sum(rhs),rhsMax = ', sumRHS,rhsMax
233	_END_MASTER( myThid )
234	ENDIF
235	C _BARRIER
236	c _BEGIN_MASTER( myThid )
237	c WRITE(standardmessageunit,'(A,I6,1PE30.14)')
238	c & ' CG2D iters, err = ',
239	c & actualIts, actualResidual
240	c _END_MASTER( myThid )
241	firstResidual=actualResidual
242
243	IF ( err .LT. cg2dTolerance ) GOTO 11
244
245	C >>>>>>>>>>>>>>> BEGIN SOLVER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
246	DO 10 it2d=1, numIters
247
248	CcnhDebugStarts
249	C WRITE(,) ' CG2D: Iteration ',it2d-1,' residual = ',
250	C & actualResidual
251	CcnhDebugEnds
252	C-- Solve preconditioning equation and update
253	C-- conjugate direction vector "s".
254	DO bj=myByLo(myThid),myByHi(myThid)
255	DO bi=myBxLo(myThid),myBxHi(myThid)
256	eta_qrNtile(bi,bj) = 0. _d 0
257	#ifdef TARGET_NEC_SX
258	!CDIR OUTERUNROLL=CG2D_OUTERLOOPITERS
259	#endif /* TARGET_NEC_SX */
260	DO J=1,sNy
261	DO I=1,sNx
262	cg2d_q(I,J,bi,bj) =
263	& pC(I ,J ,bi,bj)*cg2d_r(I ,J ,bi,bj)
264	& +pW(I ,J ,bi,bj)*cg2d_r(I-1,J ,bi,bj)
265	& +pW(I+1,J ,bi,bj)*cg2d_r(I+1,J ,bi,bj)
266	& +pS(I ,J ,bi,bj)*cg2d_r(I ,J-1,bi,bj)
267	& +pS(I ,J+1,bi,bj)*cg2d_r(I ,J+1,bi,bj)
268	CcnhDebugStarts
269	C cg2d_q(I,J,bi,bj) = cg2d_r(I ,J ,bi,bj)
270	CcnhDebugEnds
271	#ifdef CG2D_SINGLECPU_SUM
272	localBuf(I,J,bi,bj) =
273	& cg2d_q(I,J,bi,bj)*cg2d_r(I,J,bi,bj)
274	#else
275	eta_qrNtile(bi,bj) = eta_qrNtile(bi,bj)
276	& +cg2d_q(I,J,bi,bj)*cg2d_r(I,J,bi,bj)
277	#endif
278	ENDDO
279	ENDDO
280	ENDDO
281	ENDDO
282
283	#ifdef CG2D_SINGLECPU_SUM
284	CALL GLOBAL_SUM_SINGLECPU_RL( localBuf,eta_qrN,0,0,myThid )
285	#else
286	CALL GLOBAL_SUM_TILE_RL( eta_qrNtile,eta_qrN,myThid )
287	#endif
288	CcnhDebugStarts
289	C WRITE(,) ' CG2D: Iteration ',it2d-1,' eta_qrN = ',eta_qrN
290	CcnhDebugEnds
291	cgBeta = eta_qrN/eta_qrNM1
292	CcnhDebugStarts
293	C WRITE(,) ' CG2D: Iteration ',it2d-1,' beta = ',cgBeta
294	CcnhDebugEnds
295	eta_qrNM1 = eta_qrN
296
297	DO bj=myByLo(myThid),myByHi(myThid)
298	DO bi=myBxLo(myThid),myBxHi(myThid)
299	DO J=1,sNy
300	DO I=1,sNx
301	cg2d_s(I,J,bi,bj) = cg2d_q(I,J,bi,bj)
302	& + cgBeta*cg2d_s(I,J,bi,bj)
303	ENDDO
304	ENDDO
305	ENDDO
306	ENDDO
307
308	C-- Do exchanges that require messages i.e. between
309	C-- processes.
310	C _EXCH_XY_RL( cg2d_s, myThid )
311	#ifdef LETS_MAKE_JAM
312	CALL EXCH_XY_O1_R8_JAM( cg2d_s )
313	#else
314	CALL EXCH_XY_RL( cg2d_s, myThid )
315	#endif
316
317	C== Evaluate laplace operator on conjugate gradient vector
318	C== q = A.s
319	DO bj=myByLo(myThid),myByHi(myThid)
320	DO bi=myBxLo(myThid),myBxHi(myThid)
321	alphaTile(bi,bj) = 0. _d 0
322	#ifdef TARGET_NEC_SX
323	!CDIR OUTERUNROLL=CG2D_OUTERLOOPITERS
324	#endif /* TARGET_NEC_SX */
325	DO J=1,sNy
326	DO I=1,sNx
327	c ks = ksurfC(I,J,bi,bj)
328	cg2d_q(I,J,bi,bj) =
329	& aW2d(I ,J ,bi,bj)*cg2d_s(I-1,J ,bi,bj)
330	& +aW2d(I+1,J ,bi,bj)*cg2d_s(I+1,J ,bi,bj)
331	& +aS2d(I ,J ,bi,bj)*cg2d_s(I ,J-1,bi,bj)
332	& +aS2d(I ,J+1,bi,bj)*cg2d_s(I ,J+1,bi,bj)
333	& +aC2d(I ,J ,bi,bj)*cg2d_s(I ,J ,bi,bj)
334	c & -aW2d(I ,J ,bi,bj)*cg2d_s(I ,J ,bi,bj)
335	c & -aW2d(I+1,J ,bi,bj)*cg2d_s(I ,J ,bi,bj)
336	c & -aS2d(I ,J ,bi,bj)*cg2d_s(I ,J ,bi,bj)
337	c & -aS2d(I ,J+1,bi,bj)*cg2d_s(I ,J ,bi,bj)
338	c & -freeSurfFac_rA(i,j,bi,bj)deepFac2F(ks)*recip_Bo(i,j,bi,bj)
339	c & *cg2d_s(I ,J ,bi,bj)
340	c & /deltaTMom/deltaTfreesurf*cg2dNorm
341	#ifdef CG2D_SINGLECPU_SUM
342	localBuf(I,J,bi,bj) = cg2d_s(I,J,bi,bj)*cg2d_q(I,J,bi,bj)
343	#else
344	alphaTile(bi,bj) = alphaTile(bi,bj)
345	& + cg2d_s(I,J,bi,bj)*cg2d_q(I,J,bi,bj)
346	#endif
347	ENDDO
348	ENDDO
349	ENDDO
350	ENDDO
351	#ifdef CG2D_SINGLECPU_SUM
352	CALL GLOBAL_SUM_SINGLECPU_RL(localBuf, alpha, 0, 0, myThid)
353	#else
354	CALL GLOBAL_SUM_TILE_RL( alphaTile, alpha, myThid )
355	#endif
356	CcnhDebugStarts
357	C WRITE(,) ' CG2D: Iteration ',it2d-1,' SUM(s*q)= ',alpha
358	CcnhDebugEnds
359	alpha = eta_qrN/alpha
360	CcnhDebugStarts
361	C WRITE(,) ' CG2D: Iteration ',it2d-1,' alpha= ',alpha
362	CcnhDebugEnds
363
364	C== Update solution and residual vectors
365	C Now compute "interior" points.
366	DO bj=myByLo(myThid),myByHi(myThid)
367	DO bi=myBxLo(myThid),myBxHi(myThid)
368	errTile(bi,bj) = 0. _d 0
369	DO J=1,sNy
370	DO I=1,sNx
371	cg2d_x(I,J,bi,bj)=cg2d_x(I,J,bi,bj)+alpha*cg2d_s(I,J,bi,bj)
372	cg2d_r(I,J,bi,bj)=cg2d_r(I,J,bi,bj)-alpha*cg2d_q(I,J,bi,bj)
373	#ifdef CG2D_SINGLECPU_SUM
374	localBuf(I,J,bi,bj) = cg2d_r(I,J,bi,bj)*cg2d_r(I,J,bi,bj)
375	#else
376	errTile(bi,bj) = errTile(bi,bj)
377	& + cg2d_r(I,J,bi,bj)*cg2d_r(I,J,bi,bj)
378	#endif
379	ENDDO
380	ENDDO
381	ENDDO
382	ENDDO
383
384	#ifdef CG2D_SINGLECPU_SUM
385	CALL GLOBAL_SUM_SINGLECPU_RL(localBuf, err, 0, 0, myThid)
386	#else
387	CALL GLOBAL_SUM_TILE_RL( errTile, err, myThid )
388	#endif
389	err = SQRT(err)
390	actualIts = it2d
391	actualResidual = err
392	IF ( debugLevel.GT.debLevB ) THEN
393	c IF ( DIFFERENT_MULTIPLE(monitorFreq,myTime,deltaTClock)
394	c & ) THEN
395	_BEGIN_MASTER( myThid )
396	WRITE(msgBuf,'(A,I6,A,1PE21.14)')
397	& ' cg2d: iter=', actualIts, ' ; resid.= ', actualResidual
398	CALL PRINT_MESSAGE(msgBuf,standardMessageUnit,SQUEEZE_RIGHT,1)
399	_END_MASTER( myThid )
400	c ENDIF
401	ENDIF
402	IF ( err .LT. cg2dTolerance ) GOTO 11
403
404	#ifdef LETS_MAKE_JAM
405	CALL EXCH_XY_O1_R8_JAM( cg2d_r )
406	#else
407	CALL EXCH_XY_RL( cg2d_r, myThid )
408	#endif
409
410	10 CONTINUE
411	11 CONTINUE
412
413	IF (cg2dNormaliseRHS) THEN
414	C-- Un-normalise the answer
415	DO bj=myByLo(myThid),myByHi(myThid)
416	DO bi=myBxLo(myThid),myBxHi(myThid)
417	DO J=1,sNy
418	DO I=1,sNx
419	cg2d_x(I ,J ,bi,bj) = cg2d_x(I ,J ,bi,bj)/rhsNorm
420	ENDDO
421	ENDDO
422	ENDDO
423	ENDDO
424	ENDIF
425
426	C The following exchange was moved up to solve_for_pressure
427	C for compatibility with TAMC.
428	C _EXCH_XY_RL(cg2d_x, myThid )
429	c _BEGIN_MASTER( myThid )
430	c WRITE(*,'(A,I6,1PE30.14)') ' CG2D iters, err = ',
431	c & actualIts, actualResidual
432	c _END_MASTER( myThid )
433
434	C-- Return parameters to caller
435	lastResidual=actualResidual
436	numIters=actualIts
437
438	CcnhDebugStarts
439	C CALL PLOT_FIELD_XYRL( cg2d_x, 'CALC_MOM_RHS CG2D_X' , 1, myThid )
440	C err = 0. _d 0
441	C DO bj=myByLo(myThid),myByHi(myThid)
442	C DO bi=myBxLo(myThid),myBxHi(myThid)
443	C DO J=1,sNy
444	C DO I=1,sNx
445	C cg2d_r(I,J,bi,bj) = cg2d_b(I,J,bi,bj) -
446	C & (aW2d(I ,J ,bi,bj)*cg2d_x(I-1,J ,bi,bj)
447	C & +aW2d(I+1,J ,bi,bj)*cg2d_x(I+1,J ,bi,bj)
448	C & +aS2d(I ,J ,bi,bj)*cg2d_x(I ,J-1,bi,bj)
449	C & +aS2d(I ,J+1,bi,bj)*cg2d_x(I ,J+1,bi,bj)
450	C & -aW2d(I ,J ,bi,bj)*cg2d_x(I ,J ,bi,bj)
451	C & -aW2d(I+1,J ,bi,bj)*cg2d_x(I ,J ,bi,bj)
452	C & -aS2d(I ,J ,bi,bj)*cg2d_x(I ,J ,bi,bj)
453	C & -aS2d(I ,J+1,bi,bj)*cg2d_x(I ,J ,bi,bj))
454	C err = err +
455	C & cg2d_r(I,J,bi,bj)*cg2d_r(I,J,bi,bj)
456	C ENDDO
457	C ENDDO
458	C ENDDO
459	C ENDDO
460	C _GLOBAL_SUM_RL( err , myThid )
461	C write(,) 'cg2d: Ax - b = ',SQRT(err)
462	CcnhDebugEnds
463
464	RETURN
465	END