--- MITgcm/model/src/cg2d_nsa.F	2006/06/07 01:45:42	1.1
+++ MITgcm/model/src/cg2d_nsa.F	2012/07/17 21:31:52	1.5
@@ -1,50 +1,44 @@
-C $Header: /home/ubuntu/mnt/e9_copy/MITgcm/model/src/cg2d_nsa.F,v 1.1 2006/06/07 01:45:42 heimbach Exp $
+C $Header: /home/ubuntu/mnt/e9_copy/MITgcm/model/src/cg2d_nsa.F,v 1.5 2012/07/17 21:31:52 jmc Exp $
 C $Name:  $
 
 #include "CPP_OPTIONS.h"
-#ifdef ALLOW_USE_MPI
-C HACK to avoid global_max
-# define ALLOW_CONST_RHSMAX
-#endif 
 
 CML THIS DOES NOT WORK +++++
 #undef ALLOW_LOOP_DIRECTIVE
 CBOP
 C     !ROUTINE: CG2D_NSA
 C     !INTERFACE:
-      SUBROUTINE CG2D_NSA(  
-     I                cg2d_b,
-     U                cg2d_x,
-     O                firstResidual,
-     O                lastResidual,
-     U                numIters,
+      SUBROUTINE CG2D_NSA(
+     U                cg2d_b, cg2d_x,
+     O                firstResidual, minResidualSq, lastResidual,
+     U                numIters, nIterMin,
      I                myThid )
 C     !DESCRIPTION: \bv
 C     *==========================================================*
-C     | SUBROUTINE CG2D_NSA                                           
-C     | o Two-dimensional grid problem conjugate-gradient         
-C     |   inverter (with preconditioner).                         
+C     | SUBROUTINE CG2D_NSA
+C     | o Two-dimensional grid problem conjugate-gradient
+C     |   inverter (with preconditioner).
 C     | o This version is used only in the case when the matrix
-C     |   operator is not "self-adjoint" (NSA). Any remaining 
+C     |   operator is not "self-adjoint" (NSA). Any remaining
 C     |   residuals will immediately reported to the department
 C     |   of homeland security.
 C     *==========================================================*
-C     | Con. grad is an iterative procedure for solving Ax = b.   
-C     | It requires the A be symmetric.                           
-C     | This implementation assumes A is a five-diagonal          
-C     | matrix of the form that arises in the discrete            
-C     | representation of the del^2 operator in a                 
-C     | two-dimensional space.                                    
-C     | Notes:                                                    
-C     | ======                                                    
-C     | This implementation can support shared-memory              
-C     | multi-threaded execution. In order to do this COMMON       
-C     | blocks are used for many of the arrays - even ones that    
-C     | are only used for intermedaite results. This design is     
-C     | OK if you want to all the threads to collaborate on        
-C     | solving the same problem. On the other hand if you want    
-C     | the threads to solve several different problems            
-C     | concurrently this implementation will not work.           
+C     | Con. grad is an iterative procedure for solving Ax = b.
+C     | It requires the A be symmetric.
+C     | This implementation assumes A is a five-diagonal
+C     | matrix of the form that arises in the discrete
+C     | representation of the del^2 operator in a
+C     | two-dimensional space.
+C     | Notes:
+C     | ======
+C     | This implementation can support shared-memory
+C     | multi-threaded execution. In order to do this COMMON
+C     | blocks are used for many of the arrays - even ones that
+C     | are only used for intermedaite results. This design is
+C     | OK if you want to all the threads to collaborate on
+C     | solving the same problem. On the other hand if you want
+C     | the threads to solve several different problems
+C     | concurrently this implementation will not work.
 C     *==========================================================*
 C     \ev
 
@@ -54,9 +48,7 @@
 #include "SIZE.h"
 #include "EEPARAMS.h"
 #include "PARAMS.h"
-#include "GRID.h"
 #include "CG2D.h"
-#include "SURFACE.h"
 #ifdef ALLOW_AUTODIFF_TAMC
 # include "tamc.h"
 # include "tamc_keys.h"
@@ -64,71 +56,75 @@
 
 C     !INPUT/OUTPUT PARAMETERS:
 C     === Routine arguments ===
-C     myThid    - Thread on which I am working.
-C     cg2d_b    - The source term or "right hand side"
-C     cg2d_x    - The solution
-C     firstResidual - the initial residual before any iterations
-C     lastResidual  - the actual residual reached
-C     numIters  - Entry: the maximum number of iterations allowed
-C                 Exit:  the actual number of iterations used
+C     cg2d_b    :: The source term or "right hand side" (Output: normalised RHS)
+C     cg2d_x    :: The solution (Input: first guess)
+C     firstResidual :: the initial residual before any iterations
+C     minResidualSq :: the lowest residual reached (squared)
+C     lastResidual  :: the actual residual reached
+C     numIters  :: Inp: the maximum number of iterations allowed
+C                  Out: the actual number of iterations used
+C     nIterMin  :: Inp: decide to store (if >=0) or not (if <0) lowest res. sol.
+C                  Out: iteration number corresponding to lowest residual
+C     myThid    :: Thread on which I am working.
       _RL  cg2d_b(1-OLx:sNx+OLx,1-OLy:sNy+OLy,nSx,nSy)
       _RL  cg2d_x(1-OLx:sNx+OLx,1-OLy:sNy+OLy,nSx,nSy)
       _RL  firstResidual
+      _RL  minResidualSq
       _RL  lastResidual
       INTEGER numIters
+      INTEGER nIterMin
       INTEGER myThid
 
 #ifdef ALLOW_CG2D_NSA
 C     !LOCAL VARIABLES:
 C     === Local variables ====
-C     actualIts      - Number of iterations taken
-C     actualResidual - residual
-C     bi          - Block index in X and Y.
-C     bj
-C     eta_qrN     - Used in computing search directions
-C     eta_qrNM1     suffix N and NM1 denote current and
-C     cgBeta        previous iterations respectively.
-C     recip_eta_qrNM1 reciprocal of eta_qrNM1
-C     alpha  
-C     alpha_aux   - to avoid the statement: alpha = 1./alpha (for TAMC/TAF)
-C     sumRHS      - Sum of right-hand-side. Sometimes this is a
-C                   useful debuggin/trouble shooting diagnostic.
-C                   For neumann problems sumRHS needs to be ~0.
-C                   or they converge at a non-zero residual.
-C     err         - Measure of residual of Ax - b, usually the norm.
-C     err_sq      - square of err (for TAMC/TAF)
-C     I, J, N     - Loop counters ( N counts CG iterations )
+C     bi, bj     :: tile index in X and Y.
+C     i, j, it2d :: Loop counters ( it2d counts CG iterations )
+C     actualIts  :: actual CG iteration number
+C     err_sq     :: Measure of the square of the residual of Ax - b.
+C     eta_qrN    :: Used in computing search directions; suffix N and NM1
+C     eta_qrNM1     denote current and previous iterations respectively.
+C     recip_eta_qrNM1 :: reciprocal of eta_qrNM1
+C     cgBeta     :: coeff used to update conjugate direction vector "s".
+C     alpha      :: coeff used to update solution & residual
+C     alphaSum   :: to avoid the statement: alpha = 1./alpha (for TAMC/TAF)
+C     sumRHS     :: Sum of right-hand-side. Sometimes this is a useful
+C                   debugging/trouble shooting diagnostic. For neumann problems
+C                   sumRHS needs to be ~0 or it converge at a non-zero residual.
+C     cg2d_min   :: used to store solution corresponding to lowest residual.
+C     msgBuf     :: Informational/error message buffer
+      INTEGER bi, bj
+      INTEGER i, j, it2d
       INTEGER actualIts
-      _RL    actualResidual
-      INTEGER bi, bj              
-      INTEGER I, J, it2d
-
-      _RL    err
-      _RL    err_sq
-      _RL    eta_qrN
-      _RL    eta_qrNM1
-      _RL    recip_eta_qrNM1
-      _RL    cgBeta
-      _RL    alpha
-      _RL    alpha_aux
-      _RL    sumRHS
-      _RL    rhsMax, rhsMaxGlobal
-      _RL    rhsNorm
       _RL    cg2dTolerance_sq
+      _RL    err_sq,  errTile(nSx,nSy)
+      _RL    eta_qrN, eta_qrNtile(nSx,nSy)
+      _RL    eta_qrNM1, recip_eta_qrNM1
+      _RL    cgBeta,  alpha
+      _RL    alphaSum,alphaTile(nSx,nSy)
+      _RL    sumRHS,  sumRHStile(nSx,nSy)
+      _RL    rhsMax,  rhsNorm
+      CHARACTER*(MAX_LEN_MBUF) msgBuf
+      LOGICAL printResidual
 CEOP
 
 #ifdef ALLOW_AUTODIFF_TAMC
       IF ( numIters .GT. numItersMax ) THEN
-         WRITE(standardMessageUnit,'(A,I10)') 
-     &        'CG2D_NSA: numIters > numItersMax = ', numItersMax
-         STOP 'NON-NORMAL in CG2D_NSA'
+        WRITE(msgBuf,'(A,I10)')
+     &    'CG2D_NSA: numIters > numItersMax =', numItersMax
+        CALL PRINT_ERROR( msgBuf, myThid )
+        STOP 'ABNORMAL END: S/R CG2D_NSA'
+      ENDIF
+      IF ( cg2dNormaliseRHS ) THEN
+        WRITE(msgBuf,'(A)') 'CG2D_NSA: cg2dNormaliseRHS is disabled'
+        CALL PRINT_ERROR( msgBuf, myThid )
+        WRITE(msgBuf,'(A)')
+     &    'set cg2dTargetResWunit (instead of cg2dTargetResidual)'
+        CALL PRINT_ERROR( msgBuf, myThid )
+        STOP 'ABNORMAL END: S/R CG2D_NSA'
       ENDIF
 #endif /* ALLOW_AUTODIFF_TAMC */
 
-CcnhDebugStarts
-C     CHARACTER*(MAX_LEN_FNAM) suff
-CcnhDebugEnds
-
 #ifdef ALLOW_AUTODIFF_TAMC
           act1 = myThid - 1
           max1 = nTx*nTy
@@ -136,145 +132,98 @@
           ikey = (act1 + 1) + act2*max1
 #endif /* ALLOW_AUTODIFF_TAMC */
 
-C--   Initialise inverter
-      eta_qrNM1 = 1. _d 0
-      recip_eta_qrNM1 = 1./eta_qrNM1
-
-CcnhDebugStarts
-C     _EXCH_XY_R8( cg2d_b, myThid )
-C     CALL PLOT_FIELD_XYRL( cg2d_b, 'CG2D.0 CG2D_B' , 1, myThid )
-C     suff = 'unnormalised'
-C     CALL WRITE_FLD_XY_RL (  'cg2d_b.',suff,    cg2d_b, 1, myThid)
-C     STOP
-CcnhDebugEnds
+C--   Initialise auxiliary constant, some output variable and inverter
+      cg2dTolerance_sq = cg2dTolerance*cg2dTolerance
+      minResidualSq = -1. _d 0
+      eta_qrNM1     =  1. _d 0
+      recip_eta_qrNM1= 1. _d 0
 
 C--   Normalise RHS
-#ifdef ALLOW_AUTODIFF_TAMC
-CADJ STORE cg2d_b = comlev1_cg2d, key = ikey, byte = isbyte 
-#endif /* ALLOW_AUTODIFF_TAMC */
       rhsMax = 0. _d 0
       DO bj=myByLo(myThid),myByHi(myThid)
        DO bi=myBxLo(myThid),myBxHi(myThid)
-        DO J=1,sNy
-         DO I=1,sNx
-          cg2d_b(I,J,bi,bj) = cg2d_b(I,J,bi,bj)*cg2dNorm
-          rhsMax = MAX(ABS(cg2d_b(I,J,bi,bj)),rhsMax)
+        DO j=1,sNy
+         DO i=1,sNx
+          cg2d_b(i,j,bi,bj) = cg2d_b(i,j,bi,bj)*cg2dNorm
+          rhsMax = MAX(ABS(cg2d_b(i,j,bi,bj)),rhsMax)
          ENDDO
         ENDDO
        ENDDO
       ENDDO
 
+#ifndef ALLOW_AUTODIFF_TAMC
       IF (cg2dNormaliseRHS) THEN
-C     -  Normalise RHS :
-#ifdef LETS_MAKE_JAM
-C     _GLOBAL_MAX_R8( rhsMax, myThid )
-      rhsMaxGlobal=1.
-#else
-#ifdef ALLOW_CONST_RHSMAX
-      rhsMaxGlobal=1.
-#else
-      rhsMaxGlobal=rhsMax
-      _GLOBAL_MAX_R8( rhsMaxGlobal, myThid )
-#endif /* ALLOW_CONST_RHSMAX */
-#endif
-#ifdef ALLOW_AUTODIFF_TAMC
-CADJ STORE rhsNorm = comlev1_cg2d, key = ikey, byte = isbyte 
-#endif /* ALLOW_AUTODIFF_TAMC */
-      IF ( rhsMaxGlobal .NE. 0. ) THEN
-       rhsNorm = 1. _d 0 / rhsMaxGlobal
-      ELSE
+C-    Normalise RHS :
+       _GLOBAL_MAX_RL( rhsMax, myThid )
        rhsNorm = 1. _d 0
-      ENDIF
-
-#ifdef ALLOW_AUTODIFF_TAMC
-CADJ STORE cg2d_b = comlev1_cg2d, key = ikey, byte = isbyte 
-CADJ STORE cg2d_x = comlev1_cg2d, key = ikey, byte = isbyte 
-#endif /* ALLOW_AUTODIFF_TAMC */
-      DO bj=myByLo(myThid),myByHi(myThid)
-       DO bi=myBxLo(myThid),myBxHi(myThid)
-        DO J=1,sNy
-         DO I=1,sNx
-          cg2d_b(I,J,bi,bj) = cg2d_b(I,J,bi,bj)*rhsNorm
-          cg2d_x(I,J,bi,bj) = cg2d_x(I,J,bi,bj)*rhsNorm
+       IF ( rhsMax .NE. 0. ) rhsNorm = 1. _d 0 / rhsMax
+       DO bj=myByLo(myThid),myByHi(myThid)
+        DO bi=myBxLo(myThid),myBxHi(myThid)
+         DO j=1,sNy
+          DO i=1,sNx
+           cg2d_b(i,j,bi,bj) = cg2d_b(i,j,bi,bj)*rhsNorm
+           cg2d_x(i,j,bi,bj) = cg2d_x(i,j,bi,bj)*rhsNorm
+          ENDDO
          ENDDO
         ENDDO
        ENDDO
-      ENDDO
-C- end Normalise RHS
+C-    end Normalise RHS
       ENDIF
+#endif /* ndef ALLOW_AUTODIFF_TAMC */
 
 C--   Update overlaps
-      _EXCH_XY_R8( cg2d_b, myThid )
-      _EXCH_XY_R8( cg2d_x, myThid )
-CcnhDebugStarts
-C     CALL PLOT_FIELD_XYRL( cg2d_b, 'CG2D.1 CG2D_B' , 1, myThid )
-C     suff = 'normalised'
-C     CALL WRITE_FLD_XY_RL (  'cg2d_b.',suff,    cg2d_b, 1, myThid)
-CcnhDebugEnds
+      CALL EXCH_XY_RL( cg2d_x, myThid )
 
 C--   Initial residual calculation
-      err = 0. _d 0
-      err_sq = 0. _d 0
-      sumRHS = 0. _d 0
 #ifdef ALLOW_AUTODIFF_TAMC
-CADJ STORE cg2d_b = comlev1_cg2d, key = ikey, byte = isbyte 
-CADJ STORE cg2d_x = comlev1_cg2d, key = ikey, byte = isbyte 
+CADJ STORE cg2d_b = comlev1_cg2d, key = ikey, byte = isbyte
+CADJ STORE cg2d_x = comlev1_cg2d, key = ikey, byte = isbyte
 #endif /* ALLOW_AUTODIFF_TAMC */
       DO bj=myByLo(myThid),myByHi(myThid)
        DO bi=myBxLo(myThid),myBxHi(myThid)
-        DO J=1,sNy
-         DO I=1,sNx
-          cg2d_s(I,J,bi,bj) = 0.
-          cg2d_r(I,J,bi,bj) = cg2d_b(I,J,bi,bj) -
-     &    (aW2d(I  ,J  ,bi,bj)*cg2d_x(I-1,J  ,bi,bj)
-     &    +aW2d(I+1,J  ,bi,bj)*cg2d_x(I+1,J  ,bi,bj)
-     &    +aS2d(I  ,J  ,bi,bj)*cg2d_x(I  ,J-1,bi,bj)
-     &    +aS2d(I  ,J+1,bi,bj)*cg2d_x(I  ,J+1,bi,bj)
-     &    -aW2d(I  ,J  ,bi,bj)*cg2d_x(I  ,J  ,bi,bj)
-     &    -aW2d(I+1,J  ,bi,bj)*cg2d_x(I  ,J  ,bi,bj)
-     &    -aS2d(I  ,J  ,bi,bj)*cg2d_x(I  ,J  ,bi,bj)
-     &    -aS2d(I  ,J+1,bi,bj)*cg2d_x(I  ,J  ,bi,bj)
-     &    -freeSurfFac*_rA(i,j,bi,bj)*recip_Bo(i,j,bi,bj)* 
-     &     cg2d_x(I  ,J  ,bi,bj)/deltaTMom/deltaTfreesurf*cg2dNorm
-CML     &     cg2d_x(I  ,J  ,bi,bj)/deltaTMom/deltaTMom*cg2dNorm
+        errTile(bi,bj)    = 0. _d 0
+        sumRHStile(bi,bj) = 0. _d 0
+        DO j=0,sNy+1
+         DO i=0,sNx+1
+          cg2d_s(i,j,bi,bj) = 0.
+         ENDDO
+        ENDDO
+        DO j=1,sNy
+         DO i=1,sNx
+          cg2d_r(i,j,bi,bj) = cg2d_b(i,j,bi,bj) -
+     &    (aW2d(i  ,j  ,bi,bj)*cg2d_x(i-1,j  ,bi,bj)
+     &    +aW2d(i+1,j  ,bi,bj)*cg2d_x(i+1,j  ,bi,bj)
+     &    +aS2d(i  ,j  ,bi,bj)*cg2d_x(i  ,j-1,bi,bj)
+     &    +aS2d(i  ,j+1,bi,bj)*cg2d_x(i  ,j+1,bi,bj)
+     &    +aC2d(i  ,j  ,bi,bj)*cg2d_x(i  ,j  ,bi,bj)
      &    )
-          err_sq         = err_sq         + 
-     &     cg2d_r(I,J,bi,bj)*cg2d_r(I,J,bi,bj)
-          sumRHS            = sumRHS            +
-     &     cg2d_b(I,J,bi,bj)
+          errTile(bi,bj)    = errTile(bi,bj)
+     &                      + cg2d_r(i,j,bi,bj)*cg2d_r(i,j,bi,bj)
+          sumRHStile(bi,bj) = sumRHStile(bi,bj) + cg2d_b(i,j,bi,bj)
          ENDDO
         ENDDO
        ENDDO
       ENDDO
 
-#ifdef LETS_MAKE_JAM
-      CALL EXCH_XY_O1_R8_JAM( cg2d_r )
-      CALL EXCH_XY_O1_R8_JAM( cg2d_s )
-#else
-      _EXCH_XY_R8( cg2d_r, myThid )
-      _EXCH_XY_R8( cg2d_s, myThid )
-#endif
-       _GLOBAL_SUM_R8( sumRHS, myThid )
-       _GLOBAL_SUM_R8( err_sq, myThid )
-       if ( err_sq .ne. 0. ) then
-          err = SQRT(err_sq)
-       else
-          err = 0.
-       end if
-       actualIts      = 0
-       actualResidual = err
-      
-      _BEGIN_MASTER( myThid )
-      write(standardMessageUnit,'(A,1P2E22.14)')
-     &     ' cg2d: Sum(rhs),rhsMax = ', sumRHS,rhsMaxGlobal 
-      _END_MASTER( )
-C     _BARRIER
-c     _BEGIN_MASTER( myThid )
-c      WRITE(*,'(A,I6,1PE30.14)') ' CG2D_NSA iters, err = ', 
-c    & actualIts, actualResidual
-c     _END_MASTER( )
-      firstResidual=actualResidual
-      cg2dTolerance_sq = cg2dTolerance**2
+c     CALL EXCH_S3D_RL( cg2d_r, 1, myThid )
+      CALL EXCH_XY_RL ( cg2d_r, myThid )
+      CALL GLOBAL_SUM_TILE_RL( errTile,    err_sq, myThid )
+      CALL GLOBAL_SUM_TILE_RL( sumRHStile, sumRHS, myThid )
+      actualIts = 0
+      IF ( err_sq .NE. 0. ) THEN
+        firstResidual = SQRT(err_sq)
+      ELSE
+        firstResidual = 0.
+      ENDIF
+
+      printResidual = .FALSE.
+      IF ( debugLevel .GE. debLevZero ) THEN
+        _BEGIN_MASTER( myThid )
+        printResidual = printResidualFreq.GE.1
+        WRITE(standardmessageunit,'(A,1P2E22.14)')
+     &  ' cg2d: Sum(rhs),rhsMax = ', sumRHS,rhsMax
+        _END_MASTER( myThid )
+      ENDIF
 
 C     >>>>>>>>>>>>>>> BEGIN SOLVER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 Cml   begin main solver loop
@@ -284,76 +233,58 @@
       DO it2d=1, numIters
 #ifdef ALLOW_LOOP_DIRECTIVE
 CML      it2d = 0
-CML      DO WHILE ( err_sq .GT. cg2dTolerance_sq .and. it2d .LT. numIters ) 
+CML      DO WHILE ( err_sq .GT. cg2dTolerance_sq .and. it2d .LT. numIters )
 CML      it2d = it2d+1
 #endif /* ALLOW_LOOP_DIRECTIVE */
 
 #ifdef ALLOW_AUTODIFF_TAMC
        icg2dkey = (ikey-1)*numItersMax + it2d
-CMLCADJ STORE err = comlev1_cg2d_iter, key = icg2dkey, byte = isbyte 
-CADJ STORE err_sq = comlev1_cg2d_iter, key = icg2dkey, byte = isbyte 
+CADJ STORE err_sq = comlev1_cg2d_iter, key = icg2dkey, byte = isbyte
 #endif /* ALLOW_AUTODIFF_TAMC */
-CML      IF ( err .LT. cg2dTolerance ) THEN
-      IF ( err_sq .LT. cg2dTolerance_sq ) THEN
-Cml      DO NOTHING
-      ELSE
+      IF ( err_sq .GE. cg2dTolerance_sq ) THEN
 
-CcnhDebugStarts
-C      WRITE(*,*) ' CG2D_NSA: Iteration ',it2d-1,' residual = ',
-C    &  actualResidual
-CcnhDebugEnds
 C--    Solve preconditioning equation and update
 C--    conjugate direction vector "s".
-       eta_qrN = 0. _d 0
 #ifdef ALLOW_AUTODIFF_TAMC
-CADJ STORE cg2d_r = comlev1_cg2d_iter, key = icg2dkey, byte = isbyte 
-CADJ STORE cg2d_s = comlev1_cg2d_iter, key = icg2dkey, byte = isbyte 
+CADJ STORE cg2d_r = comlev1_cg2d_iter, key = icg2dkey, byte = isbyte
+CADJ STORE cg2d_s = comlev1_cg2d_iter, key = icg2dkey, byte = isbyte
 #endif /* ALLOW_AUTODIFF_TAMC */
        DO bj=myByLo(myThid),myByHi(myThid)
         DO bi=myBxLo(myThid),myBxHi(myThid)
-         DO J=1,sNy
-          DO I=1,sNx
-           cg2d_z(I,J,bi,bj) = 
-     &      pC(I  ,J  ,bi,bj)*cg2d_r(I  ,J  ,bi,bj)
-     &     +pW(I  ,J  ,bi,bj)*cg2d_r(I-1,J  ,bi,bj)
-     &     +pW(I+1,J  ,bi,bj)*cg2d_r(I+1,J  ,bi,bj)
-     &     +pS(I  ,J  ,bi,bj)*cg2d_r(I  ,J-1,bi,bj)
-     &     +pS(I  ,J+1,bi,bj)*cg2d_r(I  ,J+1,bi,bj)
-CcnhDebugStarts
-C          cg2d_z(I,J,bi,bj) = cg2d_r(I  ,J  ,bi,bj)
-CcnhDebugEnds
-           eta_qrN = eta_qrN
-     &     +cg2d_z(I,J,bi,bj)*cg2d_r(I,J,bi,bj)
+         eta_qrNtile(bi,bj) = 0. _d 0
+         DO j=1,sNy
+          DO i=1,sNx
+           cg2d_z(i,j,bi,bj) =
+     &      pC(i  ,j  ,bi,bj)*cg2d_r(i  ,j  ,bi,bj)
+     &     +pW(i  ,j  ,bi,bj)*cg2d_r(i-1,j  ,bi,bj)
+     &     +pW(i+1,j  ,bi,bj)*cg2d_r(i+1,j  ,bi,bj)
+     &     +pS(i  ,j  ,bi,bj)*cg2d_r(i  ,j-1,bi,bj)
+     &     +pS(i  ,j+1,bi,bj)*cg2d_r(i  ,j+1,bi,bj)
+           eta_qrNtile(bi,bj) = eta_qrNtile(bi,bj)
+     &     +cg2d_z(i,j,bi,bj)*cg2d_r(i,j,bi,bj)
           ENDDO
          ENDDO
         ENDDO
        ENDDO
 
-       _GLOBAL_SUM_R8(eta_qrN, myThid)
-CcnhDebugStarts
-C      WRITE(*,*) ' CG2D_NSA: Iteration ',it2d-1,' eta_qrN = ',eta_qrN
-CcnhDebugEnds
+       CALL GLOBAL_SUM_TILE_RL( eta_qrNtile,eta_qrN,myThid )
 #ifdef ALLOW_AUTODIFF_TAMC
-CMLCADJ STORE eta_qrNM1 = comlev1_cg2d_iter, key = icg2dkey, byte = isbyte 
-CADJ STORE recip_eta_qrNM1 = comlev1_cg2d_iter, key = icg2dkey, byte = isbyte 
+CMLCADJ STORE eta_qrNM1 = comlev1_cg2d_iter, key = icg2dkey, byte = isbyte
+CADJ STORE recip_eta_qrNM1 = comlev1_cg2d_iter, key = icg2dkey, byte = isbyte
 #endif /* ALLOW_AUTODIFF_TAMC */
 CML       cgBeta   = eta_qrN/eta_qrNM1
        cgBeta   = eta_qrN*recip_eta_qrNM1
-CcnhDebugStarts
-C      WRITE(*,*) ' CG2D_NSA: Iteration ',it2d-1,' beta = ',cgBeta
-CcnhDebugEnds
-Cml    store normalisation factor for the next interation 
-Cml    (in case there is one).
+Cml    store normalisation factor for the next interation (in case there is one).
 CML    store the inverse of the normalization factor for higher precision
 CML       eta_qrNM1 = eta_qrN
-       recip_eta_qrNM1 = 1./eta_qrN
+       recip_eta_qrNM1 = 1. _d 0/eta_qrN
 
        DO bj=myByLo(myThid),myByHi(myThid)
         DO bi=myBxLo(myThid),myBxHi(myThid)
-         DO J=1,sNy
-          DO I=1,sNx
-           cg2d_s(I,J,bi,bj) = cg2d_z(I,J,bi,bj)
-     &                       + cgBeta*cg2d_s(I,J,bi,bj)
+         DO j=1,sNy
+          DO i=1,sNx
+           cg2d_s(i,j,bi,bj) = cg2d_z(i,j,bi,bj)
+     &                       + cgBeta*cg2d_s(i,j,bi,bj)
           ENDDO
          ENDDO
         ENDDO
@@ -361,131 +292,107 @@
 
 C--    Do exchanges that require messages i.e. between
 C--    processes.
-#ifdef LETS_MAKE_JAM
-      CALL EXCH_XY_O1_R8_JAM( cg2d_s )
-#else
-       _EXCH_XY_R8( cg2d_s, myThid )
-#endif
+c      CALL EXCH_S3D_RL( cg2d_s, 1, myThid )
+       CALL EXCH_XY_RL ( cg2d_s, myThid )
 
 C==    Evaluate laplace operator on conjugate gradient vector
 C==    q = A.s
-       alpha     = 0. _d 0
-       alpha_aux = 0. _d 0
 #ifdef ALLOW_AUTODIFF_TAMC
 #ifndef ALLOW_LOOP_DIRECTIVE
-CADJ STORE cg2d_s = comlev1_cg2d_iter, key = icg2dkey, byte = isbyte 
+CADJ STORE cg2d_s = comlev1_cg2d_iter, key = icg2dkey, byte = isbyte
 #endif /* not ALLOW_LOOP_DIRECTIVE */
 #endif /* ALLOW_AUTODIFF_TAMC */
        DO bj=myByLo(myThid),myByHi(myThid)
         DO bi=myBxLo(myThid),myBxHi(myThid)
-         DO J=1,sNy
-          DO I=1,sNx
-           cg2d_q(I,J,bi,bj) = 
-     &     aW2d(I  ,J  ,bi,bj)*cg2d_s(I-1,J  ,bi,bj)
-     &    +aW2d(I+1,J  ,bi,bj)*cg2d_s(I+1,J  ,bi,bj)
-     &    +aS2d(I  ,J  ,bi,bj)*cg2d_s(I  ,J-1,bi,bj)
-     &    +aS2d(I  ,J+1,bi,bj)*cg2d_s(I  ,J+1,bi,bj)
-     &    -aW2d(I  ,J  ,bi,bj)*cg2d_s(I  ,J  ,bi,bj)
-     &    -aW2d(I+1,J  ,bi,bj)*cg2d_s(I  ,J  ,bi,bj)
-     &    -aS2d(I  ,J  ,bi,bj)*cg2d_s(I  ,J  ,bi,bj)
-     &    -aS2d(I  ,J+1,bi,bj)*cg2d_s(I  ,J  ,bi,bj)
-     &    -freeSurfFac*_rA(i,j,bi,bj)*recip_Bo(i,j,bi,bj)* 
-     &     cg2d_s(I  ,J  ,bi,bj)/deltaTMom/deltaTfreesurf*cg2dNorm
-CML     &     cg2d_s(I  ,J  ,bi,bj)/deltaTMom/deltaTMom*cg2dNorm
-          alpha_aux = alpha_aux+cg2d_s(I,J,bi,bj)*cg2d_q(I,J,bi,bj)
+         alphaTile(bi,bj) = 0. _d 0
+         DO j=1,sNy
+          DO i=1,sNx
+           cg2d_q(i,j,bi,bj) =
+     &     aW2d(i  ,j  ,bi,bj)*cg2d_s(i-1,j  ,bi,bj)
+     &    +aW2d(i+1,j  ,bi,bj)*cg2d_s(i+1,j  ,bi,bj)
+     &    +aS2d(i  ,j  ,bi,bj)*cg2d_s(i  ,j-1,bi,bj)
+     &    +aS2d(i  ,j+1,bi,bj)*cg2d_s(i  ,j+1,bi,bj)
+     &    +aC2d(i  ,j  ,bi,bj)*cg2d_s(i  ,j  ,bi,bj)
+           alphaTile(bi,bj) = alphaTile(bi,bj)
+     &                      + cg2d_s(i,j,bi,bj)*cg2d_q(i,j,bi,bj)
           ENDDO
          ENDDO
         ENDDO
        ENDDO
-       _GLOBAL_SUM_R8(alpha_aux,myThid)
-CcnhDebugStarts
-C      WRITE(*,*) ' CG2D_NSA: Iteration ',it2d-1,' SUM(s*q)= ',alpha_aux
-CcnhDebugEnds
-       alpha = eta_qrN/alpha_aux
-CcnhDebugStarts
-C      WRITE(*,*) ' CG2D_NSA: Iteration ',it2d-1,' alpha= ',alpha
-CcnhDebugEnds
-     
-C==    Update solution and residual vectors
+       CALL GLOBAL_SUM_TILE_RL( alphaTile, alphaSum, myThid )
+       alpha = eta_qrN/alphaSum
+
+C==    Update simultaneously solution and residual vectors (and Iter number)
 C      Now compute "interior" points.
-       err    = 0. _d 0
-       err_sq = 0. _d 0
 #ifdef ALLOW_AUTODIFF_TAMC
 #ifndef ALLOW_LOOP_DIRECTIVE
-CADJ STORE cg2d_r = comlev1_cg2d_iter, key = icg2dkey, byte = isbyte 
+CADJ STORE cg2d_r = comlev1_cg2d_iter, key = icg2dkey, byte = isbyte
 #endif /* ALLOW_LOOP_DIRECTIVE */
 #endif /* ALLOW_AUTODIFF_TAMC */
        DO bj=myByLo(myThid),myByHi(myThid)
         DO bi=myBxLo(myThid),myBxHi(myThid)
-         DO J=1,sNy
-          DO I=1,sNx
-           cg2d_x(I,J,bi,bj)=cg2d_x(I,J,bi,bj)+alpha*cg2d_s(I,J,bi,bj)
-           cg2d_r(I,J,bi,bj)=cg2d_r(I,J,bi,bj)-alpha*cg2d_q(I,J,bi,bj)
-           err_sq = err_sq+cg2d_r(I,J,bi,bj)*cg2d_r(I,J,bi,bj)
+         errTile(bi,bj) = 0. _d 0
+         DO j=1,sNy
+          DO i=1,sNx
+           cg2d_x(i,j,bi,bj)=cg2d_x(i,j,bi,bj)+alpha*cg2d_s(i,j,bi,bj)
+           cg2d_r(i,j,bi,bj)=cg2d_r(i,j,bi,bj)-alpha*cg2d_q(i,j,bi,bj)
+           errTile(bi,bj) = errTile(bi,bj)
+     &                    + cg2d_r(i,j,bi,bj)*cg2d_r(i,j,bi,bj)
           ENDDO
          ENDDO
         ENDDO
        ENDDO
+       actualIts = it2d
 
-       _GLOBAL_SUM_R8( err_sq   , myThid )
-       if ( err_sq .ne. 0. ) then
-          err = SQRT(err_sq)
-       else
-          err = 0.
-       end if
-       actualIts      = it2d
-       actualResidual = err
+       CALL GLOBAL_SUM_TILE_RL( errTile,    err_sq,    myThid )
+       IF ( printResidual ) THEN
+        IF ( MOD( it2d-1, printResidualFreq ).EQ.0 ) THEN
+         WRITE(msgBuf,'(A,I6,A,1PE21.14)')
+     &    ' cg2d: iter=', it2d, ' ; resid.= ', SQRT(err_sq)
+         CALL PRINT_MESSAGE( msgBuf, standardMessageUnit,
+     &                       SQUEEZE_RIGHT, myThid )
+        ENDIF
+       ENDIF
 
-#ifdef LETS_MAKE_JAM
-      CALL EXCH_XY_O1_R8_JAM( cg2d_r )
-#else
-      _EXCH_XY_R8( cg2d_r, myThid )
-      _EXCH_XY_R8( cg2d_x, myThid )
-#endif
+c      CALL EXCH_S3D_RL( cg2d_r, 1, myThid )
+       CALL EXCH_XY_RL ( cg2d_r, myThid )
 
-Cml   end of IF ( err .LT. cg2dTolerance ) THEN; ELSE
+Cml   end of if "err >= cg2dTolerance" block ; end main solver loop
       ENDIF
-Cml     end main solver loop
       ENDDO
 
+#ifndef ALLOW_AUTODIFF_TAMC
       IF (cg2dNormaliseRHS) THEN
-#ifdef ALLOW_AUTODIFF_TAMC
-CADJ STORE rhsNorm = comlev1_cg2d, key = ikey, byte = isbyte 
-CADJ STORE cg2d_x = comlev1_cg2d, key = ikey, byte = isbyte 
-#endif /* ALLOW_AUTODIFF_TAMC */
 C--   Un-normalise the answer
         DO bj=myByLo(myThid),myByHi(myThid)
          DO bi=myBxLo(myThid),myBxHi(myThid)
-          DO J=1,sNy
-           DO I=1,sNx
-            cg2d_x(I  ,J  ,bi,bj) = cg2d_x(I  ,J  ,bi,bj)/rhsNorm
+          DO j=1,sNy
+           DO i=1,sNx
+            cg2d_x(i,j,bi,bj) = cg2d_x(i,j,bi,bj)/rhsNorm
            ENDDO
           ENDDO
          ENDDO
         ENDDO
       ENDIF
-
-C     The following exchange was moved up to solve_for_pressure
-C     for compatibility with TAMC.
-C     _EXCH_XY_R8(cg2d_x, myThid )
-c     _BEGIN_MASTER( myThid )
-c      WRITE(*,'(A,I6,1PE30.14)') ' CG2D_NSA iters, err = ', 
-c    & actualIts, actualResidual
-c     _END_MASTER( )
+#endif /* ndef ALLOW_AUTODIFF_TAMC */
 
 C--   Return parameters to caller
-      lastResidual=actualResidual
-      numIters=actualIts
+      IF ( err_sq .NE. 0. ) THEN
+        lastResidual = SQRT(err_sq)
+      ELSE
+        lastResidual = 0.
+      ENDIF
+      numIters = actualIts
 
 #endif /* ALLOW_CG2D_NSA */
       RETURN
       END
 
-# if ((defined ALLOW_AUTODIFF_TAMC) && (defined ALLOW_LOOP_DIRECTIVE))
-C
+#if ((defined ALLOW_AUTODIFF_TAMC) && (defined ALLOW_LOOP_DIRECTIVE))
+
 C     These routines are routinely part of the TAMC/TAF library that is
 C     not included in the MITcgm, therefore they are mimicked here.
-C
+
       subroutine adstore(chardum,int1,idow,int2,int3,icount)
 
       implicit none
@@ -496,7 +403,7 @@
       character*(*) chardum
       integer int1, int2, int3, idow, icount
 
-C     the length of this vector must be greater or equal 
+C     the length of this vector must be greater or equal
 C     twice the number of timesteps
       integer nidow
 #ifdef ALLOW_TAMC_CHECKPOINTING
@@ -507,7 +414,7 @@
       integer istoreidow(nidow)
       common /istorecommon/ istoreidow
 
-      print *, 'adstore: ', chardum, int1, idow, int2, int3, icount 
+      print *, 'adstore: ', chardum, int1, idow, int2, int3, icount
 
       if ( icount .gt. nidow ) then
          print *, 'adstore: error: icount > nidow = ', nidow
@@ -529,8 +436,7 @@
       character*(*) chardum
       integer int1, int2, int3, idow, icount
 
-
-C     the length of this vector must be greater or equal 
+C     the length of this vector must be greater or equal
 C     twice the number of timesteps
       integer nidow
 #ifdef ALLOW_TAMC_CHECKPOINTING
@@ -541,7 +447,7 @@
       integer istoreidow(nidow)
       common /istorecommon/ istoreidow
 
-      print *, 'adresto: ', chardum, int1, idow, int2, int3, icount 
+      print *, 'adresto: ', chardum, int1, idow, int2, int3, icount
 
       if ( icount .gt. nidow ) then
          print *, 'adstore: error: icount > nidow = ', nidow
@@ -552,6 +458,4 @@
 
       return
       end
-# endif /* ALLOW_AUTODIFF_TAMC and ALLOW_LOOP_DIRECTIVE */
-
-
+#endif /* ALLOW_AUTODIFF_TAMC and ALLOW_LOOP_DIRECTIVE */