MITgcm_contrib/cg2d_bench/cg2d.F

C       $Id$    
      SUBROUTINE CG2D
C     /==========================================================\
C     | SUBROUTINE CG2D                                          |
C     | o Two-dimensional grid problem conjugate-gradient        |
C     |   inverter (with preconditioner).                        |
C     |==========================================================|
C     | Con. grad is an iterative procedure for solving Ax = b.  |
C     | It requires the A be symmetric.                          |
C     | This implementation assumes A is a five-diagonal         |
C     | matrix of the form that arises in the discrete           |
C     | representation of the del^2 operator in a                |
C     | two-dimensional space.                                   |
C     | Notes:                                                   |
C     | ======                                                   |
C     | This implementation can support shared-memory            | 
C     | multi-threaded execution. In order to do this COMMON     | 
C     | blocks are used for many of the arrays - even ones that  | 
C     | are only used for intermedaite results. This design is   | 
C     | OK if you want to all the threads to collaborate on      | 
C     | solving the same problem. On the other hand if you want  | 
C     | the threads to solve several different problems          | 
C     | concurrently this implementation will not work.          |
C     \==========================================================/
      IMPLICIT NONE

C     === Global data ===
#include "SIZE.h"
#include "EEPARAMS.h"
#include "PARAMS.h"
#include "CG2D.h"
#if defined(USE_PAPI_FLOPS) || defined(USE_PAPI_FLIPS)
#if defined(PAPI_PER_ITERATION) || defined(PAPI_PER_TIMESTEP)
#include "PAPI.h"
#endif
#endif

C     === Routine arguments ===
C     myThid - Thread on which I am working.
      INTEGER myThid

C     === Local variables ====
C     actualIts      - Number of iterations taken
C     actualResidual - residual
C     bi          - Block index in X and Y.
C     bj
C     etaN        - Used in computing search directions
C     etaNM1        suffix N and NM1 denote current and
C     beta          previous iterations respectively.
C     alpha  
C     sumRHS      - Sum of right-hand-side. Sometimes this is a
C                   useful debuggin/trouble shooting diagnostic.
C                   For neumann problems sumRHS needs to be ~0.
C                   or they converge at a non-zero residual.
C     err         - Measure of residual of Ax - b, usually the norm.
C     I, J, N     - Loop counters ( N counts CG iterations )
      INTEGER actualIts
      INTEGER bi, bj              
      INTEGER I, J, N
#ifdef USE_MIXED_PRECISION
      REAL*8    actualResidual
      REAL*8    err
      REAL*8    errSum
      REAL*8    etaN
      REAL*8    etaNM1
      REAL*8    etaNSum
      REAL*8    beta
      REAL*8    alpha
      REAL*8    alphaSum
      REAL*8    sumRHS
      REAL*8    temp
#else
      Real    actualResidual
      Real    err
      Real    errSum
      Real    etaN
      Real    etaNM1
      Real    etaNSum
      Real    beta
      Real    alpha
      Real    alphaSum
      Real    sumRHS
      Real    temp
#endif

C--   Initialise inverter
      etaNM1              = 1. _d 0

C--   Initial residual calculation
      err    = 0. _d 0
      sumRHS = 0. _d 0
      DO J=1,sNy
       DO I=1,sNx
        cg2d_s(I,J) = 0. _d 0
         cg2d_r(I,J) = cg2d_b(I,J) -
     &   ( aW2d(I  ,J  )*cg2d_x(I-1,J  )+aW2d(I+1,J  )*cg2d_x(I+1,J  )
     &    +aS2d(I  ,J  )*cg2d_x(I  ,J-1)+aS2d(I  ,J+1)*cg2d_x(I  ,J+1)
     &    -aW2d(I  ,J  )*cg2d_x(I  ,J  )-aW2d(I+1,J  )*cg2d_x(I  ,J  )
     &    -aS2d(I  ,J  )*cg2d_x(I  ,J  )-aS2d(I  ,J+1)*cg2d_x(I  ,J  )
     &   )
          err    = err    + cg2d_r(I,J)*cg2d_r(I,J)
          sumRHS = sumRHS + cg2d_b(I,J)
       ENDDO
      ENDDO
      CALL EXCH_XY_R8( cg2d_r    )
      CALL EXCH_XY_R8( cg2d_s    )
      CALL GSUM_R8( temp, err    )
      err    = temp
      CALL GSUM_R8( temp, sumRHS )
      sumRHS = temp

      actualIts      = 0
      actualResidual = SQRT(err)
      WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual
      IF ( actualResidual .EQ. 0. _d 0) STOP 'ABNORMAL END: RESIDUAL 0'

C     >>>>>>>>>>>>>>> BEGIN SOLVER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
      DO 10 N=1, cg2dMaxIters
C--    Solve preconditioning equation and update
C--    conjugate direction vector "s".
       etaN = 0. _d 0
       DO J=1,sNy
        DO I=1,sNx
         cg2d_q(I,J) = 
     &    pW(I  ,J  )*cg2d_r(I-1,J  )+pW(I+1,J  )*cg2d_r(I+1,J  )
     &   +pS(I  ,J  )*cg2d_r(I  ,J-1)+pS(I  ,J+1)*cg2d_r(I  ,J+1)
     &   +pC(I  ,J  )*cg2d_r(I  ,J  )
         etaN = etaN+cg2d_q(I,J)*cg2d_r(I,J)
        ENDDO
       ENDDO

       CALL GSUM_R8( temp, etaN )
       etaN   = temp
       beta   = etaN/etaNM1
       etaNM1 = etaN

       DO J=1,sNy
        DO I=1,sNx
         cg2d_s(I,J) = cg2d_q(I,J) + beta*cg2d_s(I,J)
        ENDDO
       ENDDO

C--    Do exchanges that require messages i.e. between
C--    processes.
       CALL EXCH_XY_R8( cg2d_s )

C--    Ten extra exchanges
#ifdef TEN_EXTRA_EXCHS
       CALL EXCH_XY_R8( cg2d_s )
       CALL EXCH_XY_R8( cg2d_s )
       CALL EXCH_XY_R8( cg2d_s )
       CALL EXCH_XY_R8( cg2d_s )
       CALL EXCH_XY_R8( cg2d_s )
       CALL EXCH_XY_R8( cg2d_s )
       CALL EXCH_XY_R8( cg2d_s )
       CALL EXCH_XY_R8( cg2d_s )
       CALL EXCH_XY_R8( cg2d_s )
       CALL EXCH_XY_R8( cg2d_s )
#endif

C==    Evaluate laplace operator on conjugate gradient vector
C==    q = A.s
       alpha = 0. _d 0
       DO J=1,sNy
        DO I=1,sNx
         cg2d_q(I,J) = 
     &   aW2d(I  ,J  )*cg2d_s(I-1,J  )+aW2d(I+1,J  )*cg2d_s(I+1,J  )
     &  +aS2d(I  ,J  )*cg2d_s(I  ,J-1)+aS2d(I  ,J+1)*cg2d_s(I  ,J+1)
     &  -aW2d(I  ,J  )*cg2d_s(I  ,J  )-aW2d(I+1,J  )*cg2d_s(I  ,J  )
     &  -aS2d(I  ,J  )*cg2d_s(I  ,J  )-aS2d(I  ,J+1)*cg2d_s(I  ,J  )
        alpha = alpha+cg2d_s(I,J)*cg2d_q(I,J)
        ENDDO
       ENDDO
       CALL GSUM_R8( temp, alpha )

#ifdef HUNDRED_EXTRA_SUMS
C--    Hundred extra global sums
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
#endif

       alpha = temp
       alpha = etaN/alpha
     
C==    Update solution and residual vectors
C      Now compute "interior" points.
       err = 0. _d 0
       DO J=1,sNy
        DO I=1,sNx
         cg2d_x(I,J)=cg2d_x(I,J)+alpha*cg2d_s(I,J)
         cg2d_r(I,J)=cg2d_r(I,J)-alpha*cg2d_q(I,J)
         err = err+cg2d_r(I,J)*cg2d_r(I,J)
        ENDDO
       ENDDO

       CALL GSUM_R8( temp, err )

       err = temp
       err = SQRT(err)
       actualIts      = N
       actualResidual = err
#ifdef RESIDUAL_PER_ITERATION
       WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual
#endif
       IF ( err .LT. cg2dTargetResidual ) GOTO 11
       CALL EXCH_XY_R8(cg2d_r )
#ifdef PAPI_PER_ITERATION
#ifdef USE_PAPI_FLOPS
       call PAPIF_flops(real_time, proc_time, flpops, mflops, check)
       WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
     $      mflops, ' user ', mflops*proc_time/real_time,
     $      ' wallclock Mflop/s during iteration ', N
#else
#ifdef USE_PAPI_FLIPS
       call PAPIF_flips(real_time, proc_time, flpops, mflops, check)
       WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
     $      mflops, ' user ', mflops*proc_time/real_time,
     $      ' wallclock Mflip/s during iteration ', N 
#endif
#endif
#endif
   10 CONTINUE
   11 CONTINUE
      CALL EXCH_XY_R8(cg2d_x )
#ifdef PAPI_PER_TIMESTEP
#ifdef USE_PAPI_FLOPS
       call PAPIF_flops(real_time, proc_time, flpops, mflops, check)
       WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
     $      mflops, ' user ', mflops*proc_time/real_time,
     $      ' wallclock Mflop/s during iteration ', N 
#else
#ifdef USE_PAPI_FLIPS
       call PAPIF_flips(real_time, proc_time, flpops, mflops, check)
       WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
     $      mflops, ' user ', mflops*proc_time/real_time,
     $      ' wallclock Mflip/s during iteration ', N 
#endif
#endif
#endif
      WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual

C     Calc Ax to check result
      DO J=1,sNy
       DO I=1,sNx
         cg2d_Ax(I,J) =
     &   ( aW2d(I  ,J  )*cg2d_x(I-1,J  )+aW2d(I+1,J  )*cg2d_x(I+1,J  )
     &    +aS2d(I  ,J  )*cg2d_x(I  ,J-1)+aS2d(I  ,J+1)*cg2d_x(I  ,J+1)
     &    -aW2d(I  ,J  )*cg2d_x(I  ,J  )-aW2d(I+1,J  )*cg2d_x(I  ,J  )
     &    -aS2d(I  ,J  )*cg2d_x(I  ,J  )-aS2d(I  ,J+1)*cg2d_x(I  ,J  )
     &   )
         cg2d_r(I,J) = cg2d_b(I,J)-cg2d_Ax(I,J)
       ENDDO
      ENDDO
      CALL EXCH_XY_R8(cg2d_Ax )
      CALL EXCH_XY_R8(cg2d_r  )

      END
1	ce107	1.2	C $Id$
2	ce107	1.1	SUBROUTINE CG2D
3			C /==========================================================\
4			C \| SUBROUTINE CG2D \|
5			C \| o Two-dimensional grid problem conjugate-gradient \|
6			C \| inverter (with preconditioner). \|
7			C \|==========================================================\|
8			C \| Con. grad is an iterative procedure for solving Ax = b. \|
9			C \| It requires the A be symmetric. \|
10			C \| This implementation assumes A is a five-diagonal \|
11			C \| matrix of the form that arises in the discrete \|
12			C \| representation of the del^2 operator in a \|
13			C \| two-dimensional space. \|
14			C \| Notes: \|
15			C \| ====== \|
16			C \| This implementation can support shared-memory \|
17			C \| multi-threaded execution. In order to do this COMMON \|
18			C \| blocks are used for many of the arrays - even ones that \|
19			C \| are only used for intermedaite results. This design is \|
20			C \| OK if you want to all the threads to collaborate on \|
21			C \| solving the same problem. On the other hand if you want \|
22			C \| the threads to solve several different problems \|
23			C \| concurrently this implementation will not work. \|
24			C \==========================================================/
25			IMPLICIT NONE
26
27			C === Global data ===
28			#include "SIZE.h"
29			#include "EEPARAMS.h"
30			#include "PARAMS.h"
31			#include "CG2D.h"
32	ce107	1.2	#if defined(USE_PAPI_FLOPS) \|\| defined(USE_PAPI_FLIPS)
33			#if defined(PAPI_PER_ITERATION) \|\| defined(PAPI_PER_TIMESTEP)
34			#include "PAPI.h"
35			#endif
36			#endif
37	ce107	1.1
38			C === Routine arguments ===
39			C myThid - Thread on which I am working.
40			INTEGER myThid
41
42			C === Local variables ====
43			C actualIts - Number of iterations taken
44			C actualResidual - residual
45			C bi - Block index in X and Y.
46			C bj
47			C etaN - Used in computing search directions
48			C etaNM1 suffix N and NM1 denote current and
49			C beta previous iterations respectively.
50			C alpha
51			C sumRHS - Sum of right-hand-side. Sometimes this is a
52			C useful debuggin/trouble shooting diagnostic.
53			C For neumann problems sumRHS needs to be ~0.
54			C or they converge at a non-zero residual.
55			C err - Measure of residual of Ax - b, usually the norm.
56			C I, J, N - Loop counters ( N counts CG iterations )
57			INTEGER actualIts
58			INTEGER bi, bj
59			INTEGER I, J, N
60	ce107	1.2	#ifdef USE_MIXED_PRECISION
61			REAL*8 actualResidual
62			REAL*8 err
63			REAL*8 errSum
64			REAL*8 etaN
65			REAL*8 etaNM1
66			REAL*8 etaNSum
67			REAL*8 beta
68			REAL*8 alpha
69			REAL*8 alphaSum
70			REAL*8 sumRHS
71			REAL*8 temp
72			#else
73			Real actualResidual
74			Real err
75			Real errSum
76			Real etaN
77			Real etaNM1
78			Real etaNSum
79			Real beta
80			Real alpha
81			Real alphaSum
82			Real sumRHS
83			Real temp
84			#endif
85	ce107	1.1
86			C-- Initialise inverter
87	ce107	1.2	etaNM1 = 1. _d 0
88	ce107	1.1
89			C-- Initial residual calculation
90			err = 0. _d 0
91			sumRHS = 0. _d 0
92			DO J=1,sNy
93			DO I=1,sNx
94	ce107	1.2	cg2d_s(I,J) = 0. _d 0
95	ce107	1.1	cg2d_r(I,J) = cg2d_b(I,J) -
96			& ( aW2d(I ,J )cg2d_x(I-1,J )+aW2d(I+1,J )cg2d_x(I+1,J )
97			& +aS2d(I ,J )cg2d_x(I ,J-1)+aS2d(I ,J+1)cg2d_x(I ,J+1)
98			& -aW2d(I ,J )cg2d_x(I ,J )-aW2d(I+1,J )cg2d_x(I ,J )
99			& -aS2d(I ,J )cg2d_x(I ,J )-aS2d(I ,J+1)cg2d_x(I ,J )
100			& )
101			err = err + cg2d_r(I,J)*cg2d_r(I,J)
102			sumRHS = sumRHS + cg2d_b(I,J)
103			ENDDO
104			ENDDO
105			CALL EXCH_XY_R8( cg2d_r )
106			CALL EXCH_XY_R8( cg2d_s )
107			CALL GSUM_R8( temp, err )
108			err = temp
109			CALL GSUM_R8( temp, sumRHS )
110			sumRHS = temp
111
112			actualIts = 0
113			actualResidual = SQRT(err)
114			WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual
115	ce107	1.2	IF ( actualResidual .EQ. 0. _d 0) STOP 'ABNORMAL END: RESIDUAL 0'
116	ce107	1.1
117			C >>>>>>>>>>>>>>> BEGIN SOLVER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
118			DO 10 N=1, cg2dMaxIters
119			C-- Solve preconditioning equation and update
120			C-- conjugate direction vector "s".
121			etaN = 0. _d 0
122			DO J=1,sNy
123			DO I=1,sNx
124			cg2d_q(I,J) =
125			& pW(I ,J )cg2d_r(I-1,J )+pW(I+1,J )cg2d_r(I+1,J )
126			& +pS(I ,J )cg2d_r(I ,J-1)+pS(I ,J+1)cg2d_r(I ,J+1)
127			& +pC(I ,J )*cg2d_r(I ,J )
128			etaN = etaN+cg2d_q(I,J)*cg2d_r(I,J)
129			ENDDO
130			ENDDO
131
132			CALL GSUM_R8( temp, etaN )
133			etaN = temp
134			beta = etaN/etaNM1
135			etaNM1 = etaN
136
137			DO J=1,sNy
138			DO I=1,sNx
139			cg2d_s(I,J) = cg2d_q(I,J) + beta*cg2d_s(I,J)
140			ENDDO
141			ENDDO
142
143			C-- Do exchanges that require messages i.e. between
144			C-- processes.
145			CALL EXCH_XY_R8( cg2d_s )
146
147			C-- Ten extra exchanges
148			#ifdef TEN_EXTRA_EXCHS
149			CALL EXCH_XY_R8( cg2d_s )
150			CALL EXCH_XY_R8( cg2d_s )
151			CALL EXCH_XY_R8( cg2d_s )
152			CALL EXCH_XY_R8( cg2d_s )
153			CALL EXCH_XY_R8( cg2d_s )
154			CALL EXCH_XY_R8( cg2d_s )
155			CALL EXCH_XY_R8( cg2d_s )
156			CALL EXCH_XY_R8( cg2d_s )
157			CALL EXCH_XY_R8( cg2d_s )
158			CALL EXCH_XY_R8( cg2d_s )
159			#endif
160
161			C== Evaluate laplace operator on conjugate gradient vector
162			C== q = A.s
163			alpha = 0. _d 0
164			DO J=1,sNy
165			DO I=1,sNx
166			cg2d_q(I,J) =
167			& aW2d(I ,J )cg2d_s(I-1,J )+aW2d(I+1,J )cg2d_s(I+1,J )
168			& +aS2d(I ,J )cg2d_s(I ,J-1)+aS2d(I ,J+1)cg2d_s(I ,J+1)
169			& -aW2d(I ,J )cg2d_s(I ,J )-aW2d(I+1,J )cg2d_s(I ,J )
170			& -aS2d(I ,J )cg2d_s(I ,J )-aS2d(I ,J+1)cg2d_s(I ,J )
171			alpha = alpha+cg2d_s(I,J)*cg2d_q(I,J)
172			ENDDO
173			ENDDO
174			CALL GSUM_R8( temp, alpha )
175
176			#ifdef HUNDRED_EXTRA_SUMS
177			C-- Hundred extra global sums
178			CALL GSUM_R8( temp, alpha )
179			CALL GSUM_R8( temp, alpha )
180			CALL GSUM_R8( temp, alpha )
181			CALL GSUM_R8( temp, alpha )
182			CALL GSUM_R8( temp, alpha )
183			CALL GSUM_R8( temp, alpha )
184			CALL GSUM_R8( temp, alpha )
185			CALL GSUM_R8( temp, alpha )
186			CALL GSUM_R8( temp, alpha )
187			CALL GSUM_R8( temp, alpha )
188			CALL GSUM_R8( temp, alpha )
189			CALL GSUM_R8( temp, alpha )
190			CALL GSUM_R8( temp, alpha )
191			CALL GSUM_R8( temp, alpha )
192			CALL GSUM_R8( temp, alpha )
193			CALL GSUM_R8( temp, alpha )
194			CALL GSUM_R8( temp, alpha )
195			CALL GSUM_R8( temp, alpha )
196			CALL GSUM_R8( temp, alpha )
197			CALL GSUM_R8( temp, alpha )
198			CALL GSUM_R8( temp, alpha )
199			CALL GSUM_R8( temp, alpha )
200			CALL GSUM_R8( temp, alpha )
201			CALL GSUM_R8( temp, alpha )
202			CALL GSUM_R8( temp, alpha )
203			CALL GSUM_R8( temp, alpha )
204			CALL GSUM_R8( temp, alpha )
205			CALL GSUM_R8( temp, alpha )
206			CALL GSUM_R8( temp, alpha )
207			CALL GSUM_R8( temp, alpha )
208			CALL GSUM_R8( temp, alpha )
209			CALL GSUM_R8( temp, alpha )
210			CALL GSUM_R8( temp, alpha )
211			CALL GSUM_R8( temp, alpha )
212			CALL GSUM_R8( temp, alpha )
213			CALL GSUM_R8( temp, alpha )
214			CALL GSUM_R8( temp, alpha )
215			CALL GSUM_R8( temp, alpha )
216			CALL GSUM_R8( temp, alpha )
217			CALL GSUM_R8( temp, alpha )
218			CALL GSUM_R8( temp, alpha )
219			CALL GSUM_R8( temp, alpha )
220			CALL GSUM_R8( temp, alpha )
221			CALL GSUM_R8( temp, alpha )
222			CALL GSUM_R8( temp, alpha )
223			CALL GSUM_R8( temp, alpha )
224			CALL GSUM_R8( temp, alpha )
225			CALL GSUM_R8( temp, alpha )
226			CALL GSUM_R8( temp, alpha )
227			CALL GSUM_R8( temp, alpha )
228			CALL GSUM_R8( temp, alpha )
229			CALL GSUM_R8( temp, alpha )
230			CALL GSUM_R8( temp, alpha )
231			CALL GSUM_R8( temp, alpha )
232			CALL GSUM_R8( temp, alpha )
233			CALL GSUM_R8( temp, alpha )
234			CALL GSUM_R8( temp, alpha )
235			CALL GSUM_R8( temp, alpha )
236			CALL GSUM_R8( temp, alpha )
237			CALL GSUM_R8( temp, alpha )
238			CALL GSUM_R8( temp, alpha )
239			CALL GSUM_R8( temp, alpha )
240			CALL GSUM_R8( temp, alpha )
241			CALL GSUM_R8( temp, alpha )
242			CALL GSUM_R8( temp, alpha )
243			CALL GSUM_R8( temp, alpha )
244			CALL GSUM_R8( temp, alpha )
245			CALL GSUM_R8( temp, alpha )
246			CALL GSUM_R8( temp, alpha )
247			CALL GSUM_R8( temp, alpha )
248			CALL GSUM_R8( temp, alpha )
249			CALL GSUM_R8( temp, alpha )
250			CALL GSUM_R8( temp, alpha )
251			CALL GSUM_R8( temp, alpha )
252			CALL GSUM_R8( temp, alpha )
253			CALL GSUM_R8( temp, alpha )
254			CALL GSUM_R8( temp, alpha )
255			CALL GSUM_R8( temp, alpha )
256			CALL GSUM_R8( temp, alpha )
257			CALL GSUM_R8( temp, alpha )
258			CALL GSUM_R8( temp, alpha )
259			CALL GSUM_R8( temp, alpha )
260			CALL GSUM_R8( temp, alpha )
261			CALL GSUM_R8( temp, alpha )
262			CALL GSUM_R8( temp, alpha )
263			CALL GSUM_R8( temp, alpha )
264			CALL GSUM_R8( temp, alpha )
265			CALL GSUM_R8( temp, alpha )
266			CALL GSUM_R8( temp, alpha )
267			CALL GSUM_R8( temp, alpha )
268			CALL GSUM_R8( temp, alpha )
269			CALL GSUM_R8( temp, alpha )
270			CALL GSUM_R8( temp, alpha )
271			CALL GSUM_R8( temp, alpha )
272			CALL GSUM_R8( temp, alpha )
273			CALL GSUM_R8( temp, alpha )
274			CALL GSUM_R8( temp, alpha )
275			CALL GSUM_R8( temp, alpha )
276			CALL GSUM_R8( temp, alpha )
277			CALL GSUM_R8( temp, alpha )
278			#endif
279
280			alpha = temp
281			alpha = etaN/alpha
282
283			C== Update solution and residual vectors
284			C Now compute "interior" points.
285			err = 0. _d 0
286			DO J=1,sNy
287			DO I=1,sNx
288			cg2d_x(I,J)=cg2d_x(I,J)+alpha*cg2d_s(I,J)
289			cg2d_r(I,J)=cg2d_r(I,J)-alpha*cg2d_q(I,J)
290			err = err+cg2d_r(I,J)*cg2d_r(I,J)
291			ENDDO
292			ENDDO
293
294			CALL GSUM_R8( temp, err )
295
296			err = temp
297			err = SQRT(err)
298			actualIts = N
299			actualResidual = err
300	ce107	1.2	#ifdef RESIDUAL_PER_ITERATION
301			WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual
302			#endif
303	ce107	1.1	IF ( err .LT. cg2dTargetResidual ) GOTO 11
304			CALL EXCH_XY_R8(cg2d_r )
305	ce107	1.2	#ifdef PAPI_PER_ITERATION
306			#ifdef USE_PAPI_FLOPS
307			call PAPIF_flops(real_time, proc_time, flpops, mflops, check)
308			WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
309			$ mflops, ' user ', mflops*proc_time/real_time,
310			$ ' wallclock Mflop/s during iteration ', N
311			#else
312			#ifdef USE_PAPI_FLIPS
313			call PAPIF_flips(real_time, proc_time, flpops, mflops, check)
314			WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
315			$ mflops, ' user ', mflops*proc_time/real_time,
316			$ ' wallclock Mflip/s during iteration ', N
317			#endif
318			#endif
319			#endif
320	ce107	1.1	10 CONTINUE
321			11 CONTINUE
322			CALL EXCH_XY_R8(cg2d_x )
323	ce107	1.2	#ifdef PAPI_PER_TIMESTEP
324			#ifdef USE_PAPI_FLOPS
325			call PAPIF_flops(real_time, proc_time, flpops, mflops, check)
326			WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
327			$ mflops, ' user ', mflops*proc_time/real_time,
328			$ ' wallclock Mflop/s during iteration ', N
329			#else
330			#ifdef USE_PAPI_FLIPS
331			call PAPIF_flips(real_time, proc_time, flpops, mflops, check)
332			WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
333			$ mflops, ' user ', mflops*proc_time/real_time,
334			$ ' wallclock Mflip/s during iteration ', N
335			#endif
336			#endif
337			#endif
338	ce107	1.1	WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual
339
340			C Calc Ax to check result
341			DO J=1,sNy
342			DO I=1,sNx
343			cg2d_Ax(I,J) =
344			& ( aW2d(I ,J )cg2d_x(I-1,J )+aW2d(I+1,J )cg2d_x(I+1,J )
345			& +aS2d(I ,J )cg2d_x(I ,J-1)+aS2d(I ,J+1)cg2d_x(I ,J+1)
346			& -aW2d(I ,J )cg2d_x(I ,J )-aW2d(I+1,J )cg2d_x(I ,J )
347			& -aS2d(I ,J )cg2d_x(I ,J )-aS2d(I ,J+1)cg2d_x(I ,J )
348			& )
349			cg2d_r(I,J) = cg2d_b(I,J)-cg2d_Ax(I,J)
350			ENDDO
351			ENDDO
352			CALL EXCH_XY_R8(cg2d_Ax )
353			CALL EXCH_XY_R8(cg2d_r )
354
355			END