MITgcm_contrib/cg2d_bench/cg2d.F

C       $Id$    
      SUBROUTINE CG2D
C     /==========================================================\
C     | SUBROUTINE CG2D                                          |
C     | o Two-dimensional grid problem conjugate-gradient        |
C     |   inverter (with preconditioner).                        |
C     |==========================================================|
C     | Con. grad is an iterative procedure for solving Ax = b.  |
C     | It requires the A be symmetric.                          |
C     | This implementation assumes A is a five-diagonal         |
C     | matrix of the form that arises in the discrete           |
C     | representation of the del^2 operator in a                |
C     | two-dimensional space.                                   |
C     | Notes:                                                   |
C     | ======                                                   |
C     | This implementation can support shared-memory            | 
C     | multi-threaded execution. In order to do this COMMON     | 
C     | blocks are used for many of the arrays - even ones that  | 
C     | are only used for intermedaite results. This design is   | 
C     | OK if you want to all the threads to collaborate on      | 
C     | solving the same problem. On the other hand if you want  | 
C     | the threads to solve several different problems          | 
C     | concurrently this implementation will not work.          |
C     \==========================================================/
      IMPLICIT NONE

C     === Global data ===
#include "SIZE.h"
#include "EEPARAMS.h"
#include "PARAMS.h"
#include "CG2D.h"
#if defined(USE_PAPI_FLOPS) || defined(USE_PAPI_FLIPS)
#if defined(PAPI_PER_ITERATION) || defined(PAPI_PER_TIMESTEP)
#include "PAPI.h"
#endif
#endif

C     === Routine arguments ===
C     myThid - Thread on which I am working.
      INTEGER myThid

C     === Local variables ====
C     actualIts      - Number of iterations taken
C     actualResidual - residual
C     bi          - Block index in X and Y.
C     bj
C     etaN        - Used in computing search directions
C     etaNM1        suffix N and NM1 denote current and
C     beta          previous iterations respectively.
C     alpha  
C     sumRHS      - Sum of right-hand-side. Sometimes this is a
C                   useful debuggin/trouble shooting diagnostic.
C                   For neumann problems sumRHS needs to be ~0.
C                   or they converge at a non-zero residual.
C     err         - Measure of residual of Ax - b, usually the norm.
C     I, J, N     - Loop counters ( N counts CG iterations )
      INTEGER actualIts
      INTEGER bi, bj              
      INTEGER I, J, N
#ifdef USE_MIXED_PRECISION
      REAL*8    actualResidual
      REAL*8    err
      REAL*8    errSum
      REAL*8    etaN
      REAL*8    etaNM1
      REAL*8    etaNSum
      REAL*8    beta
      REAL*8    alpha
      REAL*8    alphaSum
      REAL*8    sumRHS
      REAL*8    temp
#else
      Real    actualResidual
      Real    err
      Real    errSum
      Real    etaN
      Real    etaNM1
      Real    etaNSum
      Real    beta
      Real    alpha
      Real    alphaSum
      Real    sumRHS
      Real    temp
#endif

C--   Initialise inverter
      etaNM1              = 1. _d 0

C--   Initial residual calculation
      err    = 0. _d 0
      sumRHS = 0. _d 0
      DO J=1,sNy
       DO I=1,sNx
        cg2d_s(I,J) = 0. _d 0
         cg2d_r(I,J) = cg2d_b(I,J) -
     &   ( aW2d(I  ,J  )*cg2d_x(I-1,J  )+aW2d(I+1,J  )*cg2d_x(I+1,J  )
     &    +aS2d(I  ,J  )*cg2d_x(I  ,J-1)+aS2d(I  ,J+1)*cg2d_x(I  ,J+1)
     &    -aW2d(I  ,J  )*cg2d_x(I  ,J  )-aW2d(I+1,J  )*cg2d_x(I  ,J  )
     &    -aS2d(I  ,J  )*cg2d_x(I  ,J  )-aS2d(I  ,J+1)*cg2d_x(I  ,J  )
     &   )
          err    = err    + cg2d_r(I,J)*cg2d_r(I,J)
          sumRHS = sumRHS + cg2d_b(I,J)
       ENDDO
      ENDDO
      CALL EXCH_XY_R8( cg2d_r    )
      CALL EXCH_XY_R8( cg2d_s    )
      CALL GSUM_R8( temp, err    )
      err    = temp
      CALL GSUM_R8( temp, sumRHS )
      sumRHS = temp

      actualIts      = 0
      actualResidual = SQRT(err)
      WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual
      IF ( actualResidual .EQ. 0. _d 0) STOP 'ABNORMAL END: RESIDUAL 0'

C     >>>>>>>>>>>>>>> BEGIN SOLVER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
      DO 10 N=1, cg2dMaxIters
C--    Solve preconditioning equation and update
C--    conjugate direction vector "s".
       etaN = 0. _d 0
       DO J=1,sNy
        DO I=1,sNx
         cg2d_q(I,J) = 
     &    pW(I  ,J  )*cg2d_r(I-1,J  )+pW(I+1,J  )*cg2d_r(I+1,J  )
     &   +pS(I  ,J  )*cg2d_r(I  ,J-1)+pS(I  ,J+1)*cg2d_r(I  ,J+1)
     &   +pC(I  ,J  )*cg2d_r(I  ,J  )
         etaN = etaN+cg2d_q(I,J)*cg2d_r(I,J)
        ENDDO
       ENDDO

       CALL GSUM_R8( temp, etaN )
       etaN   = temp
       beta   = etaN/etaNM1
       etaNM1 = etaN

       DO J=1,sNy
        DO I=1,sNx
         cg2d_s(I,J) = cg2d_q(I,J) + beta*cg2d_s(I,J)
        ENDDO
       ENDDO

C--    Do exchanges that require messages i.e. between
C--    processes.
       CALL EXCH_XY_R8( cg2d_s )

C--    Ten extra exchanges
#ifdef TEN_EXTRA_EXCHS
       CALL EXCH_XY_R8( cg2d_s )
       CALL EXCH_XY_R8( cg2d_s )
       CALL EXCH_XY_R8( cg2d_s )
       CALL EXCH_XY_R8( cg2d_s )
       CALL EXCH_XY_R8( cg2d_s )
       CALL EXCH_XY_R8( cg2d_s )
       CALL EXCH_XY_R8( cg2d_s )
       CALL EXCH_XY_R8( cg2d_s )
       CALL EXCH_XY_R8( cg2d_s )
       CALL EXCH_XY_R8( cg2d_s )
#endif

C==    Evaluate laplace operator on conjugate gradient vector
C==    q = A.s
       alpha = 0. _d 0
       DO J=1,sNy
        DO I=1,sNx
         cg2d_q(I,J) = 
     &   aW2d(I  ,J  )*cg2d_s(I-1,J  )+aW2d(I+1,J  )*cg2d_s(I+1,J  )
     &  +aS2d(I  ,J  )*cg2d_s(I  ,J-1)+aS2d(I  ,J+1)*cg2d_s(I  ,J+1)
     &  -aW2d(I  ,J  )*cg2d_s(I  ,J  )-aW2d(I+1,J  )*cg2d_s(I  ,J  )
     &  -aS2d(I  ,J  )*cg2d_s(I  ,J  )-aS2d(I  ,J+1)*cg2d_s(I  ,J  )
        alpha = alpha+cg2d_s(I,J)*cg2d_q(I,J)
        ENDDO
       ENDDO
       CALL GSUM_R8( temp, alpha )

#ifdef HUNDRED_EXTRA_SUMS
C--    Hundred extra global sums
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
       CALL GSUM_R8( temp, alpha )
#endif

       alpha = temp
       alpha = etaN/alpha
     
C==    Update solution and residual vectors
C      Now compute "interior" points.
       err = 0. _d 0
       DO J=1,sNy
        DO I=1,sNx
         cg2d_x(I,J)=cg2d_x(I,J)+alpha*cg2d_s(I,J)
         cg2d_r(I,J)=cg2d_r(I,J)-alpha*cg2d_q(I,J)
         err = err+cg2d_r(I,J)*cg2d_r(I,J)
        ENDDO
       ENDDO

       CALL GSUM_R8( temp, err )

       err = temp
       err = SQRT(err)
       actualIts      = N
       actualResidual = err
#ifdef RESIDUAL_PER_ITERATION
       WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual
#endif
       IF ( err .LT. cg2dTargetResidual ) GOTO 11
       CALL EXCH_XY_R8(cg2d_r )
#ifdef PAPI_PER_ITERATION
#ifdef USE_PAPI_FLOPS
       call PAPIF_flops(real_time, proc_time, flpops, mflops, check)
       WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
     $      mflops, ' user ', mflops*proc_time/real_time,
     $      ' wallclock Mflop/s during iteration ', N
#else
#ifdef USE_PAPI_FLIPS
       call PAPIF_flips(real_time, proc_time, flpops, mflops, check)
       WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
     $      mflops, ' user ', mflops*proc_time/real_time,
     $      ' wallclock Mflip/s during iteration ', N 
#endif
#endif
#endif
   10 CONTINUE
   11 CONTINUE
      CALL EXCH_XY_R8(cg2d_x )
#ifdef PAPI_PER_TIMESTEP
#ifdef USE_PAPI_FLOPS
       call PAPIF_flops(real_time, proc_time, flpops, mflops, check)
       WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
     $      mflops, ' user ', mflops*proc_time/real_time,
     $      ' wallclock Mflop/s during iteration ', N 
#else
#ifdef USE_PAPI_FLIPS
       call PAPIF_flips(real_time, proc_time, flpops, mflops, check)
       WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
     $      mflops, ' user ', mflops*proc_time/real_time,
     $      ' wallclock Mflip/s during iteration ', N 
#endif
#endif
#endif
      WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual

C     Calc Ax to check result
      DO J=1,sNy
       DO I=1,sNx
         cg2d_Ax(I,J) =
     &   ( aW2d(I  ,J  )*cg2d_x(I-1,J  )+aW2d(I+1,J  )*cg2d_x(I+1,J  )
     &    +aS2d(I  ,J  )*cg2d_x(I  ,J-1)+aS2d(I  ,J+1)*cg2d_x(I  ,J+1)
     &    -aW2d(I  ,J  )*cg2d_x(I  ,J  )-aW2d(I+1,J  )*cg2d_x(I  ,J  )
     &    -aS2d(I  ,J  )*cg2d_x(I  ,J  )-aS2d(I  ,J+1)*cg2d_x(I  ,J  )
     &   )
         cg2d_r(I,J) = cg2d_b(I,J)-cg2d_Ax(I,J)
       ENDDO
      ENDDO
      CALL EXCH_XY_R8(cg2d_Ax )
      CALL EXCH_XY_R8(cg2d_r  )

      END
1	C $Id$
2	SUBROUTINE CG2D
3	C /==========================================================\
4	C \| SUBROUTINE CG2D \|
5	C \| o Two-dimensional grid problem conjugate-gradient \|
6	C \| inverter (with preconditioner). \|
7	C \|==========================================================\|
8	C \| Con. grad is an iterative procedure for solving Ax = b. \|
9	C \| It requires the A be symmetric. \|
10	C \| This implementation assumes A is a five-diagonal \|
11	C \| matrix of the form that arises in the discrete \|
12	C \| representation of the del^2 operator in a \|
13	C \| two-dimensional space. \|
14	C \| Notes: \|
15	C \| ====== \|
16	C \| This implementation can support shared-memory \|
17	C \| multi-threaded execution. In order to do this COMMON \|
18	C \| blocks are used for many of the arrays - even ones that \|
19	C \| are only used for intermedaite results. This design is \|
20	C \| OK if you want to all the threads to collaborate on \|
21	C \| solving the same problem. On the other hand if you want \|
22	C \| the threads to solve several different problems \|
23	C \| concurrently this implementation will not work. \|
24	C \==========================================================/
25	IMPLICIT NONE
26
27	C === Global data ===
28	#include "SIZE.h"
29	#include "EEPARAMS.h"
30	#include "PARAMS.h"
31	#include "CG2D.h"
32	#if defined(USE_PAPI_FLOPS) \|\| defined(USE_PAPI_FLIPS)
33	#if defined(PAPI_PER_ITERATION) \|\| defined(PAPI_PER_TIMESTEP)
34	#include "PAPI.h"
35	#endif
36	#endif
37
38	C === Routine arguments ===
39	C myThid - Thread on which I am working.
40	INTEGER myThid
41
42	C === Local variables ====
43	C actualIts - Number of iterations taken
44	C actualResidual - residual
45	C bi - Block index in X and Y.
46	C bj
47	C etaN - Used in computing search directions
48	C etaNM1 suffix N and NM1 denote current and
49	C beta previous iterations respectively.
50	C alpha
51	C sumRHS - Sum of right-hand-side. Sometimes this is a
52	C useful debuggin/trouble shooting diagnostic.
53	C For neumann problems sumRHS needs to be ~0.
54	C or they converge at a non-zero residual.
55	C err - Measure of residual of Ax - b, usually the norm.
56	C I, J, N - Loop counters ( N counts CG iterations )
57	INTEGER actualIts
58	INTEGER bi, bj
59	INTEGER I, J, N
60	#ifdef USE_MIXED_PRECISION
61	REAL*8 actualResidual
62	REAL*8 err
63	REAL*8 errSum
64	REAL*8 etaN
65	REAL*8 etaNM1
66	REAL*8 etaNSum
67	REAL*8 beta
68	REAL*8 alpha
69	REAL*8 alphaSum
70	REAL*8 sumRHS
71	REAL*8 temp
72	#else
73	Real actualResidual
74	Real err
75	Real errSum
76	Real etaN
77	Real etaNM1
78	Real etaNSum
79	Real beta
80	Real alpha
81	Real alphaSum
82	Real sumRHS
83	Real temp
84	#endif
85
86	C-- Initialise inverter
87	etaNM1 = 1. _d 0
88
89	C-- Initial residual calculation
90	err = 0. _d 0
91	sumRHS = 0. _d 0
92	DO J=1,sNy
93	DO I=1,sNx
94	cg2d_s(I,J) = 0. _d 0
95	cg2d_r(I,J) = cg2d_b(I,J) -
96	& ( aW2d(I ,J )cg2d_x(I-1,J )+aW2d(I+1,J )cg2d_x(I+1,J )
97	& +aS2d(I ,J )cg2d_x(I ,J-1)+aS2d(I ,J+1)cg2d_x(I ,J+1)
98	& -aW2d(I ,J )cg2d_x(I ,J )-aW2d(I+1,J )cg2d_x(I ,J )
99	& -aS2d(I ,J )cg2d_x(I ,J )-aS2d(I ,J+1)cg2d_x(I ,J )
100	& )
101	err = err + cg2d_r(I,J)*cg2d_r(I,J)
102	sumRHS = sumRHS + cg2d_b(I,J)
103	ENDDO
104	ENDDO
105	CALL EXCH_XY_R8( cg2d_r )
106	CALL EXCH_XY_R8( cg2d_s )
107	CALL GSUM_R8( temp, err )
108	err = temp
109	CALL GSUM_R8( temp, sumRHS )
110	sumRHS = temp
111
112	actualIts = 0
113	actualResidual = SQRT(err)
114	WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual
115	IF ( actualResidual .EQ. 0. _d 0) STOP 'ABNORMAL END: RESIDUAL 0'
116
117	C >>>>>>>>>>>>>>> BEGIN SOLVER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
118	DO 10 N=1, cg2dMaxIters
119	C-- Solve preconditioning equation and update
120	C-- conjugate direction vector "s".
121	etaN = 0. _d 0
122	DO J=1,sNy
123	DO I=1,sNx
124	cg2d_q(I,J) =
125	& pW(I ,J )cg2d_r(I-1,J )+pW(I+1,J )cg2d_r(I+1,J )
126	& +pS(I ,J )cg2d_r(I ,J-1)+pS(I ,J+1)cg2d_r(I ,J+1)
127	& +pC(I ,J )*cg2d_r(I ,J )
128	etaN = etaN+cg2d_q(I,J)*cg2d_r(I,J)
129	ENDDO
130	ENDDO
131
132	CALL GSUM_R8( temp, etaN )
133	etaN = temp
134	beta = etaN/etaNM1
135	etaNM1 = etaN
136
137	DO J=1,sNy
138	DO I=1,sNx
139	cg2d_s(I,J) = cg2d_q(I,J) + beta*cg2d_s(I,J)
140	ENDDO
141	ENDDO
142
143	C-- Do exchanges that require messages i.e. between
144	C-- processes.
145	CALL EXCH_XY_R8( cg2d_s )
146
147	C-- Ten extra exchanges
148	#ifdef TEN_EXTRA_EXCHS
149	CALL EXCH_XY_R8( cg2d_s )
150	CALL EXCH_XY_R8( cg2d_s )
151	CALL EXCH_XY_R8( cg2d_s )
152	CALL EXCH_XY_R8( cg2d_s )
153	CALL EXCH_XY_R8( cg2d_s )
154	CALL EXCH_XY_R8( cg2d_s )
155	CALL EXCH_XY_R8( cg2d_s )
156	CALL EXCH_XY_R8( cg2d_s )
157	CALL EXCH_XY_R8( cg2d_s )
158	CALL EXCH_XY_R8( cg2d_s )
159	#endif
160
161	C== Evaluate laplace operator on conjugate gradient vector
162	C== q = A.s
163	alpha = 0. _d 0
164	DO J=1,sNy
165	DO I=1,sNx
166	cg2d_q(I,J) =
167	& aW2d(I ,J )cg2d_s(I-1,J )+aW2d(I+1,J )cg2d_s(I+1,J )
168	& +aS2d(I ,J )cg2d_s(I ,J-1)+aS2d(I ,J+1)cg2d_s(I ,J+1)
169	& -aW2d(I ,J )cg2d_s(I ,J )-aW2d(I+1,J )cg2d_s(I ,J )
170	& -aS2d(I ,J )cg2d_s(I ,J )-aS2d(I ,J+1)cg2d_s(I ,J )
171	alpha = alpha+cg2d_s(I,J)*cg2d_q(I,J)
172	ENDDO
173	ENDDO
174	CALL GSUM_R8( temp, alpha )
175
176	#ifdef HUNDRED_EXTRA_SUMS
177	C-- Hundred extra global sums
178	CALL GSUM_R8( temp, alpha )
179	CALL GSUM_R8( temp, alpha )
180	CALL GSUM_R8( temp, alpha )
181	CALL GSUM_R8( temp, alpha )
182	CALL GSUM_R8( temp, alpha )
183	CALL GSUM_R8( temp, alpha )
184	CALL GSUM_R8( temp, alpha )
185	CALL GSUM_R8( temp, alpha )
186	CALL GSUM_R8( temp, alpha )
187	CALL GSUM_R8( temp, alpha )
188	CALL GSUM_R8( temp, alpha )
189	CALL GSUM_R8( temp, alpha )
190	CALL GSUM_R8( temp, alpha )
191	CALL GSUM_R8( temp, alpha )
192	CALL GSUM_R8( temp, alpha )
193	CALL GSUM_R8( temp, alpha )
194	CALL GSUM_R8( temp, alpha )
195	CALL GSUM_R8( temp, alpha )
196	CALL GSUM_R8( temp, alpha )
197	CALL GSUM_R8( temp, alpha )
198	CALL GSUM_R8( temp, alpha )
199	CALL GSUM_R8( temp, alpha )
200	CALL GSUM_R8( temp, alpha )
201	CALL GSUM_R8( temp, alpha )
202	CALL GSUM_R8( temp, alpha )
203	CALL GSUM_R8( temp, alpha )
204	CALL GSUM_R8( temp, alpha )
205	CALL GSUM_R8( temp, alpha )
206	CALL GSUM_R8( temp, alpha )
207	CALL GSUM_R8( temp, alpha )
208	CALL GSUM_R8( temp, alpha )
209	CALL GSUM_R8( temp, alpha )
210	CALL GSUM_R8( temp, alpha )
211	CALL GSUM_R8( temp, alpha )
212	CALL GSUM_R8( temp, alpha )
213	CALL GSUM_R8( temp, alpha )
214	CALL GSUM_R8( temp, alpha )
215	CALL GSUM_R8( temp, alpha )
216	CALL GSUM_R8( temp, alpha )
217	CALL GSUM_R8( temp, alpha )
218	CALL GSUM_R8( temp, alpha )
219	CALL GSUM_R8( temp, alpha )
220	CALL GSUM_R8( temp, alpha )
221	CALL GSUM_R8( temp, alpha )
222	CALL GSUM_R8( temp, alpha )
223	CALL GSUM_R8( temp, alpha )
224	CALL GSUM_R8( temp, alpha )
225	CALL GSUM_R8( temp, alpha )
226	CALL GSUM_R8( temp, alpha )
227	CALL GSUM_R8( temp, alpha )
228	CALL GSUM_R8( temp, alpha )
229	CALL GSUM_R8( temp, alpha )
230	CALL GSUM_R8( temp, alpha )
231	CALL GSUM_R8( temp, alpha )
232	CALL GSUM_R8( temp, alpha )
233	CALL GSUM_R8( temp, alpha )
234	CALL GSUM_R8( temp, alpha )
235	CALL GSUM_R8( temp, alpha )
236	CALL GSUM_R8( temp, alpha )
237	CALL GSUM_R8( temp, alpha )
238	CALL GSUM_R8( temp, alpha )
239	CALL GSUM_R8( temp, alpha )
240	CALL GSUM_R8( temp, alpha )
241	CALL GSUM_R8( temp, alpha )
242	CALL GSUM_R8( temp, alpha )
243	CALL GSUM_R8( temp, alpha )
244	CALL GSUM_R8( temp, alpha )
245	CALL GSUM_R8( temp, alpha )
246	CALL GSUM_R8( temp, alpha )
247	CALL GSUM_R8( temp, alpha )
248	CALL GSUM_R8( temp, alpha )
249	CALL GSUM_R8( temp, alpha )
250	CALL GSUM_R8( temp, alpha )
251	CALL GSUM_R8( temp, alpha )
252	CALL GSUM_R8( temp, alpha )
253	CALL GSUM_R8( temp, alpha )
254	CALL GSUM_R8( temp, alpha )
255	CALL GSUM_R8( temp, alpha )
256	CALL GSUM_R8( temp, alpha )
257	CALL GSUM_R8( temp, alpha )
258	CALL GSUM_R8( temp, alpha )
259	CALL GSUM_R8( temp, alpha )
260	CALL GSUM_R8( temp, alpha )
261	CALL GSUM_R8( temp, alpha )
262	CALL GSUM_R8( temp, alpha )
263	CALL GSUM_R8( temp, alpha )
264	CALL GSUM_R8( temp, alpha )
265	CALL GSUM_R8( temp, alpha )
266	CALL GSUM_R8( temp, alpha )
267	CALL GSUM_R8( temp, alpha )
268	CALL GSUM_R8( temp, alpha )
269	CALL GSUM_R8( temp, alpha )
270	CALL GSUM_R8( temp, alpha )
271	CALL GSUM_R8( temp, alpha )
272	CALL GSUM_R8( temp, alpha )
273	CALL GSUM_R8( temp, alpha )
274	CALL GSUM_R8( temp, alpha )
275	CALL GSUM_R8( temp, alpha )
276	CALL GSUM_R8( temp, alpha )
277	CALL GSUM_R8( temp, alpha )
278	#endif
279
280	alpha = temp
281	alpha = etaN/alpha
282
283	C== Update solution and residual vectors
284	C Now compute "interior" points.
285	err = 0. _d 0
286	DO J=1,sNy
287	DO I=1,sNx
288	cg2d_x(I,J)=cg2d_x(I,J)+alpha*cg2d_s(I,J)
289	cg2d_r(I,J)=cg2d_r(I,J)-alpha*cg2d_q(I,J)
290	err = err+cg2d_r(I,J)*cg2d_r(I,J)
291	ENDDO
292	ENDDO
293
294	CALL GSUM_R8( temp, err )
295
296	err = temp
297	err = SQRT(err)
298	actualIts = N
299	actualResidual = err
300	#ifdef RESIDUAL_PER_ITERATION
301	WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual
302	#endif
303	IF ( err .LT. cg2dTargetResidual ) GOTO 11
304	CALL EXCH_XY_R8(cg2d_r )
305	#ifdef PAPI_PER_ITERATION
306	#ifdef USE_PAPI_FLOPS
307	call PAPIF_flops(real_time, proc_time, flpops, mflops, check)
308	WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
309	$ mflops, ' user ', mflops*proc_time/real_time,
310	$ ' wallclock Mflop/s during iteration ', N
311	#else
312	#ifdef USE_PAPI_FLIPS
313	call PAPIF_flips(real_time, proc_time, flpops, mflops, check)
314	WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
315	$ mflops, ' user ', mflops*proc_time/real_time,
316	$ ' wallclock Mflip/s during iteration ', N
317	#endif
318	#endif
319	#endif
320	10 CONTINUE
321	11 CONTINUE
322	CALL EXCH_XY_R8(cg2d_x )
323	#ifdef PAPI_PER_TIMESTEP
324	#ifdef USE_PAPI_FLOPS
325	call PAPIF_flops(real_time, proc_time, flpops, mflops, check)
326	WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
327	$ mflops, ' user ', mflops*proc_time/real_time,
328	$ ' wallclock Mflop/s during iteration ', N
329	#else
330	#ifdef USE_PAPI_FLIPS
331	call PAPIF_flips(real_time, proc_time, flpops, mflops, check)
332	WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
333	$ mflops, ' user ', mflops*proc_time/real_time,
334	$ ' wallclock Mflip/s during iteration ', N
335	#endif
336	#endif
337	#endif
338	WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual
339
340	C Calc Ax to check result
341	DO J=1,sNy
342	DO I=1,sNx
343	cg2d_Ax(I,J) =
344	& ( aW2d(I ,J )cg2d_x(I-1,J )+aW2d(I+1,J )cg2d_x(I+1,J )
345	& +aS2d(I ,J )cg2d_x(I ,J-1)+aS2d(I ,J+1)cg2d_x(I ,J+1)
346	& -aW2d(I ,J )cg2d_x(I ,J )-aW2d(I+1,J )cg2d_x(I ,J )
347	& -aS2d(I ,J )cg2d_x(I ,J )-aS2d(I ,J+1)cg2d_x(I ,J )
348	& )
349	cg2d_r(I,J) = cg2d_b(I,J)-cg2d_Ax(I,J)
350	ENDDO
351	ENDDO
352	CALL EXCH_XY_R8(cg2d_Ax )
353	CALL EXCH_XY_R8(cg2d_r )
354
355	END