MITgcm_contrib/cg2d_bench/cg2d.F

C       $Id: cg2d.F,v 1.2 2006/05/12 22:21:21 ce107 Exp $       
      SUBROUTINE CG2D
C     /==========================================================\
C     | SUBROUTINE CG2D                                          |
C     | o Two-dimensional grid problem conjugate-gradient        |
C     |   inverter (with preconditioner).                        |
C     |==========================================================|
C     | Con. grad is an iterative procedure for solving Ax = b.  |
C     | It requires the A be symmetric.                          |
C     | This implementation assumes A is a five-diagonal         |
C     | matrix of the form that arises in the discrete           |
C     | representation of the del^2 operator in a                |
C     | two-dimensional space.                                   |
C     | Notes:                                                   |
C     | ======                                                   |
C     | This implementation can support shared-memory            | 
C     | multi-threaded execution. In order to do this COMMON     | 
C     | blocks are used for many of the arrays - even ones that  | 
C     | are only used for intermedaite results. This design is   | 
C     | OK if you want to all the threads to collaborate on      | 
C     | solving the same problem. On the other hand if you want  | 
C     | the threads to solve several different problems          | 
C     | concurrently this implementation will not work.          |
C     \==========================================================/
      IMPLICIT NONE

C     === Global data ===
#include "SIZE.h"
#include "EEPARAMS.h"
#include "PARAMS.h"
#include "CG2D.h"
#if defined(USE_PAPI_FLOPS) || defined(USE_PAPI_FLIPS)
#if defined(PAPI_PER_ITERATION) || defined(PAPI_PER_TIMESTEP)
#include "PAPI.h"
#endif
#endif

C     === Routine arguments ===
C     myThid - Thread on which I am working.
      INTEGER myThid

C     === Local variables ====
C     actualIts      - Number of iterations taken
C     actualResidual - residual
C     bi          - Block index in X and Y.
C     bj
C     etaN        - Used in computing search directions
C     etaNM1        suffix N and NM1 denote current and
C     beta          previous iterations respectively.
C     alpha  
C     sumRHS      - Sum of right-hand-side. Sometimes this is a
C                   useful debuggin/trouble shooting diagnostic.
C                   For neumann problems sumRHS needs to be ~0.
C                   or they converge at a non-zero residual.
C     err         - Measure of residual of Ax - b, usually the norm.
C     I, J, N     - Loop counters ( N counts CG iterations )
      INTEGER actualIts
      INTEGER bi, bj              
      INTEGER I, J, N
#ifdef USE_MIXED_PRECISION
      REAL*8    actualResidual
      REAL*8    err
      REAL*8    errSum
      REAL*8    etaN
      REAL*8    etaNM1
      REAL*8    etaNSum
      REAL*8    beta
      REAL*8    alpha
      REAL*8    alphaSum
      REAL*8    sumRHS
      REAL*8    temp
#else
      Real    actualResidual
      Real    err
      Real    errSum
      Real    etaN
      Real    etaNM1
      Real    etaNSum
      Real    beta
      Real    alpha
      Real    alphaSum
      Real    sumRHS
      Real    temp
#endif

C--   Initialise inverter
      etaNM1              = 1. _d 0

C--   Initial residual calculation
      err    = 0. _d 0
      sumRHS = 0. _d 0
      DO J=1,sNy
       DO I=1,sNx
        cg2d_s(I,J) = 0. _d 0
         cg2d_r(I,J) = cg2d_b(I,J) -
     &   ( aW2d(I  ,J  )*cg2d_x(I-1,J  )+aW2d(I+1,J  )*cg2d_x(I+1,J  )
     &    +aS2d(I  ,J  )*cg2d_x(I  ,J-1)+aS2d(I  ,J+1)*cg2d_x(I  ,J+1)
     &    -aW2d(I  ,J  )*cg2d_x(I  ,J  )-aW2d(I+1,J  )*cg2d_x(I  ,J  )
     &    -aS2d(I  ,J  )*cg2d_x(I  ,J  )-aS2d(I  ,J+1)*cg2d_x(I  ,J  )
     &   )
          err    = err    + cg2d_r(I,J)*cg2d_r(I,J)
          sumRHS = sumRHS + cg2d_b(I,J)
       ENDDO
      ENDDO
      CALL EXCH_XY_R8( cg2d_r    )
      CALL EXCH_XY_R8( cg2d_s    )
      CALL GSUM_R8( temp, err    )
      err    = temp
      CALL GSUM_R8( temp, sumRHS )
      sumRHS = temp

      actualIts      = 0
      actualResidual = SQRT(err)
      WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual
      IF ( actualResidual .EQ. 0. _d 0) STOP 'ABNORMAL END: RESIDUAL 0'

C     >>>>>>>>>>>>>>> BEGIN SOLVER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
      DO 10 N=1, cg2dMaxIters
C--    Solve preconditioning equation and update
C--    conjugate direction vector "s".
       etaN = 0. _d 0
       DO J=1,sNy
        DO I=1,sNx
         cg2d_q(I,J) = 
     &    pW(I  ,J  )*cg2d_r(I-1,J  )+pW(I+1,J  )*cg2d_r(I+1,J  )
     &   +pS(I  ,J  )*cg2d_r(I  ,J-1)+pS(I  ,J+1)*cg2d_r(I  ,J+1)
     &   +pC(I  ,J  )*cg2d_r(I  ,J  )
         etaN = etaN+cg2d_q(I,J)*cg2d_r(I,J)
        ENDDO
       ENDDO

       CALL GSUM_R8( temp, etaN )
       etaN   = temp
       beta   = etaN/etaNM1
       etaNM1 = etaN

       DO J=1,sNy
        DO I=1,sNx
         cg2d_s(I,J) = cg2d_q(I,J) + beta*cg2d_s(I,J)
        ENDDO
       ENDDO

C--    Do exchanges that require messages i.e. between
C--    processes.
       CALL EXCH_XY_R8( cg2d_s )

C--    Ten extra exchanges
#ifdef TEN_EXTRA_EXCHS
       DO J=1,10
          CALL EXCH_XY_R8( cg2d_s )
       ENDDO
#endif

C==    Evaluate laplace operator on conjugate gradient vector
C==    q = A.s
       alpha = 0. _d 0
       DO J=1,sNy
        DO I=1,sNx
         cg2d_q(I,J) = 
     &   aW2d(I  ,J  )*cg2d_s(I-1,J  )+aW2d(I+1,J  )*cg2d_s(I+1,J  )
     &  +aS2d(I  ,J  )*cg2d_s(I  ,J-1)+aS2d(I  ,J+1)*cg2d_s(I  ,J+1)
     &  -aW2d(I  ,J  )*cg2d_s(I  ,J  )-aW2d(I+1,J  )*cg2d_s(I  ,J  )
     &  -aS2d(I  ,J  )*cg2d_s(I  ,J  )-aS2d(I  ,J+1)*cg2d_s(I  ,J  )
        alpha = alpha+cg2d_s(I,J)*cg2d_q(I,J)
        ENDDO
       ENDDO
       CALL GSUM_R8( temp, alpha )

#ifdef HUNDRED_EXTRA_SUMS
C--    Hundred extra global sums
       DO J=1,100
          CALL GSUM_R8( temp, alpha )
       ENDDO
#endif

       alpha = temp
       alpha = etaN/alpha
     
C==    Update solution and residual vectors
C      Now compute "interior" points.
       err = 0. _d 0
       DO J=1,sNy
        DO I=1,sNx
         cg2d_x(I,J)=cg2d_x(I,J)+alpha*cg2d_s(I,J)
         cg2d_r(I,J)=cg2d_r(I,J)-alpha*cg2d_q(I,J)
         err = err+cg2d_r(I,J)*cg2d_r(I,J)
        ENDDO
       ENDDO

       CALL GSUM_R8( temp, err )

       err = temp
       err = SQRT(err)
       actualIts      = N
       actualResidual = err
#ifdef RESIDUAL_PER_ITERATION
       WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual
#endif
       IF ( err .LT. cg2dTargetResidual ) GOTO 11
       CALL EXCH_XY_R8(cg2d_r )
#ifdef PAPI_PER_ITERATION
#ifdef USE_PAPI_FLOPS
       call PAPIF_flops(real_time, proc_time, flpops, mflops, check)
       WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
     $      mflops, ' user ', mflops*proc_time/real_time,
     $      ' wallclock Mflop/s during iteration ', N
#else
#ifdef USE_PAPI_FLIPS
       call PAPIF_flips(real_time, proc_time, flpops, mflops, check)
       WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
     $      mflops, ' user ', mflops*proc_time/real_time,
     $      ' wallclock Mflip/s during iteration ', N 
#endif
#endif
#endif
   10 CONTINUE
   11 CONTINUE
      CALL EXCH_XY_R8(cg2d_x )
#ifdef PAPI_PER_TIMESTEP
#ifdef USE_PAPI_FLOPS
       call PAPIF_flops(real_time, proc_time, flpops, mflops, check)
       WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
     $      mflops, ' user ', mflops*proc_time/real_time,
     $      ' wallclock Mflop/s during iteration ', N 
#else
#ifdef USE_PAPI_FLIPS
       call PAPIF_flips(real_time, proc_time, flpops, mflops, check)
       WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
     $      mflops, ' user ', mflops*proc_time/real_time,
     $      ' wallclock Mflip/s during iteration ', N 
#endif
#endif
#endif
      WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual

C     Calc Ax to check result
      DO J=1,sNy
       DO I=1,sNx
         cg2d_Ax(I,J) =
     &   ( aW2d(I  ,J  )*cg2d_x(I-1,J  )+aW2d(I+1,J  )*cg2d_x(I+1,J  )
     &    +aS2d(I  ,J  )*cg2d_x(I  ,J-1)+aS2d(I  ,J+1)*cg2d_x(I  ,J+1)
     &    -aW2d(I  ,J  )*cg2d_x(I  ,J  )-aW2d(I+1,J  )*cg2d_x(I  ,J  )
     &    -aS2d(I  ,J  )*cg2d_x(I  ,J  )-aS2d(I  ,J+1)*cg2d_x(I  ,J  )
     &   )
         cg2d_r(I,J) = cg2d_b(I,J)-cg2d_Ax(I,J)
       ENDDO
      ENDDO
      CALL EXCH_XY_R8(cg2d_Ax )
      CALL EXCH_XY_R8(cg2d_r  )

      END
1	ce107	1.3	C $Id: cg2d.F,v 1.2 2006/05/12 22:21:21 ce107 Exp $
2	ce107	1.1	SUBROUTINE CG2D
3			C /==========================================================\
4			C \| SUBROUTINE CG2D \|
5			C \| o Two-dimensional grid problem conjugate-gradient \|
6			C \| inverter (with preconditioner). \|
7			C \|==========================================================\|
8			C \| Con. grad is an iterative procedure for solving Ax = b. \|
9			C \| It requires the A be symmetric. \|
10			C \| This implementation assumes A is a five-diagonal \|
11			C \| matrix of the form that arises in the discrete \|
12			C \| representation of the del^2 operator in a \|
13			C \| two-dimensional space. \|
14			C \| Notes: \|
15			C \| ====== \|
16			C \| This implementation can support shared-memory \|
17			C \| multi-threaded execution. In order to do this COMMON \|
18			C \| blocks are used for many of the arrays - even ones that \|
19			C \| are only used for intermedaite results. This design is \|
20			C \| OK if you want to all the threads to collaborate on \|
21			C \| solving the same problem. On the other hand if you want \|
22			C \| the threads to solve several different problems \|
23			C \| concurrently this implementation will not work. \|
24			C \==========================================================/
25			IMPLICIT NONE
26
27			C === Global data ===
28			#include "SIZE.h"
29			#include "EEPARAMS.h"
30			#include "PARAMS.h"
31			#include "CG2D.h"
32	ce107	1.2	#if defined(USE_PAPI_FLOPS) \|\| defined(USE_PAPI_FLIPS)
33			#if defined(PAPI_PER_ITERATION) \|\| defined(PAPI_PER_TIMESTEP)
34			#include "PAPI.h"
35			#endif
36			#endif
37	ce107	1.1
38			C === Routine arguments ===
39			C myThid - Thread on which I am working.
40			INTEGER myThid
41
42			C === Local variables ====
43			C actualIts - Number of iterations taken
44			C actualResidual - residual
45			C bi - Block index in X and Y.
46			C bj
47			C etaN - Used in computing search directions
48			C etaNM1 suffix N and NM1 denote current and
49			C beta previous iterations respectively.
50			C alpha
51			C sumRHS - Sum of right-hand-side. Sometimes this is a
52			C useful debuggin/trouble shooting diagnostic.
53			C For neumann problems sumRHS needs to be ~0.
54			C or they converge at a non-zero residual.
55			C err - Measure of residual of Ax - b, usually the norm.
56			C I, J, N - Loop counters ( N counts CG iterations )
57			INTEGER actualIts
58			INTEGER bi, bj
59			INTEGER I, J, N
60	ce107	1.2	#ifdef USE_MIXED_PRECISION
61			REAL*8 actualResidual
62			REAL*8 err
63			REAL*8 errSum
64			REAL*8 etaN
65			REAL*8 etaNM1
66			REAL*8 etaNSum
67			REAL*8 beta
68			REAL*8 alpha
69			REAL*8 alphaSum
70			REAL*8 sumRHS
71			REAL*8 temp
72			#else
73			Real actualResidual
74			Real err
75			Real errSum
76			Real etaN
77			Real etaNM1
78			Real etaNSum
79			Real beta
80			Real alpha
81			Real alphaSum
82			Real sumRHS
83			Real temp
84			#endif
85	ce107	1.1
86			C-- Initialise inverter
87	ce107	1.2	etaNM1 = 1. _d 0
88	ce107	1.1
89			C-- Initial residual calculation
90			err = 0. _d 0
91			sumRHS = 0. _d 0
92			DO J=1,sNy
93			DO I=1,sNx
94	ce107	1.2	cg2d_s(I,J) = 0. _d 0
95	ce107	1.1	cg2d_r(I,J) = cg2d_b(I,J) -
96			& ( aW2d(I ,J )cg2d_x(I-1,J )+aW2d(I+1,J )cg2d_x(I+1,J )
97			& +aS2d(I ,J )cg2d_x(I ,J-1)+aS2d(I ,J+1)cg2d_x(I ,J+1)
98			& -aW2d(I ,J )cg2d_x(I ,J )-aW2d(I+1,J )cg2d_x(I ,J )
99			& -aS2d(I ,J )cg2d_x(I ,J )-aS2d(I ,J+1)cg2d_x(I ,J )
100			& )
101			err = err + cg2d_r(I,J)*cg2d_r(I,J)
102			sumRHS = sumRHS + cg2d_b(I,J)
103			ENDDO
104			ENDDO
105			CALL EXCH_XY_R8( cg2d_r )
106			CALL EXCH_XY_R8( cg2d_s )
107			CALL GSUM_R8( temp, err )
108			err = temp
109			CALL GSUM_R8( temp, sumRHS )
110			sumRHS = temp
111
112			actualIts = 0
113			actualResidual = SQRT(err)
114			WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual
115	ce107	1.2	IF ( actualResidual .EQ. 0. _d 0) STOP 'ABNORMAL END: RESIDUAL 0'
116	ce107	1.1
117			C >>>>>>>>>>>>>>> BEGIN SOLVER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
118			DO 10 N=1, cg2dMaxIters
119			C-- Solve preconditioning equation and update
120			C-- conjugate direction vector "s".
121			etaN = 0. _d 0
122			DO J=1,sNy
123			DO I=1,sNx
124			cg2d_q(I,J) =
125			& pW(I ,J )cg2d_r(I-1,J )+pW(I+1,J )cg2d_r(I+1,J )
126			& +pS(I ,J )cg2d_r(I ,J-1)+pS(I ,J+1)cg2d_r(I ,J+1)
127			& +pC(I ,J )*cg2d_r(I ,J )
128			etaN = etaN+cg2d_q(I,J)*cg2d_r(I,J)
129			ENDDO
130			ENDDO
131
132			CALL GSUM_R8( temp, etaN )
133			etaN = temp
134			beta = etaN/etaNM1
135			etaNM1 = etaN
136
137			DO J=1,sNy
138			DO I=1,sNx
139			cg2d_s(I,J) = cg2d_q(I,J) + beta*cg2d_s(I,J)
140			ENDDO
141			ENDDO
142
143			C-- Do exchanges that require messages i.e. between
144			C-- processes.
145			CALL EXCH_XY_R8( cg2d_s )
146
147			C-- Ten extra exchanges
148			#ifdef TEN_EXTRA_EXCHS
149	ce107	1.3	DO J=1,10
150			CALL EXCH_XY_R8( cg2d_s )
151			ENDDO
152	ce107	1.1	#endif
153
154			C== Evaluate laplace operator on conjugate gradient vector
155			C== q = A.s
156			alpha = 0. _d 0
157			DO J=1,sNy
158			DO I=1,sNx
159			cg2d_q(I,J) =
160			& aW2d(I ,J )cg2d_s(I-1,J )+aW2d(I+1,J )cg2d_s(I+1,J )
161			& +aS2d(I ,J )cg2d_s(I ,J-1)+aS2d(I ,J+1)cg2d_s(I ,J+1)
162			& -aW2d(I ,J )cg2d_s(I ,J )-aW2d(I+1,J )cg2d_s(I ,J )
163			& -aS2d(I ,J )cg2d_s(I ,J )-aS2d(I ,J+1)cg2d_s(I ,J )
164			alpha = alpha+cg2d_s(I,J)*cg2d_q(I,J)
165			ENDDO
166			ENDDO
167			CALL GSUM_R8( temp, alpha )
168
169			#ifdef HUNDRED_EXTRA_SUMS
170			C-- Hundred extra global sums
171	ce107	1.3	DO J=1,100
172			CALL GSUM_R8( temp, alpha )
173			ENDDO
174	ce107	1.1	#endif
175
176			alpha = temp
177			alpha = etaN/alpha
178
179			C== Update solution and residual vectors
180			C Now compute "interior" points.
181			err = 0. _d 0
182			DO J=1,sNy
183			DO I=1,sNx
184			cg2d_x(I,J)=cg2d_x(I,J)+alpha*cg2d_s(I,J)
185			cg2d_r(I,J)=cg2d_r(I,J)-alpha*cg2d_q(I,J)
186			err = err+cg2d_r(I,J)*cg2d_r(I,J)
187			ENDDO
188			ENDDO
189
190			CALL GSUM_R8( temp, err )
191
192			err = temp
193			err = SQRT(err)
194			actualIts = N
195			actualResidual = err
196	ce107	1.2	#ifdef RESIDUAL_PER_ITERATION
197			WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual
198			#endif
199	ce107	1.1	IF ( err .LT. cg2dTargetResidual ) GOTO 11
200			CALL EXCH_XY_R8(cg2d_r )
201	ce107	1.2	#ifdef PAPI_PER_ITERATION
202			#ifdef USE_PAPI_FLOPS
203			call PAPIF_flops(real_time, proc_time, flpops, mflops, check)
204			WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
205			$ mflops, ' user ', mflops*proc_time/real_time,
206			$ ' wallclock Mflop/s during iteration ', N
207			#else
208			#ifdef USE_PAPI_FLIPS
209			call PAPIF_flips(real_time, proc_time, flpops, mflops, check)
210			WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
211			$ mflops, ' user ', mflops*proc_time/real_time,
212			$ ' wallclock Mflip/s during iteration ', N
213			#endif
214			#endif
215			#endif
216	ce107	1.1	10 CONTINUE
217			11 CONTINUE
218			CALL EXCH_XY_R8(cg2d_x )
219	ce107	1.2	#ifdef PAPI_PER_TIMESTEP
220			#ifdef USE_PAPI_FLOPS
221			call PAPIF_flops(real_time, proc_time, flpops, mflops, check)
222			WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
223			$ mflops, ' user ', mflops*proc_time/real_time,
224			$ ' wallclock Mflop/s during iteration ', N
225			#else
226			#ifdef USE_PAPI_FLIPS
227			call PAPIF_flips(real_time, proc_time, flpops, mflops, check)
228			WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
229			$ mflops, ' user ', mflops*proc_time/real_time,
230			$ ' wallclock Mflip/s during iteration ', N
231			#endif
232			#endif
233			#endif
234	ce107	1.1	WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual
235
236			C Calc Ax to check result
237			DO J=1,sNy
238			DO I=1,sNx
239			cg2d_Ax(I,J) =
240			& ( aW2d(I ,J )cg2d_x(I-1,J )+aW2d(I+1,J )cg2d_x(I+1,J )
241			& +aS2d(I ,J )cg2d_x(I ,J-1)+aS2d(I ,J+1)cg2d_x(I ,J+1)
242			& -aW2d(I ,J )cg2d_x(I ,J )-aW2d(I+1,J )cg2d_x(I ,J )
243			& -aS2d(I ,J )cg2d_x(I ,J )-aS2d(I ,J+1)cg2d_x(I ,J )
244			& )
245			cg2d_r(I,J) = cg2d_b(I,J)-cg2d_Ax(I,J)
246			ENDDO
247			ENDDO
248			CALL EXCH_XY_R8(cg2d_Ax )
249			CALL EXCH_XY_R8(cg2d_r )
250
251			END