MITgcm_contrib/cg2d_bench/cg2d.F

C       $Id: cg2d.F,v 1.2 2006/05/12 22:21:21 ce107 Exp $       
      SUBROUTINE CG2D
C     /==========================================================\
C     | SUBROUTINE CG2D                                          |
C     | o Two-dimensional grid problem conjugate-gradient        |
C     |   inverter (with preconditioner).                        |
C     |==========================================================|
C     | Con. grad is an iterative procedure for solving Ax = b.  |
C     | It requires the A be symmetric.                          |
C     | This implementation assumes A is a five-diagonal         |
C     | matrix of the form that arises in the discrete           |
C     | representation of the del^2 operator in a                |
C     | two-dimensional space.                                   |
C     | Notes:                                                   |
C     | ======                                                   |
C     | This implementation can support shared-memory            | 
C     | multi-threaded execution. In order to do this COMMON     | 
C     | blocks are used for many of the arrays - even ones that  | 
C     | are only used for intermedaite results. This design is   | 
C     | OK if you want to all the threads to collaborate on      | 
C     | solving the same problem. On the other hand if you want  | 
C     | the threads to solve several different problems          | 
C     | concurrently this implementation will not work.          |
C     \==========================================================/
      IMPLICIT NONE

C     === Global data ===
#include "SIZE.h"
#include "EEPARAMS.h"
#include "PARAMS.h"
#include "CG2D.h"
#if defined(USE_PAPI_FLOPS) || defined(USE_PAPI_FLIPS)
#if defined(PAPI_PER_ITERATION) || defined(PAPI_PER_TIMESTEP)
#include "PAPI.h"
#endif
#endif

C     === Routine arguments ===
C     myThid - Thread on which I am working.
      INTEGER myThid

C     === Local variables ====
C     actualIts      - Number of iterations taken
C     actualResidual - residual
C     bi          - Block index in X and Y.
C     bj
C     etaN        - Used in computing search directions
C     etaNM1        suffix N and NM1 denote current and
C     beta          previous iterations respectively.
C     alpha  
C     sumRHS      - Sum of right-hand-side. Sometimes this is a
C                   useful debuggin/trouble shooting diagnostic.
C                   For neumann problems sumRHS needs to be ~0.
C                   or they converge at a non-zero residual.
C     err         - Measure of residual of Ax - b, usually the norm.
C     I, J, N     - Loop counters ( N counts CG iterations )
      INTEGER actualIts
      INTEGER bi, bj              
      INTEGER I, J, N
#ifdef USE_MIXED_PRECISION
      REAL*8    actualResidual
      REAL*8    err
      REAL*8    errSum
      REAL*8    etaN
      REAL*8    etaNM1
      REAL*8    etaNSum
      REAL*8    beta
      REAL*8    alpha
      REAL*8    alphaSum
      REAL*8    sumRHS
      REAL*8    temp
#else
      Real    actualResidual
      Real    err
      Real    errSum
      Real    etaN
      Real    etaNM1
      Real    etaNSum
      Real    beta
      Real    alpha
      Real    alphaSum
      Real    sumRHS
      Real    temp
#endif

C--   Initialise inverter
      etaNM1              = 1. _d 0

C--   Initial residual calculation
      err    = 0. _d 0
      sumRHS = 0. _d 0
      DO J=1,sNy
       DO I=1,sNx
        cg2d_s(I,J) = 0. _d 0
         cg2d_r(I,J) = cg2d_b(I,J) -
     &   ( aW2d(I  ,J  )*cg2d_x(I-1,J  )+aW2d(I+1,J  )*cg2d_x(I+1,J  )
     &    +aS2d(I  ,J  )*cg2d_x(I  ,J-1)+aS2d(I  ,J+1)*cg2d_x(I  ,J+1)
     &    -aW2d(I  ,J  )*cg2d_x(I  ,J  )-aW2d(I+1,J  )*cg2d_x(I  ,J  )
     &    -aS2d(I  ,J  )*cg2d_x(I  ,J  )-aS2d(I  ,J+1)*cg2d_x(I  ,J  )
     &   )
          err    = err    + cg2d_r(I,J)*cg2d_r(I,J)
          sumRHS = sumRHS + cg2d_b(I,J)
       ENDDO
      ENDDO
      CALL EXCH_XY_R8( cg2d_r    )
      CALL EXCH_XY_R8( cg2d_s    )
      CALL GSUM_R8( temp, err    )
      err    = temp
      CALL GSUM_R8( temp, sumRHS )
      sumRHS = temp

      actualIts      = 0
      actualResidual = SQRT(err)
      WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual
      IF ( actualResidual .EQ. 0. _d 0) STOP 'ABNORMAL END: RESIDUAL 0'

C     >>>>>>>>>>>>>>> BEGIN SOLVER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
      DO 10 N=1, cg2dMaxIters
C--    Solve preconditioning equation and update
C--    conjugate direction vector "s".
       etaN = 0. _d 0
       DO J=1,sNy
        DO I=1,sNx
         cg2d_q(I,J) = 
     &    pW(I  ,J  )*cg2d_r(I-1,J  )+pW(I+1,J  )*cg2d_r(I+1,J  )
     &   +pS(I  ,J  )*cg2d_r(I  ,J-1)+pS(I  ,J+1)*cg2d_r(I  ,J+1)
     &   +pC(I  ,J  )*cg2d_r(I  ,J  )
         etaN = etaN+cg2d_q(I,J)*cg2d_r(I,J)
        ENDDO
       ENDDO

       CALL GSUM_R8( temp, etaN )
       etaN   = temp
       beta   = etaN/etaNM1
       etaNM1 = etaN

       DO J=1,sNy
        DO I=1,sNx
         cg2d_s(I,J) = cg2d_q(I,J) + beta*cg2d_s(I,J)
        ENDDO
       ENDDO

C--    Do exchanges that require messages i.e. between
C--    processes.
       CALL EXCH_XY_R8( cg2d_s )

C--    Ten extra exchanges
#ifdef TEN_EXTRA_EXCHS
       DO J=1,10
          CALL EXCH_XY_R8( cg2d_s )
       ENDDO
#endif

C==    Evaluate laplace operator on conjugate gradient vector
C==    q = A.s
       alpha = 0. _d 0
       DO J=1,sNy
        DO I=1,sNx
         cg2d_q(I,J) = 
     &   aW2d(I  ,J  )*cg2d_s(I-1,J  )+aW2d(I+1,J  )*cg2d_s(I+1,J  )
     &  +aS2d(I  ,J  )*cg2d_s(I  ,J-1)+aS2d(I  ,J+1)*cg2d_s(I  ,J+1)
     &  -aW2d(I  ,J  )*cg2d_s(I  ,J  )-aW2d(I+1,J  )*cg2d_s(I  ,J  )
     &  -aS2d(I  ,J  )*cg2d_s(I  ,J  )-aS2d(I  ,J+1)*cg2d_s(I  ,J  )
        alpha = alpha+cg2d_s(I,J)*cg2d_q(I,J)
        ENDDO
       ENDDO
       CALL GSUM_R8( temp, alpha )

#ifdef HUNDRED_EXTRA_SUMS
C--    Hundred extra global sums
       DO J=1,100
          CALL GSUM_R8( temp, alpha )
       ENDDO
#endif

       alpha = temp
       alpha = etaN/alpha
     
C==    Update solution and residual vectors
C      Now compute "interior" points.
       err = 0. _d 0
       DO J=1,sNy
        DO I=1,sNx
         cg2d_x(I,J)=cg2d_x(I,J)+alpha*cg2d_s(I,J)
         cg2d_r(I,J)=cg2d_r(I,J)-alpha*cg2d_q(I,J)
         err = err+cg2d_r(I,J)*cg2d_r(I,J)
        ENDDO
       ENDDO

       CALL GSUM_R8( temp, err )

       err = temp
       err = SQRT(err)
       actualIts      = N
       actualResidual = err
#ifdef RESIDUAL_PER_ITERATION
       WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual
#endif
       IF ( err .LT. cg2dTargetResidual ) GOTO 11
       CALL EXCH_XY_R8(cg2d_r )
#ifdef PAPI_PER_ITERATION
#ifdef USE_PAPI_FLOPS
       call PAPIF_flops(real_time, proc_time, flpops, mflops, check)
       WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
     $      mflops, ' user ', mflops*proc_time/real_time,
     $      ' wallclock Mflop/s during iteration ', N
#else
#ifdef USE_PAPI_FLIPS
       call PAPIF_flips(real_time, proc_time, flpops, mflops, check)
       WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
     $      mflops, ' user ', mflops*proc_time/real_time,
     $      ' wallclock Mflip/s during iteration ', N 
#endif
#endif
#endif
   10 CONTINUE
   11 CONTINUE
      CALL EXCH_XY_R8(cg2d_x )
#ifdef PAPI_PER_TIMESTEP
#ifdef USE_PAPI_FLOPS
       call PAPIF_flops(real_time, proc_time, flpops, mflops, check)
       WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
     $      mflops, ' user ', mflops*proc_time/real_time,
     $      ' wallclock Mflop/s during iteration ', N 
#else
#ifdef USE_PAPI_FLIPS
       call PAPIF_flips(real_time, proc_time, flpops, mflops, check)
       WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
     $      mflops, ' user ', mflops*proc_time/real_time,
     $      ' wallclock Mflip/s during iteration ', N 
#endif
#endif
#endif
      WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual

C     Calc Ax to check result
      DO J=1,sNy
       DO I=1,sNx
         cg2d_Ax(I,J) =
     &   ( aW2d(I  ,J  )*cg2d_x(I-1,J  )+aW2d(I+1,J  )*cg2d_x(I+1,J  )
     &    +aS2d(I  ,J  )*cg2d_x(I  ,J-1)+aS2d(I  ,J+1)*cg2d_x(I  ,J+1)
     &    -aW2d(I  ,J  )*cg2d_x(I  ,J  )-aW2d(I+1,J  )*cg2d_x(I  ,J  )
     &    -aS2d(I  ,J  )*cg2d_x(I  ,J  )-aS2d(I  ,J+1)*cg2d_x(I  ,J  )
     &   )
         cg2d_r(I,J) = cg2d_b(I,J)-cg2d_Ax(I,J)
       ENDDO
      ENDDO
      CALL EXCH_XY_R8(cg2d_Ax )
      CALL EXCH_XY_R8(cg2d_r  )

      END
1	C $Id: cg2d.F,v 1.2 2006/05/12 22:21:21 ce107 Exp $
2	SUBROUTINE CG2D
3	C /==========================================================\
4	C \| SUBROUTINE CG2D \|
5	C \| o Two-dimensional grid problem conjugate-gradient \|
6	C \| inverter (with preconditioner). \|
7	C \|==========================================================\|
8	C \| Con. grad is an iterative procedure for solving Ax = b. \|
9	C \| It requires the A be symmetric. \|
10	C \| This implementation assumes A is a five-diagonal \|
11	C \| matrix of the form that arises in the discrete \|
12	C \| representation of the del^2 operator in a \|
13	C \| two-dimensional space. \|
14	C \| Notes: \|
15	C \| ====== \|
16	C \| This implementation can support shared-memory \|
17	C \| multi-threaded execution. In order to do this COMMON \|
18	C \| blocks are used for many of the arrays - even ones that \|
19	C \| are only used for intermedaite results. This design is \|
20	C \| OK if you want to all the threads to collaborate on \|
21	C \| solving the same problem. On the other hand if you want \|
22	C \| the threads to solve several different problems \|
23	C \| concurrently this implementation will not work. \|
24	C \==========================================================/
25	IMPLICIT NONE
26
27	C === Global data ===
28	#include "SIZE.h"
29	#include "EEPARAMS.h"
30	#include "PARAMS.h"
31	#include "CG2D.h"
32	#if defined(USE_PAPI_FLOPS) \|\| defined(USE_PAPI_FLIPS)
33	#if defined(PAPI_PER_ITERATION) \|\| defined(PAPI_PER_TIMESTEP)
34	#include "PAPI.h"
35	#endif
36	#endif
37
38	C === Routine arguments ===
39	C myThid - Thread on which I am working.
40	INTEGER myThid
41
42	C === Local variables ====
43	C actualIts - Number of iterations taken
44	C actualResidual - residual
45	C bi - Block index in X and Y.
46	C bj
47	C etaN - Used in computing search directions
48	C etaNM1 suffix N and NM1 denote current and
49	C beta previous iterations respectively.
50	C alpha
51	C sumRHS - Sum of right-hand-side. Sometimes this is a
52	C useful debuggin/trouble shooting diagnostic.
53	C For neumann problems sumRHS needs to be ~0.
54	C or they converge at a non-zero residual.
55	C err - Measure of residual of Ax - b, usually the norm.
56	C I, J, N - Loop counters ( N counts CG iterations )
57	INTEGER actualIts
58	INTEGER bi, bj
59	INTEGER I, J, N
60	#ifdef USE_MIXED_PRECISION
61	REAL*8 actualResidual
62	REAL*8 err
63	REAL*8 errSum
64	REAL*8 etaN
65	REAL*8 etaNM1
66	REAL*8 etaNSum
67	REAL*8 beta
68	REAL*8 alpha
69	REAL*8 alphaSum
70	REAL*8 sumRHS
71	REAL*8 temp
72	#else
73	Real actualResidual
74	Real err
75	Real errSum
76	Real etaN
77	Real etaNM1
78	Real etaNSum
79	Real beta
80	Real alpha
81	Real alphaSum
82	Real sumRHS
83	Real temp
84	#endif
85
86	C-- Initialise inverter
87	etaNM1 = 1. _d 0
88
89	C-- Initial residual calculation
90	err = 0. _d 0
91	sumRHS = 0. _d 0
92	DO J=1,sNy
93	DO I=1,sNx
94	cg2d_s(I,J) = 0. _d 0
95	cg2d_r(I,J) = cg2d_b(I,J) -
96	& ( aW2d(I ,J )cg2d_x(I-1,J )+aW2d(I+1,J )cg2d_x(I+1,J )
97	& +aS2d(I ,J )cg2d_x(I ,J-1)+aS2d(I ,J+1)cg2d_x(I ,J+1)
98	& -aW2d(I ,J )cg2d_x(I ,J )-aW2d(I+1,J )cg2d_x(I ,J )
99	& -aS2d(I ,J )cg2d_x(I ,J )-aS2d(I ,J+1)cg2d_x(I ,J )
100	& )
101	err = err + cg2d_r(I,J)*cg2d_r(I,J)
102	sumRHS = sumRHS + cg2d_b(I,J)
103	ENDDO
104	ENDDO
105	CALL EXCH_XY_R8( cg2d_r )
106	CALL EXCH_XY_R8( cg2d_s )
107	CALL GSUM_R8( temp, err )
108	err = temp
109	CALL GSUM_R8( temp, sumRHS )
110	sumRHS = temp
111
112	actualIts = 0
113	actualResidual = SQRT(err)
114	WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual
115	IF ( actualResidual .EQ. 0. _d 0) STOP 'ABNORMAL END: RESIDUAL 0'
116
117	C >>>>>>>>>>>>>>> BEGIN SOLVER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
118	DO 10 N=1, cg2dMaxIters
119	C-- Solve preconditioning equation and update
120	C-- conjugate direction vector "s".
121	etaN = 0. _d 0
122	DO J=1,sNy
123	DO I=1,sNx
124	cg2d_q(I,J) =
125	& pW(I ,J )cg2d_r(I-1,J )+pW(I+1,J )cg2d_r(I+1,J )
126	& +pS(I ,J )cg2d_r(I ,J-1)+pS(I ,J+1)cg2d_r(I ,J+1)
127	& +pC(I ,J )*cg2d_r(I ,J )
128	etaN = etaN+cg2d_q(I,J)*cg2d_r(I,J)
129	ENDDO
130	ENDDO
131
132	CALL GSUM_R8( temp, etaN )
133	etaN = temp
134	beta = etaN/etaNM1
135	etaNM1 = etaN
136
137	DO J=1,sNy
138	DO I=1,sNx
139	cg2d_s(I,J) = cg2d_q(I,J) + beta*cg2d_s(I,J)
140	ENDDO
141	ENDDO
142
143	C-- Do exchanges that require messages i.e. between
144	C-- processes.
145	CALL EXCH_XY_R8( cg2d_s )
146
147	C-- Ten extra exchanges
148	#ifdef TEN_EXTRA_EXCHS
149	DO J=1,10
150	CALL EXCH_XY_R8( cg2d_s )
151	ENDDO
152	#endif
153
154	C== Evaluate laplace operator on conjugate gradient vector
155	C== q = A.s
156	alpha = 0. _d 0
157	DO J=1,sNy
158	DO I=1,sNx
159	cg2d_q(I,J) =
160	& aW2d(I ,J )cg2d_s(I-1,J )+aW2d(I+1,J )cg2d_s(I+1,J )
161	& +aS2d(I ,J )cg2d_s(I ,J-1)+aS2d(I ,J+1)cg2d_s(I ,J+1)
162	& -aW2d(I ,J )cg2d_s(I ,J )-aW2d(I+1,J )cg2d_s(I ,J )
163	& -aS2d(I ,J )cg2d_s(I ,J )-aS2d(I ,J+1)cg2d_s(I ,J )
164	alpha = alpha+cg2d_s(I,J)*cg2d_q(I,J)
165	ENDDO
166	ENDDO
167	CALL GSUM_R8( temp, alpha )
168
169	#ifdef HUNDRED_EXTRA_SUMS
170	C-- Hundred extra global sums
171	DO J=1,100
172	CALL GSUM_R8( temp, alpha )
173	ENDDO
174	#endif
175
176	alpha = temp
177	alpha = etaN/alpha
178
179	C== Update solution and residual vectors
180	C Now compute "interior" points.
181	err = 0. _d 0
182	DO J=1,sNy
183	DO I=1,sNx
184	cg2d_x(I,J)=cg2d_x(I,J)+alpha*cg2d_s(I,J)
185	cg2d_r(I,J)=cg2d_r(I,J)-alpha*cg2d_q(I,J)
186	err = err+cg2d_r(I,J)*cg2d_r(I,J)
187	ENDDO
188	ENDDO
189
190	CALL GSUM_R8( temp, err )
191
192	err = temp
193	err = SQRT(err)
194	actualIts = N
195	actualResidual = err
196	#ifdef RESIDUAL_PER_ITERATION
197	WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual
198	#endif
199	IF ( err .LT. cg2dTargetResidual ) GOTO 11
200	CALL EXCH_XY_R8(cg2d_r )
201	#ifdef PAPI_PER_ITERATION
202	#ifdef USE_PAPI_FLOPS
203	call PAPIF_flops(real_time, proc_time, flpops, mflops, check)
204	WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
205	$ mflops, ' user ', mflops*proc_time/real_time,
206	$ ' wallclock Mflop/s during iteration ', N
207	#else
208	#ifdef USE_PAPI_FLIPS
209	call PAPIF_flips(real_time, proc_time, flpops, mflops, check)
210	WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
211	$ mflops, ' user ', mflops*proc_time/real_time,
212	$ ' wallclock Mflip/s during iteration ', N
213	#endif
214	#endif
215	#endif
216	10 CONTINUE
217	11 CONTINUE
218	CALL EXCH_XY_R8(cg2d_x )
219	#ifdef PAPI_PER_TIMESTEP
220	#ifdef USE_PAPI_FLOPS
221	call PAPIF_flops(real_time, proc_time, flpops, mflops, check)
222	WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
223	$ mflops, ' user ', mflops*proc_time/real_time,
224	$ ' wallclock Mflop/s during iteration ', N
225	#else
226	#ifdef USE_PAPI_FLIPS
227	call PAPIF_flips(real_time, proc_time, flpops, mflops, check)
228	WRITE(6,'(F10.3,A7,F10.3,A37,I8)')
229	$ mflops, ' user ', mflops*proc_time/real_time,
230	$ ' wallclock Mflip/s during iteration ', N
231	#endif
232	#endif
233	#endif
234	WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual
235
236	C Calc Ax to check result
237	DO J=1,sNy
238	DO I=1,sNx
239	cg2d_Ax(I,J) =
240	& ( aW2d(I ,J )cg2d_x(I-1,J )+aW2d(I+1,J )cg2d_x(I+1,J )
241	& +aS2d(I ,J )cg2d_x(I ,J-1)+aS2d(I ,J+1)cg2d_x(I ,J+1)
242	& -aW2d(I ,J )cg2d_x(I ,J )-aW2d(I+1,J )cg2d_x(I ,J )
243	& -aS2d(I ,J )cg2d_x(I ,J )-aS2d(I ,J+1)cg2d_x(I ,J )
244	& )
245	cg2d_r(I,J) = cg2d_b(I,J)-cg2d_Ax(I,J)
246	ENDDO
247	ENDDO
248	CALL EXCH_XY_R8(cg2d_Ax )
249	CALL EXCH_XY_R8(cg2d_r )
250
251	END