| 1 | C       $Id: cg2d.F,v 1.2 2006/05/12 22:21:21 ce107 Exp $ | 
| 2 | SUBROUTINE CG2D | 
| 3 | C     /==========================================================\ | 
| 4 | C     | SUBROUTINE CG2D                                          | | 
| 5 | C     | o Two-dimensional grid problem conjugate-gradient        | | 
| 6 | C     |   inverter (with preconditioner).                        | | 
| 7 | C     |==========================================================| | 
| 8 | C     | Con. grad is an iterative procedure for solving Ax = b.  | | 
| 9 | C     | It requires the A be symmetric.                          | | 
| 10 | C     | This implementation assumes A is a five-diagonal         | | 
| 11 | C     | matrix of the form that arises in the discrete           | | 
| 12 | C     | representation of the del^2 operator in a                | | 
| 13 | C     | two-dimensional space.                                   | | 
| 14 | C     | Notes:                                                   | | 
| 15 | C     | ======                                                   | | 
| 16 | C     | This implementation can support shared-memory            | | 
| 17 | C     | multi-threaded execution. In order to do this COMMON     | | 
| 18 | C     | blocks are used for many of the arrays - even ones that  | | 
| 19 | C     | are only used for intermedaite results. This design is   | | 
| 20 | C     | OK if you want to all the threads to collaborate on      | | 
| 21 | C     | solving the same problem. On the other hand if you want  | | 
| 22 | C     | the threads to solve several different problems          | | 
| 23 | C     | concurrently this implementation will not work.          | | 
| 24 | C     \==========================================================/ | 
| 25 | IMPLICIT NONE | 
| 26 |  | 
| 27 | C     === Global data === | 
| 28 | #include "SIZE.h" | 
| 29 | #include "EEPARAMS.h" | 
| 30 | #include "PARAMS.h" | 
| 31 | #include "CG2D.h" | 
| 32 | #if defined(USE_PAPI_FLOPS) || defined(USE_PAPI_FLIPS) | 
| 33 | #if defined(PAPI_PER_ITERATION) || defined(PAPI_PER_TIMESTEP) | 
| 34 | #include "PAPI.h" | 
| 35 | #endif | 
| 36 | #endif | 
| 37 |  | 
| 38 | C     === Routine arguments === | 
| 39 | C     myThid - Thread on which I am working. | 
| 40 | INTEGER myThid | 
| 41 |  | 
| 42 | C     === Local variables ==== | 
| 43 | C     actualIts      - Number of iterations taken | 
| 44 | C     actualResidual - residual | 
| 45 | C     bi          - Block index in X and Y. | 
| 46 | C     bj | 
| 47 | C     etaN        - Used in computing search directions | 
| 48 | C     etaNM1        suffix N and NM1 denote current and | 
| 49 | C     beta          previous iterations respectively. | 
| 50 | C     alpha | 
| 51 | C     sumRHS      - Sum of right-hand-side. Sometimes this is a | 
| 52 | C                   useful debuggin/trouble shooting diagnostic. | 
| 53 | C                   For neumann problems sumRHS needs to be ~0. | 
| 54 | C                   or they converge at a non-zero residual. | 
| 55 | C     err         - Measure of residual of Ax - b, usually the norm. | 
| 56 | C     I, J, N     - Loop counters ( N counts CG iterations ) | 
| 57 | INTEGER actualIts | 
| 58 | INTEGER bi, bj | 
| 59 | INTEGER I, J, N | 
| 60 | #ifdef USE_MIXED_PRECISION | 
| 61 | REAL*8    actualResidual | 
| 62 | REAL*8    err | 
| 63 | REAL*8    errSum | 
| 64 | REAL*8    etaN | 
| 65 | REAL*8    etaNM1 | 
| 66 | REAL*8    etaNSum | 
| 67 | REAL*8    beta | 
| 68 | REAL*8    alpha | 
| 69 | REAL*8    alphaSum | 
| 70 | REAL*8    sumRHS | 
| 71 | REAL*8    temp | 
| 72 | #else | 
| 73 | Real    actualResidual | 
| 74 | Real    err | 
| 75 | Real    errSum | 
| 76 | Real    etaN | 
| 77 | Real    etaNM1 | 
| 78 | Real    etaNSum | 
| 79 | Real    beta | 
| 80 | Real    alpha | 
| 81 | Real    alphaSum | 
| 82 | Real    sumRHS | 
| 83 | Real    temp | 
| 84 | #endif | 
| 85 |  | 
| 86 | C--   Initialise inverter | 
| 87 | etaNM1              = 1. _d 0 | 
| 88 |  | 
| 89 | C--   Initial residual calculation | 
| 90 | err    = 0. _d 0 | 
| 91 | sumRHS = 0. _d 0 | 
| 92 | DO J=1,sNy | 
| 93 | DO I=1,sNx | 
| 94 | cg2d_s(I,J) = 0. _d 0 | 
| 95 | cg2d_r(I,J) = cg2d_b(I,J) - | 
| 96 | &   ( aW2d(I  ,J  )*cg2d_x(I-1,J  )+aW2d(I+1,J  )*cg2d_x(I+1,J  ) | 
| 97 | &    +aS2d(I  ,J  )*cg2d_x(I  ,J-1)+aS2d(I  ,J+1)*cg2d_x(I  ,J+1) | 
| 98 | &    -aW2d(I  ,J  )*cg2d_x(I  ,J  )-aW2d(I+1,J  )*cg2d_x(I  ,J  ) | 
| 99 | &    -aS2d(I  ,J  )*cg2d_x(I  ,J  )-aS2d(I  ,J+1)*cg2d_x(I  ,J  ) | 
| 100 | &   ) | 
| 101 | err    = err    + cg2d_r(I,J)*cg2d_r(I,J) | 
| 102 | sumRHS = sumRHS + cg2d_b(I,J) | 
| 103 | ENDDO | 
| 104 | ENDDO | 
| 105 | CALL EXCH_XY_R8( cg2d_r    ) | 
| 106 | CALL EXCH_XY_R8( cg2d_s    ) | 
| 107 | CALL GSUM_R8( temp, err    ) | 
| 108 | err    = temp | 
| 109 | CALL GSUM_R8( temp, sumRHS ) | 
| 110 | sumRHS = temp | 
| 111 |  | 
| 112 | actualIts      = 0 | 
| 113 | actualResidual = SQRT(err) | 
| 114 | WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual | 
| 115 | IF ( actualResidual .EQ. 0. _d 0) STOP 'ABNORMAL END: RESIDUAL 0' | 
| 116 |  | 
| 117 | C     >>>>>>>>>>>>>>> BEGIN SOLVER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< | 
| 118 | DO 10 N=1, cg2dMaxIters | 
| 119 | C--    Solve preconditioning equation and update | 
| 120 | C--    conjugate direction vector "s". | 
| 121 | etaN = 0. _d 0 | 
| 122 | DO J=1,sNy | 
| 123 | DO I=1,sNx | 
| 124 | cg2d_q(I,J) = | 
| 125 | &    pW(I  ,J  )*cg2d_r(I-1,J  )+pW(I+1,J  )*cg2d_r(I+1,J  ) | 
| 126 | &   +pS(I  ,J  )*cg2d_r(I  ,J-1)+pS(I  ,J+1)*cg2d_r(I  ,J+1) | 
| 127 | &   +pC(I  ,J  )*cg2d_r(I  ,J  ) | 
| 128 | etaN = etaN+cg2d_q(I,J)*cg2d_r(I,J) | 
| 129 | ENDDO | 
| 130 | ENDDO | 
| 131 |  | 
| 132 | CALL GSUM_R8( temp, etaN ) | 
| 133 | etaN   = temp | 
| 134 | beta   = etaN/etaNM1 | 
| 135 | etaNM1 = etaN | 
| 136 |  | 
| 137 | DO J=1,sNy | 
| 138 | DO I=1,sNx | 
| 139 | cg2d_s(I,J) = cg2d_q(I,J) + beta*cg2d_s(I,J) | 
| 140 | ENDDO | 
| 141 | ENDDO | 
| 142 |  | 
| 143 | C--    Do exchanges that require messages i.e. between | 
| 144 | C--    processes. | 
| 145 | CALL EXCH_XY_R8( cg2d_s ) | 
| 146 |  | 
| 147 | C--    Ten extra exchanges | 
| 148 | #ifdef TEN_EXTRA_EXCHS | 
| 149 | DO J=1,10 | 
| 150 | CALL EXCH_XY_R8( cg2d_s ) | 
| 151 | ENDDO | 
| 152 | #endif | 
| 153 |  | 
| 154 | C==    Evaluate laplace operator on conjugate gradient vector | 
| 155 | C==    q = A.s | 
| 156 | alpha = 0. _d 0 | 
| 157 | DO J=1,sNy | 
| 158 | DO I=1,sNx | 
| 159 | cg2d_q(I,J) = | 
| 160 | &   aW2d(I  ,J  )*cg2d_s(I-1,J  )+aW2d(I+1,J  )*cg2d_s(I+1,J  ) | 
| 161 | &  +aS2d(I  ,J  )*cg2d_s(I  ,J-1)+aS2d(I  ,J+1)*cg2d_s(I  ,J+1) | 
| 162 | &  -aW2d(I  ,J  )*cg2d_s(I  ,J  )-aW2d(I+1,J  )*cg2d_s(I  ,J  ) | 
| 163 | &  -aS2d(I  ,J  )*cg2d_s(I  ,J  )-aS2d(I  ,J+1)*cg2d_s(I  ,J  ) | 
| 164 | alpha = alpha+cg2d_s(I,J)*cg2d_q(I,J) | 
| 165 | ENDDO | 
| 166 | ENDDO | 
| 167 | CALL GSUM_R8( temp, alpha ) | 
| 168 |  | 
| 169 | #ifdef HUNDRED_EXTRA_SUMS | 
| 170 | C--    Hundred extra global sums | 
| 171 | DO J=1,100 | 
| 172 | CALL GSUM_R8( temp, alpha ) | 
| 173 | ENDDO | 
| 174 | #endif | 
| 175 |  | 
| 176 | alpha = temp | 
| 177 | alpha = etaN/alpha | 
| 178 |  | 
| 179 | C==    Update solution and residual vectors | 
| 180 | C      Now compute "interior" points. | 
| 181 | err = 0. _d 0 | 
| 182 | DO J=1,sNy | 
| 183 | DO I=1,sNx | 
| 184 | cg2d_x(I,J)=cg2d_x(I,J)+alpha*cg2d_s(I,J) | 
| 185 | cg2d_r(I,J)=cg2d_r(I,J)-alpha*cg2d_q(I,J) | 
| 186 | err = err+cg2d_r(I,J)*cg2d_r(I,J) | 
| 187 | ENDDO | 
| 188 | ENDDO | 
| 189 |  | 
| 190 | CALL GSUM_R8( temp, err ) | 
| 191 |  | 
| 192 | err = temp | 
| 193 | err = SQRT(err) | 
| 194 | actualIts      = N | 
| 195 | actualResidual = err | 
| 196 | #ifdef RESIDUAL_PER_ITERATION | 
| 197 | WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual | 
| 198 | #endif | 
| 199 | IF ( err .LT. cg2dTargetResidual ) GOTO 11 | 
| 200 | CALL EXCH_XY_R8(cg2d_r ) | 
| 201 | #ifdef PAPI_PER_ITERATION | 
| 202 | #ifdef USE_PAPI_FLOPS | 
| 203 | call PAPIF_flops(real_time, proc_time, flpops, mflops, check) | 
| 204 | WRITE(6,'(F10.3,A7,F10.3,A37,I8)') | 
| 205 | $      mflops, ' user ', mflops*proc_time/real_time, | 
| 206 | $      ' wallclock Mflop/s during iteration ', N | 
| 207 | #else | 
| 208 | #ifdef USE_PAPI_FLIPS | 
| 209 | call PAPIF_flips(real_time, proc_time, flpops, mflops, check) | 
| 210 | WRITE(6,'(F10.3,A7,F10.3,A37,I8)') | 
| 211 | $      mflops, ' user ', mflops*proc_time/real_time, | 
| 212 | $      ' wallclock Mflip/s during iteration ', N | 
| 213 | #endif | 
| 214 | #endif | 
| 215 | #endif | 
| 216 | 10 CONTINUE | 
| 217 | 11 CONTINUE | 
| 218 | CALL EXCH_XY_R8(cg2d_x ) | 
| 219 | #ifdef PAPI_PER_TIMESTEP | 
| 220 | #ifdef USE_PAPI_FLOPS | 
| 221 | call PAPIF_flops(real_time, proc_time, flpops, mflops, check) | 
| 222 | WRITE(6,'(F10.3,A7,F10.3,A37,I8)') | 
| 223 | $      mflops, ' user ', mflops*proc_time/real_time, | 
| 224 | $      ' wallclock Mflop/s during iteration ', N | 
| 225 | #else | 
| 226 | #ifdef USE_PAPI_FLIPS | 
| 227 | call PAPIF_flips(real_time, proc_time, flpops, mflops, check) | 
| 228 | WRITE(6,'(F10.3,A7,F10.3,A37,I8)') | 
| 229 | $      mflops, ' user ', mflops*proc_time/real_time, | 
| 230 | $      ' wallclock Mflip/s during iteration ', N | 
| 231 | #endif | 
| 232 | #endif | 
| 233 | #endif | 
| 234 | WRITE(6,*) ' CG2D iters, err = ', actualIts, actualResidual | 
| 235 |  | 
| 236 | C     Calc Ax to check result | 
| 237 | DO J=1,sNy | 
| 238 | DO I=1,sNx | 
| 239 | cg2d_Ax(I,J) = | 
| 240 | &   ( aW2d(I  ,J  )*cg2d_x(I-1,J  )+aW2d(I+1,J  )*cg2d_x(I+1,J  ) | 
| 241 | &    +aS2d(I  ,J  )*cg2d_x(I  ,J-1)+aS2d(I  ,J+1)*cg2d_x(I  ,J+1) | 
| 242 | &    -aW2d(I  ,J  )*cg2d_x(I  ,J  )-aW2d(I+1,J  )*cg2d_x(I  ,J  ) | 
| 243 | &    -aS2d(I  ,J  )*cg2d_x(I  ,J  )-aS2d(I  ,J+1)*cg2d_x(I  ,J  ) | 
| 244 | &   ) | 
| 245 | cg2d_r(I,J) = cg2d_b(I,J)-cg2d_Ax(I,J) | 
| 246 | ENDDO | 
| 247 | ENDDO | 
| 248 | CALL EXCH_XY_R8(cg2d_Ax ) | 
| 249 | CALL EXCH_XY_R8(cg2d_r  ) | 
| 250 |  | 
| 251 | END |