C $Header: /home/ubuntu/mnt/e9_copy/MITgcm_contrib/torge/itd/code/seaice_fgmres.F,v 1.2 2012/11/02 16:52:08 torge Exp $
C $Name:  $

#include "SEAICE_OPTIONS.h"

C--   File seaice_fgmres.F: seaice fgmres dynamical (linear) solver S/R:
C--   Contents
C--   o SEAICE_FGMRES_DRIVER
C--   o SEAICE_MAP2VEC
C--   o SEAICE_FGMRES
C--   o SCALPROD

CBOP
C     !ROUTINE: SEAICE_FGMRES_DRIVER
C     !INTERFACE:

      SUBROUTINE SEAICE_FGMRES_DRIVER(
     I     uIceRes, vIceRes,
     U     duIce, dvIce,
     U     iCode,
     I     FGMRESeps,
     I     newtonIter, krylovIter, myTime, myIter, myThid )

C     !DESCRIPTION: \bv
C     *==========================================================*
C     | SUBROUTINE SEAICE_FGMRES_DRIVER
C     | o driver routine for fgmres
C     | o does the conversion between 2D fields and 1D vector
C     |   back and forth
C     *==========================================================*
C     | written by Martin Losch, Oct 2012
C     *==========================================================*
C     \ev

C     !USES:
      IMPLICIT NONE

C     === Global variables ===
#include "SIZE.h"
#include "EEPARAMS.h"
#include "SEAICE_SIZE.h"
#include "SEAICE_PARAMS.h"

C     !INPUT/OUTPUT PARAMETERS:
C     === Routine arguments ===
C     myTime :: Simulation time
C     myIter :: Simulation timestep number
C     myThid :: my Thread Id. number
C     newtonIter :: current iterate of Newton iteration
C     krylovIter :: current iterate of Newton iteration
C     iCode  :: FGMRES parameter to determine next step
      _RL     myTime
      INTEGER myIter
      INTEGER myThid
      INTEGER newtonIter
      INTEGER krylovIter
      INTEGER iCode
C     FGMRESeps :: tolerance for FGMRES
      _RL FGMRESeps
C     du/vIce   :: solution vector
      _RL duIce(1-OLx:sNx+OLx,1-OLy:sNy+OLy,nSx,nSy)
      _RL dvIce(1-OLx:sNx+OLx,1-OLy:sNy+OLy,nSx,nSy)
C     u/vIceRes :: residual F(u)
      _RL uIceRes(1-OLx:sNx+OLx,1-OLy:sNy+OLy,nSx,nSy)
      _RL vIceRes(1-OLx:sNx+OLx,1-OLy:sNy+OLy,nSx,nSy)

#if ( defined (SEAICE_CGRID) && \
      defined (SEAICE_ALLOW_JFNK) && \
      defined (SEAICE_ALLOW_DYNAMICS) )
C     Local variables:
C     k :: loop indices
      INTEGER k
C     FGMRES parameters
C     n  :: size of the input vector(s)
C     im :: size of Krylov space
C     ifgmres :: interation counter
C     iout :: control output of fgmres
      INTEGER n
      PARAMETER ( n  = 2*sNx*sNy*nSx*nSy )
      INTEGER im
      PARAMETER ( im = 50 )
      INTEGER ifgmres, iout
C     work arrays
      _RL rhs(n), sol(n)
      _RL vv(n,im+1), w(n,im)
      _RL wk1(n), wk2(n)
C     need to store some of the fgmres parameters and fields so that
C     they are not forgotten between Krylov iterations
      COMMON /FGMRES_I/ ifgmres
      COMMON /FGMRES_RL/ sol, rhs, vv, w
CEOP

C     iout=1 give a little bit of output
      iout=1

C     For now, let only the master thread do all the work
C     - copy from 2D arrays to 1D-vector
C     - perform fgmres step (including global sums)
C     - copy from 1D-vector to 2D arrays
C     not sure if this works properly

      _BEGIN_MASTER ( myThid )
      IF ( iCode .EQ. 0 ) THEN
C     first guess is zero because it is a correction
C     wk2 needs to be reset for iCode = 0, because it may contain
C     remains of the previous Krylov iteration
       DO k=1,n
        sol(k) = 0. _d 0
        wk2(k) = 0. _d 0
       ENDDO
      ELSEIF ( iCode .EQ. 3 ) THEN
       CALL SEAICE_MAP2VEC(n,uIceRes,vIceRes,rhs,.TRUE.,myThid)
C     change sign of rhs because we are solving J*u = -F
C     wk2 needs to be initialised for iCode = 3, because it may contain
C     garbage
       DO k=1,n
        rhs(k) = -rhs(k)
        wk2(k) = 0. _d 0
       ENDDO
      ELSE
C     map preconditioner results or Jacobian times vector,
C     stored in du/vIce to wk2
       CALL SEAICE_MAP2VEC(n,duIce,dvIce,wk2,.TRUE.,myThid)
      ENDIF
C
      CALL SEAICE_FGMRES (n,im,rhs,sol,ifgmres,vv,w,wk1,wk2,
     &     FGMRESeps,SEAICEkrylovIterMax,
     &     iout,icode,krylovIter,myThid)
C
      IF ( iCode .EQ. 0 ) THEN
C     map sol(ution) vector to du/vIce
       CALL SEAICE_MAP2VEC(n,duIce,dvIce,sol,.FALSE.,myThid)
      ELSE
C     map work vector to du/vIce to either compute a preconditioner
C     solution (wk1=rhs) or a Jacobian times wk1
       CALL SEAICE_MAP2VEC(n,duIce,dvIce,wk1,.FALSE.,myThid)
      ENDIF
      _END_MASTER ( myThid )

C     Fill overlaps in updated fields
      CALL EXCH_UV_XY_RL( duIce, dvIce,.TRUE.,myThid)

      RETURN
      END

C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----|
CBOP
C     !ROUTINE: SEAICE_MAP2VEC
C     !INTERFACE:

      SUBROUTINE SEAICE_MAP2VEC(
     I     n,
     O     xfld2d, yfld2d,
     U     vector,
     I     map2vec, myThid )

C     !DESCRIPTION: \bv
C     *==========================================================*
C     | SUBROUTINE SEAICE_MAP2VEC
C     | o maps 2 2D-fields to vector and back
C     *==========================================================*
C     | written by Martin Losch, Oct 2012
C     *==========================================================*
C     \ev

C     !USES:
      IMPLICIT NONE

C     === Global variables ===
#include "SIZE.h"
#include "EEPARAMS.h"
C     === Routine arguments ===
      INTEGER n
      LOGICAL map2vec
      INTEGER myThid
      _RL xfld2d (1-OLx:sNx+OLx,1-OLy:sNy+OLy,nSx,nSy)
      _RL yfld2d (1-OLx:sNx+OLx,1-OLy:sNy+OLy,nSx,nSy)
      _RL vector (n)
C     === local variables ===
      INTEGER I, J, bi, bj
      INTEGER ii, jj, ib, jb, m
CEOP

      m = n/2
      IF ( map2vec ) THEN
       DO bj=myByLo(myThid),myByHi(myThid)
        jb = nSx*sNy*sNx*(bj-1)
        DO bi=myBxLo(myThid),myBxHi(myThid)
         ib = jb + sNy*sNx*(bi-1)
         DO J=1,sNy
          jj = ib + sNx*(J-1)
          DO I=1,sNx
           ii = jj + I
           vector(ii)   = xfld2d(I,J,bi,bj)
           vector(ii+m) = yfld2d(I,J,bi,bj)
          ENDDO
         ENDDO
        ENDDO
       ENDDO
      ELSE
       DO bj=myByLo(myThid),myByHi(myThid)
        jb = nSx*sNy*sNx*(bj-1)
        DO bi=myBxLo(myThid),myBxHi(myThid)
         ib = jb + sNy*sNx*(bi-1)
         DO J=1,sNy
          jj = ib + sNx*(J-1)
          DO I=1,sNx
           ii = jj + I
           xfld2d(I,J,bi,bj) = vector(ii)
           yfld2d(I,J,bi,bj) = vector(ii+m)
          ENDDO
         ENDDO
        ENDDO
       ENDDO
      ENDIF

      RETURN
      END

C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----|
CBOP
C     !ROUTINE: SEAICE_FGMRES
C     !INTERFACE:

      SUBROUTINE SEAICE_FGMRES (n,im,rhs,sol,i,vv,w,wk1, wk2,
     &     eps,maxits,iout,icode,its,myThid)

C-----------------------------------------------------------------------
C mlosch Oct 2012: modified the routine further to be compliant with
C MITgcm standards:
C f90 -> F
C !-comment -> C-comment
C double precision -> _RL
C implicit none
C
C jfl Dec 1st 2006. We modified the routine so that it is double precison.
C Here are the modifications:
C 1) implicit real (a-h,o-z) becomes implicit real*8 (a-h,o-z)
C 2) real bocomes real*8
C 3) subroutine scopy.f has been changed for dcopy.f
C 4) subroutine saxpy.f has been changed for daxpy.f
C 5) function sdot.f has been changed for ddot.f
C 6) 1e-08 becomes 1d-08
C
C Be careful with the dcopy, daxpy and ddot code...there is a slight
C difference with the single precision versions (scopy, saxpy and sdot).
C In the single precision versions, the array are declared sightly differently.
C It is written for single precision:
C
C modified 12/3/93, array(1) declarations changed to array(*)
C-----------------------------------------------------------------------

      implicit none
CML   implicit double precision (a-h,o-z) !jfl modification
      integer myThid
      integer n, im, maxits, iout, icode
      _RL rhs(*), sol(*), vv(n,im+1), w(n,im)
      _RL wk1(n), wk2(n), eps
C-----------------------------------------------------------------------
C flexible GMRES routine. This is a version of GMRES which allows a
C a variable preconditioner. Implemented with a reverse communication
C protocole for flexibility -
C DISTRIBUTED VERSION (USES DISTDOT FOR DDOT)
C explicit (exact) residual norms for restarts
C written by Y. Saad, modified by A. Malevsky, version February 1, 1995
C-----------------------------------------------------------------------
C This Is A Reverse Communication Implementation.
C-------------------------------------------------
C USAGE: (see also comments for icode below). FGMRES
C should be put in a loop and the loop should be active for as
C long as icode is not equal to 0. On return fgmres will
C    1) either be requesting the new preconditioned vector applied
C       to wk1 in case icode.eq.1 (result should be put in wk2)
C    2) or be requesting the product of A applied to the vector wk1
C       in case icode.eq.2 (result should be put in wk2)
C    3) or be terminated in case icode .eq. 0.
C on entry always set icode = 0. So icode should be set back to zero
C upon convergence.
C-----------------------------------------------------------------------
C Here is a typical way of running fgmres:
C
C      icode = 0
C 1    continue
C      call fgmres (n,im,rhs,sol,i,vv,w,wk1, wk2,eps,maxits,iout,icode)
C
C      if (icode .eq. 1) then
C         call  precon(n, wk1, wk2)    <--- user variable preconditioning
C         goto 1
C      else if (icode .ge. 2) then
C         call  matvec (n,wk1, wk2)    <--- user matrix vector product.
C         goto 1
C      else
C         ----- done ----
C         .........
C-----------------------------------------------------------------------
C list of parameters
C-------------------
C
C n     == integer. the dimension of the problem
C im    == size of Krylov subspace:  should not exceed 50 in this
C          version (can be reset in code. looking at comment below)
C rhs   == vector of length n containing the right hand side
C sol   == initial guess on input, approximate solution on output
C vv    == work space of size n x (im+1)
C w     == work space of length n x im
C wk1,
C wk2,  == two work vectors of length n each used for the reverse
C          communication protocole. When on return (icode .ne. 1)
C          the user should call fgmres again with wk2 = precon * wk1
C          and icode untouched. When icode.eq.1 then it means that
C          convergence has taken place.
C
C eps   == tolerance for stopping criterion. process is stopped
C          as soon as ( ||.|| is the euclidean norm):
C          || current residual||/||initial residual|| <= eps
C
C maxits== maximum number of iterations allowed
C
C iout  == output unit number number for printing intermediate results
C          if (iout .le. 0) no statistics are printed.
C
C icode = integer. indicator for the reverse communication protocole.
C         ON ENTRY : icode should be set to icode = 0.
C         ON RETURN:
C       * icode .eq. 1 value means that fgmres has not finished
C         and that it is requesting a preconditioned vector before
C         continuing. The user must compute M**(-1) wk1, where M is
C         the preconditioing  matrix (may vary at each call) and wk1 is
C         the vector as provided by fgmres upun return, and put the
C         result in wk2. Then fgmres must be called again without
C         changing any other argument.
C       * icode .eq. 2 value means that fgmres has not finished
C         and that it is requesting a matrix vector product before
C         continuing. The user must compute  A * wk1, where A is the
C         coefficient  matrix and wk1 is the vector provided by
C         upon return. The result of the operation is to be put in
C         the vector wk2. Then fgmres must be called again without
C         changing any other argument.
C       * icode .eq. 0 means that fgmres has finished and sol contains
C         the approximate solution.
C         comment: typically fgmres must be implemented in a loop
C         with fgmres being called as long icode is returned with
C         a value .ne. 0.
C-----------------------------------------------------------------------
C     local variables -- !jfl modif
      integer imax
      parameter ( imax = 50 )
      _RL hh(4*imax+1,4*imax),c(4*imax),s(4*imax)
      _RL rs(4*imax+1),t,ro
C-------------------------------------------------------------
C     arnoldi size should not exceed 50 in this version..
C-------------------------------------------------------------
      integer i, its, i1, ii, j, jj, k, k1!, n1
      _RL r0, gam, epsmac, eps1

CEOP
      save
      data epsmac/1.d-16/
C
C     computed goto
C
      if ( im .gt. imax ) stop 'size of krylov space > 50'
      goto (100,200,300,11) icode +1
 100  continue
CML      n1 = n + 1
      its = 0
C-------------------------------------------------------------
C     **  outer loop starts here..
C--------------compute initial residual vector --------------
C 10   continue
CML      call dcopy (n, sol, 1, wk1, 1) !jfl modification
      do k=1,n
       wk1(k)=sol(k)
      enddo
      icode = 3
      RETURN
 11   continue
      do j=1,n
         vv(j,1) = rhs(j) - wk2(j)
      enddo
CML 20   ro = ddot(n, vv, 1, vv,1) !jfl modification
 20   call scalprod(n, vv, vv, ro, myThid)
      ro = sqrt(ro)
      if (ro .eq. 0.0d0) goto 999
      t = 1.0d0/ ro
      do j=1, n
         vv(j,1) = vv(j,1)*t
      enddo
      if (its .eq. 0) eps1=eps
      if (its .eq. 0) r0 = ro
      if (iout .gt. 0) write(*, 199) its, ro!&
C           print *,'chau',its, ro !write(iout, 199) its, ro
C
C     initialize 1-st term  of rhs of hessenberg system..
C
      rs(1) = ro
      i = 0
 4    i=i+1
      its = its + 1
      i1 = i + 1
      do k=1, n
         wk1(k) = vv(k,i)
      enddo
C
C     return
C
      icode = 1

      RETURN
 200  continue
      do k=1, n
         w(k,i) = wk2(k)
      enddo
C
C     call matvec operation
C
      icode = 2
CML      call dcopy(n, wk2, 1, wk1, 1) !jfl modification
      do k=1,n
       wk1(k)=wk2(k)
      enddo
C
C     return
C
      RETURN
 300  continue
C
C     first call to ope corresponds to intialization goto back to 11.
C
C      if (icode .eq. 3) goto 11
CML      call  dcopy (n, wk2, 1, vv(1,i1), 1) !jfl modification
      do k=1,n
       vv(k,i1)=wk2(k)
      enddo
C
C     modified gram - schmidt...
C
      do j=1, i
CML         t = ddot(n, vv(1,j), 1, vv(1,i1), 1) !jfl modification
         call scalprod(n, vv(1,j), vv(1,i1), t, myThid)
         hh(j,i) = t
CML         call daxpy(n, -t, vv(1,j), 1, vv(1,i1), 1) !jfl modification
CML      enddo
CML      do j=1, i
CML         t = hh(j,i)
         do k=1,n
          vv(k,i1) = vv(k,i1) - t*vv(k,j)
         enddo
      enddo
CML      t = sqrt(ddot(n, vv(1,i1), 1, vv(1,i1), 1)) !jfl modification
      call scalprod(n, vv(1,i1), vv(1,i1), t, myThid)
      t = sqrt(t)
      hh(i1,i) = t
      if (t .eq. 0.0d0) goto 58
      t = 1.0d0 / t
      do k=1,n
         vv(k,i1) = vv(k,i1)*t
      enddo
C
C     done with modified gram schimd and arnoldi step.
C     now  update factorization of hh
C
 58   if (i .eq. 1) goto 121
C
C     perfrom previous transformations  on i-th column of h
C
      do k=2,i
         k1 = k-1
         t = hh(k1,i)
         hh(k1,i) = c(k1)*t + s(k1)*hh(k,i)
         hh(k,i) = -s(k1)*t + c(k1)*hh(k,i)
      enddo
 121  gam = sqrt(hh(i,i)**2 + hh(i1,i)**2)
      if (gam .eq. 0.0d0) gam = epsmac
C-----------#determine next plane rotation  #-------------------
      c(i) = hh(i,i)/gam
      s(i) = hh(i1,i)/gam
      rs(i1) = -s(i)*rs(i)
      rs(i) =  c(i)*rs(i)
C
C     determine res. norm. and test for convergence-
C
      hh(i,i) = c(i)*hh(i,i) + s(i)*hh(i1,i)
      ro = abs(rs(i1))
      if (iout .gt. 0) write(*, 199) its, ro
      if (i .lt. im .and. (ro .gt. eps1))  goto 4
C
C     now compute solution. first solve upper triangular system.
C
      rs(i) = rs(i)/hh(i,i)
      do ii=2,i
         k=i-ii+1
         k1 = k+1
         t=rs(k)
         do j=k1,i
            t = t-hh(k,j)*rs(j)
         enddo
         rs(k) = t/hh(k,k)
      enddo
C
C     done with back substitution..
C     now form linear combination to get solution
C
      do j=1, i
       t = rs(j)
C         call daxpy(n, t, w(1,j), 1, sol,1) !jfl modification
       do k=1,n
        sol(k) = sol(k) + t*w(k,j)
       enddo
      enddo
C
C     test for return
C
      print *, 'ml-fgmres: its, maxits: ', its, maxits, ro, '<', eps1
      if (ro .le. eps1 .or. its .ge. maxits) goto 999
C
C     else compute residual vector and continue..
C
C       goto 10

      do j=1,i
        jj = i1-j+1
        rs(jj-1) = -s(jj-1)*rs(jj)
        rs(jj) = c(jj-1)*rs(jj)
      enddo
      do j=1,i1
        t = rs(j)
        if (j .eq. 1)  t = t-1.0d0
CML        call daxpy (n, t, vv(1,j), 1,  vv, 1)
        do k=1,n
         vv(k,1) = vv(k,1) + t*vv(k,j)
        enddo
      enddo
C
C     restart outer loop.
C
      goto 20
 999  icode = 0

 199  format('   -- fgmres its =', i4, ' res. norm =', d26.16)
C
      RETURN
C-----end-of-fgmres-----------------------------------------------------
C-----------------------------------------------------------------------
      END

C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----|
CBOP
C     !ROUTINE: SCALPROD
C     !INTERFACE:

      subroutine scalprod(n,dx,dy,t,myThid)

C     forms the dot product of two vectors.
C     uses unrolled loops for increments equal to one.
C     jack dongarra, linpack, 3/11/78.
C     ML: code stolen from BLAS and adapted for parallel applications

      implicit none
#include "SIZE.h"
#include "EEPARAMS.h"
#include "EESUPPORT.h"
      integer n
      _RL dx(n),dy(n)
      real*8 t
      integer myThid

      real*8 dtemp
      integer i,m,mp1
#ifdef ALLOW_USE_MPI
      INTEGER mpiRC
#endif   /* ALLOW_USE_MPI */
CEOP
C
      m = mod(n,5)
      dtemp = 0. _d 0
      t     = 0. _d 0
      if( m .eq. 0 ) go to 40
      do i = 1,m
       dtemp = dtemp + dx(i)*dy(i)
      enddo
      if( n .lt. 5 ) go to 60
   40 mp1 = m + 1
      do i = mp1,n,5
       dtemp = dtemp + dx(i)*dy(i) + dx(i + 1)*dy(i + 1) +
     &      dx(i + 2)*dy(i + 2) + dx(i + 3)*dy(i + 3) +
     &      dx(i + 4)*dy(i + 4)
      enddo
   60 continue
C     sum over all processors
#ifdef ALLOW_USE_MPI
      t = dtemp
      IF ( usingMPI ) THEN
       CALL MPI_Allreduce(t,dtemp,1,MPI_DOUBLE_PRECISION,MPI_SUM,
     &                   MPI_COMM_MODEL,mpiRC)
      ENDIF
#endif /* ALLOW_USE_MPI */
      t = dtemp

CML      return
CML      end
CML
CML      subroutine daxpy(n,da,dx,incx,dy,incy)
CMLC
CMLC     constant times a vector plus a vector.
CMLC     uses unrolled loops for increments equal to one.
CMLC     jack dongarra, linpack, 3/11/78.
CMLC
CML      _RL dx(n),dy(n),da
CML      integer i,incx,incy,ix,iy,m,mp1,n
CMLC
CML      if(n.le.0)return
CML      if (da .eq. 0.0d0) return
CML      if(incx.eq.1.and.incy.eq.1)go to 20
CMLC
CMLC        code for unequal increments or equal increments
CMLC          not equal to 1
CMLC
CML      ix = 1
CML      iy = 1
CML      if(incx.lt.0)ix = (-n+1)*incx + 1
CML      if(incy.lt.0)iy = (-n+1)*incy + 1
CML      do 10 i = 1,n
CML        dy(iy) = dy(iy) + da*dx(ix)
CML        ix = ix + incx
CML        iy = iy + incy
CML   10 continue
CML      return
CMLC
CMLC        code for both increments equal to 1
CMLC
CMLC
CMLC        clean-up loop
CMLC
CML   20 m = mod(n,4)
CML      if( m .eq. 0 ) go to 40
CML      do 30 i = 1,m
CML        dy(i) = dy(i) + da*dx(i)
CML   30 continue
CML      if( n .lt. 4 ) return
CML   40 mp1 = m + 1
CML      do 50 i = mp1,n,4
CML        dy(i) = dy(i) + da*dx(i)
CML        dy(i + 1) = dy(i + 1) + da*dx(i + 1)
CML        dy(i + 2) = dy(i + 2) + da*dx(i + 2)
CML        dy(i + 3) = dy(i + 3) + da*dx(i + 3)
CML   50 continue
CML      return
CML      end
CML
CML      subroutine  dcopy(n,dx,incx,dy,incy)
CMLC
CMLC     copies a vector, x, to a vector, y.
CMLC     uses unrolled loops for increments equal to one.
CMLC     jack dongarra, linpack, 3/11/78.
CMLC
CML      _RL dx(n),dy(n)
CML      integer i,incx,incy,ix,iy,m,mp1,n
CMLC
CML      if(n.le.0)return
CML      if(incx.eq.1.and.incy.eq.1)go to 20
CMLC
CMLC        code for unequal increments or equal increments
CMLC          not equal to 1
CMLC
CML      ix = 1
CML      iy = 1
CML      if(incx.lt.0)ix = (-n+1)*incx + 1
CML      if(incy.lt.0)iy = (-n+1)*incy + 1
CML      do 10 i = 1,n
CML        dy(iy) = dx(ix)
CML        ix = ix + incx
CML        iy = iy + incy
CML   10 continue
CML      return
CMLC
CMLC        code for both increments equal to 1
CMLC
CMLC
CMLC        clean-up loop
CMLC
CML   20 m = mod(n,7)
CML      if( m .eq. 0 ) go to 40
CML      do 30 i = 1,m
CML        dy(i) = dx(i)
CML   30 continue
CML      if( n .lt. 7 ) return
CML   40 mp1 = m + 1
CML      do 50 i = mp1,n,7
CML        dy(i) = dx(i)
CML        dy(i + 1) = dx(i + 1)
CML        dy(i + 2) = dx(i + 2)
CML        dy(i + 3) = dx(i + 3)
CML        dy(i + 4) = dx(i + 4)
CML        dy(i + 5) = dx(i + 5)
CML        dy(i + 6) = dx(i + 6)
CML   50 continue

#endif /* SEAICE_ALLOW_DYNAMICS and SEAICE_CGRID and SEAICE_ALLOW_JFNK */

      RETURN
      END