eesupp/src/global_sum_tile.F

C $Header:  $
C $Name:  $

C--   File global_sum_tile.F: Routines that perform global sum
C                             on a tile array
C      Contents
C      o global_sum_tile_rl
C      o global_sum_tile_rs <- not yet coded
#include "CPP_EEOPTIONS.h"

CBOP
C     !ROUTINE: GLOBAL_SUM_TILE_RL

C     !INTERFACE:
      SUBROUTINE GLOBAL_SUM_TILE_RL(
     I                       phiTile,
     O                       sumPhi,
     I                       myThid )
      IMPLICIT NONE
C     !DESCRIPTION:
C     *==========================================================*
C     | SUBROUTINE GLOBAL\_SUM\_TILE\_RL
C     | o Handle sum for _RL data.
C     *==========================================================*
C     | Apply sum on an array of one value per tile
C     |  and operate over all tiles & all the processes.
C     *==========================================================*

C     !USES:
C     == Global data ==
#include "SIZE.h"
#include "EEPARAMS.h"
#include "EESUPPORT.h"
#include "GLOBAL_SUM.h"

C     !INPUT/OUTPUT PARAMETERS:
C     == Routine arguments ==
C     phiTile :: Input array with one value per tile
C     sumPhi  :: Result of sum.
C     myThid  :: My thread id.
      _RL     phiTile(nSx,nSy)
      _RL     sumPhi
      INTEGER myThid

C     !LOCAL VARIABLES:
C     == Local variables ==
C     bi,bj      :: Loop counters
C     mpiRC  - MPI return code
C- type declaration of: sumMyPr, sumAllP, localBuf and shareBufGSR8 :
C         all 4 needs to have the same length as MPI_DOUBLE_PRECISION
      INTEGER bi,bj
#ifdef ALLOW_USE_MPI
#ifdef GLOBAL_SUM_SEND_RECV
      INTEGER biG, bjG, npe
      INTEGER lbuff, idest, itag, ready_to_receive
      INTEGER istatus(MPI_STATUS_SIZE), ierr
      Real*8  localBuf (nSx,nSy)
      Real*8  globalBuf(nSx*nPx,nSy*nPy)
#else
      INTEGER mpiRC
      Real*8  sumMyPr
#endif
#else /* ALLOW_USE_MPI */
      Real*8  sumMyPr
#endif /* ALLOW_USE_MPI */
      Real*8  sumAllP
CEOP

C- note(jmc): do not see why we need this Barrier here + the one at the end ;
C     however a) both are present in the standard version of global_sum ;
C             b) with OpenMP on hugo, does not work if both are commented out
      CALL BAR2( myThid )

C--   write local sum into shared-buffer array
      DO bj = myByLo(myThid), myByHi(myThid)
       DO bi = myBxLo(myThid), myBxHi(myThid)
         shareBufGSR8(bi,bj) = phiTile(bi,bj)
       ENDDO
      ENDDO

C--   Master thread cannot start until everyone is ready:
      CALL BAR2( myThid )
      _BEGIN_MASTER( myThid )

#if (defined (GLOBAL_SUM_SEND_RECV) && defined (ALLOW_USE_MPI) )

      lbuff = nSx*nSy
      idest = 0
      itag  = 0
      ready_to_receive = 0

      IF ( mpiMyId.NE.0 ) THEN

C--   All proceses except 0 wait to be polled then send local array
#ifndef DISABLE_MPI_READY_TO_RECEIVE
          CALL MPI_RECV (ready_to_receive, 1, MPI_INTEGER,
     &         idest, itag, MPI_COMM_MODEL, istatus, ierr)
#endif
          CALL MPI_SEND (shareBufGSR8, lbuff, MPI_DOUBLE_PRECISION,
     &         idest, itag, MPI_COMM_MODEL, ierr)

C--   All proceses except 0 receive result from process 0
          CALL MPI_RECV (sumAllP, 1, MPI_DOUBLE_PRECISION,
     &         idest, itag, MPI_COMM_MODEL, istatus, ierr)

      ELSE

C--   Process 0 fills-in its local data
        npe = 0
        DO bj=1,nSy
          DO bi=1,nSx
            biG = (mpi_myXGlobalLo(npe+1)-1)/sNx+bi
            bjG = (mpi_myYGlobalLo(npe+1)-1)/sNy+bj
            globalBuf(biG,bjG) = shareBufGSR8(bi,bj)
          ENDDO
        ENDDO

C--   Process 0 polls and receives data from each process in turn
        DO npe = 1, numberOfProcs-1
#ifndef DISABLE_MPI_READY_TO_RECEIVE
          CALL MPI_SEND (ready_to_receive, 1, MPI_INTEGER,
     &           npe, itag, MPI_COMM_MODEL, ierr)
#endif
          CALL MPI_RECV (localBuf, lbuff, MPI_DOUBLE_PRECISION,
     &           npe, itag, MPI_COMM_MODEL, istatus, ierr)

C--   Process 0 gathers the local arrays into a global array.
          DO bj=1,nSy
           DO bi=1,nSx
            biG = (mpi_myXGlobalLo(npe+1)-1)/sNx+bi
            bjG = (mpi_myYGlobalLo(npe+1)-1)/sNy+bj
            globalBuf(biG,bjG) = localBuf(bi,bj)
           ENDDO
          ENDDO
        ENDDO

C--   Sum over all tiles:
        sumAllP = 0.
        DO bjG = 1,nSy*nPy
         DO biG = 1,nSx*nPx
           sumAllP = sumAllP + globalBuf(biG,bjG)
         ENDDO
        ENDDO

C--   Process 0 sends result to all other processes
        lbuff = 1
        DO npe = 1, numberOfProcs-1
          CALL MPI_SEND (sumAllP, 1, MPI_DOUBLE_PRECISION,
     &                   npe, itag, MPI_COMM_MODEL, ierr)
        ENDDO


      ENDIF

#else /* not (GLOBAL_SUM_SEND_RECV & ALLOW_USE_MPI) */

C--   Sum over all tiles (of the same process) first
       sumMyPr = 0.
       DO bj = 1,nSy
        DO bi = 1,nSx
          sumMyPr = sumMyPr + shareBufGSR8(bi,bj)
        ENDDO
       ENDDO

C      in case MPI is not used:
       sumAllP = sumMyPr

#ifdef ALLOW_USE_MPI
#ifndef ALWAYS_USE_MPI
       IF ( usingMPI ) THEN
#endif
        CALL MPI_Allreduce(sumMyPr,sumAllP,1,MPI_DOUBLE_PRECISION,
     &                     MPI_SUM,MPI_COMM_MODEL,mpiRC)
#ifndef ALWAYS_USE_MPI
       ENDIF
#endif
#endif /* ALLOW_USE_MPI */

#endif /* not (GLOBAL_SUM_SEND_RECV & ALLOW_USE_MPI) */

C--    Write solution to shared buffer (all threads can see it)
       shareBufGSR8(1,1) = sumAllP

      _END_MASTER( myThid )
C--   Everyone wait for Master thread to be ready
      CALL BAR2( myThid )

C--   set result for every threads
      sumPhi = shareBufGSR8(1,1)

C- note(jmc): do not see why we need this Barrier here (see comment @ the top)
      CALL BAR2( myThid )

      RETURN
      END
1	C $Header: $
2	C $Name: $
3
4	C-- File global_sum_tile.F: Routines that perform global sum
5	C on a tile array
6	C Contents
7	C o global_sum_tile_rl
8	C o global_sum_tile_rs <- not yet coded
9	#include "CPP_EEOPTIONS.h"
10
11	CBOP
12	C !ROUTINE: GLOBAL_SUM_TILE_RL
13
14	C !INTERFACE:
15	SUBROUTINE GLOBAL_SUM_TILE_RL(
16	I phiTile,
17	O sumPhi,
18	I myThid )
19	IMPLICIT NONE
20	C !DESCRIPTION:
21	C ==========================================================
22	C \| SUBROUTINE GLOBAL\_SUM\_TILE\_RL
23	C \| o Handle sum for _RL data.
24	C ==========================================================
25	C \| Apply sum on an array of one value per tile
26	C \| and operate over all tiles & all the processes.
27	C ==========================================================
28
29	C !USES:
30	C == Global data ==
31	#include "SIZE.h"
32	#include "EEPARAMS.h"
33	#include "EESUPPORT.h"
34	#include "GLOBAL_SUM.h"
35
36	C !INPUT/OUTPUT PARAMETERS:
37	C == Routine arguments ==
38	C phiTile :: Input array with one value per tile
39	C sumPhi :: Result of sum.
40	C myThid :: My thread id.
41	_RL phiTile(nSx,nSy)
42	_RL sumPhi
43	INTEGER myThid
44
45	C !LOCAL VARIABLES:
46	C == Local variables ==
47	C bi,bj :: Loop counters
48	C mpiRC - MPI return code
49	C- type declaration of: sumMyPr, sumAllP, localBuf and shareBufGSR8 :
50	C all 4 needs to have the same length as MPI_DOUBLE_PRECISION
51	INTEGER bi,bj
52	#ifdef ALLOW_USE_MPI
53	#ifdef GLOBAL_SUM_SEND_RECV
54	INTEGER biG, bjG, npe
55	INTEGER lbuff, idest, itag, ready_to_receive
56	INTEGER istatus(MPI_STATUS_SIZE), ierr
57	Real*8 localBuf (nSx,nSy)
58	Real8 globalBuf(nSxnPx,nSy*nPy)
59	#else
60	INTEGER mpiRC
61	Real*8 sumMyPr
62	#endif
63	#else /* ALLOW_USE_MPI */
64	Real*8 sumMyPr
65	#endif /* ALLOW_USE_MPI */
66	Real*8 sumAllP
67	CEOP
68
69	C- note(jmc): do not see why we need this Barrier here + the one at the end ;
70	C however a) both are present in the standard version of global_sum ;
71	C b) with OpenMP on hugo, does not work if both are commented out
72	CALL BAR2( myThid )
73
74	C-- write local sum into shared-buffer array
75	DO bj = myByLo(myThid), myByHi(myThid)
76	DO bi = myBxLo(myThid), myBxHi(myThid)
77	shareBufGSR8(bi,bj) = phiTile(bi,bj)
78	ENDDO
79	ENDDO
80
81	C-- Master thread cannot start until everyone is ready:
82	CALL BAR2( myThid )
83	_BEGIN_MASTER( myThid )
84
85	#if (defined (GLOBAL_SUM_SEND_RECV) && defined (ALLOW_USE_MPI) )
86
87	lbuff = nSx*nSy
88	idest = 0
89	itag = 0
90	ready_to_receive = 0
91
92	IF ( mpiMyId.NE.0 ) THEN
93
94	C-- All proceses except 0 wait to be polled then send local array
95	#ifndef DISABLE_MPI_READY_TO_RECEIVE
96	CALL MPI_RECV (ready_to_receive, 1, MPI_INTEGER,
97	& idest, itag, MPI_COMM_MODEL, istatus, ierr)
98	#endif
99	CALL MPI_SEND (shareBufGSR8, lbuff, MPI_DOUBLE_PRECISION,
100	& idest, itag, MPI_COMM_MODEL, ierr)
101
102	C-- All proceses except 0 receive result from process 0
103	CALL MPI_RECV (sumAllP, 1, MPI_DOUBLE_PRECISION,
104	& idest, itag, MPI_COMM_MODEL, istatus, ierr)
105
106	ELSE
107
108	C-- Process 0 fills-in its local data
109	npe = 0
110	DO bj=1,nSy
111	DO bi=1,nSx
112	biG = (mpi_myXGlobalLo(npe+1)-1)/sNx+bi
113	bjG = (mpi_myYGlobalLo(npe+1)-1)/sNy+bj
114	globalBuf(biG,bjG) = shareBufGSR8(bi,bj)
115	ENDDO
116	ENDDO
117
118	C-- Process 0 polls and receives data from each process in turn
119	DO npe = 1, numberOfProcs-1
120	#ifndef DISABLE_MPI_READY_TO_RECEIVE
121	CALL MPI_SEND (ready_to_receive, 1, MPI_INTEGER,
122	& npe, itag, MPI_COMM_MODEL, ierr)
123	#endif
124	CALL MPI_RECV (localBuf, lbuff, MPI_DOUBLE_PRECISION,
125	& npe, itag, MPI_COMM_MODEL, istatus, ierr)
126
127	C-- Process 0 gathers the local arrays into a global array.
128	DO bj=1,nSy
129	DO bi=1,nSx
130	biG = (mpi_myXGlobalLo(npe+1)-1)/sNx+bi
131	bjG = (mpi_myYGlobalLo(npe+1)-1)/sNy+bj
132	globalBuf(biG,bjG) = localBuf(bi,bj)
133	ENDDO
134	ENDDO
135	ENDDO
136
137	C-- Sum over all tiles:
138	sumAllP = 0.
139	DO bjG = 1,nSy*nPy
140	DO biG = 1,nSx*nPx
141	sumAllP = sumAllP + globalBuf(biG,bjG)
142	ENDDO
143	ENDDO
144
145	C-- Process 0 sends result to all other processes
146	lbuff = 1
147	DO npe = 1, numberOfProcs-1
148	CALL MPI_SEND (sumAllP, 1, MPI_DOUBLE_PRECISION,
149	& npe, itag, MPI_COMM_MODEL, ierr)
150	ENDDO
151
152
153	ENDIF
154
155	#else /* not (GLOBAL_SUM_SEND_RECV & ALLOW_USE_MPI) */
156
157	C-- Sum over all tiles (of the same process) first
158	sumMyPr = 0.
159	DO bj = 1,nSy
160	DO bi = 1,nSx
161	sumMyPr = sumMyPr + shareBufGSR8(bi,bj)
162	ENDDO
163	ENDDO
164
165	C in case MPI is not used:
166	sumAllP = sumMyPr
167
168	#ifdef ALLOW_USE_MPI
169	#ifndef ALWAYS_USE_MPI
170	IF ( usingMPI ) THEN
171	#endif
172	CALL MPI_Allreduce(sumMyPr,sumAllP,1,MPI_DOUBLE_PRECISION,
173	& MPI_SUM,MPI_COMM_MODEL,mpiRC)
174	#ifndef ALWAYS_USE_MPI
175	ENDIF
176	#endif
177	#endif /* ALLOW_USE_MPI */
178
179	#endif /* not (GLOBAL_SUM_SEND_RECV & ALLOW_USE_MPI) */
180
181	C-- Write solution to shared buffer (all threads can see it)
182	shareBufGSR8(1,1) = sumAllP
183
184	_END_MASTER( myThid )
185	C-- Everyone wait for Master thread to be ready
186	CALL BAR2( myThid )
187
188	C-- set result for every threads
189	sumPhi = shareBufGSR8(1,1)
190
191	C- note(jmc): do not see why we need this Barrier here (see comment @ the top)
192	CALL BAR2( myThid )
193
194	RETURN
195	END