/[MITgcm]/MITgcm/eesupp/src/check_threads.F
ViewVC logotype

Annotation of /MITgcm/eesupp/src/check_threads.F

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.16 - (hide annotations) (download)
Fri Mar 30 14:18:07 2012 UTC (12 years, 2 months ago) by jmc
Branch: MAIN
CVS Tags: checkpoint65z, checkpoint65x, checkpoint65y, checkpoint65r, checkpoint65s, checkpoint65p, checkpoint65q, checkpoint65v, checkpoint65w, checkpoint65t, checkpoint65u, checkpoint65j, checkpoint65k, checkpoint65h, checkpoint65i, checkpoint65n, checkpoint65l, checkpoint65m, checkpoint65b, checkpoint65c, checkpoint65a, checkpoint65f, checkpoint65g, checkpoint65d, checkpoint65e, checkpoint64, checkpoint65, checkpoint66g, checkpoint66f, checkpoint66e, checkpoint66d, checkpoint66c, checkpoint66b, checkpoint66a, checkpoint66o, checkpoint66n, checkpoint66m, checkpoint66l, checkpoint66k, checkpoint66j, checkpoint66i, checkpoint66h, checkpoint63p, checkpoint63q, checkpoint63r, checkpoint63s, checkpoint63l, checkpoint63m, checkpoint63n, checkpoint63o, checkpoint65o, checkpoint64y, checkpoint64x, checkpoint64z, checkpoint64q, checkpoint64p, checkpoint64s, checkpoint64r, checkpoint64u, checkpoint64t, checkpoint64w, checkpoint64v, checkpoint64i, checkpoint64h, checkpoint64k, checkpoint64j, checkpoint64m, checkpoint64l, checkpoint64o, checkpoint64n, checkpoint64a, checkpoint64c, checkpoint64b, checkpoint64e, checkpoint64d, checkpoint64g, checkpoint64f, HEAD
Changes since 1.15: +59 -12 lines
- specific code if using OpenMP threading; improve checking of # of threads
  across processors (e.g., different env OMP_NUM_THREADS setting);
  do not stop but set eeBootError=T.

1 jmc 1.16 C $Header: /u/gcmpack/MITgcm/eesupp/src/check_threads.F,v 1.15 2011/09/01 14:55:24 jmc Exp $
2 adcroft 1.9 C $Name: $
3 cnh 1.1
4     #include "CPP_EEOPTIONS.h"
5    
6 cnh 1.10 CBOP
7     C !ROUTINE: CHECK_THREADS
8    
9     C !INTERFACE:
10 cnh 1.1 SUBROUTINE CHECK_THREADS( myThid )
11    
12 cnh 1.10 C !DESCRIPTION:
13     C *==========================================================
14 jmc 1.15 C | SUBROUTINE CHECK\_THREADS
15     C | o Check that all the threads we need did indeed start.
16     C *==========================================================
17     C | This routine is called during the initialisation phase
18     C | to check whether all the threads have started.
19     C | It is invoked by every thread and if any thread finds an
20     C | error it should set its error flag.
21     C | Notes:
22     C | Different mechanisms may be required on different
23     C | platforms to actually perform the check. For example as
24     C | coded here each thread checks for a semaphore set by the
25     C | other threads to see if they are running.
26     C | It is also possible for a system to schedule threads
27     C | sequentially, unless some system call is made to yield
28     C | the process. This routine would detect this situation too
29     C | and allow a programmer to modify this routine and the
30     C | barrier code to allow threads to be scheduled more
31     C | appropriately.
32     C *==========================================================
33 cnh 1.10
34     C !USES:
35 jmc 1.15 IMPLICIT NONE
36 cnh 1.10 C == Global variables ==
37 cnh 1.1 #include "SIZE.h"
38     #include "EEPARAMS.h"
39     #include "EESUPPORT.h"
40 cnh 1.10
41     C !INPUT PARAMETERS:
42     C == Routine arguments ==
43     C myThid :: My thread number
44 cnh 1.1 INTEGER myThid
45    
46 jmc 1.15 C !FUNCTIONS:
47     #ifdef USE_OMP_THREADING
48     INTEGER OMP_GET_NUM_THREADS
49     EXTERNAL OMP_GET_NUM_THREADS
50     #endif
51    
52 cnh 1.10 C !LOCAL VARIABLES:
53     C == Local variables ==
54 jmc 1.15 C I :: Loop counter
55 cnh 1.10 C numberThreadRunning :: Count of number of threads this thread
56     C thinks are running.
57     C nChecks :: Number of times checked for all threads. After so
58     C many checks give up and report an error.
59 jmc 1.15 C msgBuf :: Informational/error message buffer
60 cnh 1.1 INTEGER nChecks
61 jmc 1.15 CHARACTER*(MAX_LEN_MBUF) msgBuf
62 jmc 1.16 #ifdef USE_OMP_THREADING
63     #ifdef ALLOW_USE_MPI
64     INTEGER myErr, mpiRC
65     #endif
66     #else /* USE_OMP_THREADING */
67     INTEGER I, numberThreadsRunning
68     #endif /* USE_OMP_THREADING */
69 cnh 1.10 CEOP
70    
71 jmc 1.15 C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----|
72    
73     #ifdef USE_OMP_THREADING
74     C-- Check early-on that number of threads match
75 jmc 1.16
76 jmc 1.15 IF ( OMP_GET_NUM_THREADS() .NE. nThreads ) THEN
77 jmc 1.16 C- This process has problems in multi-threads setting (detected by
78     C all pseudo-threads); note: cannot use any Barrier in this context
79    
80 jmc 1.15 WRITE(msgBuf,'(2A,I6)') 'CHECK_THREADS:',
81     & ' from "eedata", nThreads=', nThreads
82     CALL PRINT_ERROR( msgBuf, myThid )
83     WRITE(msgBuf,'(2A,I6)') ' not equal to ',
84     & 'Env.Var. OMP_NUM_THREADS=', OMP_GET_NUM_THREADS()
85     CALL PRINT_ERROR( msgBuf, myThid )
86 jmc 1.16 thError(myThid) = .TRUE.
87     eeBootError = .TRUE.
88     IF ( myThid.EQ.1 ) THEN
89     C- one pseudo-thread (thId=1) export the error to other MPI processes
90     nChecks = 1
91     #ifdef ALLOW_USE_MPI
92     IF ( usingMPI ) THEN
93     myErr = nChecks
94     CALL MPI_Allreduce( myErr,nChecks,1,MPI_INTEGER,
95     & MPI_SUM,MPI_COMM_MODEL,mpiRC )
96     ENDIF
97     #endif /* ALLOW_USE_MPI */
98     ENDIF
99    
100     ELSE
101     C- this process has a working multi-threads setting
102    
103     threadIsRunning(myThid) = .TRUE.
104     IF ( myThid.EQ.1 ) THEN
105     C- master collects error from other MPI processes
106     nChecks = 0
107     #ifdef ALLOW_USE_MPI
108     IF ( usingMPI ) THEN
109     myErr = nChecks
110     CALL MPI_Allreduce( myErr,nChecks,1,MPI_INTEGER,
111     & MPI_SUM,MPI_COMM_MODEL,mpiRC )
112     ENDIF
113     #endif /* ALLOW_USE_MPI */
114     IF ( nChecks.NE.0 ) THEN
115     WRITE(msgBuf,'(A,I5,A)') 'CHECK_THREADS:', nChecks,
116     & ' error(s) from other Processes'
117     CALL PRINT_ERROR( msgBuf, myThid )
118     eeBootError = .TRUE.
119     ENDIF
120     ENDIF
121     C- ensure all threads leave with updated eeBootError (shared) value
122     C$OMP BARRIER
123    
124 jmc 1.15 ENDIF
125 jmc 1.16
126     #else /* ndef USE_OMP_THREADING */
127 jmc 1.15
128     threadIsRunning(myThid) = .TRUE.
129 cnh 1.1 nChecks = 0
130 jmc 1.15
131 cnh 1.1 10 CONTINUE
132     numberThreadsRunning = 0
133     nChecks = nChecks + 1
134     DO I = 1, nThreads
135     IF ( threadIsRunning(I) )
136     & numberThreadsRunning = numberThreadsRunning+1
137     ENDDO
138     IF ( nChecks .GT. 10 ) THEN
139     thError(myThid) = .TRUE.
140     eeBootError = .TRUE.
141 jmc 1.15 WRITE(msgBuf,'(A,I5,A,I5,A)')
142 jmc 1.16 & 'CHECK_THREADS: Only ',numberThreadsRunning,
143     & ' thread(s), ',nThreads,' are needed for this config!'
144 jmc 1.15 CALL PRINT_ERROR( msgBuf, myThid )
145 cnh 1.1 C-- Not enough threads are running so halt the program.
146 cnh 1.3 C I did not want this here but it is the only place I have found that
147 cnh 1.1 C KAP deadlocks if it there are fewer threads than iterations in a BLOCKED(1)
148 jmc 1.15 C loop. The deadlock appears to be in the routine mppjoin which never
149 cnh 1.1 C returns. I tried putting the STOP in main or breaking out of the loop in main
150     C but this causes KAP to insert a call to mppjoin - which then deadlocks!
151     IF ( myThid .EQ. 1 ) THEN
152     STOP 'ABNORMAL END: S/R CHECK_THREADS'
153     ENDIF
154     GOTO 11
155     ENDIF
156     IF ( numberThreadsRunning .NE. nThreads ) THEN
157 heimbach 1.7 #ifdef HAVE_SYSTEM
158 cnh 1.1 CALL SYSTEM('sleep 1')
159 adcroft 1.5 #endif
160 cnh 1.1 GOTO 10
161     ENDIF
162     11 CONTINUE
163 adcroft 1.9
164 jmc 1.16 #endif /* ndef USE_OMP_THREADING */
165    
166 jmc 1.14 C-- check barrier synchronization: 1rst (initial) call.
167 jmc 1.16 IF ( .NOT. eeBootError ) THEN
168     CALL BAR_CHECK( 1, myThid )
169     ENDIF
170 jmc 1.14
171 cnh 1.1 RETURN
172     END

  ViewVC Help
Powered by ViewVC 1.1.22