1 |
C $Header: /u/gcmpack/MITgcm/eesupp/src/check_threads.F,v 1.15 2011/09/01 14:55:24 jmc Exp $ |
2 |
C $Name: $ |
3 |
|
4 |
#include "CPP_EEOPTIONS.h" |
5 |
|
6 |
CBOP |
7 |
C !ROUTINE: CHECK_THREADS |
8 |
|
9 |
C !INTERFACE: |
10 |
SUBROUTINE CHECK_THREADS( myThid ) |
11 |
|
12 |
C !DESCRIPTION: |
13 |
C *========================================================== |
14 |
C | SUBROUTINE CHECK\_THREADS |
15 |
C | o Check that all the threads we need did indeed start. |
16 |
C *========================================================== |
17 |
C | This routine is called during the initialisation phase |
18 |
C | to check whether all the threads have started. |
19 |
C | It is invoked by every thread and if any thread finds an |
20 |
C | error it should set its error flag. |
21 |
C | Notes: |
22 |
C | Different mechanisms may be required on different |
23 |
C | platforms to actually perform the check. For example as |
24 |
C | coded here each thread checks for a semaphore set by the |
25 |
C | other threads to see if they are running. |
26 |
C | It is also possible for a system to schedule threads |
27 |
C | sequentially, unless some system call is made to yield |
28 |
C | the process. This routine would detect this situation too |
29 |
C | and allow a programmer to modify this routine and the |
30 |
C | barrier code to allow threads to be scheduled more |
31 |
C | appropriately. |
32 |
C *========================================================== |
33 |
|
34 |
C !USES: |
35 |
IMPLICIT NONE |
36 |
C == Global variables == |
37 |
#include "SIZE.h" |
38 |
#include "EEPARAMS.h" |
39 |
#include "EESUPPORT.h" |
40 |
|
41 |
C !INPUT PARAMETERS: |
42 |
C == Routine arguments == |
43 |
C myThid :: My thread number |
44 |
INTEGER myThid |
45 |
|
46 |
C !FUNCTIONS: |
47 |
#ifdef USE_OMP_THREADING |
48 |
INTEGER OMP_GET_NUM_THREADS |
49 |
EXTERNAL OMP_GET_NUM_THREADS |
50 |
#endif |
51 |
|
52 |
C !LOCAL VARIABLES: |
53 |
C == Local variables == |
54 |
C I :: Loop counter |
55 |
C numberThreadRunning :: Count of number of threads this thread |
56 |
C thinks are running. |
57 |
C nChecks :: Number of times checked for all threads. After so |
58 |
C many checks give up and report an error. |
59 |
C msgBuf :: Informational/error message buffer |
60 |
INTEGER nChecks |
61 |
CHARACTER*(MAX_LEN_MBUF) msgBuf |
62 |
#ifdef USE_OMP_THREADING |
63 |
#ifdef ALLOW_USE_MPI |
64 |
INTEGER myErr, mpiRC |
65 |
#endif |
66 |
#else /* USE_OMP_THREADING */ |
67 |
INTEGER I, numberThreadsRunning |
68 |
#endif /* USE_OMP_THREADING */ |
69 |
CEOP |
70 |
|
71 |
C---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----| |
72 |
|
73 |
#ifdef USE_OMP_THREADING |
74 |
C-- Check early-on that number of threads match |
75 |
|
76 |
IF ( OMP_GET_NUM_THREADS() .NE. nThreads ) THEN |
77 |
C- This process has problems in multi-threads setting (detected by |
78 |
C all pseudo-threads); note: cannot use any Barrier in this context |
79 |
|
80 |
WRITE(msgBuf,'(2A,I6)') 'CHECK_THREADS:', |
81 |
& ' from "eedata", nThreads=', nThreads |
82 |
CALL PRINT_ERROR( msgBuf, myThid ) |
83 |
WRITE(msgBuf,'(2A,I6)') ' not equal to ', |
84 |
& 'Env.Var. OMP_NUM_THREADS=', OMP_GET_NUM_THREADS() |
85 |
CALL PRINT_ERROR( msgBuf, myThid ) |
86 |
thError(myThid) = .TRUE. |
87 |
eeBootError = .TRUE. |
88 |
IF ( myThid.EQ.1 ) THEN |
89 |
C- one pseudo-thread (thId=1) export the error to other MPI processes |
90 |
nChecks = 1 |
91 |
#ifdef ALLOW_USE_MPI |
92 |
IF ( usingMPI ) THEN |
93 |
myErr = nChecks |
94 |
CALL MPI_Allreduce( myErr,nChecks,1,MPI_INTEGER, |
95 |
& MPI_SUM,MPI_COMM_MODEL,mpiRC ) |
96 |
ENDIF |
97 |
#endif /* ALLOW_USE_MPI */ |
98 |
ENDIF |
99 |
|
100 |
ELSE |
101 |
C- this process has a working multi-threads setting |
102 |
|
103 |
threadIsRunning(myThid) = .TRUE. |
104 |
IF ( myThid.EQ.1 ) THEN |
105 |
C- master collects error from other MPI processes |
106 |
nChecks = 0 |
107 |
#ifdef ALLOW_USE_MPI |
108 |
IF ( usingMPI ) THEN |
109 |
myErr = nChecks |
110 |
CALL MPI_Allreduce( myErr,nChecks,1,MPI_INTEGER, |
111 |
& MPI_SUM,MPI_COMM_MODEL,mpiRC ) |
112 |
ENDIF |
113 |
#endif /* ALLOW_USE_MPI */ |
114 |
IF ( nChecks.NE.0 ) THEN |
115 |
WRITE(msgBuf,'(A,I5,A)') 'CHECK_THREADS:', nChecks, |
116 |
& ' error(s) from other Processes' |
117 |
CALL PRINT_ERROR( msgBuf, myThid ) |
118 |
eeBootError = .TRUE. |
119 |
ENDIF |
120 |
ENDIF |
121 |
C- ensure all threads leave with updated eeBootError (shared) value |
122 |
C$OMP BARRIER |
123 |
|
124 |
ENDIF |
125 |
|
126 |
#else /* ndef USE_OMP_THREADING */ |
127 |
|
128 |
threadIsRunning(myThid) = .TRUE. |
129 |
nChecks = 0 |
130 |
|
131 |
10 CONTINUE |
132 |
numberThreadsRunning = 0 |
133 |
nChecks = nChecks + 1 |
134 |
DO I = 1, nThreads |
135 |
IF ( threadIsRunning(I) ) |
136 |
& numberThreadsRunning = numberThreadsRunning+1 |
137 |
ENDDO |
138 |
IF ( nChecks .GT. 10 ) THEN |
139 |
thError(myThid) = .TRUE. |
140 |
eeBootError = .TRUE. |
141 |
WRITE(msgBuf,'(A,I5,A,I5,A)') |
142 |
& 'CHECK_THREADS: Only ',numberThreadsRunning, |
143 |
& ' thread(s), ',nThreads,' are needed for this config!' |
144 |
CALL PRINT_ERROR( msgBuf, myThid ) |
145 |
C-- Not enough threads are running so halt the program. |
146 |
C I did not want this here but it is the only place I have found that |
147 |
C KAP deadlocks if it there are fewer threads than iterations in a BLOCKED(1) |
148 |
C loop. The deadlock appears to be in the routine mppjoin which never |
149 |
C returns. I tried putting the STOP in main or breaking out of the loop in main |
150 |
C but this causes KAP to insert a call to mppjoin - which then deadlocks! |
151 |
IF ( myThid .EQ. 1 ) THEN |
152 |
STOP 'ABNORMAL END: S/R CHECK_THREADS' |
153 |
ENDIF |
154 |
GOTO 11 |
155 |
ENDIF |
156 |
IF ( numberThreadsRunning .NE. nThreads ) THEN |
157 |
#ifdef HAVE_SYSTEM |
158 |
CALL SYSTEM('sleep 1') |
159 |
#endif |
160 |
GOTO 10 |
161 |
ENDIF |
162 |
11 CONTINUE |
163 |
|
164 |
#endif /* ndef USE_OMP_THREADING */ |
165 |
|
166 |
C-- check barrier synchronization: 1rst (initial) call. |
167 |
IF ( .NOT. eeBootError ) THEN |
168 |
CALL BAR_CHECK( 1, myThid ) |
169 |
ENDIF |
170 |
|
171 |
RETURN |
172 |
END |