1 |
|
2 |
// Code to do the m-on-n fan-in, recompositing, and output, of data |
3 |
// tiles for mitGCM. |
4 |
// |
5 |
// There are aspects of this code that would be simpler in C++, but |
6 |
// we deliberately wrote the code in ansi-C to make linking it with |
7 |
// the Fortran main code easier (should be able to just include the |
8 |
// .o on the link line). |
9 |
|
10 |
|
11 |
#include "PACKAGES_CONFIG.h" |
12 |
|
13 |
#include <stdio.h> |
14 |
#include <unistd.h> |
15 |
#include <stdlib.h> |
16 |
#include <string.h> |
17 |
#include <sys/types.h> |
18 |
#include <sys/stat.h> |
19 |
#include <fcntl.h> |
20 |
|
21 |
#include <mpi.h> |
22 |
#include <alloca.h> |
23 |
|
24 |
|
25 |
#include <stdint.h> |
26 |
#include <errno.h> |
27 |
|
28 |
#define DEBUG 1 |
29 |
|
30 |
#if (DEBUG >= 2) |
31 |
#define FPRINTF fprintf |
32 |
#else |
33 |
#include <stdarg.h> |
34 |
void FPRINTF(FILE *fp,...){return;} |
35 |
#endif |
36 |
|
37 |
|
38 |
// Define our own version of "assert", where we sleep rather than abort. |
39 |
// This makes it easier to attach the debugger. |
40 |
#if (DEBUG >= 1) |
41 |
#define ASSERT(_expr) \ |
42 |
if (!(_expr)) { \ |
43 |
fprintf(stderr, "ASSERT failed for pid %d : `%s': %s: %d: %s\n", \ |
44 |
getpid(), #_expr, __FILE__, __LINE__, __func__); \ |
45 |
sleep(9999); \ |
46 |
} |
47 |
#else |
48 |
#define ASSERT assert |
49 |
#endif |
50 |
|
51 |
|
52 |
|
53 |
// If numRanksPerNode is set to be > 0, we just use that setting. |
54 |
// If it is <= 0, we dynamically determine the number of cores on |
55 |
// a node, and use that. (N.B.: we assume *all* nodes being used in |
56 |
// the run have the *same* number of cores.) |
57 |
int numRanksPerNode = 0; |
58 |
|
59 |
// Just an error check; can be zero (if you're confident it's all correct). |
60 |
#define numCheckBits 2 |
61 |
// This bitMask definition works for 2's complement |
62 |
#define bitMask ((1 << numCheckBits) - 1) |
63 |
|
64 |
|
65 |
/////////////////////////////////////////////////////////////////////// |
66 |
// Info about the data fields |
67 |
|
68 |
// double for now. Might also be float someday |
69 |
#define datumType double |
70 |
#define datumSize (sizeof(datumType)) |
71 |
|
72 |
|
73 |
// Info about the data fields. We assume that all the fields have the |
74 |
// same X and Y characteristics, but may have either 1 or NUM_Z levels. |
75 |
// |
76 |
// the following 3 values are required here. Could be sent over or read in |
77 |
// with some rearrangement in init routines |
78 |
// |
79 |
#define NUM_X 4320 |
80 |
#define NUM_Y 56160L // get rid of this someday |
81 |
#define NUM_Z 90 |
82 |
#define MULTDIM 7 |
83 |
#define twoDFieldSizeInBytes (NUM_X * NUM_Y * 1 * datumSize) |
84 |
#define threeDFieldSizeInBytes (twoDFieldSizeInBytes * NUM_Z) |
85 |
#define multDFieldSizeInBytes (twoDFieldSizeInBytes * MULTDIM) |
86 |
|
87 |
// Info about the data tiles. We assume that all the tiles are the same |
88 |
// size (no odd-sized last piece), they all have the same X and Y |
89 |
// characteristics (including ghosting), and are full depth in Z |
90 |
// (either 1 or NUM_Z as appropriate). |
91 |
// |
92 |
// all these data now sent over from compute ranks |
93 |
// |
94 |
int TILE_X = -1; |
95 |
int TILE_Y = -1; |
96 |
int XGHOSTS = -1; |
97 |
int YGHOSTS = -1; |
98 |
|
99 |
// Size of one Z level of a tile (NOT one Z level of a whole field) |
100 |
int tileOneZLevelItemCount = -1; |
101 |
int tileOneZLevelSizeInBytes = -1; |
102 |
|
103 |
|
104 |
typedef struct dataFieldDepth { |
105 |
char dataFieldID; |
106 |
int numZ; |
107 |
} dataFieldDepth_t; |
108 |
|
109 |
dataFieldDepth_t fieldDepths[] = { |
110 |
{ 'u', NUM_Z }, |
111 |
{ 'v', NUM_Z }, |
112 |
{ 'w', NUM_Z }, |
113 |
{ 't', NUM_Z }, |
114 |
{ 's', NUM_Z }, |
115 |
{ 'x', NUM_Z }, |
116 |
{ 'y', NUM_Z }, |
117 |
{ 'n', 1 }, |
118 |
{ 'd', 1 }, |
119 |
{ 'h', 1 }, |
120 |
{ 'a', MULTDIM }, // seaice, 7 == MULTDIM in SEAICE_SIZE.h |
121 |
{ 'b', 1 }, |
122 |
{ 'c', 1 }, |
123 |
{ 'd', 1 }, |
124 |
{ 'e', 1 }, |
125 |
{ 'f', 1 }, |
126 |
{ 'g', 1 }, |
127 |
}; |
128 |
#define numAllFields (sizeof(fieldDepths)/sizeof(dataFieldDepth_t)) |
129 |
|
130 |
|
131 |
|
132 |
/////////////////////////////////////////////////////////////////////// |
133 |
// Info about the various kinds of i/o epochs |
134 |
typedef struct epochFieldInfo { |
135 |
char dataFieldID; |
136 |
MPI_Comm registrationIntercomm; |
137 |
MPI_Comm dataIntercomm; |
138 |
MPI_Comm ioRanksIntracomm; |
139 |
int tileCount; |
140 |
int zDepth; // duplicates the fieldDepth entry; filled in automatically |
141 |
char filenameTemplate[128]; |
142 |
long int offset; |
143 |
int pickup; |
144 |
} fieldInfoThisEpoch_t; |
145 |
|
146 |
// The normal i/o dump |
147 |
fieldInfoThisEpoch_t fieldsForEpochStyle_0[] = { |
148 |
{ 'u', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "U.%010d.%s", 0, 0 }, |
149 |
{ 'v', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "V.%010d.%s", 0, 0 }, |
150 |
{ 't', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "T.%010d.%s", 0,0 }, |
151 |
{ 'n', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "Eta.%010d.%s", 0,0 }, |
152 |
{'\0', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "", 0,0 }, |
153 |
}; |
154 |
|
155 |
|
156 |
// pickup file |
157 |
fieldInfoThisEpoch_t fieldsForEpochStyle_1[] = { |
158 |
{ 'u', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 0 + twoDFieldSizeInBytes * 0, 1}, |
159 |
{ 'v', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 0, 1}, |
160 |
{ 't', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 2 + twoDFieldSizeInBytes * 0, 1}, |
161 |
{ 's', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 3 + twoDFieldSizeInBytes * 0, 1}, |
162 |
{ 'x', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 4 + twoDFieldSizeInBytes * 0, 1}, |
163 |
{ 'y', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 5 + twoDFieldSizeInBytes * 0, 1}, |
164 |
{ 'n', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 6 + twoDFieldSizeInBytes * 0, 1}, |
165 |
{ 'd', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 6 + twoDFieldSizeInBytes * 1, 1}, |
166 |
{ 'h', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 6 + twoDFieldSizeInBytes * 2, 1}, |
167 |
{'\0', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "", 0 ,1}, |
168 |
}; |
169 |
|
170 |
|
171 |
// seaice pickup |
172 |
fieldInfoThisEpoch_t fieldsForEpochStyle_2[] = { |
173 |
{ 'a', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 0 + twoDFieldSizeInBytes * 0, 2}, |
174 |
{ 'b', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 0, 2}, |
175 |
{ 'c', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 1, 2}, |
176 |
{ 'd', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 2, 2}, |
177 |
{ 'g', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 3, 2}, |
178 |
{ 'e', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 4, 2}, |
179 |
{ 'f', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 5, 2}, |
180 |
{'\0', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "", 0 }, |
181 |
}; |
182 |
|
183 |
|
184 |
fieldInfoThisEpoch_t *epochStyles[] = { |
185 |
fieldsForEpochStyle_0, |
186 |
fieldsForEpochStyle_1, |
187 |
fieldsForEpochStyle_2, |
188 |
}; |
189 |
int numEpochStyles = (sizeof(epochStyles) / sizeof(fieldInfoThisEpoch_t *)); |
190 |
|
191 |
|
192 |
typedef enum { |
193 |
cmd_illegal, |
194 |
cmd_newEpoch, |
195 |
cmd_epochComplete, |
196 |
cmd_exit, |
197 |
} epochCmd_t; |
198 |
|
199 |
|
200 |
/////////////////////////////////////////////////////////////////////// |
201 |
|
202 |
// Note that a rank will only have access to one of the Intracomms, |
203 |
// but all ranks will define the Intercomm. |
204 |
MPI_Comm ioIntracomm = MPI_COMM_NULL; |
205 |
int ioIntracommRank = -1; |
206 |
MPI_Comm computeIntracomm = MPI_COMM_NULL; |
207 |
int computeIntracommRank = -1; |
208 |
MPI_Comm globalIntercomm = MPI_COMM_NULL; |
209 |
|
210 |
|
211 |
#define divCeil(_x,_y) (((_x) + ((_y) - 1)) / (_y)) |
212 |
#define roundUp(_x,_y) (divCeil((_x),(_y)) * (_y)) |
213 |
|
214 |
|
215 |
|
216 |
typedef enum { |
217 |
bufState_illegal, |
218 |
bufState_Free, |
219 |
bufState_InUse, |
220 |
} bufState_t; |
221 |
|
222 |
|
223 |
typedef struct buf_header{ |
224 |
struct buf_header *next; |
225 |
bufState_t state; |
226 |
int requestsArraySize; |
227 |
MPI_Request *requests; |
228 |
uint64_t ck0; |
229 |
char *payload; // [tileOneZLevelSizeInBytes * NUM_Z]; // Max payload size |
230 |
uint64_t ck1; // now rec'vd from compute ranks & dynamically allocated in |
231 |
} bufHdr_t; // allocateTileBufs |
232 |
|
233 |
|
234 |
bufHdr_t *freeTileBufs_ptr = NULL; |
235 |
bufHdr_t *inUseTileBufs_ptr = NULL; |
236 |
|
237 |
int maxTagValue = -1; |
238 |
int totalNumTiles = -1; |
239 |
|
240 |
// routine to convert double to float during memcpy |
241 |
// need to get byteswapping in here as well |
242 |
memcpy_r8_2_r4 (float *f, double *d, long long *n) |
243 |
{ |
244 |
long long i, rem; |
245 |
rem = *n%16LL; |
246 |
for (i = 0; i < rem; i++) { |
247 |
f [i] = d [i]; |
248 |
} |
249 |
for (i = rem; i < *n; i += 16) { |
250 |
__asm__ __volatile__ ("prefetcht0 %0 # memcpy_r8_2_r4.c 10" : : "m" (d [i + 256 + 0]) ); |
251 |
__asm__ __volatile__ ("prefetcht0 %0 # memcpy_r8_2_r4.c 11" : : "m" (f [i + 256 + 0]) ); |
252 |
__asm__ __volatile__ ("cvtsd2ss %0, %%xmm0 # memcpy_r8_2_r4.c 12" : : "m" (d [i + 0]) : "%xmm0"); |
253 |
__asm__ __volatile__ ("movss %%xmm0, %0 # memcpy_r8_2_r4.c 13" : "=m" (f [i + 0]) : : "memory"); |
254 |
__asm__ __volatile__ ("cvtsd2ss %0, %%xmm1 # memcpy_r8_2_r4.c 14" : : "m" (d [i + 1]) : "%xmm1"); |
255 |
__asm__ __volatile__ ("movss %%xmm1, %0 # memcpy_r8_2_r4.c 15" : "=m" (f [i + 1]) : : "memory"); |
256 |
__asm__ __volatile__ ("cvtsd2ss %0, %%xmm2 # memcpy_r8_2_r4.c 16" : : "m" (d [i + 2]) : "%xmm2"); |
257 |
__asm__ __volatile__ ("movss %%xmm2, %0 # memcpy_r8_2_r4.c 17" : "=m" (f [i + 2]) : : "memory"); |
258 |
__asm__ __volatile__ ("cvtsd2ss %0, %%xmm3 # memcpy_r8_2_r4.c 18" : : "m" (d [i + 3]) : "%xmm3"); |
259 |
__asm__ __volatile__ ("movss %%xmm3, %0 # memcpy_r8_2_r4.c 19" : "=m" (f [i + 3]) : : "memory"); |
260 |
__asm__ __volatile__ ("cvtsd2ss %0, %%xmm4 # memcpy_r8_2_r4.c 20" : : "m" (d [i + 4]) : "%xmm4"); |
261 |
__asm__ __volatile__ ("movss %%xmm4, %0 # memcpy_r8_2_r4.c 21" : "=m" (f [i + 4]) : : "memory"); |
262 |
__asm__ __volatile__ ("cvtsd2ss %0, %%xmm5 # memcpy_r8_2_r4.c 22" : : "m" (d [i + 5]) : "%xmm5"); |
263 |
__asm__ __volatile__ ("movss %%xmm5, %0 # memcpy_r8_2_r4.c 23" : "=m" (f [i + 5]) : : "memory"); |
264 |
__asm__ __volatile__ ("cvtsd2ss %0, %%xmm6 # memcpy_r8_2_r4.c 24" : : "m" (d [i + 6]) : "%xmm6"); |
265 |
__asm__ __volatile__ ("movss %%xmm6, %0 # memcpy_r8_2_r4.c 25" : "=m" (f [i + 6]) : : "memory"); |
266 |
__asm__ __volatile__ ("cvtsd2ss %0, %%xmm7 # memcpy_r8_2_r4.c 26" : : "m" (d [i + 7]) : "%xmm7"); |
267 |
__asm__ __volatile__ ("prefetcht0 %0 # memcpy_r8_2_r4.c 27" : : "m" (d [i + 256 + 8 + 0]) ); |
268 |
__asm__ __volatile__ ("movss %%xmm7, %0 # memcpy_r8_2_r4.c 28" : "=m" (f [i + 7]) : : "memory"); |
269 |
__asm__ __volatile__ ("cvtsd2ss %0, %%xmm8 # memcpy_r8_2_r4.c 29" : : "m" (d [i + 8]) : "%xmm8"); |
270 |
__asm__ __volatile__ ("movss %%xmm8, %0 # memcpy_r8_2_r4.c 30" : "=m" (f [i + 8]) : : "memory"); |
271 |
__asm__ __volatile__ ("cvtsd2ss %0, %%xmm9 # memcpy_r8_2_r4.c 31" : : "m" (d [i + 9]) : "%xmm9"); |
272 |
__asm__ __volatile__ ("movss %%xmm9, %0 # memcpy_r8_2_r4.c 32" : "=m" (f [i + 9]) : : "memory"); |
273 |
__asm__ __volatile__ ("cvtsd2ss %0, %%xmm10 # memcpy_r8_2_r4.c 33" : : "m" (d [i + 10]) : "%xmm10"); |
274 |
__asm__ __volatile__ ("movss %%xmm10, %0 # memcpy_r8_2_r4.c 34" : "=m" (f [i + 10]) : : "memory"); |
275 |
__asm__ __volatile__ ("cvtsd2ss %0, %%xmm11 # memcpy_r8_2_r4.c 35" : : "m" (d [i + 11]) : "%xmm11"); |
276 |
__asm__ __volatile__ ("movss %%xmm11, %0 # memcpy_r8_2_r4.c 36" : "=m" (f [i + 11]) : : "memory"); |
277 |
__asm__ __volatile__ ("cvtsd2ss %0, %%xmm12 # memcpy_r8_2_r4.c 37" : : "m" (d [i + 12]) : "%xmm12"); |
278 |
__asm__ __volatile__ ("movss %%xmm12, %0 # memcpy_r8_2_r4.c 38" : "=m" (f [i + 12]) : : "memory"); |
279 |
__asm__ __volatile__ ("cvtsd2ss %0, %%xmm13 # memcpy_r8_2_r4.c 39" : : "m" (d [i + 13]) : "%xmm13"); |
280 |
__asm__ __volatile__ ("movss %%xmm13, %0 # memcpy_r8_2_r4.c 40" : "=m" (f [i + 13]) : : "memory"); |
281 |
__asm__ __volatile__ ("cvtsd2ss %0, %%xmm14 # memcpy_r8_2_r4.c 41" : : "m" (d [i + 14]) : "%xmm14"); |
282 |
__asm__ __volatile__ ("movss %%xmm14, %0 # memcpy_r8_2_r4.c 42" : "=m" (f [i + 14]) : : "memory"); |
283 |
__asm__ __volatile__ ("cvtsd2ss %0, %%xmm15 # memcpy_r8_2_r4.c 43" : : "m" (d [i + 15]) : "%xmm15"); |
284 |
__asm__ __volatile__ ("movss %%xmm15, %0 # memcpy_r8_2_r4.c 44" : "=m" (f [i + 15]) : : "memory"); |
285 |
} |
286 |
} |
287 |
|
288 |
// Debug routine |
289 |
countBufs(int nbufs) |
290 |
{ |
291 |
int nInUse, nFree; |
292 |
bufHdr_t *bufPtr; |
293 |
static int target = -1; |
294 |
|
295 |
if (-1 == target) { |
296 |
ASSERT(-1 != nbufs); |
297 |
target = nbufs; |
298 |
} |
299 |
|
300 |
bufPtr = freeTileBufs_ptr; |
301 |
for (nFree = 0; bufPtr != NULL; bufPtr = bufPtr->next) nFree += 1; |
302 |
|
303 |
bufPtr = inUseTileBufs_ptr; |
304 |
for (nInUse = 0; bufPtr != NULL; bufPtr = bufPtr->next) nInUse += 1; |
305 |
|
306 |
if (nInUse + nFree != target) { |
307 |
fprintf(stderr, "Rank %d: bad number of buffs: free %d, inUse %d, should be %d\n", |
308 |
ioIntracommRank, nFree, nInUse, target); |
309 |
sleep(5000); |
310 |
} |
311 |
} |
312 |
|
313 |
int readn(int fd, void *p, int nbytes) |
314 |
{ |
315 |
|
316 |
char *ptr = (char*)(p); |
317 |
|
318 |
int nleft, nread; |
319 |
|
320 |
nleft = nbytes; |
321 |
while (nleft > 0){ |
322 |
nread = read(fd, ptr, nleft); |
323 |
if (nread < 0) |
324 |
return(nread); // error |
325 |
else if (nread == 0) |
326 |
break; // EOF |
327 |
|
328 |
nleft -= nread; |
329 |
ptr += nread; |
330 |
} |
331 |
return (nbytes - nleft); |
332 |
} |
333 |
|
334 |
|
335 |
ssize_t writen(int fd, void *p, size_t nbytes) |
336 |
{ |
337 |
char *ptr = (char*)(p); |
338 |
|
339 |
size_t nleft; |
340 |
ssize_t nwritten; |
341 |
|
342 |
nleft = nbytes; |
343 |
while (nleft > 0){ |
344 |
nwritten = write(fd, ptr, nleft); |
345 |
if (nwritten <= 0){ |
346 |
if (errno==EINTR) continue; // POSIX, not SVr4 |
347 |
return(nwritten); // non-EINTR error |
348 |
} |
349 |
|
350 |
nleft -= nwritten; |
351 |
ptr += nwritten; |
352 |
} |
353 |
return(nbytes - nleft); |
354 |
} |
355 |
|
356 |
|
357 |
void write_pickup_meta(FILE *fp, int gcmIter, int pickup) |
358 |
{ |
359 |
int i; |
360 |
int ndims = 2; |
361 |
int nrecords,nfields; |
362 |
char**f; |
363 |
char*fld1[] = {"Uvel","Vvel","Theta","Salt","GuNm1","GvNm1","EtaN","dEtaHdt","EtaH"}; |
364 |
char*fld2[] = {"siTICES","siAREA","siHEFF","siHSNOW","siHSALT","siUICE","siVICE"}; |
365 |
// for now, just list the fields here. When the whole field specification apparatus |
366 |
// is cleaned up, pull the names out of the epochstyle definition or whatever |
367 |
|
368 |
ASSERT(1==pickup||2==pickup); |
369 |
|
370 |
if (1==pickup){ |
371 |
nrecords = 6*NUM_Z+3; |
372 |
nfields = sizeof(fld1)/sizeof(char*); |
373 |
f = fld1; |
374 |
} |
375 |
else if (2==pickup){ |
376 |
nrecords = MULTDIM+6; |
377 |
nfields = sizeof(fld2)/sizeof(char*); |
378 |
f = fld2; |
379 |
} |
380 |
|
381 |
fprintf(fp," nDims = [ %3d ];\n",ndims); |
382 |
fprintf(fp," dimList = [\n"); |
383 |
fprintf(fp," %10u,%10d,%10u,\n",NUM_X,1,NUM_X); |
384 |
fprintf(fp," %10ld,%10d,%10ld\n",NUM_Y,1,NUM_Y); |
385 |
fprintf(fp," ];\n"); |
386 |
fprintf(fp," dataprec = [ 'float64' ];\n"); |
387 |
fprintf(fp," nrecords = [ %5d ];\n",nrecords); |
388 |
fprintf(fp," timeStepNumber = [ %10d ];\n",gcmIter); |
389 |
fprintf(fp," timeInterval = [ %19.12E ];\n",0.0); // what should this be? |
390 |
fprintf(fp," nFlds = [ %4d ];\n",nfields); |
391 |
fprintf(fp," fldList = {\n"); |
392 |
for (i=0;i<nfields;++i) |
393 |
fprintf(fp," '%-8s'",f[i]); |
394 |
fprintf(fp,"\n };\n"); |
395 |
} |
396 |
|
397 |
|
398 |
|
399 |
double *outBuf=NULL;//[NUM_X*NUM_Y*NUM_Z]; // only needs to be myNumZSlabs |
400 |
size_t outBufSize=0; |
401 |
|
402 |
|
403 |
void |
404 |
do_write(int io_epoch, fieldInfoThisEpoch_t *whichField, int firstZ, int numZ, int gcmIter) |
405 |
{ |
406 |
if (0==numZ) return; // this is *not* global NUM_Z : change name of parameter to avoid grief! |
407 |
|
408 |
int pickup = whichField->pickup; |
409 |
|
410 |
/////////////////////////////// |
411 |
// swap here, if necessary |
412 |
// int i,j; |
413 |
|
414 |
//i = NUM_X*NUM_Y*numZ; |
415 |
//mds_byteswapr8_(&i,outBuf); |
416 |
|
417 |
// mds_byteswapr8 expects an integer count, which is gonna overflow someday |
418 |
// can't redefine to long without affecting a bunch of other calls |
419 |
// so do a slab at a time here, to delay the inevitable |
420 |
// i = NUM_X*NUM_Y; |
421 |
//for (j=0;j<numZ;++j) |
422 |
// mds_byteswapr8_(&i,&outBuf[i*j]); |
423 |
|
424 |
// gnu builtin evidently honored by intel compilers |
425 |
|
426 |
if (pickup) { |
427 |
uint64_t *alias = (uint64_t*)outBuf; |
428 |
size_t i; |
429 |
for (i=0;i<NUM_X*NUM_Y*numZ;++i) |
430 |
alias[i] = __builtin_bswap64(alias[i]); |
431 |
} |
432 |
else { |
433 |
uint32_t *alias = (uint32_t*)outBuf; |
434 |
size_t i; |
435 |
for (i=0;i<NUM_X*NUM_Y*numZ;++i) |
436 |
alias[i] = __builtin_bswap32(alias[i]); |
437 |
} |
438 |
|
439 |
// end of swappiness |
440 |
////////////////////////////////// |
441 |
|
442 |
char s[1024]; |
443 |
//sprintf(s,"henze_%d_%d_%c.dat",io_epoch,gcmIter,whichField->dataFieldID); |
444 |
|
445 |
sprintf(s,whichField->filenameTemplate,gcmIter,"data"); |
446 |
|
447 |
int fd = open(s,O_CREAT|O_WRONLY,S_IRWXU|S_IRGRP); |
448 |
ASSERT(fd!=-1); |
449 |
|
450 |
size_t nbytes; |
451 |
|
452 |
if (pickup) { |
453 |
lseek(fd,whichField->offset,SEEK_SET); |
454 |
lseek(fd,firstZ*NUM_X*NUM_Y*datumSize,SEEK_CUR); |
455 |
nbytes = NUM_X*NUM_Y*numZ*datumSize; |
456 |
} |
457 |
else { |
458 |
lseek(fd,firstZ*NUM_X*NUM_Y*sizeof(float),SEEK_CUR); |
459 |
nbytes = NUM_X*NUM_Y*numZ*sizeof(float); |
460 |
} |
461 |
|
462 |
ssize_t bwrit = writen(fd,outBuf,nbytes); |
463 |
|
464 |
if (-1==bwrit) perror("Henze : file write problem : "); |
465 |
|
466 |
FPRINTF(stderr,"Wrote %d of %d bytes (%d -> %d) \n",bwrit,nbytes,firstZ,numZ); |
467 |
|
468 |
// ASSERT(nbytes == bwrit); |
469 |
|
470 |
if (nbytes!=bwrit) |
471 |
fprintf(stderr,"WROTE %ld /%ld\n",bwrit,nbytes); |
472 |
|
473 |
close(fd); |
474 |
|
475 |
|
476 |
return; |
477 |
} |
478 |
|
479 |
|
480 |
int NTILES = -1; |
481 |
|
482 |
typedef struct { |
483 |
int off; |
484 |
int skip; |
485 |
} tile_layout_t; |
486 |
|
487 |
tile_layout_t *offsetTable; |
488 |
|
489 |
void |
490 |
processSlabSection( |
491 |
fieldInfoThisEpoch_t *whichField, |
492 |
int tileID, |
493 |
void *data, |
494 |
int myNumZSlabs) |
495 |
{ |
496 |
int intracommSize,intracommRank; |
497 |
// MPI_Comm_size(whichField->ioRanksIntracomm, &intracommSize); |
498 |
MPI_Comm_rank(whichField->ioRanksIntracomm, &intracommRank); |
499 |
//printf("i/o rank %d/%d recv'd %d::%d (%d->%d)\n",intracommRank,intracommSize,whichField,tileID,firstZ,lastZ); |
500 |
// printf("rank %d : tile %d is gonna go at %d and stride with %d, z = %d\n",intracommRank,tileID,offsetTable[tileID].off, |
501 |
// offsetTable[tileID].skip,myNumZSlabs); |
502 |
|
503 |
int z; |
504 |
|
505 |
int pickup = whichField->pickup; |
506 |
|
507 |
//ASSERT((tileID > 0) && (tileID < (sizeof(offsetTable)/sizeof(tile_layout_t)))); |
508 |
ASSERT( (tileID > 0) && (tileID <= NTILES) ); // offsetTable now dynamically allocated |
509 |
|
510 |
if (myNumZSlabs * twoDFieldSizeInBytes > outBufSize){ |
511 |
|
512 |
free(outBuf); |
513 |
|
514 |
outBufSize = myNumZSlabs * twoDFieldSizeInBytes; |
515 |
|
516 |
outBuf = malloc(outBufSize); |
517 |
ASSERT(outBuf); |
518 |
|
519 |
memset(outBuf,0,outBufSize); |
520 |
} |
521 |
|
522 |
for (z=0;z<myNumZSlabs;++z){ |
523 |
|
524 |
off_t zoffset = z*TILE_X*TILE_Y*NTILES; //NOT totalNumTiles; |
525 |
off_t hoffset = offsetTable[tileID].off; |
526 |
off_t skipdst = offsetTable[tileID].skip; |
527 |
|
528 |
//double *dst = outBuf + zoffset + hoffset; |
529 |
void *dst; |
530 |
if (pickup) |
531 |
dst = outBuf + zoffset + hoffset; |
532 |
else |
533 |
dst = (float*)outBuf + zoffset + hoffset; |
534 |
|
535 |
off_t zoff = z*(TILE_X+2*XGHOSTS)*(TILE_Y+2*YGHOSTS); |
536 |
off_t hoff = YGHOSTS*(TILE_X+2*XGHOSTS) + YGHOSTS; |
537 |
double *src = (double*)data + zoff + hoff; |
538 |
|
539 |
off_t skipsrc = TILE_X+2*XGHOSTS; |
540 |
|
541 |
//fprintf(stderr,"rank %d tile %d offset %d skip %d dst %x src %x\n",intracommRank,tileID,hoffset,skipdst,dst,src); |
542 |
|
543 |
long long n = TILE_X; |
544 |
|
545 |
int y; |
546 |
if (pickup) |
547 |
for (y=0;y<TILE_Y;++y) |
548 |
memcpy((double*)dst + y * skipdst, src + y * skipsrc, TILE_X*datumSize); |
549 |
else |
550 |
for (y=0;y<TILE_Y;++y) |
551 |
memcpy_r8_2_r4((float*)dst + y * skipdst, src + y * skipsrc, &n); |
552 |
} |
553 |
|
554 |
return; |
555 |
} |
556 |
|
557 |
|
558 |
|
559 |
// Allocate some buffers to receive tile-data from the compute ranks |
560 |
void |
561 |
allocateTileBufs(int numTileBufs, int maxIntracommSize) |
562 |
{ |
563 |
|
564 |
ASSERT(tileOneZLevelSizeInBytes>0); // be sure we have rec'vd values by now |
565 |
|
566 |
int i, j; |
567 |
for (i = 0; i < numTileBufs; ++i) { |
568 |
|
569 |
bufHdr_t *newBuf = malloc(sizeof(bufHdr_t)); |
570 |
ASSERT(NULL != newBuf); |
571 |
|
572 |
newBuf->payload = malloc(tileOneZLevelSizeInBytes * NUM_Z); |
573 |
ASSERT(NULL != newBuf->payload); |
574 |
|
575 |
newBuf->requests = malloc(maxIntracommSize * sizeof(MPI_Request)); |
576 |
ASSERT(NULL != newBuf->requests); |
577 |
|
578 |
// Init some values |
579 |
newBuf->requestsArraySize = maxIntracommSize; |
580 |
for (j = 0; j < maxIntracommSize; ++j) { |
581 |
newBuf->requests[j] = MPI_REQUEST_NULL; |
582 |
} |
583 |
newBuf->ck0 = newBuf->ck1 = 0xdeadbeefdeadbeef; |
584 |
|
585 |
// Put the buf into the free list |
586 |
newBuf->state = bufState_Free; |
587 |
newBuf->next = freeTileBufs_ptr; |
588 |
freeTileBufs_ptr = newBuf; |
589 |
} |
590 |
} |
591 |
|
592 |
|
593 |
|
594 |
bufHdr_t * |
595 |
getFreeBuf() |
596 |
{ |
597 |
int j; |
598 |
bufHdr_t *rtnValue = freeTileBufs_ptr; |
599 |
|
600 |
if (NULL != rtnValue) { |
601 |
ASSERT(bufState_Free == rtnValue->state); |
602 |
freeTileBufs_ptr = rtnValue->next; |
603 |
rtnValue->next = NULL; |
604 |
|
605 |
// Paranoia. This should already be the case |
606 |
for (j = 0; j < rtnValue->requestsArraySize; ++j) { |
607 |
rtnValue->requests[j] = MPI_REQUEST_NULL; |
608 |
} |
609 |
} |
610 |
return rtnValue; |
611 |
} |
612 |
|
613 |
|
614 |
///////////////////////////////////////////////////////////////// |
615 |
|
616 |
|
617 |
bufHdr_t * |
618 |
tryToReceiveDataTile( |
619 |
MPI_Comm dataIntercomm, |
620 |
int epochID, |
621 |
size_t expectedMsgSize, |
622 |
int *tileID) |
623 |
{ |
624 |
bufHdr_t *bufHdr; |
625 |
int pending, i, count; |
626 |
MPI_Status mpiStatus; |
627 |
|
628 |
MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, dataIntercomm, |
629 |
&pending, &mpiStatus); |
630 |
|
631 |
// If no data are pending, or if we can't get a buffer to |
632 |
// put the pending data into, return NULL |
633 |
if (!pending) return NULL; |
634 |
if (((bufHdr = getFreeBuf()) == NULL)) { |
635 |
FPRINTF(stderr,"tile %d(%d) pending, but no buf to recv it\n", |
636 |
mpiStatus.MPI_TAG, mpiStatus.MPI_TAG >> numCheckBits); |
637 |
return NULL; |
638 |
} |
639 |
|
640 |
// Do sanity checks on the pending message |
641 |
MPI_Get_count(&mpiStatus, MPI_BYTE, &count); |
642 |
ASSERT(count == expectedMsgSize); |
643 |
ASSERT((mpiStatus.MPI_TAG & bitMask) == (epochID & bitMask)); |
644 |
|
645 |
// Recieve the data we saw in the iprobe |
646 |
MPI_Recv(bufHdr->payload, expectedMsgSize, MPI_BYTE, |
647 |
mpiStatus.MPI_SOURCE, mpiStatus.MPI_TAG, |
648 |
dataIntercomm, &mpiStatus); |
649 |
bufHdr->state = bufState_InUse; |
650 |
|
651 |
// Overrun check |
652 |
ASSERT(bufHdr->ck0==0xdeadbeefdeadbeef); |
653 |
ASSERT(bufHdr->ck0==bufHdr->ck1); |
654 |
|
655 |
// Return values |
656 |
*tileID = mpiStatus.MPI_TAG >> numCheckBits; |
657 |
|
658 |
|
659 |
FPRINTF(stderr, "recv tile %d(%d) from compute rank %d\n", |
660 |
mpiStatus.MPI_TAG, *tileID, mpiStatus.MPI_SOURCE); |
661 |
|
662 |
return bufHdr; |
663 |
} |
664 |
|
665 |
|
666 |
|
667 |
int |
668 |
tryToReceiveZSlab( |
669 |
void *buf, |
670 |
int expectedMsgSize, |
671 |
MPI_Comm intracomm) |
672 |
{ |
673 |
MPI_Status mpiStatus; |
674 |
int pending, count; |
675 |
|
676 |
MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, intracomm, &pending, &mpiStatus); |
677 |
if (!pending) return -1; |
678 |
|
679 |
MPI_Get_count(&mpiStatus, MPI_BYTE, &count); |
680 |
ASSERT(count == expectedMsgSize); |
681 |
|
682 |
MPI_Recv(buf, count, MPI_BYTE, mpiStatus.MPI_SOURCE, |
683 |
mpiStatus.MPI_TAG, intracomm, &mpiStatus); |
684 |
|
685 |
FPRINTF(stderr, "recv slab %d from rank %d\n", |
686 |
mpiStatus.MPI_TAG, mpiStatus.MPI_SOURCE); |
687 |
|
688 |
// return the tileID |
689 |
return mpiStatus.MPI_TAG; |
690 |
} |
691 |
|
692 |
|
693 |
|
694 |
void |
695 |
redistributeZSlabs( |
696 |
bufHdr_t *bufHdr, |
697 |
int tileID, |
698 |
int zSlabsPer, |
699 |
int thisFieldNumZ, |
700 |
MPI_Comm intracomm) |
701 |
{ |
702 |
int pieceSize = zSlabsPer * tileOneZLevelSizeInBytes; |
703 |
int tileSizeInBytes = tileOneZLevelSizeInBytes * thisFieldNumZ; |
704 |
int offset = 0; |
705 |
int recvRank = 0; |
706 |
|
707 |
while ((offset + pieceSize) <= tileSizeInBytes) { |
708 |
|
709 |
MPI_Isend(bufHdr->payload + offset, pieceSize, MPI_BYTE, recvRank, |
710 |
tileID, intracomm, &(bufHdr->requests[recvRank])); |
711 |
ASSERT(MPI_REQUEST_NULL != bufHdr->requests[recvRank]); |
712 |
|
713 |
offset += pieceSize; |
714 |
recvRank += 1; |
715 |
} |
716 |
|
717 |
// There might be one last odd-sized piece |
718 |
if (offset < tileSizeInBytes) { |
719 |
pieceSize = tileSizeInBytes - offset; |
720 |
ASSERT(pieceSize % tileOneZLevelSizeInBytes == 0); |
721 |
|
722 |
MPI_Isend(bufHdr->payload + offset, pieceSize, MPI_BYTE, recvRank, |
723 |
tileID, intracomm, &(bufHdr->requests[recvRank])); |
724 |
ASSERT(MPI_REQUEST_NULL != bufHdr->requests[recvRank]); |
725 |
|
726 |
offset += pieceSize; |
727 |
recvRank += 1; |
728 |
} |
729 |
|
730 |
// Sanity check |
731 |
ASSERT(recvRank <= bufHdr->requestsArraySize); |
732 |
while (recvRank < bufHdr->requestsArraySize) { |
733 |
ASSERT(MPI_REQUEST_NULL == bufHdr->requests[recvRank++]) |
734 |
} |
735 |
} |
736 |
|
737 |
|
738 |
|
739 |
///////////////////////////////////////////////////////////////// |
740 |
// This is called by the i/o ranks |
741 |
void |
742 |
doNewEpoch(int epochID, int epochStyleIndex, int gcmIter) |
743 |
{ |
744 |
// In an i/o epoch, the i/o ranks are partitioned into groups, |
745 |
// each group dealing with one field. The ranks within a group: |
746 |
// (1) receive data tiles from the compute ranks |
747 |
// (2) slice the received data tiles into slabs in the 'z' |
748 |
// dimension, and redistribute the tile-slabs among the group. |
749 |
// (3) receive redistributed tile-slabs, and reconstruct a complete |
750 |
// field-slab for the whole field. |
751 |
// (4) Write the completed field-slab to disk. |
752 |
|
753 |
fieldInfoThisEpoch_t *fieldInfo; |
754 |
int intracommRank, intracommSize; |
755 |
|
756 |
int zSlabsPer; |
757 |
int myNumZSlabs, myFirstZSlab, myLastZSlab, myNumSlabPiecesToRecv; |
758 |
|
759 |
int numTilesRecvd, numSlabPiecesRecvd; |
760 |
|
761 |
|
762 |
// Find the descriptor for my assigned field for this epoch style. |
763 |
// It is the one whose dataIntercomm is not null |
764 |
fieldInfo = epochStyles[epochStyleIndex]; |
765 |
while (MPI_COMM_NULL == fieldInfo->dataIntercomm) ++fieldInfo; |
766 |
ASSERT('\0' != fieldInfo->dataFieldID); |
767 |
|
768 |
|
769 |
|
770 |
MPI_Comm_rank(fieldInfo->ioRanksIntracomm, &intracommRank); |
771 |
MPI_Comm_size(fieldInfo->ioRanksIntracomm, &intracommSize); |
772 |
|
773 |
|
774 |
// Compute which z slab(s) we will be reassembling. |
775 |
zSlabsPer = divCeil(fieldInfo->zDepth, intracommSize); |
776 |
myNumZSlabs = zSlabsPer; |
777 |
|
778 |
// Adjust myNumZSlabs in case it didn't divide evenly |
779 |
myFirstZSlab = intracommRank * myNumZSlabs; |
780 |
if (myFirstZSlab >= fieldInfo->zDepth) { |
781 |
myNumZSlabs = 0; |
782 |
} else if ((myFirstZSlab + myNumZSlabs) > fieldInfo->zDepth) { |
783 |
myNumZSlabs = fieldInfo->zDepth - myFirstZSlab; |
784 |
} else { |
785 |
myNumZSlabs = zSlabsPer; |
786 |
} |
787 |
myLastZSlab = myFirstZSlab + myNumZSlabs - 1; |
788 |
|
789 |
// If we were not assigned any z-slabs, we don't get any redistributed |
790 |
// tile-slabs. If we were assigned one or more slabs, we get |
791 |
// redistributed tile-slabs for every tile. |
792 |
myNumSlabPiecesToRecv = (0 == myNumZSlabs) ? 0 : totalNumTiles; |
793 |
|
794 |
|
795 |
numTilesRecvd = 0; |
796 |
numSlabPiecesRecvd = 0; |
797 |
|
798 |
//////////////////////////////////////////////////////////////////// |
799 |
// Main loop. Handle tiles from the current epoch |
800 |
for(;;) { |
801 |
bufHdr_t *bufHdr; |
802 |
|
803 |
//////////////////////////////////////////////////////////////// |
804 |
// Check for tiles from the computational tasks |
805 |
while (numTilesRecvd < fieldInfo->tileCount) { |
806 |
int tileID = -1; |
807 |
size_t msgSize = tileOneZLevelSizeInBytes * fieldInfo->zDepth; |
808 |
bufHdr = tryToReceiveDataTile(fieldInfo->dataIntercomm, epochID, |
809 |
msgSize, &tileID); |
810 |
if (NULL == bufHdr) break; // No tile was received |
811 |
|
812 |
numTilesRecvd += 1; |
813 |
redistributeZSlabs(bufHdr, tileID, zSlabsPer, |
814 |
fieldInfo->zDepth, fieldInfo->ioRanksIntracomm); |
815 |
|
816 |
// Add the bufHdr to the "in use" list |
817 |
bufHdr->next = inUseTileBufs_ptr; |
818 |
inUseTileBufs_ptr = bufHdr; |
819 |
|
820 |
|
821 |
} |
822 |
|
823 |
|
824 |
//////////////////////////////////////////////////////////////// |
825 |
// Check for tile-slabs redistributed from the i/o processes |
826 |
while (numSlabPiecesRecvd < myNumSlabPiecesToRecv) { |
827 |
int msgSize = tileOneZLevelSizeInBytes * myNumZSlabs; |
828 |
char data[msgSize]; |
829 |
|
830 |
int tileID = tryToReceiveZSlab(data, msgSize, fieldInfo->ioRanksIntracomm); |
831 |
if (tileID < 0) break; // No slab was received |
832 |
|
833 |
|
834 |
numSlabPiecesRecvd += 1; |
835 |
processSlabSection(fieldInfo, tileID, data, myNumZSlabs); |
836 |
|
837 |
|
838 |
// Can do the write here, or at the end of the epoch. |
839 |
// Probably want to make it asynchronous (waiting for |
840 |
// completion at the barrier at the start of the next epoch). |
841 |
//if (numSlabPiecesRecvd >= myNumSlabPiecesToRecv) { |
842 |
// ASSERT(numSlabPiecesRecvd == myNumSlabPiecesToRecv); |
843 |
// do_write(io_epoch,whichField,myFirstZSlab,myNumZSlabs); |
844 |
//} |
845 |
} |
846 |
|
847 |
|
848 |
//////////////////////////////////////////////////////////////// |
849 |
// Check if we can release any of the inUse buffers (i.e. the |
850 |
// isends we used to redistribute those z-slabs have completed). |
851 |
// We do this by detaching the inUse list, then examining each |
852 |
// element in the list, putting each element either into the |
853 |
// free list or back into the inUse list, as appropriate. |
854 |
bufHdr = inUseTileBufs_ptr; |
855 |
inUseTileBufs_ptr = NULL; |
856 |
while (NULL != bufHdr) { |
857 |
int count = 0; |
858 |
int completions[intracommSize]; |
859 |
MPI_Status statuses[intracommSize]; |
860 |
|
861 |
// Acquire the "next" bufHdr now, before we change anything. |
862 |
bufHdr_t *nextBufHeader = bufHdr->next; |
863 |
|
864 |
// Check to see if the Isend requests have completed for this buf |
865 |
MPI_Testsome(intracommSize, bufHdr->requests, |
866 |
&count, completions, statuses); |
867 |
|
868 |
if (MPI_UNDEFINED == count) { |
869 |
// A return of UNDEFINED means that none of the requests |
870 |
// is still outstanding, i.e. they have all completed. |
871 |
// Put the buf on the free list. |
872 |
bufHdr->state = bufState_Free; |
873 |
bufHdr->next = freeTileBufs_ptr; |
874 |
freeTileBufs_ptr = bufHdr; |
875 |
FPRINTF(stderr,"Rank %d freed a tile buffer\n", intracommRank); |
876 |
} else { |
877 |
// At least one request is still outstanding. |
878 |
// Put the buf on the inUse list. |
879 |
bufHdr->next = inUseTileBufs_ptr; |
880 |
inUseTileBufs_ptr = bufHdr; |
881 |
} |
882 |
|
883 |
// Countinue with the next buffer |
884 |
bufHdr = nextBufHeader; |
885 |
} |
886 |
|
887 |
|
888 |
//////////////////////////////////////////////////////////////// |
889 |
// Check if the epoch is complete |
890 |
if ((numTilesRecvd >= fieldInfo->tileCount) && |
891 |
(numSlabPiecesRecvd >= myNumSlabPiecesToRecv) && |
892 |
(NULL == inUseTileBufs_ptr)) |
893 |
{ |
894 |
ASSERT(numTilesRecvd == fieldInfo->tileCount); |
895 |
ASSERT(numSlabPiecesRecvd == myNumSlabPiecesToRecv); |
896 |
|
897 |
//fprintf(stderr,"rank %d %d %d %d\n",intracommRank,numTilesRecvd,numSlabPiecesRecvd,myNumZSlabs); |
898 |
|
899 |
// Ok, wrap up the current i/o epoch |
900 |
// Probably want to make this asynchronous (waiting for |
901 |
// completion at the start of the next epoch). |
902 |
do_write(epochID, fieldInfo, myFirstZSlab, myNumZSlabs, gcmIter); |
903 |
|
904 |
// The epoch is complete |
905 |
return; |
906 |
} |
907 |
} |
908 |
|
909 |
} |
910 |
|
911 |
|
912 |
|
913 |
|
914 |
|
915 |
|
916 |
|
917 |
// Main routine for the i/o tasks. Callers DO NOT RETURN from |
918 |
// this routine; they MPI_Finalize and exit. |
919 |
void |
920 |
ioRankMain(int totalNumTiles) |
921 |
{ |
922 |
// This is one of a group of i/o processes that will be receiving tiles |
923 |
// from the computational processes. This same process is also |
924 |
// responsible for compositing slices of tiles together into one or |
925 |
// more complete slabs, and then writing those slabs out to disk. |
926 |
// |
927 |
// When a tile is received, it is cut up into slices, and each slice is |
928 |
// sent to the appropriate compositing task that is dealing with those |
929 |
// "z" level(s) (possibly including ourselves). These are tagged with |
930 |
// the tile-id. When we *receive* such a message (from ourselves, or |
931 |
// from any other process in this group), we unpack the data (stripping |
932 |
// the ghost cells), and put them into the slab we are assembling. Once |
933 |
// a slab is fully assembled, it is written to disk. |
934 |
|
935 |
int i, size, count, numTileBufs; |
936 |
MPI_Status status; |
937 |
int currentEpochID = 0; |
938 |
int maxTileCount = 0, max3DTileCount = 0, maxIntracommSize = 0; |
939 |
|
940 |
int numComputeRanks, epochStyleIndex; |
941 |
MPI_Comm_remote_size(globalIntercomm, &numComputeRanks); |
942 |
|
943 |
|
944 |
//////////////////////////////////////////////////////////// |
945 |
//// get global tile info via bcast over the global intercom |
946 |
|
947 |
|
948 |
// int ierr = MPI_Bcast(&NTILES,1,MPI_INT,0,globalIntercomm); |
949 |
|
950 |
int data[9]; |
951 |
|
952 |
int ierr = MPI_Bcast(data,9,MPI_INT,0,globalIntercomm); |
953 |
|
954 |
NTILES = data[3]; |
955 |
TILE_X = data[5]; |
956 |
TILE_Y = data[6]; |
957 |
XGHOSTS = data[7]; |
958 |
YGHOSTS = data[8]; |
959 |
|
960 |
tileOneZLevelItemCount = ((TILE_X + 2*XGHOSTS) * (TILE_Y + 2*YGHOSTS)); |
961 |
tileOneZLevelSizeInBytes = (tileOneZLevelItemCount * datumSize); // also calculated in compute ranks, urghh |
962 |
|
963 |
int *xoff = malloc(NTILES*sizeof(int)); |
964 |
ASSERT(xoff); |
965 |
int *yoff = malloc(NTILES*sizeof(int)); |
966 |
ASSERT(yoff); |
967 |
int *xskip = malloc(NTILES*sizeof(int)); |
968 |
ASSERT(xskip); |
969 |
|
970 |
offsetTable = malloc((NTILES+1)*sizeof(tile_layout_t)); |
971 |
ASSERT(offsetTable); |
972 |
|
973 |
ierr = MPI_Bcast(xoff,NTILES,MPI_INT,0,globalIntercomm); |
974 |
ierr = MPI_Bcast(yoff,NTILES,MPI_INT,0,globalIntercomm); |
975 |
ierr = MPI_Bcast(xskip,NTILES,MPI_INT,0,globalIntercomm); |
976 |
|
977 |
for (i=0;i<NTILES;++i){ |
978 |
offsetTable[i+1].off = NUM_X*(yoff[i]-1) + xoff[i] - 1; |
979 |
offsetTable[i+1].skip = xskip[i]; |
980 |
} |
981 |
|
982 |
free(xoff); |
983 |
free(yoff); |
984 |
free(xskip); |
985 |
|
986 |
/////////////////////////////////////////////////////////////// |
987 |
|
988 |
|
989 |
// At this point, the multitude of various inter-communicators have |
990 |
// been created and stored in the various epoch "style" arrays. |
991 |
// The next step is to have the computational tasks "register" the |
992 |
// data tiles they will send. In this version of the code, we only |
993 |
// track and store *counts*, not the actual tileIDs. |
994 |
for (epochStyleIndex = 0; epochStyleIndex < numEpochStyles; ++epochStyleIndex) { |
995 |
fieldInfoThisEpoch_t *p = NULL; |
996 |
fieldInfoThisEpoch_t *thisEpochStyle = epochStyles[epochStyleIndex]; |
997 |
int thisIntracommSize; |
998 |
|
999 |
// Find the field we were assigned for this epoch style |
1000 |
int curFieldIndex = -1; |
1001 |
while ('\0' != thisEpochStyle[++curFieldIndex].dataFieldID) { |
1002 |
p = &(thisEpochStyle[curFieldIndex]); |
1003 |
if (MPI_COMM_NULL != p->registrationIntercomm) break; |
1004 |
} |
1005 |
ASSERT(NULL != p); |
1006 |
ASSERT('\0' != p->dataFieldID); |
1007 |
MPI_Comm_size(p->ioRanksIntracomm, &size); |
1008 |
if (size > maxIntracommSize) maxIntracommSize = size; |
1009 |
|
1010 |
|
1011 |
// Receive a message from *each* computational rank telling us |
1012 |
// a count of how many tiles that rank will send during epochs |
1013 |
// of this style. Note that some of these counts may be zero |
1014 |
for (i = 0; i < numComputeRanks; ++i) { |
1015 |
MPI_Recv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE, MPI_ANY_TAG, |
1016 |
p->registrationIntercomm, &status); |
1017 |
p->tileCount += status.MPI_TAG; |
1018 |
} |
1019 |
if (p->tileCount > maxTileCount) maxTileCount = p->tileCount; |
1020 |
if (p->zDepth > 1) { |
1021 |
if (p->tileCount > max3DTileCount) max3DTileCount = p->tileCount; |
1022 |
} |
1023 |
|
1024 |
// Sanity check |
1025 |
MPI_Allreduce (&(p->tileCount), &count, 1, |
1026 |
MPI_INT, MPI_SUM, p->ioRanksIntracomm); |
1027 |
ASSERT(count == totalNumTiles); |
1028 |
} |
1029 |
|
1030 |
|
1031 |
// In some as-of-yet-undetermined fashion, we decide how many |
1032 |
// buffers to allocate to hold tiles received from the computational |
1033 |
// tasks. The number of such buffers is more-or-less arbitrary (the |
1034 |
// code should be able to function with any number greater than zero). |
1035 |
// More buffers means more parallelism, but uses more space. |
1036 |
// Note that maxTileCount is an upper bound on the number of buffers |
1037 |
// that we can usefully use. Note also that if we are assigned a 2D |
1038 |
// field, we might be assigned to recieve a very large number of tiles |
1039 |
// for that field in that epoch style. |
1040 |
|
1041 |
// Hack - for now, just pick a value for numTileBufs |
1042 |
numTileBufs = (max3DTileCount > 0) ? max3DTileCount : maxTileCount; |
1043 |
if (numTileBufs < 4) numTileBufs = 4; |
1044 |
if (numTileBufs > 15) numTileBufs = 15; |
1045 |
// if (numTileBufs > 25) numTileBufs = 25; |
1046 |
|
1047 |
allocateTileBufs(numTileBufs, maxIntracommSize); |
1048 |
countBufs(numTileBufs); |
1049 |
|
1050 |
|
1051 |
//////////////////////////////////////////////////////////////////// |
1052 |
// Main loop. |
1053 |
//////////////////////////////////////////////////////////////////// |
1054 |
|
1055 |
for (;;) { |
1056 |
int cmd[4]; |
1057 |
|
1058 |
// Make sure all the i/o ranks have completed processing the prior |
1059 |
// cmd (i.e. the prior i/o epoch is complete). |
1060 |
MPI_Barrier(ioIntracomm); |
1061 |
|
1062 |
if (0 == ioIntracommRank) { |
1063 |
fprintf(stderr, "I/O ranks waiting for new epoch\n"); |
1064 |
MPI_Send(NULL, 0, MPI_BYTE, 0, cmd_epochComplete, globalIntercomm); |
1065 |
|
1066 |
MPI_Recv(cmd, 4, MPI_INT, 0, 0, globalIntercomm, MPI_STATUS_IGNORE); |
1067 |
fprintf(stderr, "I/O ranks begining new epoch: %d, gcmIter = %d\n", cmd[1],cmd[3]); |
1068 |
|
1069 |
// before we start a new epoch, have i/o rank 0: |
1070 |
// determine output filenames for this epoch |
1071 |
// clean up any extant files with same names |
1072 |
// write .meta files |
1073 |
|
1074 |
if (cmd_exit != cmd[0]){ |
1075 |
|
1076 |
fprintf(stderr,"new epoch: epoch %d, style %d, gcmIter %d\n", cmd[1],cmd[2],cmd[3]); |
1077 |
|
1078 |
int epochStyle = cmd[2]; |
1079 |
int gcmIter = cmd[3]; |
1080 |
|
1081 |
fieldInfoThisEpoch_t *fieldInfo; |
1082 |
fieldInfo = epochStyles[epochStyle]; |
1083 |
char s[1024]; |
1084 |
int res; |
1085 |
FILE *fp; |
1086 |
|
1087 |
if (fieldInfo->pickup==0){ // for non-pickups, need to loop over individual fields |
1088 |
char f; |
1089 |
while (f = fieldInfo->dataFieldID){ |
1090 |
sprintf(s,fieldInfo->filenameTemplate,gcmIter,"data"); |
1091 |
fprintf(stderr,"%s\n",s); |
1092 |
res = unlink(s); |
1093 |
if (-1==res && ENOENT!=errno) fprintf(stderr,"unable to rm %s\n",s); |
1094 |
|
1095 |
// skip writing meta files for non-pickup fields |
1096 |
/* |
1097 |
sprintf(s,fieldInfo->filenameTemplate,gcmIter,"meta"); |
1098 |
fp = fopen(s,"w+"); |
1099 |
fclose(fp); |
1100 |
*/ |
1101 |
|
1102 |
++fieldInfo; |
1103 |
|
1104 |
} |
1105 |
} |
1106 |
|
1107 |
else { // single pickup or pickup_seaice file |
1108 |
|
1109 |
sprintf(s,fieldInfo->filenameTemplate,gcmIter,"data"); |
1110 |
fprintf(stderr,"%s\n",s); |
1111 |
res = unlink(s); |
1112 |
if (-1==res && ENOENT!=errno) fprintf(stderr,"unable to rm %s\n",s); |
1113 |
|
1114 |
sprintf(s,fieldInfo->filenameTemplate,gcmIter,"meta"); |
1115 |
fp = fopen(s,"w+"); |
1116 |
write_pickup_meta(fp, gcmIter, fieldInfo->pickup); |
1117 |
fclose(fp); |
1118 |
|
1119 |
} |
1120 |
} |
1121 |
} |
1122 |
MPI_Bcast(cmd, 4, MPI_INT, 0, ioIntracomm); |
1123 |
|
1124 |
switch (cmd[0]) { |
1125 |
|
1126 |
case cmd_exit: |
1127 |
// Don't bother trying to disconnect and free the |
1128 |
// plethora of communicators; just exit. |
1129 |
FPRINTF(stderr,"Received shutdown message\n"); |
1130 |
MPI_Finalize(); |
1131 |
exit(0); |
1132 |
break; |
1133 |
|
1134 |
case cmd_newEpoch: |
1135 |
if ((currentEpochID + 1) != cmd[1]) { |
1136 |
fprintf(stderr, "ERROR: Missing i/o epoch? was %d, " |
1137 |
"but is now %d ??\n", currentEpochID, cmd[1]); |
1138 |
sleep(1); // Give the message a chance to propagate |
1139 |
abort(); |
1140 |
} |
1141 |
currentEpochID = cmd[1]; |
1142 |
|
1143 |
memset(outBuf,0,outBufSize); // zero the outBuf, so dry tiles are well defined |
1144 |
|
1145 |
doNewEpoch(cmd[1], cmd[2], cmd[3]); |
1146 |
break; |
1147 |
|
1148 |
default: |
1149 |
fprintf(stderr, "Unexpected epoch command: %d %d %d %d\n", |
1150 |
cmd[0], cmd[1], cmd[2], cmd[3]); |
1151 |
sleep(1); |
1152 |
abort(); |
1153 |
break; |
1154 |
} |
1155 |
|
1156 |
} |
1157 |
|
1158 |
} |
1159 |
|
1160 |
|
1161 |
|
1162 |
/////////////////////////////////////////////////////////////////////////////////// |
1163 |
|
1164 |
int |
1165 |
findField(const char c) |
1166 |
{ |
1167 |
int i; |
1168 |
for (i = 0; i < numAllFields; ++i) { |
1169 |
if (c == fieldDepths[i].dataFieldID) return i; |
1170 |
} |
1171 |
|
1172 |
// Give the error message a chance to propagate before exiting. |
1173 |
fprintf(stderr, "ERROR: Field not found: '%c'\n", c); |
1174 |
sleep(1); |
1175 |
abort(); |
1176 |
} |
1177 |
|
1178 |
|
1179 |
|
1180 |
// Given a number of ranks and a set of fields we will want to output, |
1181 |
// figure out how to distribute the ranks among the fields. |
1182 |
void |
1183 |
computeIORankAssigments( |
1184 |
int numComputeRanks, |
1185 |
int numIORanks, |
1186 |
int numFields, |
1187 |
fieldInfoThisEpoch_t *fields, |
1188 |
int assignments[]) |
1189 |
{ |
1190 |
int i,j,k, sum, count; |
1191 |
|
1192 |
int numIONodes = numIORanks / numRanksPerNode; |
1193 |
int numIORanksThisField[numFields]; |
1194 |
long int bytesPerIORankThisField[numFields]; |
1195 |
int expectedMessagesPerRankThisField[numFields]; |
1196 |
long int fieldSizes[numFields]; |
1197 |
|
1198 |
struct ioNodeInfo_t { |
1199 |
int expectedNumMessagesThisNode; |
1200 |
int numCoresAssigned; |
1201 |
int *assignedFieldThisCore; |
1202 |
} ioNodeInfo[numIONodes]; |
1203 |
|
1204 |
// Since numRanksPerNode might be dynamically determined, we have |
1205 |
// to dynamically allocate the assignedFieldThisCore arrays. |
1206 |
for (i = 0; i < numIONodes; ++i) { |
1207 |
ioNodeInfo[i].assignedFieldThisCore = malloc(sizeof(int)*numRanksPerNode); |
1208 |
ASSERT(NULL != ioNodeInfo[i].assignedFieldThisCore); |
1209 |
} |
1210 |
|
1211 |
ASSERT((numIONodes*numRanksPerNode) == numIORanks); |
1212 |
|
1213 |
|
1214 |
////////////////////////////////////////////////////////////// |
1215 |
// Compute the size for each field in this epoch style |
1216 |
for (i = 0; i < numFields; ++i) { |
1217 |
ASSERT((1 == fields[i].zDepth) || (NUM_Z == fields[i].zDepth) || (MULTDIM == fields[i].zDepth)); |
1218 |
fieldSizes[i] = twoDFieldSizeInBytes * fields[i].zDepth; |
1219 |
} |
1220 |
|
1221 |
|
1222 |
///////////////////////////////////////////////////////// |
1223 |
// Distribute the available i/o ranks among the fields, |
1224 |
// proportionally based on field size. |
1225 |
|
1226 |
// First, assign one rank per field |
1227 |
ASSERT(numIORanks >= numFields); |
1228 |
for (i = 0; i < numFields; ++i) { |
1229 |
numIORanksThisField[i] = 1; |
1230 |
bytesPerIORankThisField[i] = fieldSizes[i]; |
1231 |
} |
1232 |
|
1233 |
// Now apportion any extra ranks |
1234 |
for (; i < numIORanks; ++i) { |
1235 |
|
1236 |
// Find the field 'k' with the most bytesPerIORank |
1237 |
k = 0; |
1238 |
for (j = 1; j < numFields; ++j) { |
1239 |
if (bytesPerIORankThisField[j] > bytesPerIORankThisField[k]) { |
1240 |
k = j; |
1241 |
} |
1242 |
} |
1243 |
|
1244 |
// Assign an i/o rank to that field |
1245 |
numIORanksThisField[k] += 1; |
1246 |
bytesPerIORankThisField[k] = fieldSizes[k] / numIORanksThisField[k]; |
1247 |
} |
1248 |
|
1249 |
//////////////////////////////////////////////////////////// |
1250 |
// At this point, all the i/o ranks should be apportioned |
1251 |
// among the fields. Check we didn't screw up the count. |
1252 |
for (sum = 0, i = 0; i < numFields; ++i) { |
1253 |
sum += numIORanksThisField[i]; |
1254 |
} |
1255 |
ASSERT(sum == numIORanks); |
1256 |
|
1257 |
|
1258 |
|
1259 |
////////////////////////////////////////////////////////////////// |
1260 |
// The *number* of i/o ranks assigned to a field is based on the |
1261 |
// field size. But the *placement* of those ranks is based on |
1262 |
// the expected number of messages the process will receive. |
1263 |
// [In particular, if we have several fields each with only one |
1264 |
// assigned i/o rank, we do not want to place them all on the |
1265 |
// same node if we don't have to.] This traffic-load balance |
1266 |
// strategy is an approximation at best since the messages may |
1267 |
// not all be the same size (e.g. 2D fields vs. 3D fields), so |
1268 |
// "number of messages" is not the same thing as "traffic". |
1269 |
// But it should suffice. |
1270 |
|
1271 |
// Init a couple of things |
1272 |
for (i = 0; i < numFields; ++i) { |
1273 |
expectedMessagesPerRankThisField[i] = |
1274 |
numComputeRanks / numIORanksThisField[i]; |
1275 |
} |
1276 |
for (i = 0; i < numIONodes; ++i) { |
1277 |
ioNodeInfo[i].expectedNumMessagesThisNode = 0; |
1278 |
ioNodeInfo[i].numCoresAssigned = 0; |
1279 |
for (j = 0; j < numRanksPerNode; ++j) { |
1280 |
ioNodeInfo[i].assignedFieldThisCore[j] = -1; |
1281 |
} |
1282 |
} |
1283 |
|
1284 |
/////////////////////////////////////////////////////////////////// |
1285 |
// Select the i/o node with the smallest expectedNumMessages, and |
1286 |
// assign it a rank from the field with the highest remaining |
1287 |
// expectedMessagesPerRank. Repeat until everything is assigned. |
1288 |
// (Yes, this could be a lot faster, but isn't worth the trouble.) |
1289 |
|
1290 |
for (count = 0; count < numIORanks; ++count) { |
1291 |
|
1292 |
// Make an initial choice of node |
1293 |
for (i = 0; i < numIONodes; ++i) { |
1294 |
if (ioNodeInfo[i].numCoresAssigned < numRanksPerNode) break; |
1295 |
} |
1296 |
j = i; |
1297 |
ASSERT(j < numIONodes); |
1298 |
|
1299 |
// Search for a better choice |
1300 |
for (++i; i < numIONodes; ++i) { |
1301 |
if (ioNodeInfo[i].numCoresAssigned >= numRanksPerNode) continue; |
1302 |
if (ioNodeInfo[i].expectedNumMessagesThisNode < |
1303 |
ioNodeInfo[j].expectedNumMessagesThisNode) |
1304 |
{ |
1305 |
j = i; |
1306 |
} |
1307 |
} |
1308 |
|
1309 |
|
1310 |
// Make an initial choice of field |
1311 |
for (i = 0; i < numFields; ++i) { |
1312 |
if (numIORanksThisField[i] > 0) break; |
1313 |
} |
1314 |
k = i; |
1315 |
ASSERT(k < numFields); |
1316 |
|
1317 |
// Search for a better choice |
1318 |
for (++i; i < numFields; ++i) { |
1319 |
if (numIORanksThisField[i] <= 0) continue; |
1320 |
if (expectedMessagesPerRankThisField[i] > |
1321 |
expectedMessagesPerRankThisField[k]) |
1322 |
{ |
1323 |
k = i; |
1324 |
} |
1325 |
} |
1326 |
|
1327 |
// Put one rank from the chosen field onto the chosen node |
1328 |
ioNodeInfo[j].expectedNumMessagesThisNode += expectedMessagesPerRankThisField[k]; |
1329 |
ioNodeInfo[j].assignedFieldThisCore[ioNodeInfo[j].numCoresAssigned] = k; |
1330 |
ioNodeInfo[j].numCoresAssigned += 1; |
1331 |
numIORanksThisField[k] -= 1; |
1332 |
} |
1333 |
|
1334 |
// Sanity check - all ranks were assigned to a core |
1335 |
for (i = 1; i < numFields; ++i) { |
1336 |
ASSERT(0 == numIORanksThisField[i]); |
1337 |
} |
1338 |
// Sanity check - all cores were assigned a rank |
1339 |
for (i = 1; i < numIONodes; ++i) { |
1340 |
ASSERT(numRanksPerNode == ioNodeInfo[i].numCoresAssigned); |
1341 |
} |
1342 |
|
1343 |
|
1344 |
///////////////////////////////////// |
1345 |
// Return the computed assignments |
1346 |
for (i = 0; i < numIONodes; ++i) { |
1347 |
for (j = 0; j < numRanksPerNode; ++j) { |
1348 |
assignments[i*numRanksPerNode + j] = |
1349 |
ioNodeInfo[i].assignedFieldThisCore[j]; |
1350 |
} |
1351 |
} |
1352 |
|
1353 |
// Clean up |
1354 |
for (i = 0; i < numIONodes; ++i) { |
1355 |
free(ioNodeInfo[i].assignedFieldThisCore); |
1356 |
} |
1357 |
|
1358 |
} |
1359 |
|
1360 |
////////////////////////////////////////////////////////////////////////////////// |
1361 |
|
1362 |
|
1363 |
|
1364 |
|
1365 |
int |
1366 |
isIORank(int commRank, int totalNumNodes, int numIONodes) |
1367 |
{ |
1368 |
// Figure out if this rank is on a node that will do i/o. |
1369 |
// Note that the i/o nodes are distributed throughout the |
1370 |
// task, not clustered together. |
1371 |
int ioNodeStride = totalNumNodes / numIONodes; |
1372 |
int thisRankNode = commRank / numRanksPerNode; |
1373 |
int n = thisRankNode / ioNodeStride; |
1374 |
return (((n * ioNodeStride) == thisRankNode) && (n < numIONodes)) ? 1 : 0; |
1375 |
} |
1376 |
|
1377 |
|
1378 |
// Find the number of *physical cores* on the current node |
1379 |
// (ignore hyperthreading). This should work for pretty much |
1380 |
// any Linux based system (and fail horribly for anything else). |
1381 |
int |
1382 |
getNumCores(void) |
1383 |
{ |
1384 |
return 20; // until we figure out why this cratered |
1385 |
|
1386 |
char *magic = "cat /proc/cpuinfo | egrep \"core id|physical id\" | " |
1387 |
"tr -d \"\\n\" | sed s/physical/\\\\nphysical/g | " |
1388 |
"grep -v ^$ | sort -u | wc -l"; |
1389 |
|
1390 |
FILE *fp = popen(magic,"r"); |
1391 |
ASSERT(fp); |
1392 |
|
1393 |
int rtnValue = -1; |
1394 |
|
1395 |
int res = fscanf(fp,"%d",&rtnValue); |
1396 |
|
1397 |
ASSERT(1==res); |
1398 |
|
1399 |
pclose(fp); |
1400 |
|
1401 |
ASSERT(rtnValue > 0); |
1402 |
return rtnValue; |
1403 |
} |
1404 |
|
1405 |
|
1406 |
|
1407 |
//////////////////////////////////////////////////////////////////////// |
1408 |
//////////////////////////////////////////////////////////////////////// |
1409 |
// Routines called by the mitGCM code |
1410 |
|
1411 |
|
1412 |
//////////////////////////////////////// |
1413 |
// Various "one time" initializations |
1414 |
void |
1415 |
f1( |
1416 |
MPI_Comm parentComm, |
1417 |
int numComputeRanks, |
1418 |
int numTiles, |
1419 |
MPI_Comm *rtnComputeComm) |
1420 |
{ |
1421 |
MPI_Comm newIntracomm, newIntercomm, dupParentComm; |
1422 |
int newIntracommRank, thisIsIORank; |
1423 |
int parentSize, parentRank; |
1424 |
int numIONodes, numIORanks; |
1425 |
int mpiIsInitialized, *intPtr, tagUBexists; |
1426 |
int i, j, nF, compRoot, fieldIndex, epochStyleIndex; |
1427 |
int totalNumNodes, numComputeNodes, newIntracommSize; |
1428 |
|
1429 |
// Init globals |
1430 |
totalNumTiles = numTiles; |
1431 |
if (numRanksPerNode <= 0) { |
1432 |
// Might also want to check for an env var (or something) |
1433 |
numRanksPerNode = getNumCores(); |
1434 |
} |
1435 |
|
1436 |
// Fill in the zDepth field of the fieldInfoThisEpoch_t descriptors |
1437 |
for (epochStyleIndex = 0; epochStyleIndex < numEpochStyles; ++epochStyleIndex) { |
1438 |
fieldInfoThisEpoch_t *f, *thisEpochStyle = epochStyles[epochStyleIndex]; |
1439 |
|
1440 |
int curFieldIndex = -1; |
1441 |
while ((f = &(thisEpochStyle[++curFieldIndex]))->dataFieldID != '\0') { |
1442 |
i = findField(f->dataFieldID); |
1443 |
if (-1 != f->zDepth) { |
1444 |
if (f->zDepth != fieldDepths[i].numZ) { |
1445 |
fprintf(stderr, "Inconsistent z-depth for field '%c': " |
1446 |
"fieldDepths[%d] and epoch style %d\n", |
1447 |
f->dataFieldID, i, epochStyleIndex); |
1448 |
} |
1449 |
} |
1450 |
f->zDepth = fieldDepths[i].numZ; |
1451 |
} |
1452 |
} |
1453 |
|
1454 |
|
1455 |
// Find the max MPI "tag" value |
1456 |
MPI_Attr_get(MPI_COMM_WORLD, MPI_TAG_UB, &intPtr, &tagUBexists); |
1457 |
ASSERT(tagUBexists); |
1458 |
maxTagValue = *intPtr; |
1459 |
|
1460 |
|
1461 |
// Info about the parent communicator |
1462 |
MPI_Initialized(&mpiIsInitialized); |
1463 |
ASSERT(mpiIsInitialized); |
1464 |
MPI_Comm_size(parentComm, &parentSize); |
1465 |
MPI_Comm_rank(parentComm, &parentRank); |
1466 |
|
1467 |
|
1468 |
// Figure out how many nodes we can use for i/o. |
1469 |
// To make things (e.g. memory calculations) simpler, we want |
1470 |
// a node to have either all compute ranks, or all i/o ranks. |
1471 |
// If numComputeRanks does not evenly divide numRanksPerNode, we have |
1472 |
// to round up in favor of the compute side. |
1473 |
|
1474 |
totalNumNodes = divCeil(parentSize, numRanksPerNode); |
1475 |
numComputeNodes = divCeil(numComputeRanks, numRanksPerNode); |
1476 |
numIONodes = (parentSize - numComputeRanks) / numRanksPerNode; |
1477 |
ASSERT(numIONodes > 0); |
1478 |
ASSERT(numIONodes <= (totalNumNodes - numComputeNodes)); |
1479 |
numIORanks = numIONodes * numRanksPerNode; |
1480 |
|
1481 |
|
1482 |
// It is surprisingly easy to launch the job with the wrong number |
1483 |
// of ranks, particularly if the number of compute ranks is not a |
1484 |
// multiple of numRanksPerNode (the tendency is to just launch one |
1485 |
// rank per core for all available cores). So we introduce a third |
1486 |
// option for a rank: besides being an i/o rank or a compute rank, |
1487 |
// it might instead be an "excess" rank, in which case it just |
1488 |
// calls MPI_Finalize and exits. |
1489 |
|
1490 |
typedef enum { isCompute, isIO, isExcess } rankType; |
1491 |
rankType thisRanksType; |
1492 |
|
1493 |
if (isIORank(parentRank, totalNumNodes, numIONodes)) { |
1494 |
thisRanksType = isIO; |
1495 |
} else if (parentRank >= numComputeRanks + numIORanks) { |
1496 |
thisRanksType = isExcess; |
1497 |
} else { |
1498 |
thisRanksType = isCompute; |
1499 |
} |
1500 |
|
1501 |
// Split the parent communicator into parts |
1502 |
MPI_Comm_split(parentComm, thisRanksType, parentRank, &newIntracomm); |
1503 |
MPI_Comm_rank(newIntracomm, &newIntracommRank); |
1504 |
|
1505 |
// Excess ranks disappear |
1506 |
// N.B. "parentSize" still includes these ranks. |
1507 |
if (isExcess == thisRanksType) { |
1508 |
MPI_Finalize(); |
1509 |
exit(0); |
1510 |
} |
1511 |
|
1512 |
// Sanity check |
1513 |
MPI_Comm_size(newIntracomm, &newIntracommSize); |
1514 |
if (isIO == thisRanksType) { |
1515 |
ASSERT(newIntracommSize == numIORanks); |
1516 |
} else { |
1517 |
ASSERT(newIntracommSize == numComputeRanks); |
1518 |
} |
1519 |
|
1520 |
|
1521 |
// Create a special intercomm from the i/o and compute parts |
1522 |
if (isIO == thisRanksType) { |
1523 |
// Set globals |
1524 |
ioIntracomm = newIntracomm; |
1525 |
MPI_Comm_rank(ioIntracomm, &ioIntracommRank); |
1526 |
|
1527 |
// Find the lowest computation rank |
1528 |
for (i = 0; i < parentSize; ++i) { |
1529 |
if (!isIORank(i, totalNumNodes, numIONodes)) break; |
1530 |
} |
1531 |
} else { // isCompute |
1532 |
// Set globals |
1533 |
computeIntracomm = newIntracomm; |
1534 |
MPI_Comm_rank(computeIntracomm, &computeIntracommRank); |
1535 |
|
1536 |
// Find the lowest IO rank |
1537 |
for (i = 0; i < parentSize; ++i) { |
1538 |
if (isIORank(i, totalNumNodes, numIONodes)) break; |
1539 |
} |
1540 |
} |
1541 |
MPI_Intercomm_create(newIntracomm, 0, parentComm, i, 0, &globalIntercomm); |
1542 |
|
1543 |
|
1544 |
|
1545 |
/////////////////////////////////////////////////////////////////// |
1546 |
// For each different i/o epoch style, split the i/o ranks among |
1547 |
// the fields, and create an inter-communicator for each split. |
1548 |
|
1549 |
for (epochStyleIndex = 0; epochStyleIndex < numEpochStyles; ++epochStyleIndex) { |
1550 |
fieldInfoThisEpoch_t *thisEpochStyle = epochStyles[epochStyleIndex]; |
1551 |
int fieldAssignments[numIORanks]; |
1552 |
int rankAssignments[numComputeRanks + numIORanks]; |
1553 |
|
1554 |
// Count the number of fields in this epoch style |
1555 |
for (nF = 0; thisEpochStyle[nF].dataFieldID != '\0'; ++nF) ; |
1556 |
|
1557 |
// Decide how to apportion the i/o ranks among the fields |
1558 |
for (i=0; i < numIORanks; ++i) fieldAssignments[i] = -1; |
1559 |
computeIORankAssigments(numComputeRanks, numIORanks, nF, |
1560 |
thisEpochStyle, fieldAssignments); |
1561 |
// Sanity check |
1562 |
for (i=0; i < numIORanks; ++i) { |
1563 |
ASSERT((fieldAssignments[i] >= 0) && (fieldAssignments[i] < nF)); |
1564 |
} |
1565 |
|
1566 |
// Embed the i/o rank assignments into an array holding |
1567 |
// the assignments for *all* the ranks (i/o and compute). |
1568 |
// Rank assignment of "-1" means "compute". |
1569 |
j = 0; |
1570 |
for (i = 0; i < numComputeRanks + numIORanks; ++i) { |
1571 |
rankAssignments[i] = isIORank(i, totalNumNodes, numIONodes) ? |
1572 |
fieldAssignments[j++] : -1; |
1573 |
} |
1574 |
// Sanity check. Check the assignment for this rank. |
1575 |
if (isIO == thisRanksType) { |
1576 |
ASSERT(fieldAssignments[newIntracommRank] == rankAssignments[parentRank]); |
1577 |
} else { |
1578 |
ASSERT(-1 == rankAssignments[parentRank]); |
1579 |
} |
1580 |
ASSERT(j == numIORanks); |
1581 |
|
1582 |
if (0 == parentRank) { |
1583 |
printf("\ncpu assignments, epoch style %d\n", epochStyleIndex); |
1584 |
for (i = 0; i < numComputeNodes + numIONodes ; ++i) { |
1585 |
if (rankAssignments[i*numRanksPerNode] >= 0) { |
1586 |
// i/o node |
1587 |
for (j = 0; j < numRanksPerNode; ++j) { |
1588 |
printf(" %1d", rankAssignments[i*numRanksPerNode + j]); |
1589 |
} |
1590 |
} else { |
1591 |
// compute node |
1592 |
for (j = 0; j < numRanksPerNode; ++j) { |
1593 |
if ((i*numRanksPerNode + j) >= (numComputeRanks + numIORanks)) break; |
1594 |
ASSERT(-1 == rankAssignments[i*numRanksPerNode + j]); |
1595 |
} |
1596 |
printf(" #"); |
1597 |
for (; j < numRanksPerNode; ++j) { // "excess" ranks (if any) |
1598 |
printf("X"); |
1599 |
} |
1600 |
} |
1601 |
printf(" "); |
1602 |
} |
1603 |
printf("\n\n"); |
1604 |
} |
1605 |
|
1606 |
// Find the lowest rank assigned to computation; use it as |
1607 |
// the "root" for the upcoming intercomm creates. |
1608 |
for (compRoot = 0; rankAssignments[compRoot] != -1; ++compRoot); |
1609 |
// Sanity check |
1610 |
if ((isCompute == thisRanksType) && (0 == newIntracommRank)) ASSERT(compRoot == parentRank); |
1611 |
|
1612 |
|
1613 |
// If this is an I/O rank, split the newIntracomm (which, for |
1614 |
// the i/o ranks, is a communicator among all the i/o ranks) |
1615 |
// into the pieces as assigned. |
1616 |
|
1617 |
if (isIO == thisRanksType) { |
1618 |
MPI_Comm fieldIntracomm; |
1619 |
int myField = rankAssignments[parentRank]; |
1620 |
ASSERT(myField >= 0); |
1621 |
|
1622 |
MPI_Comm_split(newIntracomm, myField, parentRank, &fieldIntracomm); |
1623 |
thisEpochStyle[myField].ioRanksIntracomm = fieldIntracomm; |
1624 |
|
1625 |
// Now create an inter-communicator between the computational |
1626 |
// ranks, and each of the newly split off field communicators. |
1627 |
for (i = 0; i < nF; ++i) { |
1628 |
|
1629 |
// Do one field at a time to avoid clashes on parentComm |
1630 |
MPI_Barrier(newIntracomm); |
1631 |
|
1632 |
if (myField != i) continue; |
1633 |
|
1634 |
// Create the intercomm for this field for this epoch style |
1635 |
MPI_Intercomm_create(fieldIntracomm, 0, parentComm, |
1636 |
compRoot, i, &(thisEpochStyle[myField].dataIntercomm)); |
1637 |
|
1638 |
// ... and dup a separate one for tile registrations |
1639 |
MPI_Comm_dup(thisEpochStyle[myField].dataIntercomm, |
1640 |
&(thisEpochStyle[myField].registrationIntercomm)); |
1641 |
} |
1642 |
} |
1643 |
else { |
1644 |
// This is a computational rank; create the intercommunicators |
1645 |
// to the various split off separate field communicators. |
1646 |
|
1647 |
for (fieldIndex = 0; fieldIndex < nF; ++fieldIndex) { |
1648 |
|
1649 |
// Find the remote "root" process for this field |
1650 |
int fieldRoot = -1; |
1651 |
while (rankAssignments[++fieldRoot] != fieldIndex); |
1652 |
|
1653 |
// Create the intercomm for this field for this epoch style |
1654 |
MPI_Intercomm_create(newIntracomm, 0, parentComm, fieldRoot, |
1655 |
fieldIndex, &(thisEpochStyle[fieldIndex].dataIntercomm)); |
1656 |
|
1657 |
// ... and dup a separate one for tile registrations |
1658 |
MPI_Comm_dup(thisEpochStyle[fieldIndex].dataIntercomm, |
1659 |
&(thisEpochStyle[fieldIndex].registrationIntercomm)); |
1660 |
} |
1661 |
} |
1662 |
|
1663 |
} // epoch style loop |
1664 |
|
1665 |
|
1666 |
// I/O processes split off and start receiving data |
1667 |
// NOTE: the I/O processes DO NOT RETURN from this call |
1668 |
if (isIO == thisRanksType) ioRankMain(totalNumTiles); |
1669 |
|
1670 |
|
1671 |
// The "compute" processes now return with their new intra-communicator. |
1672 |
*rtnComputeComm = newIntracomm; |
1673 |
|
1674 |
// but first, call mpiio initialization routine! |
1675 |
initSizesAndTypes(); |
1676 |
|
1677 |
return; |
1678 |
} |
1679 |
|
1680 |
|
1681 |
|
1682 |
|
1683 |
// "Register" the tile-id(s) that this process will be sending |
1684 |
void |
1685 |
f2(int tileID) |
1686 |
{ |
1687 |
int i, epochStyleIndex; |
1688 |
|
1689 |
static int count = 0; |
1690 |
|
1691 |
// This code assumes that the same tileID will apply to all the |
1692 |
// fields. Note that we use the tileID as a tag, so we require it |
1693 |
// be an int with a legal tag value, but we do *NOT* actually use |
1694 |
// the tag *value* for anything (e.g. it is NOT used as an index |
1695 |
// into an array). It is treated as an opaque handle that is |
1696 |
// passed from the compute task(s) to the compositing routines. |
1697 |
// Those two end-points likely assign meaning to the tileID (i.e. |
1698 |
// use it to identify where the tile belongs within the domain), |
1699 |
// but that is their business. |
1700 |
// |
1701 |
// A tileID of -1 signifies the end of registration. |
1702 |
|
1703 |
ASSERT(computeIntracommRank >= 0); |
1704 |
|
1705 |
if (tileID >= 0) { |
1706 |
// In this version of the code, in addition to the tileID, we also |
1707 |
// multiplex in the low order bit(s) of the epoch number as an |
1708 |
// error check. So the tile value must be small enough to allow that. |
1709 |
ASSERT(((tileID<<numCheckBits) + ((1<<numCheckBits)-1)) < maxTagValue); |
1710 |
|
1711 |
// In this version of the code, we do not send the actual tileID's |
1712 |
// to the i/o processes during the registration procedure. We only |
1713 |
// send a count of the number of tiles that we will send. |
1714 |
++count; |
1715 |
return; |
1716 |
} |
1717 |
|
1718 |
|
1719 |
// We get here when we are called with a negative tileID, signifying |
1720 |
// the end of registration. We now need to figure out and inform |
1721 |
// *each* i/o process in *each* field just how many tiles we will be |
1722 |
// sending them in *each* epoch style. |
1723 |
|
1724 |
for (epochStyleIndex = 0; epochStyleIndex < numEpochStyles; ++epochStyleIndex) { |
1725 |
fieldInfoThisEpoch_t *thisEpochStyle = epochStyles[epochStyleIndex]; |
1726 |
|
1727 |
int curFieldIndex = -1; |
1728 |
while ('\0' != thisEpochStyle[++curFieldIndex].dataFieldID) { |
1729 |
fieldInfoThisEpoch_t *thisField = &(thisEpochStyle[curFieldIndex]); |
1730 |
int numRemoteRanks, *tileCounts, remainder; |
1731 |
MPI_Comm_remote_size(thisField->dataIntercomm, &numRemoteRanks); |
1732 |
|
1733 |
tileCounts = alloca(numRemoteRanks * sizeof(int)); |
1734 |
|
1735 |
memset(tileCounts,0,numRemoteRanks * sizeof(int)); |
1736 |
|
1737 |
// Distribute the tiles among the i/o ranks. |
1738 |
for (i = 0; i < numRemoteRanks; ++i) { |
1739 |
tileCounts[i] = count / numRemoteRanks; |
1740 |
} |
1741 |
// Deal with any remainder |
1742 |
remainder = count - ((count / numRemoteRanks) * numRemoteRanks); |
1743 |
for (i = 0; i < remainder; ++i) { |
1744 |
int target = (computeIntracommRank + i) % numRemoteRanks; |
1745 |
tileCounts[target] += 1; |
1746 |
} |
1747 |
|
1748 |
// Communicate these counts to the i/o processes for this |
1749 |
// field. Note that we send a message to each process, |
1750 |
// even if the count is zero. |
1751 |
for (i = 0; i < numRemoteRanks; ++i) { |
1752 |
MPI_Send(NULL, 0, MPI_BYTE, i, tileCounts[i], |
1753 |
thisField->registrationIntercomm); |
1754 |
} |
1755 |
|
1756 |
} // field loop |
1757 |
} // epoch-style loop |
1758 |
|
1759 |
} |
1760 |
|
1761 |
|
1762 |
|
1763 |
|
1764 |
int currentEpochID = 0; |
1765 |
int currentEpochStyle = 0; |
1766 |
|
1767 |
void |
1768 |
beginNewEpoch(int newEpochID, int gcmIter, int epochStyle) |
1769 |
{ |
1770 |
fieldInfoThisEpoch_t *p; |
1771 |
|
1772 |
if (newEpochID != (currentEpochID + 1)) { |
1773 |
fprintf(stderr, "ERROR: Missing i/o epoch? was %d, is now %d ??\n", |
1774 |
currentEpochID, newEpochID); |
1775 |
sleep(1); // Give the message a chance to propagate |
1776 |
abort(); |
1777 |
} |
1778 |
|
1779 |
//////////////////////////////////////////////////////////////////////// |
1780 |
// Need to be sure the i/o tasks are done processing the previous epoch |
1781 |
// before any compute tasks start sending tiles from a new epoch. |
1782 |
|
1783 |
if (0 == computeIntracommRank) { |
1784 |
int cmd[4] = { cmd_newEpoch, newEpochID, epochStyle, gcmIter }; |
1785 |
|
1786 |
// Wait to get an ack that the i/o tasks are done with the prior epoch. |
1787 |
MPI_Recv(NULL, 0, MPI_BYTE, 0, cmd_epochComplete, |
1788 |
globalIntercomm, MPI_STATUS_IGNORE); |
1789 |
|
1790 |
// Tell the i/o ranks about the new epoch. |
1791 |
// (Just tell rank 0; it will bcast to the others) |
1792 |
MPI_Send(cmd, 4, MPI_INT, 0, 0, globalIntercomm); |
1793 |
} |
1794 |
|
1795 |
// Compute ranks wait here until rank 0 gets the ack from the i/o ranks |
1796 |
MPI_Barrier(computeIntracomm); |
1797 |
|
1798 |
|
1799 |
currentEpochID = newEpochID; |
1800 |
currentEpochStyle = epochStyle; |
1801 |
|
1802 |
// Reset the tileCount (used by f3) |
1803 |
for (p = epochStyles[currentEpochStyle]; p->dataFieldID != '\0'; ++p) { |
1804 |
p->tileCount = 0; |
1805 |
} |
1806 |
} |
1807 |
|
1808 |
|
1809 |
void |
1810 |
f3(char dataFieldID, int tileID, int epochID, void *data) |
1811 |
{ |
1812 |
int whichField, receiver, tag; |
1813 |
|
1814 |
static char priorDataFieldID = '\0'; |
1815 |
static int priorEpoch = -1; |
1816 |
static int remoteCommSize; |
1817 |
static fieldInfoThisEpoch_t *p; |
1818 |
|
1819 |
int flag=0; |
1820 |
|
1821 |
// Check that this global has been set |
1822 |
ASSERT(computeIntracommRank >= 0); |
1823 |
|
1824 |
|
1825 |
// Figure out some info about this dataFieldID. It is |
1826 |
// likely to be another tile from the same field as last time |
1827 |
// we were called, in which case we can reuse the "static" values |
1828 |
// retained from the prior call. If not, we need to look it up. |
1829 |
|
1830 |
if ((dataFieldID != priorDataFieldID) || (epochID != priorEpoch)) { |
1831 |
|
1832 |
// It's not the same; we need to look it up. |
1833 |
|
1834 |
for (p = epochStyles[currentEpochStyle]; p->dataFieldID != '\0'; ++p) { |
1835 |
if (p->dataFieldID == dataFieldID) break; |
1836 |
} |
1837 |
// Make sure we found a valid field |
1838 |
|
1839 |
ASSERT(p->dataIntercomm != MPI_COMM_NULL); |
1840 |
|
1841 |
MPI_Comm_remote_size(p->dataIntercomm, &remoteCommSize); |
1842 |
|
1843 |
flag=1; |
1844 |
|
1845 |
} |
1846 |
|
1847 |
ASSERT(p->dataFieldID == dataFieldID); |
1848 |
|
1849 |
receiver = (computeIntracommRank + p->tileCount) % remoteCommSize; |
1850 |
|
1851 |
tag = (tileID << numCheckBits) | (epochID & bitMask); |
1852 |
|
1853 |
//fprintf(stderr,"%d %d\n",flag,tileID); |
1854 |
|
1855 |
/* |
1856 |
if (tileID==189){ |
1857 |
int i,j; |
1858 |
for (i=0;i<TILE_Y;++i){ |
1859 |
for (j=0;j<TILE_X;++j) |
1860 |
fprintf(stderr,"%f ",((double*)data)[872+i*108+j]); |
1861 |
fprintf(stderr,"\n"); |
1862 |
} |
1863 |
} |
1864 |
|
1865 |
*/ |
1866 |
|
1867 |
|
1868 |
|
1869 |
FPRINTF(stderr,"Rank %d sends field '%c', tile %d, to i/o task %d with tag %d(%d)\n", |
1870 |
computeIntracommRank, dataFieldID, tileID, receiver, tag, tag >> numCheckBits); |
1871 |
|
1872 |
MPI_Send(data, tileOneZLevelSizeInBytes * p->zDepth, |
1873 |
MPI_BYTE, receiver, tag, p->dataIntercomm); |
1874 |
|
1875 |
p->tileCount += 1; |
1876 |
priorDataFieldID = dataFieldID; |
1877 |
priorEpoch = epochID; |
1878 |
} |
1879 |
|
1880 |
|
1881 |
|
1882 |
void |
1883 |
f4(int epochID) |
1884 |
{ |
1885 |
int i; |
1886 |
ASSERT(computeIntracommRank >= 0); |
1887 |
|
1888 |
if (0 == computeIntracommRank) { |
1889 |
int cmd[3] = { cmd_exit, -1, -1 }; |
1890 |
|
1891 |
// Recv the ack that the prior i/o epoch is complete |
1892 |
MPI_Recv(NULL, 0, MPI_BYTE, 0, cmd_epochComplete, |
1893 |
globalIntercomm, MPI_STATUS_IGNORE); |
1894 |
|
1895 |
// Tell the i/o ranks to exit |
1896 |
// Just tell rank 0; it will bcast to the others |
1897 |
MPI_Send(cmd, 3, MPI_INT, 0, 0, globalIntercomm); |
1898 |
} |
1899 |
|
1900 |
} |
1901 |
|
1902 |
|
1903 |
|
1904 |
void myasyncio_set_global_sizes_(int *nx, int *ny, int *nz, |
1905 |
int *nt, int *nb, |
1906 |
int *tx, int *ty, |
1907 |
int *ox, int *oy) |
1908 |
{ |
1909 |
|
1910 |
|
1911 |
int data[] = {*nx,*ny,*nz,*nt,*nb,*tx,*ty,*ox,*oy}; |
1912 |
|
1913 |
int items = sizeof(data)/sizeof(int); |
1914 |
|
1915 |
|
1916 |
NTILES = *nt; // total number of tiles |
1917 |
|
1918 |
|
1919 |
TILE_X = *tx; |
1920 |
TILE_Y = *ty; |
1921 |
XGHOSTS = *ox; |
1922 |
YGHOSTS = *oy; |
1923 |
tileOneZLevelItemCount = ((TILE_X + 2*XGHOSTS) * (TILE_Y + 2*YGHOSTS)); |
1924 |
tileOneZLevelSizeInBytes = (tileOneZLevelItemCount * datumSize); // needed by compute ranks in f3 |
1925 |
|
1926 |
|
1927 |
int rank,ierr; |
1928 |
MPI_Comm_rank(globalIntercomm,&rank); |
1929 |
|
1930 |
|
1931 |
if (0==rank) |
1932 |
printf("%d %d %d\n%d %d\n%d %d\n%d %d\n",*nx,*ny,*nz,*nt,*nb,*tx,*ty,*ox,*oy); |
1933 |
|
1934 |
|
1935 |
/* |
1936 |
if (0==rank) |
1937 |
ierr=MPI_Bcast(&NTILES,1,MPI_INT,MPI_ROOT,globalIntercomm); |
1938 |
else |
1939 |
ierr=MPI_Bcast(&NTILES,1,MPI_INT,MPI_PROC_NULL,globalIntercomm); |
1940 |
*/ |
1941 |
|
1942 |
if (0==rank) |
1943 |
ierr=MPI_Bcast(data,items,MPI_INT,MPI_ROOT,globalIntercomm); |
1944 |
else |
1945 |
ierr=MPI_Bcast(data,items,MPI_INT,MPI_PROC_NULL,globalIntercomm); |
1946 |
|
1947 |
} |
1948 |
|
1949 |
void asyncio_tile_arrays_(int *xoff, int *yoff, int *xskip) |
1950 |
{ |
1951 |
int rank,ierr; |
1952 |
MPI_Comm_rank(globalIntercomm,&rank); |
1953 |
|
1954 |
if (0==rank) |
1955 |
ierr=MPI_Bcast(xoff,NTILES,MPI_INT,MPI_ROOT,globalIntercomm); |
1956 |
else |
1957 |
ierr=MPI_Bcast(xoff,NTILES,MPI_INT,MPI_PROC_NULL,globalIntercomm); |
1958 |
|
1959 |
if (0==rank) |
1960 |
ierr=MPI_Bcast(yoff,NTILES,MPI_INT,MPI_ROOT,globalIntercomm); |
1961 |
else |
1962 |
ierr=MPI_Bcast(yoff,NTILES,MPI_INT,MPI_PROC_NULL,globalIntercomm); |
1963 |
|
1964 |
if (0==rank) |
1965 |
ierr=MPI_Bcast(xskip,NTILES,MPI_INT,MPI_ROOT,globalIntercomm); |
1966 |
else |
1967 |
ierr=MPI_Bcast(xskip,NTILES,MPI_INT,MPI_PROC_NULL,globalIntercomm); |
1968 |
|
1969 |
} |
1970 |
|
1971 |
|
1972 |
|
1973 |
|
1974 |
|
1975 |
////////////////////////////////////////////////////////////////////// |
1976 |
// Fortran interface |
1977 |
|
1978 |
void |
1979 |
bron_f1( |
1980 |
MPI_Fint *ptr_parentComm, |
1981 |
int *ptr_numComputeRanks, |
1982 |
int *ptr_totalNumTiles, |
1983 |
MPI_Fint *rtnComputeComm) |
1984 |
{ |
1985 |
// Convert the Fortran handle into a C handle |
1986 |
MPI_Comm newComm, parentComm = MPI_Comm_f2c(*ptr_parentComm); |
1987 |
|
1988 |
f1(parentComm, *ptr_numComputeRanks, *ptr_totalNumTiles, &newComm); |
1989 |
|
1990 |
// Convert the C handle into a Fortran handle |
1991 |
*rtnComputeComm = MPI_Comm_c2f(newComm); |
1992 |
} |
1993 |
|
1994 |
|
1995 |
|
1996 |
void |
1997 |
bron_f2(int *ptr_tileID) |
1998 |
{ |
1999 |
f2(*ptr_tileID); |
2000 |
} |
2001 |
|
2002 |
|
2003 |
void |
2004 |
beginnewepoch_(int *newEpochID, int *gcmIter, int *epochStyle) |
2005 |
{ |
2006 |
beginNewEpoch(*newEpochID, *gcmIter, *epochStyle); |
2007 |
} |
2008 |
|
2009 |
|
2010 |
void |
2011 |
bron_f3(char *ptr_dataFieldID, int *ptr_tileID, int *ptr_epochID, void *data) |
2012 |
{ |
2013 |
f3(*ptr_dataFieldID, *ptr_tileID, *ptr_epochID, data); |
2014 |
} |
2015 |
|
2016 |
|
2017 |
|
2018 |
void |
2019 |
bron_f4(int *ptr_epochID) |
2020 |
{ |
2021 |
f4(*ptr_epochID); |
2022 |
} |
2023 |
|