1 |
|
|
2 |
// Code to do the m-on-n fan-in, recompositing, and output, of data |
// Code to do the m-on-n fan-in, recompositing, and output, of data |
3 |
// tiles for mitGCM. |
// tiles for MITgcm. |
4 |
// |
// |
5 |
// There are aspects of this code that would be simpler in C++, but |
// There are aspects of this code that would be simpler in C++, but |
6 |
// we deliberately wrote the code in ansi-C to make linking it with |
// we deliberately wrote the code in ansi-C to make linking it with |
9 |
|
|
10 |
|
|
11 |
#include "PACKAGES_CONFIG.h" |
#include "PACKAGES_CONFIG.h" |
12 |
|
#include "SIZE.h" |
13 |
|
|
14 |
#include <stdio.h> |
#include <stdio.h> |
15 |
#include <unistd.h> |
#include <unistd.h> |
18 |
#include <sys/types.h> |
#include <sys/types.h> |
19 |
#include <sys/stat.h> |
#include <sys/stat.h> |
20 |
#include <fcntl.h> |
#include <fcntl.h> |
21 |
|
#include <assert.h> |
22 |
|
|
23 |
#include <mpi.h> |
#include <mpi.h> |
24 |
#include <alloca.h> |
#include <alloca.h> |
25 |
|
|
26 |
|
|
27 |
#include <stdint.h> |
#include <stdint.h> |
28 |
|
#include <limits.h> |
29 |
#include <errno.h> |
#include <errno.h> |
30 |
|
|
31 |
#define DEBUG 1 |
#define DEBUG 1 |
32 |
|
|
33 |
#if (DEBUG >= 2) |
#if (DEBUG >= 3) |
34 |
#define FPRINTF fprintf |
#define FPRINTF fprintf |
35 |
#else |
#else |
36 |
#include <stdarg.h> |
#include <stdarg.h> |
38 |
#endif |
#endif |
39 |
|
|
40 |
|
|
41 |
|
#if (DEBUG >= 2) |
42 |
// Define our own version of "assert", where we sleep rather than abort. |
// Define our own version of "assert", where we sleep rather than abort. |
43 |
// This makes it easier to attach the debugger. |
// This makes it easier to attach the debugger. |
|
#if (DEBUG >= 1) |
|
44 |
#define ASSERT(_expr) \ |
#define ASSERT(_expr) \ |
45 |
if (!(_expr)) { \ |
if (!(_expr)) { \ |
46 |
fprintf(stderr, "ASSERT failed for pid %d : `%s': %s: %d: %s\n", \ |
fprintf(stderr, "ASSERT failed for pid %d : `%s': %s: %d: %s\n", \ |
48 |
sleep(9999); \ |
sleep(9999); \ |
49 |
} |
} |
50 |
#else |
#else |
51 |
|
// Use the standard version of "assert" |
52 |
#define ASSERT assert |
#define ASSERT assert |
53 |
#endif |
#endif |
54 |
|
|
55 |
|
|
56 |
|
// The hostname that each rank is running on. Currently, only rank 0 |
57 |
|
// uses this. It is allocated and deallocated in routine "f1" |
58 |
|
char (*allHostnames)[HOST_NAME_MAX+1] = NULL; |
59 |
|
|
60 |
|
|
61 |
// If numRanksPerNode is set to be > 0, we just use that setting. |
// If numRanksPerNode is set to be > 0, we just use that setting. |
62 |
// If it is <= 0, we dynamically determine the number of cores on |
// If it is <= 0, we dynamically determine the number of cores on |
63 |
// a node, and use that. (N.B.: we assume *all* nodes being used in |
// a node, and use that. (N.B.: we assume *all* nodes being used in |
64 |
// the run have the *same* number of cores.) |
// the run have the *same* number of MPI processes on each.) |
65 |
int numRanksPerNode = 0; |
int numRanksPerNode = 0; |
66 |
|
|
67 |
// Just an error check; can be zero (if you're confident it's all correct). |
// Just an error check; can be zero (if you're confident it's all correct). |
71 |
|
|
72 |
|
|
73 |
/////////////////////////////////////////////////////////////////////// |
/////////////////////////////////////////////////////////////////////// |
74 |
// Info about the data fields |
// Fundamental info about the data fields. Most (but not all) of |
75 |
|
// this is copied or derived from the values in "SIZE.h" |
76 |
|
|
77 |
// double for now. Might also be float someday |
// double for now. Might also be float someday |
78 |
#define datumType double |
#define datumType double |
81 |
|
|
82 |
// Info about the data fields. We assume that all the fields have the |
// Info about the data fields. We assume that all the fields have the |
83 |
// same X and Y characteristics, but may have either 1 or NUM_Z levels. |
// same X and Y characteristics, but may have either 1 or NUM_Z levels. |
84 |
// |
// bcn: or MULTDIM levels (evidently used by "sea ice"). |
85 |
// the following 3 values are required here. Could be sent over or read in |
|
86 |
// with some rearrangement in init routines |
// N.B. Please leave the seemingly unneeded "(long int)" casts in |
87 |
// |
// place. These individual values may not need them, but they get |
88 |
#define NUM_X 90 |
// used in calculations where the extra length might be important. |
89 |
#define NUM_Y 1170L // get rid of this someday |
#define NUM_X ((long int) sFacet) |
90 |
#define NUM_Z 50 |
#define NUM_Y (((long int) sFacet) * 13) |
91 |
#define MULTDIM 7 |
#define NUM_Z ((long int) Nr) |
92 |
|
#define MULTDIM ((long int) 7) |
93 |
|
|
94 |
|
// Some values derived from the above constants |
95 |
#define twoDFieldSizeInBytes (NUM_X * NUM_Y * 1 * datumSize) |
#define twoDFieldSizeInBytes (NUM_X * NUM_Y * 1 * datumSize) |
96 |
#define threeDFieldSizeInBytes (twoDFieldSizeInBytes * NUM_Z) |
#define threeDFieldSizeInBytes (twoDFieldSizeInBytes * NUM_Z) |
97 |
#define multDFieldSizeInBytes (twoDFieldSizeInBytes * MULTDIM) |
#define multDFieldSizeInBytes (twoDFieldSizeInBytes * MULTDIM) |
100 |
// size (no odd-sized last piece), they all have the same X and Y |
// size (no odd-sized last piece), they all have the same X and Y |
101 |
// characteristics (including ghosting), and are full depth in Z |
// characteristics (including ghosting), and are full depth in Z |
102 |
// (either 1, or NUM_Z, or MULTDIM, as appropriate). |
// (either 1, or NUM_Z, or MULTDIM, as appropriate). |
103 |
// |
#define TILE_X sNx |
104 |
// all these data now sent over from compute ranks |
#define TILE_Y sNy |
105 |
// |
#define XGHOSTS OLx |
106 |
int TILE_X = -1; |
#define YGHOSTS OLy |
107 |
int TILE_Y = -1; |
#define TOTAL_NUM_TILES ((sFacet / sNx) * (sFacet / sNy) * 13) |
108 |
int XGHOSTS = -1; |
#define NUM_WET_TILES (nPx * nPy) |
109 |
int YGHOSTS = -1; |
|
110 |
|
// Size of one Z level of an individual tile. This is the "in memory" |
111 |
// Size of one Z level of a tile (NOT one Z level of a whole field) |
// size, including ghosting. |
112 |
int tileOneZLevelItemCount = -1; |
#define tileOneZLevelItemCount (((long)(TILE_X + 2*XGHOSTS)) * (TILE_Y + 2*YGHOSTS)) |
113 |
int tileOneZLevelSizeInBytes = -1; |
#define tileOneZLevelSizeInBytes (tileOneZLevelItemCount * datumSize) |
114 |
|
|
115 |
|
|
116 |
|
|
117 |
|
////////////////////////////////////////////////////////////////////// |
118 |
|
// Define the depth (i.e. number of Z-levels) of the various fields. |
119 |
|
|
120 |
typedef struct dataFieldDepth { |
typedef struct dataFieldDepth { |
121 |
char dataFieldID; |
char dataFieldID; |
122 |
int numZ; |
int numZ; |
154 |
#define numAllFields (sizeof(fieldDepths)/sizeof(dataFieldDepth_t)) |
#define numAllFields (sizeof(fieldDepths)/sizeof(dataFieldDepth_t)) |
155 |
|
|
156 |
|
|
|
|
|
157 |
/////////////////////////////////////////////////////////////////////// |
/////////////////////////////////////////////////////////////////////// |
158 |
// Info about the various kinds of i/o epochs |
// Info about the various kinds of i/o epochs |
159 |
typedef struct epochFieldInfo { |
typedef struct epochFieldInfo { |
160 |
char dataFieldID; |
char dataFieldID; |
161 |
MPI_Comm registrationIntercomm; |
MPI_Comm registrationIntercomm; |
162 |
MPI_Comm dataIntercomm; |
MPI_Comm dataIntercomm; |
163 |
MPI_Comm ioRanksIntracomm; |
MPI_Comm ioRanksIntracomm; // i/o ranks for *this* field |
164 |
int tileCount; |
long int tileCount; |
165 |
int zDepth; // duplicates the fieldDepth entry; filled in automatically |
int zDepth; // duplicates the fieldDepth entry; filled in automatically |
166 |
|
int *chunkWriters; // Which rank writes the i'th chunk of z-levels |
167 |
|
int nChunks; // length of chunkWriters array |
168 |
char filenameTemplate[128]; |
char filenameTemplate[128]; |
169 |
long int offset; |
long int offset; |
170 |
int pickup; |
int pickup; |
171 |
} fieldInfoThisEpoch_t; |
} fieldInfoThisEpoch_t; |
172 |
|
|
173 |
// The normal i/o dump |
// The normal i/o dump - style 0 |
174 |
fieldInfoThisEpoch_t fieldsForEpochStyle_0[] = { |
fieldInfoThisEpoch_t fieldsForEpochStyle_0[] = { |
175 |
{ 'U', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "U.%010d.%s", 0, 0 }, |
{ 'U', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "U.%010d.%s", 0, 0 }, |
176 |
{ 'V', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "V.%010d.%s", 0, 0 }, |
{ 'V', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "V.%010d.%s", 0, 0 }, |
177 |
{ 'W', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "W.%010d.%s", 0, 0 }, |
{ 'W', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "W.%010d.%s", 0, 0 }, |
178 |
{ 'S', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "Salt.%010d.%s", 0, 0 }, |
{ 'S', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "Salt.%010d.%s", 0, 0 }, |
179 |
{ 'T', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "Theta.%010d.%s", 0,0 }, |
{ 'T', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "Theta.%010d.%s", 0,0 }, |
180 |
{ 'N', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "Eta.%010d.%s", 0,0 }, |
{ 'N', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "Eta.%010d.%s", 0,0 }, |
181 |
|
|
182 |
//DM { 'B', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "SIarea.%010d.%s", 0,0 }, |
//DM { 'B', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "SIarea.%010d.%s", 0,0 }, |
183 |
//DM { 'C', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "SIheff.%010d.%s", 0,0 }, |
//DM { 'C', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "SIheff.%010d.%s", 0,0 }, |
184 |
//DM { 'D', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "SIhsnow.%010d.%s", 0,0 }, |
//DM { 'D', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "SIhsnow.%010d.%s", 0,0 }, |
185 |
//DM { 'E', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "SIuice.%010d.%s", 0,0 }, |
//DM { 'E', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "SIuice.%010d.%s", 0,0 }, |
186 |
//DM { 'F', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "SIvice.%010d.%s", 0,0 }, |
//DM { 'F', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "SIvice.%010d.%s", 0,0 }, |
187 |
//DM { 'G', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "SIhsalt.%010d.%s", 0,0 }, |
//DM { 'G', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "SIhsalt.%010d.%s", 0,0 }, |
188 |
|
|
189 |
//{ 'H', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "EtaHnm1.%010d.%s", 0,0 }, |
//{ 'H', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "EtaHnm1.%010d.%s", 0,0 }, |
190 |
{ 'I', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "oceTAUX.%010d.%s", 0,0 }, |
{ 'I', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "oceTAUX.%010d.%s", 0,0 }, |
191 |
{ 'J', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "oceTAUY.%010d.%s", 0,0 }, |
{ 'J', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "oceTAUY.%010d.%s", 0,0 }, |
192 |
{ 'K', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "KPPhbl.%010d.%s", 0,0 }, |
{ 'K', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "KPPhbl.%010d.%s", 0,0 }, |
193 |
{ 'L', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "oceSflux.%010d.%s", 0,0 }, |
{ 'L', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "oceSflux.%010d.%s", 0,0 }, |
194 |
|
|
195 |
{ 'M', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "oceFWflx.%010d.%s", 0,0 }, |
{ 'M', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "oceFWflx.%010d.%s", 0,0 }, |
196 |
{ 'O', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "oceQnet.%010d.%s", 0,0 }, |
{ 'O', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "oceQnet.%010d.%s", 0,0 }, |
197 |
{ 'P', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "PhiBot.%010d.%s", 0,0 }, |
{ 'P', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "PhiBot.%010d.%s", 0,0 }, |
198 |
{ 'Q', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "oceQsw.%010d.%s", 0,0 }, |
{ 'Q', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "oceQsw.%010d.%s", 0,0 }, |
199 |
//{ 'R', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "dEtaHdt.%010d.%s", 0,0 }, |
//{ 'R', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "dEtaHdt.%010d.%s", 0,0 }, |
200 |
|
|
201 |
{'\0', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "", 0,0 }, |
{'\0', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "", 0,0 }, |
202 |
}; |
}; |
203 |
|
|
204 |
|
|
205 |
// pickup file |
// pickup file dump - style 1 |
206 |
|
// NOTE: if this changes, write_pickup_meta will also need to be changed |
207 |
fieldInfoThisEpoch_t fieldsForEpochStyle_1[] = { |
fieldInfoThisEpoch_t fieldsForEpochStyle_1[] = { |
208 |
{ 'U', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 0 + twoDFieldSizeInBytes * 0, 1}, |
{ 'U', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_%010d.%s", threeDFieldSizeInBytes * 0 + twoDFieldSizeInBytes * 0, 1}, |
209 |
{ 'V', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 0, 1}, |
{ 'V', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_%010d.%s", threeDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 0, 1}, |
210 |
{ 'T', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 2 + twoDFieldSizeInBytes * 0, 1}, |
{ 'T', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_%010d.%s", threeDFieldSizeInBytes * 2 + twoDFieldSizeInBytes * 0, 1}, |
211 |
{ 'S', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 3 + twoDFieldSizeInBytes * 0, 1}, |
{ 'S', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_%010d.%s", threeDFieldSizeInBytes * 3 + twoDFieldSizeInBytes * 0, 1}, |
212 |
{ 'X', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 4 + twoDFieldSizeInBytes * 0, 1}, |
{ 'X', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_%010d.%s", threeDFieldSizeInBytes * 4 + twoDFieldSizeInBytes * 0, 1}, |
213 |
{ 'Y', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 5 + twoDFieldSizeInBytes * 0, 1}, |
{ 'Y', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_%010d.%s", threeDFieldSizeInBytes * 5 + twoDFieldSizeInBytes * 0, 1}, |
214 |
{ 'N', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 6 + twoDFieldSizeInBytes * 0, 1}, |
{ 'N', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_%010d.%s", threeDFieldSizeInBytes * 6 + twoDFieldSizeInBytes * 0, 1}, |
215 |
{ 'R', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 6 + twoDFieldSizeInBytes * 1, 1}, |
{ 'R', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_%010d.%s", threeDFieldSizeInBytes * 6 + twoDFieldSizeInBytes * 1, 1}, |
216 |
{ 'H', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 6 + twoDFieldSizeInBytes * 2, 1}, |
{ 'H', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_%010d.%s", threeDFieldSizeInBytes * 6 + twoDFieldSizeInBytes * 2, 1}, |
217 |
{'\0', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "", 0 ,1}, |
{'\0', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "", 0 ,1}, |
218 |
}; |
}; |
219 |
|
|
220 |
|
|
221 |
// seaice pickup |
// seaice pickup dump - style 2 |
222 |
|
// NOTE: if this changes, write_pickup_meta will also need to be changed |
223 |
//DMfieldInfoThisEpoch_t fieldsForEpochStyle_2[] = { |
//DMfieldInfoThisEpoch_t fieldsForEpochStyle_2[] = { |
224 |
//DM { 'A', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 0 + twoDFieldSizeInBytes * 0, 2}, |
//DM { 'A', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 0 + twoDFieldSizeInBytes * 0, 2}, |
225 |
//DM { 'B', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 0, 2}, |
//DM { 'B', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 0, 2}, |
226 |
//DM { 'C', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 1, 2}, |
//DM { 'C', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 1, 2}, |
227 |
//DM { 'D', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 2, 2}, |
//DM { 'D', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 2, 2}, |
228 |
//DM { 'G', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 3, 2}, |
//DM { 'G', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 3, 2}, |
229 |
//DM { 'E', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 4, 2}, |
//DM { 'E', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 4, 2}, |
230 |
//DM { 'F', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 5, 2}, |
//DM { 'F', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 5, 2}, |
231 |
//DM {'\0', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "", 0 }, |
//DM {'\0', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "", 0 }, |
232 |
//DM}; |
//DM}; |
233 |
|
|
234 |
|
|
263 |
#define roundUp(_x,_y) (divCeil((_x),(_y)) * (_y)) |
#define roundUp(_x,_y) (divCeil((_x),(_y)) * (_y)) |
264 |
|
|
265 |
|
|
|
|
|
266 |
typedef enum { |
typedef enum { |
267 |
bufState_illegal, |
bufState_illegal, |
268 |
bufState_Free, |
bufState_Free, |
275 |
bufState_t state; |
bufState_t state; |
276 |
int requestsArraySize; |
int requestsArraySize; |
277 |
MPI_Request *requests; |
MPI_Request *requests; |
278 |
uint64_t ck0; |
char *payload; |
279 |
char *payload; // [tileOneZLevelSizeInBytes * NUM_Z]; // Max payload size |
} bufHdr_t; |
|
uint64_t ck1; // now rec'vd from compute ranks & dynamically allocated in |
|
|
} bufHdr_t; // allocateTileBufs |
|
280 |
|
|
281 |
|
|
282 |
bufHdr_t *freeTileBufs_ptr = NULL; |
bufHdr_t *freeTileBufs_ptr = NULL; |
283 |
bufHdr_t *inUseTileBufs_ptr = NULL; |
bufHdr_t *inUseTileBufs_ptr = NULL; |
284 |
|
|
285 |
int maxTagValue = -1; |
int maxTagValue = -1; |
286 |
int totalNumTiles = -1; |
|
287 |
|
|
288 |
// routine to convert double to float during memcpy |
// routine to convert double to float during memcpy |
289 |
// need to get byteswapping in here as well |
// need to get byteswapping in here as well |
333 |
} |
} |
334 |
} |
} |
335 |
|
|
336 |
|
|
337 |
// Debug routine |
// Debug routine |
338 |
countBufs(int nbufs) |
countBufs(int nbufs) |
339 |
{ |
{ |
355 |
if (nInUse + nFree != target) { |
if (nInUse + nFree != target) { |
356 |
fprintf(stderr, "Rank %d: bad number of buffs: free %d, inUse %d, should be %d\n", |
fprintf(stderr, "Rank %d: bad number of buffs: free %d, inUse %d, should be %d\n", |
357 |
ioIntracommRank, nFree, nInUse, target); |
ioIntracommRank, nFree, nInUse, target); |
|
sleep(5000); |
|
358 |
} |
} |
359 |
|
ASSERT ((nInUse + nFree) == target); |
360 |
} |
} |
361 |
|
|
362 |
int readn(int fd, void *p, int nbytes) |
|
363 |
|
long int readn(int fd, void *p, long int nbytes) |
364 |
{ |
{ |
365 |
|
|
366 |
char *ptr = (char*)(p); |
char *ptr = (char*)(p); |
367 |
|
|
368 |
int nleft, nread; |
long int nleft, nread; |
369 |
|
|
370 |
nleft = nbytes; |
nleft = nbytes; |
371 |
while (nleft > 0){ |
while (nleft > 0){ |
430 |
|
|
431 |
fprintf(fp," nDims = [ %3d ];\n",ndims); |
fprintf(fp," nDims = [ %3d ];\n",ndims); |
432 |
fprintf(fp," dimList = [\n"); |
fprintf(fp," dimList = [\n"); |
433 |
fprintf(fp," %10u,%10d,%10u,\n",NUM_X,1,NUM_X); |
fprintf(fp," %10lu,%10d,%10lu,\n",NUM_X,1,NUM_X); |
434 |
fprintf(fp," %10ld,%10d,%10ld\n",NUM_Y,1,NUM_Y); |
fprintf(fp," %10ld,%10d,%10ld\n",NUM_Y,1,NUM_Y); |
435 |
fprintf(fp," ];\n"); |
fprintf(fp," ];\n"); |
436 |
fprintf(fp," dataprec = [ 'float64' ];\n"); |
fprintf(fp," dataprec = [ 'float64' ];\n"); |
446 |
|
|
447 |
|
|
448 |
|
|
449 |
double *outBuf=NULL;//[NUM_X*NUM_Y*NUM_Z]; // only needs to be myNumZSlabs |
double *outBuf=NULL;//was [NUM_X*NUM_Y*NUM_Z], but only needs to be myNumZSlabs |
450 |
size_t outBufSize=0; |
size_t outBufSize=0; |
451 |
|
|
452 |
|
|
453 |
void |
void |
454 |
do_write(int io_epoch, fieldInfoThisEpoch_t *whichField, int firstZ, int numZ, int gcmIter) |
do_write(int io_epoch, fieldInfoThisEpoch_t *whichField, int myFirstZ, int myNumZ, int gcmIter) |
455 |
{ |
{ |
456 |
if (0==numZ) return; // this is *not* global NUM_Z : change name of parameter to avoid grief! |
if (0==myNumZ) return; |
457 |
|
|
458 |
int pickup = whichField->pickup; |
int pickup = whichField->pickup; |
459 |
|
|
461 |
// swap here, if necessary |
// swap here, if necessary |
462 |
// int i,j; |
// int i,j; |
463 |
|
|
464 |
//i = NUM_X*NUM_Y*numZ; |
//i = NUM_X*NUM_Y*myNumZ; |
465 |
//mds_byteswapr8_(&i,outBuf); |
//mds_byteswapr8_(&i,outBuf); |
466 |
|
|
467 |
// mds_byteswapr8 expects an integer count, which is gonna overflow someday |
// mds_byteswapr8 expects an integer count, which is gonna overflow someday |
468 |
// can't redefine to long without affecting a bunch of other calls |
// can't redefine to long without affecting a bunch of other calls |
469 |
// so do a slab at a time here, to delay the inevitable |
// so do a slab at a time here, to delay the inevitable |
470 |
// i = NUM_X*NUM_Y; |
// i = NUM_X*NUM_Y; |
471 |
//for (j=0;j<numZ;++j) |
//for (j=0;j<myNumZ;++j) |
472 |
// mds_byteswapr8_(&i,&outBuf[i*j]); |
// mds_byteswapr8_(&i,&outBuf[i*j]); |
473 |
|
|
474 |
// gnu builtin evidently honored by intel compilers |
// gnu builtin evidently honored by intel compilers |
476 |
if (pickup) { |
if (pickup) { |
477 |
uint64_t *alias = (uint64_t*)outBuf; |
uint64_t *alias = (uint64_t*)outBuf; |
478 |
size_t i; |
size_t i; |
479 |
for (i=0;i<NUM_X*NUM_Y*numZ;++i) |
for (i=0;i<NUM_X*NUM_Y*myNumZ;++i) |
480 |
alias[i] = __builtin_bswap64(alias[i]); |
alias[i] = __builtin_bswap64(alias[i]); |
481 |
} |
} |
482 |
else { |
else { |
483 |
uint32_t *alias = (uint32_t*)outBuf; |
uint32_t *alias = (uint32_t*)outBuf; |
484 |
size_t i; |
size_t i; |
485 |
for (i=0;i<NUM_X*NUM_Y*numZ;++i) |
for (i=0;i<NUM_X*NUM_Y*myNumZ;++i) |
486 |
alias[i] = __builtin_bswap32(alias[i]); |
alias[i] = __builtin_bswap32(alias[i]); |
487 |
} |
} |
488 |
|
|
501 |
|
|
502 |
if (pickup) { |
if (pickup) { |
503 |
lseek(fd,whichField->offset,SEEK_SET); |
lseek(fd,whichField->offset,SEEK_SET); |
504 |
lseek(fd,firstZ*NUM_X*NUM_Y*datumSize,SEEK_CUR); |
lseek(fd,myFirstZ*NUM_X*NUM_Y*datumSize,SEEK_CUR); |
505 |
nbytes = NUM_X*NUM_Y*numZ*datumSize; |
nbytes = NUM_X*NUM_Y*myNumZ*datumSize; |
506 |
} |
} |
507 |
else { |
else { |
508 |
lseek(fd,firstZ*NUM_X*NUM_Y*sizeof(float),SEEK_CUR); |
lseek(fd,myFirstZ*NUM_X*NUM_Y*sizeof(float),SEEK_CUR); |
509 |
nbytes = NUM_X*NUM_Y*numZ*sizeof(float); |
nbytes = NUM_X*NUM_Y*myNumZ*sizeof(float); |
510 |
} |
} |
511 |
|
|
512 |
ssize_t bwrit = writen(fd,outBuf,nbytes); |
ssize_t bwrit = writen(fd,outBuf,nbytes); |
513 |
|
|
514 |
if (-1==bwrit) perror("Henze : file write problem : "); |
if (-1==bwrit) perror("Henze : file write problem : "); |
515 |
|
|
516 |
FPRINTF(stderr,"Wrote %d of %d bytes (%d -> %d) \n",bwrit,nbytes,firstZ,numZ); |
FPRINTF(stderr,"Wrote %d of %d bytes (%d -> %d) \n",bwrit,nbytes,myFirstZ,myNumZ); |
517 |
|
|
518 |
// ASSERT(nbytes == bwrit); |
// ASSERT(nbytes == bwrit); |
519 |
|
|
527 |
} |
} |
528 |
|
|
529 |
|
|
|
int NTILES = -1; |
|
530 |
|
|
531 |
typedef struct { |
typedef struct { |
532 |
int off; |
long int off; |
533 |
int skip; |
long int skip; |
534 |
} tile_layout_t; |
} tile_layout_t; |
535 |
|
|
536 |
tile_layout_t *offsetTable; |
tile_layout_t *offsetTable; |
542 |
void *data, |
void *data, |
543 |
int myNumZSlabs) |
int myNumZSlabs) |
544 |
{ |
{ |
|
int intracommSize,intracommRank; |
|
|
// MPI_Comm_size(whichField->ioRanksIntracomm, &intracommSize); |
|
|
MPI_Comm_rank(whichField->ioRanksIntracomm, &intracommRank); |
|
|
//printf("i/o rank %d/%d recv'd %d::%d (%d->%d)\n",intracommRank,intracommSize,whichField,tileID,firstZ,lastZ); |
|
|
// printf("rank %d : tile %d is gonna go at %d and stride with %d, z = %d\n",intracommRank,tileID,offsetTable[tileID].off, |
|
|
// offsetTable[tileID].skip,myNumZSlabs); |
|
|
|
|
545 |
int z; |
int z; |
546 |
|
|
547 |
int pickup = whichField->pickup; |
int pickup = whichField->pickup; |
548 |
|
|
549 |
//ASSERT((tileID > 0) && (tileID < (sizeof(offsetTable)/sizeof(tile_layout_t)))); |
ASSERT ((tileID > 0) && (tileID <= TOTAL_NUM_TILES)); |
|
ASSERT( (tileID > 0) && (tileID <= NTILES) ); // offsetTable now dynamically allocated |
|
550 |
|
|
551 |
if (myNumZSlabs * twoDFieldSizeInBytes > outBufSize){ |
if (myNumZSlabs * twoDFieldSizeInBytes > outBufSize){ |
552 |
|
|
562 |
|
|
563 |
for (z=0;z<myNumZSlabs;++z){ |
for (z=0;z<myNumZSlabs;++z){ |
564 |
|
|
565 |
off_t zoffset = z*TILE_X*TILE_Y*NTILES; //NOT totalNumTiles; |
off_t zoffset = z*TILE_X*TILE_Y*TOTAL_NUM_TILES; |
566 |
off_t hoffset = offsetTable[tileID].off; |
off_t hoffset = offsetTable[tileID].off; |
567 |
off_t skipdst = offsetTable[tileID].skip; |
off_t skipdst = offsetTable[tileID].skip; |
568 |
|
|
579 |
|
|
580 |
off_t skipsrc = TILE_X+2*XGHOSTS; |
off_t skipsrc = TILE_X+2*XGHOSTS; |
581 |
|
|
|
//fprintf(stderr,"rank %d tile %d offset %d skip %d dst %x src %x\n",intracommRank,tileID,hoffset,skipdst,dst,src); |
|
|
|
|
582 |
long long n = TILE_X; |
long long n = TILE_X; |
583 |
|
|
584 |
int y; |
int y; |
599 |
void |
void |
600 |
allocateTileBufs(int numTileBufs, int maxIntracommSize) |
allocateTileBufs(int numTileBufs, int maxIntracommSize) |
601 |
{ |
{ |
|
|
|
|
ASSERT(tileOneZLevelSizeInBytes>0); // be sure we have rec'vd values by now |
|
|
|
|
602 |
int i, j; |
int i, j; |
603 |
for (i = 0; i < numTileBufs; ++i) { |
for (i = 0; i < numTileBufs; ++i) { |
604 |
|
|
616 |
for (j = 0; j < maxIntracommSize; ++j) { |
for (j = 0; j < maxIntracommSize; ++j) { |
617 |
newBuf->requests[j] = MPI_REQUEST_NULL; |
newBuf->requests[j] = MPI_REQUEST_NULL; |
618 |
} |
} |
|
newBuf->ck0 = newBuf->ck1 = 0xdeadbeefdeadbeef; |
|
619 |
|
|
620 |
// Put the buf into the free list |
// Put the buf into the free list |
621 |
newBuf->state = bufState_Free; |
newBuf->state = bufState_Free; |
683 |
dataIntercomm, &mpiStatus); |
dataIntercomm, &mpiStatus); |
684 |
bufHdr->state = bufState_InUse; |
bufHdr->state = bufState_InUse; |
685 |
|
|
|
// Overrun check |
|
|
ASSERT(bufHdr->ck0==0xdeadbeefdeadbeef); |
|
|
ASSERT(bufHdr->ck0==bufHdr->ck1); |
|
|
|
|
686 |
// Return values |
// Return values |
687 |
*tileID = mpiStatus.MPI_TAG >> numCheckBits; |
*tileID = mpiStatus.MPI_TAG >> numCheckBits; |
688 |
|
|
698 |
int |
int |
699 |
tryToReceiveZSlab( |
tryToReceiveZSlab( |
700 |
void *buf, |
void *buf, |
701 |
int expectedMsgSize, |
long int expectedMsgSize, |
702 |
MPI_Comm intracomm) |
MPI_Comm intracomm) |
703 |
{ |
{ |
704 |
MPI_Status mpiStatus; |
MPI_Status mpiStatus; |
721 |
} |
} |
722 |
|
|
723 |
|
|
|
|
|
724 |
void |
void |
725 |
redistributeZSlabs( |
redistributeZSlabs( |
726 |
bufHdr_t *bufHdr, |
bufHdr_t *bufHdr, |
727 |
int tileID, |
int tileID, |
728 |
int zSlabsPer, |
int zSlabsPer, |
729 |
int thisFieldNumZ, |
fieldInfoThisEpoch_t *fieldInfo) |
|
MPI_Comm intracomm) |
|
730 |
{ |
{ |
731 |
int pieceSize = zSlabsPer * tileOneZLevelSizeInBytes; |
int thisFieldNumZ = fieldInfo->zDepth; |
732 |
int tileSizeInBytes = tileOneZLevelSizeInBytes * thisFieldNumZ; |
long int tileSizeInBytes = tileOneZLevelSizeInBytes * thisFieldNumZ; |
733 |
int offset = 0; |
long int fullPieceSize = zSlabsPer * tileOneZLevelSizeInBytes; |
734 |
int recvRank = 0; |
long int offset = 0; |
735 |
|
|
736 |
|
MPI_Comm intracomm = fieldInfo->ioRanksIntracomm; |
737 |
|
|
738 |
|
// Note that the MPI interface definition requires that this arg |
739 |
|
// (i.e. "pieceSize") be of type "int". So check to be sure we |
740 |
|
// haven't overflowed the size. |
741 |
|
int pieceSize = (int)fullPieceSize; |
742 |
|
if (pieceSize != fullPieceSize) { |
743 |
|
fprintf(stderr, "ERROR: pieceSize:%d, fullPieceSize:%ld\n", |
744 |
|
pieceSize, fullPieceSize); |
745 |
|
fprintf(stderr, "ERROR: tileOneZLevelSizeInBytes:%ld, tileSizeInBytes:%ld\n", |
746 |
|
tileOneZLevelSizeInBytes, tileSizeInBytes); |
747 |
|
fprintf(stderr, "ERROR: thisFieldNumZ:%d, zSlabsPer:%d\n", |
748 |
|
thisFieldNumZ , zSlabsPer); |
749 |
|
} |
750 |
|
ASSERT (pieceSize == fullPieceSize); |
751 |
|
ASSERT (pieceSize > 0); |
752 |
|
|
753 |
|
|
754 |
|
// Send each piece (aka chunk) to the rank that is responsible |
755 |
|
// for writing it. |
756 |
|
int curPiece = 0; |
757 |
while ((offset + pieceSize) <= tileSizeInBytes) { |
while ((offset + pieceSize) <= tileSizeInBytes) { |
758 |
|
int recvRank = fieldInfo->chunkWriters[curPiece]; |
759 |
MPI_Isend(bufHdr->payload + offset, pieceSize, MPI_BYTE, recvRank, |
MPI_Isend(bufHdr->payload + offset, pieceSize, MPI_BYTE, recvRank, |
760 |
tileID, intracomm, &(bufHdr->requests[recvRank])); |
tileID, intracomm, &(bufHdr->requests[recvRank])); |
761 |
ASSERT(MPI_REQUEST_NULL != bufHdr->requests[recvRank]); |
ASSERT(MPI_REQUEST_NULL != bufHdr->requests[recvRank]); |
762 |
|
|
763 |
offset += pieceSize; |
offset += pieceSize; |
764 |
recvRank += 1; |
curPiece += 1; |
765 |
} |
} |
766 |
|
|
767 |
// There might be one last odd-sized piece |
// There might be one last odd-sized piece (N.B.: odd-sized in Z; |
768 |
|
// the X and Y sizes of a tile are always the same). |
769 |
if (offset < tileSizeInBytes) { |
if (offset < tileSizeInBytes) { |
770 |
|
ASSERT (pieceSize >= tileSizeInBytes - offset); |
771 |
pieceSize = tileSizeInBytes - offset; |
pieceSize = tileSizeInBytes - offset; |
772 |
ASSERT(pieceSize % tileOneZLevelSizeInBytes == 0); |
ASSERT(pieceSize % tileOneZLevelSizeInBytes == 0); |
773 |
|
|
774 |
|
int recvRank = fieldInfo->chunkWriters[curPiece]; |
775 |
MPI_Isend(bufHdr->payload + offset, pieceSize, MPI_BYTE, recvRank, |
MPI_Isend(bufHdr->payload + offset, pieceSize, MPI_BYTE, recvRank, |
776 |
tileID, intracomm, &(bufHdr->requests[recvRank])); |
tileID, intracomm, &(bufHdr->requests[recvRank])); |
777 |
ASSERT(MPI_REQUEST_NULL != bufHdr->requests[recvRank]); |
ASSERT(MPI_REQUEST_NULL != bufHdr->requests[recvRank]); |
778 |
|
curPiece += 1; |
|
offset += pieceSize; |
|
|
recvRank += 1; |
|
779 |
} |
} |
780 |
|
|
781 |
// Sanity check |
ASSERT (curPiece == fieldInfo->nChunks); |
|
ASSERT(recvRank <= bufHdr->requestsArraySize); |
|
|
while (recvRank < bufHdr->requestsArraySize) { |
|
|
ASSERT(MPI_REQUEST_NULL == bufHdr->requests[recvRank++]) |
|
|
} |
|
782 |
} |
} |
783 |
|
|
784 |
|
|
803 |
int zSlabsPer; |
int zSlabsPer; |
804 |
int myNumZSlabs, myFirstZSlab, myLastZSlab, myNumSlabPiecesToRecv; |
int myNumZSlabs, myFirstZSlab, myLastZSlab, myNumSlabPiecesToRecv; |
805 |
|
|
806 |
int numTilesRecvd, numSlabPiecesRecvd; |
int ii, numTilesRecvd, numSlabPiecesRecvd; |
807 |
|
|
808 |
|
|
809 |
// Find the descriptor for my assigned field for this epoch style. |
// Find the descriptor for my assigned field for this epoch style. |
813 |
ASSERT('\0' != fieldInfo->dataFieldID); |
ASSERT('\0' != fieldInfo->dataFieldID); |
814 |
|
|
815 |
|
|
|
|
|
816 |
MPI_Comm_rank(fieldInfo->ioRanksIntracomm, &intracommRank); |
MPI_Comm_rank(fieldInfo->ioRanksIntracomm, &intracommRank); |
817 |
MPI_Comm_size(fieldInfo->ioRanksIntracomm, &intracommSize); |
MPI_Comm_size(fieldInfo->ioRanksIntracomm, &intracommSize); |
818 |
|
|
|
|
|
|
// Compute which z slab(s) we will be reassembling. |
|
819 |
zSlabsPer = divCeil(fieldInfo->zDepth, intracommSize); |
zSlabsPer = divCeil(fieldInfo->zDepth, intracommSize); |
820 |
myNumZSlabs = zSlabsPer; |
|
821 |
|
// Figure out if this rank writes z-slabs, and if so, which ones |
822 |
// Adjust myNumZSlabs in case it didn't divide evenly |
myNumZSlabs = 0; |
823 |
myFirstZSlab = intracommRank * myNumZSlabs; |
myFirstZSlab = 0; |
824 |
if (myFirstZSlab >= fieldInfo->zDepth) { |
myLastZSlab = -1; |
825 |
myNumZSlabs = 0; |
for (ii = 0; ii < fieldInfo->nChunks; ++ii) { |
826 |
} else if ((myFirstZSlab + myNumZSlabs) > fieldInfo->zDepth) { |
if (fieldInfo->chunkWriters[ii] == intracommRank) { |
827 |
myNumZSlabs = fieldInfo->zDepth - myFirstZSlab; |
myFirstZSlab = ii * zSlabsPer; |
828 |
} else { |
// The last chunk might be odd sized |
829 |
myNumZSlabs = zSlabsPer; |
myNumZSlabs = ((ii + 1) < fieldInfo->nChunks) ? |
830 |
|
zSlabsPer : fieldInfo->zDepth - myFirstZSlab; |
831 |
|
myLastZSlab = myFirstZSlab + myNumZSlabs - 1; |
832 |
|
|
833 |
|
break; |
834 |
|
} |
835 |
|
} |
836 |
|
if (myNumZSlabs > 0) { |
837 |
|
ASSERT ((myFirstZSlab >= 0) && (myFirstZSlab < fieldInfo->zDepth)); |
838 |
|
ASSERT ((myLastZSlab >= 0) && (myLastZSlab < fieldInfo->zDepth)); |
839 |
|
ASSERT (myLastZSlab >= myFirstZSlab); |
840 |
} |
} |
841 |
myLastZSlab = myFirstZSlab + myNumZSlabs - 1; |
|
842 |
|
|
843 |
// If we were not assigned any z-slabs, we don't get any redistributed |
// If we were not assigned any z-slabs, we don't get any redistributed |
844 |
// tile-slabs. If we were assigned one or more slabs, we get |
// tile-slabs. If we were assigned one or more slabs, we get |
845 |
// redistributed tile-slabs for every tile. |
// redistributed tile-slabs for every tile. |
846 |
myNumSlabPiecesToRecv = (0 == myNumZSlabs) ? 0 : totalNumTiles; |
myNumSlabPiecesToRecv = (0 == myNumZSlabs) ? 0 : NUM_WET_TILES; |
847 |
|
|
848 |
|
|
849 |
numTilesRecvd = 0; |
numTilesRecvd = 0; |
864 |
if (NULL == bufHdr) break; // No tile was received |
if (NULL == bufHdr) break; // No tile was received |
865 |
|
|
866 |
numTilesRecvd += 1; |
numTilesRecvd += 1; |
867 |
redistributeZSlabs(bufHdr, tileID, zSlabsPer, |
redistributeZSlabs(bufHdr, tileID, zSlabsPer, fieldInfo); |
|
fieldInfo->zDepth, fieldInfo->ioRanksIntracomm); |
|
868 |
|
|
869 |
// Add the bufHdr to the "in use" list |
// Add the bufHdr to the "in use" list |
870 |
bufHdr->next = inUseTileBufs_ptr; |
bufHdr->next = inUseTileBufs_ptr; |
871 |
inUseTileBufs_ptr = bufHdr; |
inUseTileBufs_ptr = bufHdr; |
|
|
|
|
|
|
872 |
} |
} |
873 |
|
|
874 |
|
|
875 |
//////////////////////////////////////////////////////////////// |
//////////////////////////////////////////////////////////////// |
876 |
// Check for tile-slabs redistributed from the i/o processes |
// Check for tile-slabs redistributed from the i/o processes |
877 |
while (numSlabPiecesRecvd < myNumSlabPiecesToRecv) { |
while (numSlabPiecesRecvd < myNumSlabPiecesToRecv) { |
878 |
int msgSize = tileOneZLevelSizeInBytes * myNumZSlabs; |
long int msgSize = tileOneZLevelSizeInBytes * myNumZSlabs; |
879 |
char data[msgSize]; |
char data[msgSize]; |
880 |
|
|
881 |
int tileID = tryToReceiveZSlab(data, msgSize, fieldInfo->ioRanksIntracomm); |
int tileID = tryToReceiveZSlab(data, msgSize, fieldInfo->ioRanksIntracomm); |
882 |
if (tileID < 0) break; // No slab was received |
if (tileID < 0) break; // No slab was received |
883 |
|
|
|
|
|
884 |
numSlabPiecesRecvd += 1; |
numSlabPiecesRecvd += 1; |
885 |
processSlabSection(fieldInfo, tileID, data, myNumZSlabs); |
processSlabSection(fieldInfo, tileID, data, myNumZSlabs); |
886 |
|
|
|
|
|
887 |
// Can do the write here, or at the end of the epoch. |
// Can do the write here, or at the end of the epoch. |
888 |
// Probably want to make it asynchronous (waiting for |
// Probably want to make it asynchronous (waiting for |
889 |
// completion at the barrier at the start of the next epoch). |
// completion at the barrier at the start of the next epoch). |
895 |
|
|
896 |
|
|
897 |
//////////////////////////////////////////////////////////////// |
//////////////////////////////////////////////////////////////// |
898 |
|
// Sanity check for non-writers |
899 |
|
if (0 == myNumSlabPiecesToRecv) { |
900 |
|
|
901 |
|
long int msgSize = tileOneZLevelSizeInBytes * myNumZSlabs; |
902 |
|
char data[msgSize]; |
903 |
|
|
904 |
|
// Check that no one has tried to re-distribute a slab to us. |
905 |
|
int tileID = tryToReceiveZSlab(data, msgSize, fieldInfo->ioRanksIntracomm); |
906 |
|
ASSERT (tileID < 0); |
907 |
|
} |
908 |
|
|
909 |
|
|
910 |
|
//////////////////////////////////////////////////////////////// |
911 |
// Check if we can release any of the inUse buffers (i.e. the |
// Check if we can release any of the inUse buffers (i.e. the |
912 |
// isends we used to redistribute those z-slabs have completed). |
// isends we used to redistribute those z-slabs have completed). |
913 |
// We do this by detaching the inUse list, then examining each |
// We do this by detaching the inUse list, then examining each |
937 |
FPRINTF(stderr,"Rank %d freed a tile buffer\n", intracommRank); |
FPRINTF(stderr,"Rank %d freed a tile buffer\n", intracommRank); |
938 |
} else { |
} else { |
939 |
// At least one request is still outstanding. |
// At least one request is still outstanding. |
940 |
// Put the buf on the inUse list. |
// Put the buf back on the inUse list. |
941 |
bufHdr->next = inUseTileBufs_ptr; |
bufHdr->next = inUseTileBufs_ptr; |
942 |
inUseTileBufs_ptr = bufHdr; |
inUseTileBufs_ptr = bufHdr; |
943 |
} |
} |
973 |
|
|
974 |
|
|
975 |
|
|
|
|
|
|
|
|
|
|
|
976 |
// Main routine for the i/o tasks. Callers DO NOT RETURN from |
// Main routine for the i/o tasks. Callers DO NOT RETURN from |
977 |
// this routine; they MPI_Finalize and exit. |
// this routine; they MPI_Finalize and exit. |
978 |
void |
void |
979 |
ioRankMain(int totalNumTiles) |
ioRankMain (void) |
980 |
{ |
{ |
981 |
// This is one of a group of i/o processes that will be receiving tiles |
// This is one of a group of i/o processes that will be receiving tiles |
982 |
// from the computational processes. This same process is also |
// from the computational processes. This same process is also |
991 |
// the ghost cells), and put them into the slab we are assembling. Once |
// the ghost cells), and put them into the slab we are assembling. Once |
992 |
// a slab is fully assembled, it is written to disk. |
// a slab is fully assembled, it is written to disk. |
993 |
|
|
994 |
int i, size, count, numTileBufs; |
int i, ierr, count, numTileBufs; |
995 |
MPI_Status status; |
MPI_Status status; |
996 |
int currentEpochID = 0; |
int currentEpochID = 0; |
997 |
int maxTileCount = 0, max3DTileCount = 0, maxIntracommSize = 0; |
int maxTileCount = 0, max3DTileCount = 0, maxIntracommSize = 0; |
1000 |
MPI_Comm_remote_size(globalIntercomm, &numComputeRanks); |
MPI_Comm_remote_size(globalIntercomm, &numComputeRanks); |
1001 |
|
|
1002 |
|
|
1003 |
//////////////////////////////////////////////////////////// |
int *xoff = malloc(TOTAL_NUM_TILES*sizeof(int)); |
|
//// get global tile info via bcast over the global intercom |
|
|
|
|
|
|
|
|
// int ierr = MPI_Bcast(&NTILES,1,MPI_INT,0,globalIntercomm); |
|
|
|
|
|
int data[9]; |
|
|
|
|
|
int ierr = MPI_Bcast(data,9,MPI_INT,0,globalIntercomm); |
|
|
|
|
|
NTILES = data[3]; |
|
|
TILE_X = data[5]; |
|
|
TILE_Y = data[6]; |
|
|
XGHOSTS = data[7]; |
|
|
YGHOSTS = data[8]; |
|
|
|
|
|
tileOneZLevelItemCount = ((TILE_X + 2*XGHOSTS) * (TILE_Y + 2*YGHOSTS)); |
|
|
tileOneZLevelSizeInBytes = (tileOneZLevelItemCount * datumSize); // also calculated in compute ranks, urghh |
|
|
|
|
|
int *xoff = malloc(NTILES*sizeof(int)); |
|
1004 |
ASSERT(xoff); |
ASSERT(xoff); |
1005 |
int *yoff = malloc(NTILES*sizeof(int)); |
int *yoff = malloc(TOTAL_NUM_TILES*sizeof(int)); |
1006 |
ASSERT(yoff); |
ASSERT(yoff); |
1007 |
int *xskip = malloc(NTILES*sizeof(int)); |
int *xskip = malloc(TOTAL_NUM_TILES*sizeof(int)); |
1008 |
ASSERT(xskip); |
ASSERT(xskip); |
1009 |
|
|
1010 |
offsetTable = malloc((NTILES+1)*sizeof(tile_layout_t)); |
offsetTable = malloc((TOTAL_NUM_TILES+1)*sizeof(tile_layout_t)); |
1011 |
ASSERT(offsetTable); |
ASSERT(offsetTable); |
1012 |
|
|
1013 |
ierr = MPI_Bcast(xoff,NTILES,MPI_INT,0,globalIntercomm); |
ierr = MPI_Bcast(xoff,TOTAL_NUM_TILES,MPI_INT,0,globalIntercomm); |
1014 |
ierr = MPI_Bcast(yoff,NTILES,MPI_INT,0,globalIntercomm); |
ierr = MPI_Bcast(yoff,TOTAL_NUM_TILES,MPI_INT,0,globalIntercomm); |
1015 |
ierr = MPI_Bcast(xskip,NTILES,MPI_INT,0,globalIntercomm); |
ierr = MPI_Bcast(xskip,TOTAL_NUM_TILES,MPI_INT,0,globalIntercomm); |
1016 |
|
|
1017 |
for (i=0;i<NTILES;++i){ |
for (i=0;i<TOTAL_NUM_TILES;++i){ |
1018 |
offsetTable[i+1].off = NUM_X*(yoff[i]-1) + xoff[i] - 1; |
offsetTable[i+1].off = NUM_X*(yoff[i]-1) + xoff[i] - 1; |
1019 |
offsetTable[i+1].skip = xskip[i]; |
offsetTable[i+1].skip = xskip[i]; |
1020 |
} |
} |
1044 |
} |
} |
1045 |
ASSERT(NULL != p); |
ASSERT(NULL != p); |
1046 |
ASSERT('\0' != p->dataFieldID); |
ASSERT('\0' != p->dataFieldID); |
1047 |
MPI_Comm_size(p->ioRanksIntracomm, &size); |
MPI_Comm_size(p->ioRanksIntracomm, &thisIntracommSize); |
1048 |
if (size > maxIntracommSize) maxIntracommSize = size; |
if (thisIntracommSize > maxIntracommSize) { |
1049 |
|
maxIntracommSize = thisIntracommSize; |
1050 |
|
} |
1051 |
|
|
1052 |
|
|
1053 |
// Receive a message from *each* computational rank telling us |
// Receive a message from *each* computational rank telling us |
1066 |
// Sanity check |
// Sanity check |
1067 |
MPI_Allreduce (&(p->tileCount), &count, 1, |
MPI_Allreduce (&(p->tileCount), &count, 1, |
1068 |
MPI_INT, MPI_SUM, p->ioRanksIntracomm); |
MPI_INT, MPI_SUM, p->ioRanksIntracomm); |
1069 |
ASSERT(count == totalNumTiles); |
ASSERT(count == NUM_WET_TILES); |
1070 |
} |
} |
1071 |
|
|
1072 |
|
|
1083 |
// Hack - for now, just pick a value for numTileBufs |
// Hack - for now, just pick a value for numTileBufs |
1084 |
numTileBufs = (max3DTileCount > 0) ? max3DTileCount : maxTileCount; |
numTileBufs = (max3DTileCount > 0) ? max3DTileCount : maxTileCount; |
1085 |
if (numTileBufs < 4) numTileBufs = 4; |
if (numTileBufs < 4) numTileBufs = 4; |
1086 |
if (numTileBufs > 15) numTileBufs = 15; |
if (numTileBufs > 25) numTileBufs = 25; |
|
// if (numTileBufs > 25) numTileBufs = 25; |
|
1087 |
|
|
1088 |
allocateTileBufs(numTileBufs, maxIntracommSize); |
allocateTileBufs(numTileBufs, maxIntracommSize); |
1089 |
countBufs(numTileBufs); |
countBufs(numTileBufs); |
1101 |
MPI_Barrier(ioIntracomm); |
MPI_Barrier(ioIntracomm); |
1102 |
|
|
1103 |
if (0 == ioIntracommRank) { |
if (0 == ioIntracommRank) { |
1104 |
fprintf(stderr, "I/O ranks waiting for new epoch\n"); |
fprintf(stderr, "I/O ranks waiting for new epoch at time %f\n",MPI_Wtime()); |
1105 |
MPI_Send(NULL, 0, MPI_BYTE, 0, cmd_epochComplete, globalIntercomm); |
MPI_Send(NULL, 0, MPI_BYTE, 0, cmd_epochComplete, globalIntercomm); |
1106 |
|
|
1107 |
MPI_Recv(cmd, 4, MPI_INT, 0, 0, globalIntercomm, MPI_STATUS_IGNORE); |
MPI_Recv(cmd, 4, MPI_INT, 0, 0, globalIntercomm, MPI_STATUS_IGNORE); |
1108 |
fprintf(stderr, "I/O ranks begining new epoch: %d, gcmIter = %d\n", cmd[1],cmd[3]); |
fprintf(stderr, "I/O ranks begining new epoch: %d, gcmIter = %d, at time %f\n", |
1109 |
|
cmd[1], cmd[3], MPI_Wtime()); |
1110 |
|
|
1111 |
// before we start a new epoch, have i/o rank 0: |
// before we start a new epoch, have i/o rank 0: |
1112 |
// determine output filenames for this epoch |
// determine output filenames for this epoch |
1163 |
} |
} |
1164 |
MPI_Bcast(cmd, 4, MPI_INT, 0, ioIntracomm); |
MPI_Bcast(cmd, 4, MPI_INT, 0, ioIntracomm); |
1165 |
|
|
1166 |
|
if (0 == ioIntracommRank) |
1167 |
|
fprintf(stderr,"i/o handshake completed %d %d %f\n",cmd[1],cmd[3],MPI_Wtime()); |
1168 |
|
|
1169 |
switch (cmd[0]) { |
switch (cmd[0]) { |
1170 |
|
|
1171 |
case cmd_exit: |
case cmd_exit: |
1224 |
|
|
1225 |
// Given a number of ranks and a set of fields we will want to output, |
// Given a number of ranks and a set of fields we will want to output, |
1226 |
// figure out how to distribute the ranks among the fields. |
// figure out how to distribute the ranks among the fields. |
1227 |
|
// We attempt to distribute according to three separate criteria: |
1228 |
|
// 1) Try to make the number of ranks assigned to a field be |
1229 |
|
// proportional to the number of bytes in that field. |
1230 |
|
// 2) Try to even out the total number of MPI messages that are |
1231 |
|
// sent to any given i/o node. |
1232 |
|
// 3) Try to even out the amount of output being written from |
1233 |
|
// any given i/o node. |
1234 |
|
|
1235 |
void |
void |
1236 |
computeIORankAssigments( |
computeIORankAssigments( |
1237 |
int numComputeRanks, |
int numComputeRanks, |
1238 |
int numIORanks, |
int numIORanks, |
1239 |
int numFields, |
int numFields, |
1240 |
fieldInfoThisEpoch_t *fields, |
fieldInfoThisEpoch_t *fields, |
1241 |
int assignments[]) |
int assignments[], |
1242 |
|
int nToWrite[]) |
1243 |
{ |
{ |
1244 |
int i,j,k, sum, count; |
int i,j,k, iteration, sum, count; |
1245 |
|
|
1246 |
int numIONodes = numIORanks / numRanksPerNode; |
int numIONodes = numIORanks / numRanksPerNode; |
1247 |
int numIORanksThisField[numFields]; |
int numIORanksThisField[numFields], nRanks[numFields]; |
1248 |
long int bytesPerIORankThisField[numFields]; |
long int bytesPerIORankThisField[numFields]; |
1249 |
int expectedMessagesPerRankThisField[numFields]; |
int expectedMessagesPerRankThisField[numFields]; |
1250 |
|
int isWriter[numIORanks]; |
1251 |
long int fieldSizes[numFields]; |
long int fieldSizes[numFields]; |
1252 |
|
|
1253 |
struct ioNodeInfo_t { |
struct ioNodeInfo_t { |
1265 |
|
|
1266 |
ASSERT((numIONodes*numRanksPerNode) == numIORanks); |
ASSERT((numIONodes*numRanksPerNode) == numIORanks); |
1267 |
|
|
1268 |
|
memset (assignments, 0, numIORanks*sizeof(int)); |
1269 |
|
memset (isWriter, 0, numIORanks*sizeof(int)); |
1270 |
|
|
1271 |
|
|
1272 |
////////////////////////////////////////////////////////////// |
////////////////////////////////////////////////////////////// |
1273 |
// Compute the size for each field in this epoch style |
// Compute the size for each field in this epoch style |
1277 |
} |
} |
1278 |
|
|
1279 |
|
|
1280 |
///////////////////////////////////////////////////////// |
///////////////////////////////////////////////////////////////// |
1281 |
// Distribute the available i/o ranks among the fields, |
// Phase 1: Distribute the number of available i/o ranks among |
1282 |
// proportionally based on field size. |
// the fields, proportionally based on field size. |
1283 |
|
|
1284 |
// First, assign one rank per field |
// First, assign one rank per field |
1285 |
ASSERT(numIORanks >= numFields); |
ASSERT(numIORanks >= numFields); |
1304 |
bytesPerIORankThisField[k] = fieldSizes[k] / numIORanksThisField[k]; |
bytesPerIORankThisField[k] = fieldSizes[k] / numIORanksThisField[k]; |
1305 |
} |
} |
1306 |
|
|
1307 |
//////////////////////////////////////////////////////////// |
///////////////////////////////////////////////////////////////// |
1308 |
// At this point, all the i/o ranks should be apportioned |
// At this point, the number of i/o ranks should be apportioned |
1309 |
// among the fields. Check we didn't screw up the count. |
// among the fields. Check we didn't screw up the count. |
1310 |
for (sum = 0, i = 0; i < numFields; ++i) { |
for (sum = 0, i = 0; i < numFields; ++i) { |
1311 |
sum += numIORanksThisField[i]; |
sum += numIORanksThisField[i]; |
1312 |
} |
} |
1313 |
ASSERT(sum == numIORanks); |
ASSERT(sum == numIORanks); |
1314 |
|
|
1315 |
|
// Make a copy of numIORanksThisField |
1316 |
|
memcpy (nRanks, numIORanksThisField, numFields*sizeof(int)); |
1317 |
|
|
1318 |
|
|
1319 |
|
|
1320 |
////////////////////////////////////////////////////////////////// |
////////////////////////////////////////////////////////////////// |
1321 |
|
// Phase 2: try to even out the number of messages per node. |
1322 |
// The *number* of i/o ranks assigned to a field is based on the |
// The *number* of i/o ranks assigned to a field is based on the |
1323 |
// field size. But the *placement* of those ranks is based on |
// field size. But the *placement* of those ranks is based on |
1324 |
// the expected number of messages the process will receive. |
// the expected number of messages the process will receive. |
1371 |
|
|
1372 |
// Make an initial choice of field |
// Make an initial choice of field |
1373 |
for (i = 0; i < numFields; ++i) { |
for (i = 0; i < numFields; ++i) { |
1374 |
if (numIORanksThisField[i] > 0) break; |
if (nRanks[i] > 0) break; |
1375 |
} |
} |
1376 |
k = i; |
k = i; |
1377 |
ASSERT(k < numFields); |
ASSERT(k < numFields); |
1378 |
|
|
1379 |
// Search for a better choice |
// Search for a better choice |
1380 |
for (++i; i < numFields; ++i) { |
for (++i; i < numFields; ++i) { |
1381 |
if (numIORanksThisField[i] <= 0) continue; |
if (nRanks[i] <= 0) continue; |
1382 |
if (expectedMessagesPerRankThisField[i] > |
if (expectedMessagesPerRankThisField[i] > |
1383 |
expectedMessagesPerRankThisField[k]) |
expectedMessagesPerRankThisField[k]) |
1384 |
{ |
{ |
1390 |
ioNodeInfo[j].expectedNumMessagesThisNode += expectedMessagesPerRankThisField[k]; |
ioNodeInfo[j].expectedNumMessagesThisNode += expectedMessagesPerRankThisField[k]; |
1391 |
ioNodeInfo[j].assignedFieldThisCore[ioNodeInfo[j].numCoresAssigned] = k; |
ioNodeInfo[j].assignedFieldThisCore[ioNodeInfo[j].numCoresAssigned] = k; |
1392 |
ioNodeInfo[j].numCoresAssigned += 1; |
ioNodeInfo[j].numCoresAssigned += 1; |
1393 |
numIORanksThisField[k] -= 1; |
nRanks[k] -= 1; |
1394 |
} |
} |
1395 |
|
|
1396 |
// Sanity check - all ranks were assigned to a core |
// Sanity check - all ranks were assigned to a core |
1397 |
for (i = 1; i < numFields; ++i) { |
for (i = 1; i < numFields; ++i) { |
1398 |
ASSERT(0 == numIORanksThisField[i]); |
ASSERT(0 == nRanks[i]); |
1399 |
} |
} |
1400 |
// Sanity check - all cores were assigned a rank |
// Sanity check - all cores were assigned a rank |
1401 |
for (i = 1; i < numIONodes; ++i) { |
for (i = 1; i < numIONodes; ++i) { |
1403 |
} |
} |
1404 |
|
|
1405 |
|
|
1406 |
///////////////////////////////////// |
////////////////////////////////////////////////////////////////// |
1407 |
|
// Phase 3: Select which ranks will be assigned to write out the |
1408 |
|
// fields. We attempt to balance the amount that each node is |
1409 |
|
// assigned to write out. Since Phase 1 and Phase 2 have already |
1410 |
|
// fixed a number of things, our options for balancing things |
1411 |
|
// is somewhat restricted. |
1412 |
|
|
1413 |
|
|
1414 |
|
// Count how many nodes each field is distributed across |
1415 |
|
int numIONodesThisField[numFields]; |
1416 |
|
for (i = 0; i < numFields; ++i) { |
1417 |
|
numIONodesThisField[i] = 0; |
1418 |
|
for (j = 0; j < numIONodes; ++j) { |
1419 |
|
for (k = 0; k < numRanksPerNode; ++k) { |
1420 |
|
if (ioNodeInfo[j].assignedFieldThisCore[k] == i) { |
1421 |
|
numIONodesThisField[i] += 1; |
1422 |
|
break; |
1423 |
|
} |
1424 |
|
} |
1425 |
|
} |
1426 |
|
ASSERT (numIONodesThisField[i] > 0); |
1427 |
|
ASSERT (numIONodesThisField[i] <= numIONodes); |
1428 |
|
} |
1429 |
|
|
1430 |
|
|
1431 |
|
// Init a per-node running count of z-levels-to-write |
1432 |
|
memset (nToWrite, 0, numIONodes*sizeof(int)); |
1433 |
|
|
1434 |
|
|
1435 |
|
// Iterate through all the fields, although we will select |
1436 |
|
// which field to operate on (i.e. curField) non-sequentially. |
1437 |
|
for (iteration = 0; iteration < numFields; ++iteration) { |
1438 |
|
|
1439 |
|
// Pick the field distributed across the fewest number of nodes, on |
1440 |
|
// the theory that this field will have the least flexibility. |
1441 |
|
int curField = 0; |
1442 |
|
for (j = 1; j < numFields; ++j) { |
1443 |
|
if (numIONodesThisField[j] < numIONodesThisField[curField]) { |
1444 |
|
curField = j; |
1445 |
|
} |
1446 |
|
} |
1447 |
|
ASSERT (numIONodesThisField[curField] <= numIONodes); |
1448 |
|
|
1449 |
|
// Now that we have chosen a field, identify any i/o nodes |
1450 |
|
// that have rank(s) assigned to that field. |
1451 |
|
int nAssigned[numIONodes]; |
1452 |
|
for (i = 0; i < numIONodes; ++i) { |
1453 |
|
nAssigned[i] = 0; |
1454 |
|
for (j = 0; j < numRanksPerNode; ++j) { |
1455 |
|
if (ioNodeInfo[i].assignedFieldThisCore[j] == curField) { |
1456 |
|
nAssigned[i] += 1; |
1457 |
|
} |
1458 |
|
} |
1459 |
|
} |
1460 |
|
|
1461 |
|
|
1462 |
|
// We do the writes in units of entire z-levels. If a rank is a |
1463 |
|
// writer, it is assigned to write a chunk of one or more z-levels. |
1464 |
|
int chunkSize = divCeil (fields[curField].zDepth, numIORanksThisField[curField]); |
1465 |
|
int nChunks = divCeil (fields[curField].zDepth, chunkSize); |
1466 |
|
int curChunk; |
1467 |
|
|
1468 |
|
// Designate the same number of ranks to be writers as there are chunks |
1469 |
|
fields[curField].chunkWriters = malloc (nChunks*sizeof(int)); |
1470 |
|
ASSERT (fields[curField].chunkWriters != NULL); |
1471 |
|
fields[curField].nChunks = nChunks; |
1472 |
|
int nAvailable[numIONodes]; |
1473 |
|
memcpy (nAvailable, nAssigned, numIONodes*sizeof(int)); |
1474 |
|
for (curChunk = 0; curChunk < nChunks; ++curChunk) { |
1475 |
|
|
1476 |
|
// Note that the last chunk might be an odd size (if chunkSize |
1477 |
|
// does not evenly divide zDepth). |
1478 |
|
if ((curChunk + 1) == nChunks) { |
1479 |
|
chunkSize = fields[curField].zDepth - curChunk*chunkSize; |
1480 |
|
} |
1481 |
|
|
1482 |
|
// Pick a node that has at least one rank assigned to curField |
1483 |
|
// that has not already been designated as a writer. |
1484 |
|
// There must still be at least one, or we would have exited |
1485 |
|
// the loop already. |
1486 |
|
int curNode; |
1487 |
|
for (curNode = 0; curNode < numIONodes; ++curNode) { |
1488 |
|
if (nAvailable[curNode] > 0) break; |
1489 |
|
} |
1490 |
|
ASSERT (curNode < numIONodes); |
1491 |
|
|
1492 |
|
// curNode is a candidate node; try to find an acceptable |
1493 |
|
// candidate node with a smaller nToWrite |
1494 |
|
for (i = curNode + 1; i < numIONodes; ++i) { |
1495 |
|
if ((nAvailable[i] > 0) && (nToWrite[i] < nToWrite[curNode])) { |
1496 |
|
curNode = i; |
1497 |
|
} |
1498 |
|
} |
1499 |
|
|
1500 |
|
// Find an appropriate rank on the chosen node |
1501 |
|
for (j = 0; j < numRanksPerNode; ++j) { |
1502 |
|
if ( (ioNodeInfo[curNode].assignedFieldThisCore[j] == curField) && |
1503 |
|
(!isWriter[curNode*numRanksPerNode + j]) ) |
1504 |
|
{ |
1505 |
|
break; |
1506 |
|
} |
1507 |
|
} |
1508 |
|
// Double-check that we found an appropriate rank. |
1509 |
|
ASSERT (j < numRanksPerNode); |
1510 |
|
|
1511 |
|
|
1512 |
|
// We've picked a rank to be the writer of the current chunk. |
1513 |
|
// Now we need to figure out which rank this will wind up being |
1514 |
|
// in the "ioRanksIntracomm" for this field (when that intracomm |
1515 |
|
// eventually gets created), so we can set "chunkWriters". |
1516 |
|
int eventualCommRank = 0; |
1517 |
|
for (i = 0; i < curNode; ++i) eventualCommRank += nAssigned[i]; |
1518 |
|
for (i = 0; i < j; ++i) { |
1519 |
|
if (ioNodeInfo[curNode].assignedFieldThisCore[i] == curField) { |
1520 |
|
eventualCommRank += 1; |
1521 |
|
} |
1522 |
|
} |
1523 |
|
|
1524 |
|
|
1525 |
|
// Update the info for this choice of node+rank |
1526 |
|
fields[curField].chunkWriters[curChunk] = eventualCommRank; |
1527 |
|
isWriter[curNode*numRanksPerNode + j] = 1; |
1528 |
|
nAvailable[curNode] -= 1; |
1529 |
|
nToWrite[curNode] += chunkSize; |
1530 |
|
|
1531 |
|
} |
1532 |
|
|
1533 |
|
// We've finished with this curField; mark it so that we don't |
1534 |
|
// re-select this same curField the next time through the loop. |
1535 |
|
numIONodesThisField[curField] = numIONodes + 1; |
1536 |
|
} |
1537 |
|
|
1538 |
|
|
1539 |
|
////////////////////////////////////////////////////////////////// |
1540 |
// Return the computed assignments |
// Return the computed assignments |
1541 |
|
// N.B. We are also returning info in "fields[*].chunkWriters" |
1542 |
for (i = 0; i < numIONodes; ++i) { |
for (i = 0; i < numIONodes; ++i) { |
1543 |
for (j = 0; j < numRanksPerNode; ++j) { |
for (j = 0; j < numRanksPerNode; ++j) { |
1544 |
assignments[i*numRanksPerNode + j] = |
assignments[i*numRanksPerNode + j] = |
1546 |
} |
} |
1547 |
} |
} |
1548 |
|
|
1549 |
|
|
1550 |
// Clean up |
// Clean up |
1551 |
for (i = 0; i < numIONodes; ++i) { |
for (i = 0; i < numIONodes; ++i) { |
1552 |
free(ioNodeInfo[i].assignedFieldThisCore); |
free(ioNodeInfo[i].assignedFieldThisCore); |
1557 |
////////////////////////////////////////////////////////////////////////////////// |
////////////////////////////////////////////////////////////////////////////////// |
1558 |
|
|
1559 |
|
|
|
|
|
|
|
|
1560 |
int |
int |
1561 |
isIORank(int commRank, int totalNumNodes, int numIONodes) |
isIORank(int commRank, int totalNumNodes, int numIONodes) |
1562 |
{ |
{ |
1563 |
// Figure out if this rank is on a node that will do i/o. |
// Figure out if this rank is on a node that will do i/o. |
1564 |
// Note that the i/o nodes are distributed throughout the |
// Note that the i/o nodes are distributed throughout the |
1565 |
// task, not clustered together. |
// task, not clustered together. |
|
int ioNodeStride = totalNumNodes / numIONodes; |
|
1566 |
int thisRankNode = commRank / numRanksPerNode; |
int thisRankNode = commRank / numRanksPerNode; |
|
int n = thisRankNode / ioNodeStride; |
|
|
return (((n * ioNodeStride) == thisRankNode) && (n < numIONodes)) ? 1 : 0; |
|
|
} |
|
1567 |
|
|
1568 |
|
int ioNodeStride = totalNumNodes / numIONodes; |
1569 |
|
int extra = totalNumNodes % numIONodes; |
1570 |
|
|
1571 |
// Find the number of *physical cores* on the current node |
int inflectionPoint = (ioNodeStride+1)*extra; |
1572 |
// (ignore hyperthreading). This should work for pretty much |
if (thisRankNode <= inflectionPoint) { |
1573 |
// any Linux based system (and fail horribly for anything else). |
return (thisRankNode % (ioNodeStride+1)) == 0; |
1574 |
int |
} else { |
1575 |
getNumCores(void) |
return ((thisRankNode - inflectionPoint) % ioNodeStride) == 0; |
1576 |
{ |
} |
|
return 20; // until we figure out why this cratered |
|
|
|
|
|
char *magic = "cat /proc/cpuinfo | egrep \"core id|physical id\" | " |
|
|
"tr -d \"\\n\" | sed s/physical/\\\\nphysical/g | " |
|
|
"grep -v ^$ | sort -u | wc -l"; |
|
1577 |
|
|
1578 |
FILE *fp = popen(magic,"r"); |
} |
|
ASSERT(fp); |
|
1579 |
|
|
|
int rtnValue = -1; |
|
|
|
|
|
int res = fscanf(fp,"%d",&rtnValue); |
|
1580 |
|
|
|
ASSERT(1==res); |
|
1581 |
|
|
1582 |
pclose(fp); |
// Get the number of MPI ranks that are running on one node. |
1583 |
|
// We assume that the MPI ranks are launched in sequence, filling one |
1584 |
|
// node before going to the next, and that each node has the same |
1585 |
|
// number of ranks (except possibly the last node, which is allowed |
1586 |
|
// to be short). |
1587 |
|
int |
1588 |
|
getNumRanksPerNode (MPI_Comm comm) |
1589 |
|
{ |
1590 |
|
char myHostname[HOST_NAME_MAX+1]; |
1591 |
|
int status, count; |
1592 |
|
int commSize, commRank; |
1593 |
|
|
1594 |
|
MPI_Comm_size (comm, &commSize); |
1595 |
|
MPI_Comm_rank (comm, &commRank); |
1596 |
|
|
1597 |
|
status = gethostname (myHostname, HOST_NAME_MAX); |
1598 |
|
myHostname[HOST_NAME_MAX] = '\0'; |
1599 |
|
ASSERT (0 == status); |
1600 |
|
|
1601 |
|
if (0 == commRank) { |
1602 |
|
int nBlocks, ii, jj; |
1603 |
|
assert (allHostnames != NULL); |
1604 |
|
|
1605 |
|
// We assume these are already in-order and so don't |
1606 |
|
// need to be sorted. |
1607 |
|
|
1608 |
|
// Count how many ranks have the same hostname as rank 0 |
1609 |
|
for (count = 0; |
1610 |
|
(count < commSize) && |
1611 |
|
(strcmp(&(allHostnames[count][0]), myHostname) == 0); |
1612 |
|
++count); |
1613 |
|
ASSERT (count > 0); |
1614 |
|
|
1615 |
|
// Check that the count is consistent for each block of hostnames |
1616 |
|
// (except possibly the last block, which might not be full). |
1617 |
|
nBlocks = commSize / count; |
1618 |
|
|
1619 |
|
for (jj = 1; jj < nBlocks; ++jj) { |
1620 |
|
char *p = &(allHostnames[jj*count][0]); |
1621 |
|
for (ii = 0; ii < count; ++ii) { |
1622 |
|
char *q = &(allHostnames[jj*count + ii][0]); |
1623 |
|
ASSERT (strcmp (p, q) == 0); |
1624 |
|
} |
1625 |
|
} |
1626 |
|
} |
1627 |
|
|
1628 |
ASSERT(rtnValue > 0); |
MPI_Bcast (&count, 1, MPI_INT, 0, comm); |
1629 |
return rtnValue; |
return count; |
1630 |
} |
} |
1631 |
|
|
1632 |
|
|
|
|
|
1633 |
//////////////////////////////////////////////////////////////////////// |
//////////////////////////////////////////////////////////////////////// |
1634 |
//////////////////////////////////////////////////////////////////////// |
//////////////////////////////////////////////////////////////////////// |
1635 |
// Routines called by the mitGCM code |
// Routines called by the mitGCM code |
1644 |
int numTiles, |
int numTiles, |
1645 |
MPI_Comm *rtnComputeComm) |
MPI_Comm *rtnComputeComm) |
1646 |
{ |
{ |
1647 |
|
// Note that we ignore the argument "numTiles". This value used to |
1648 |
|
// be needed, but now we get the information directly from "SIZE.h" |
1649 |
|
|
1650 |
MPI_Comm newIntracomm, newIntercomm, dupParentComm; |
MPI_Comm newIntracomm, newIntercomm, dupParentComm; |
1651 |
int newIntracommRank, thisIsIORank; |
int newIntracommRank, thisIsIORank; |
1652 |
int parentSize, parentRank; |
int parentSize, parentRank; |
1655 |
int i, j, nF, compRoot, fieldIndex, epochStyleIndex; |
int i, j, nF, compRoot, fieldIndex, epochStyleIndex; |
1656 |
int totalNumNodes, numComputeNodes, newIntracommSize; |
int totalNumNodes, numComputeNodes, newIntracommSize; |
1657 |
|
|
1658 |
|
|
1659 |
// Init globals |
// Init globals |
1660 |
totalNumTiles = numTiles; |
|
1661 |
|
// Info about the parent communicator |
1662 |
|
MPI_Initialized(&mpiIsInitialized); |
1663 |
|
ASSERT(mpiIsInitialized); |
1664 |
|
MPI_Comm_size(parentComm, &parentSize); |
1665 |
|
MPI_Comm_rank(parentComm, &parentRank); |
1666 |
|
|
1667 |
|
// Find the max MPI "tag" value |
1668 |
|
MPI_Attr_get(MPI_COMM_WORLD, MPI_TAG_UB, &intPtr, &tagUBexists); |
1669 |
|
ASSERT(tagUBexists); |
1670 |
|
maxTagValue = *intPtr; |
1671 |
|
|
1672 |
|
// Gather all the hostnames to rank zero |
1673 |
|
char myHostname[HOST_NAME_MAX+1]; |
1674 |
|
int status; |
1675 |
|
status = gethostname (myHostname, HOST_NAME_MAX); |
1676 |
|
myHostname[HOST_NAME_MAX] = '\0'; |
1677 |
|
ASSERT (0 == status); |
1678 |
|
if (parentRank != 0) { |
1679 |
|
// send my hostname to rank 0 |
1680 |
|
MPI_Gather (myHostname, HOST_NAME_MAX+1, MPI_CHAR, |
1681 |
|
NULL, 0, MPI_CHAR, 0, parentComm); |
1682 |
|
} |
1683 |
|
else { |
1684 |
|
allHostnames = malloc (parentSize*(HOST_NAME_MAX+1)); |
1685 |
|
assert (allHostnames != NULL); |
1686 |
|
|
1687 |
|
// Collect the hostnames from all the ranks |
1688 |
|
MPI_Gather (myHostname, HOST_NAME_MAX+1, MPI_CHAR, |
1689 |
|
allHostnames, HOST_NAME_MAX+1, MPI_CHAR, |
1690 |
|
0, parentComm); |
1691 |
|
} |
1692 |
|
|
1693 |
if (numRanksPerNode <= 0) { |
if (numRanksPerNode <= 0) { |
1694 |
// Might also want to check for an env var (or something) |
// Might also want to check for an env var (or something) |
1695 |
numRanksPerNode = getNumCores(); |
// N.B.: getNumRanksPerNode uses an MPI collective on the communicator |
1696 |
|
// and needs the global "allHostnames" to already be set in rank 0. |
1697 |
|
numRanksPerNode = getNumRanksPerNode(parentComm); |
1698 |
} |
} |
1699 |
|
|
1700 |
// Fill in the zDepth field of the fieldInfoThisEpoch_t descriptors |
// Fill in the zDepth field of the fieldInfoThisEpoch_t descriptors |
1716 |
} |
} |
1717 |
|
|
1718 |
|
|
|
// Find the max MPI "tag" value |
|
|
MPI_Attr_get(MPI_COMM_WORLD, MPI_TAG_UB, &intPtr, &tagUBexists); |
|
|
ASSERT(tagUBexists); |
|
|
maxTagValue = *intPtr; |
|
|
|
|
|
|
|
|
// Info about the parent communicator |
|
|
MPI_Initialized(&mpiIsInitialized); |
|
|
ASSERT(mpiIsInitialized); |
|
|
MPI_Comm_size(parentComm, &parentSize); |
|
|
MPI_Comm_rank(parentComm, &parentRank); |
|
|
|
|
1719 |
|
|
1720 |
// Figure out how many nodes we can use for i/o. |
// Figure out how many nodes we can use for i/o. |
1721 |
// To make things (e.g. memory calculations) simpler, we want |
// To make things (e.g. memory calculations) simpler, we want |
1731 |
numIORanks = numIONodes * numRanksPerNode; |
numIORanks = numIONodes * numRanksPerNode; |
1732 |
|
|
1733 |
|
|
1734 |
|
// Print out the hostnames, identifying which ones are i/o nodes |
1735 |
|
|
1736 |
|
if (0 == parentRank) { |
1737 |
|
printf ("\n%d total nodes, %d i/o, %d compute\n", |
1738 |
|
totalNumNodes, numIONodes, numComputeNodes); |
1739 |
|
for (i = 0; i < parentSize; i += numRanksPerNode) { |
1740 |
|
if (isIORank (i, totalNumNodes, numIONodes)) { |
1741 |
|
printf ("\n(%s)", &(allHostnames[i][0])); |
1742 |
|
} else { |
1743 |
|
printf (" %s", &(allHostnames[i][0])); |
1744 |
|
} |
1745 |
|
} |
1746 |
|
printf ("\n\n"); |
1747 |
|
} |
1748 |
|
fflush (stdout); |
1749 |
|
|
1750 |
|
|
1751 |
|
|
1752 |
// It is surprisingly easy to launch the job with the wrong number |
// It is surprisingly easy to launch the job with the wrong number |
1753 |
// of ranks, particularly if the number of compute ranks is not a |
// of ranks, particularly if the number of compute ranks is not a |
1754 |
// multiple of numRanksPerNode (the tendency is to just launch one |
// multiple of numRanksPerNode (the tendency is to just launch one |
1773 |
MPI_Comm_rank(newIntracomm, &newIntracommRank); |
MPI_Comm_rank(newIntracomm, &newIntracommRank); |
1774 |
|
|
1775 |
// Excess ranks disappear |
// Excess ranks disappear |
1776 |
// N.B. "parentSize" still includes these ranks. |
// N.B. "parentComm" (and parentSize) still includes these ranks, so |
1777 |
|
// we cannot do MPI *collectives* using parentComm after this point, |
1778 |
|
// although the communicator can still be used for other things. |
1779 |
if (isExcess == thisRanksType) { |
if (isExcess == thisRanksType) { |
1780 |
MPI_Finalize(); |
MPI_Finalize(); |
1781 |
exit(0); |
exit(0); |
1819 |
// the fields, and create an inter-communicator for each split. |
// the fields, and create an inter-communicator for each split. |
1820 |
|
|
1821 |
for (epochStyleIndex = 0; epochStyleIndex < numEpochStyles; ++epochStyleIndex) { |
for (epochStyleIndex = 0; epochStyleIndex < numEpochStyles; ++epochStyleIndex) { |
1822 |
|
if (0 == parentRank) { |
1823 |
|
printf ("Set up epoch style %d\n", epochStyleIndex); |
1824 |
|
} |
1825 |
|
|
1826 |
fieldInfoThisEpoch_t *thisEpochStyle = epochStyles[epochStyleIndex]; |
fieldInfoThisEpoch_t *thisEpochStyle = epochStyles[epochStyleIndex]; |
1827 |
int fieldAssignments[numIORanks]; |
int fieldAssignments[numIORanks]; |
1828 |
int rankAssignments[numComputeRanks + numIORanks]; |
int rankAssignments[numComputeRanks + numIORanks]; |
1830 |
// Count the number of fields in this epoch style |
// Count the number of fields in this epoch style |
1831 |
for (nF = 0; thisEpochStyle[nF].dataFieldID != '\0'; ++nF) ; |
for (nF = 0; thisEpochStyle[nF].dataFieldID != '\0'; ++nF) ; |
1832 |
|
|
1833 |
// Decide how to apportion the i/o ranks among the fields |
// Decide how to apportion the i/o ranks among the fields. |
1834 |
|
// (Currently, this call also sets the "chunkWriters" array.) |
1835 |
for (i=0; i < numIORanks; ++i) fieldAssignments[i] = -1; |
for (i=0; i < numIORanks; ++i) fieldAssignments[i] = -1; |
1836 |
|
int nToWrite[numIORanks/numRanksPerNode]; |
1837 |
computeIORankAssigments(numComputeRanks, numIORanks, nF, |
computeIORankAssigments(numComputeRanks, numIORanks, nF, |
1838 |
thisEpochStyle, fieldAssignments); |
thisEpochStyle, fieldAssignments, nToWrite); |
1839 |
// Sanity check |
// Sanity check |
1840 |
for (i=0; i < numIORanks; ++i) { |
for (i=0; i < numIORanks; ++i) { |
1841 |
ASSERT((fieldAssignments[i] >= 0) && (fieldAssignments[i] < nF)); |
ASSERT((fieldAssignments[i] >= 0) && (fieldAssignments[i] < nF)); |
1857 |
} |
} |
1858 |
ASSERT(j == numIORanks); |
ASSERT(j == numIORanks); |
1859 |
|
|
1860 |
|
|
1861 |
if (0 == parentRank) { |
if (0 == parentRank) { |
1862 |
printf("\ncpu assignments, epoch style %d\n", epochStyleIndex); |
printf ("Create communicators for epoch style %d\n", epochStyleIndex); |
|
for (i = 0; i < numComputeNodes + numIONodes ; ++i) { |
|
|
if (rankAssignments[i*numRanksPerNode] >= 0) { |
|
|
// i/o node |
|
|
for (j = 0; j < numRanksPerNode; ++j) { |
|
|
printf(" %1d", rankAssignments[i*numRanksPerNode + j]); |
|
|
} |
|
|
} else { |
|
|
// compute node |
|
|
for (j = 0; j < numRanksPerNode; ++j) { |
|
|
if ((i*numRanksPerNode + j) >= (numComputeRanks + numIORanks)) break; |
|
|
ASSERT(-1 == rankAssignments[i*numRanksPerNode + j]); |
|
|
} |
|
|
printf(" #"); |
|
|
for (; j < numRanksPerNode; ++j) { // "excess" ranks (if any) |
|
|
printf("X"); |
|
|
} |
|
|
} |
|
|
printf(" "); |
|
|
} |
|
|
printf("\n\n"); |
|
1863 |
} |
} |
1864 |
|
fflush (stdout); |
1865 |
|
|
1866 |
|
|
1867 |
// Find the lowest rank assigned to computation; use it as |
// Find the lowest rank assigned to computation; use it as |
1868 |
// the "root" for the upcoming intercomm creates. |
// the "root" for the upcoming intercomm creates. |
1870 |
// Sanity check |
// Sanity check |
1871 |
if ((isCompute == thisRanksType) && (0 == newIntracommRank)) ASSERT(compRoot == parentRank); |
if ((isCompute == thisRanksType) && (0 == newIntracommRank)) ASSERT(compRoot == parentRank); |
1872 |
|
|
|
|
|
1873 |
// If this is an I/O rank, split the newIntracomm (which, for |
// If this is an I/O rank, split the newIntracomm (which, for |
1874 |
// the i/o ranks, is a communicator among all the i/o ranks) |
// the i/o ranks, is a communicator among all the i/o ranks) |
1875 |
// into the pieces as assigned. |
// into the pieces as assigned. |
1898 |
// ... and dup a separate one for tile registrations |
// ... and dup a separate one for tile registrations |
1899 |
MPI_Comm_dup(thisEpochStyle[myField].dataIntercomm, |
MPI_Comm_dup(thisEpochStyle[myField].dataIntercomm, |
1900 |
&(thisEpochStyle[myField].registrationIntercomm)); |
&(thisEpochStyle[myField].registrationIntercomm)); |
1901 |
|
|
1902 |
|
|
1903 |
|
// Sanity check: make sure the chunkWriters array looks ok. |
1904 |
|
{ |
1905 |
|
int ii, jj, commSize, nChunks = thisEpochStyle[myField].nChunks; |
1906 |
|
ASSERT (nChunks > 0); |
1907 |
|
ASSERT (thisEpochStyle[myField].chunkWriters != NULL); |
1908 |
|
MPI_Comm_size (thisEpochStyle[myField].dataIntercomm, &commSize); |
1909 |
|
for (ii = 0; ii < nChunks; ++ii) { |
1910 |
|
ASSERT (thisEpochStyle[myField].chunkWriters[ii] >= 0); |
1911 |
|
ASSERT (thisEpochStyle[myField].chunkWriters[ii] < commSize); |
1912 |
|
for (jj = ii+1; jj < nChunks; ++jj) { |
1913 |
|
ASSERT (thisEpochStyle[myField].chunkWriters[ii] != |
1914 |
|
thisEpochStyle[myField].chunkWriters[jj]); |
1915 |
|
} |
1916 |
|
} |
1917 |
|
} |
1918 |
} |
} |
1919 |
} |
} |
1920 |
else { |
else { |
1937 |
} |
} |
1938 |
} |
} |
1939 |
|
|
1940 |
|
|
1941 |
|
// Print a "map" of the core assignments. Compute nodes are indicated |
1942 |
|
// by a "#" for the whole node, i/o nodes have the individual cores |
1943 |
|
// within the node broken out and their field assignment printed. |
1944 |
|
// If the core is writing to the disk, the field assignment is printed |
1945 |
|
// in parentheses. |
1946 |
|
if (0 == parentRank) { |
1947 |
|
int fieldIOCounts[nF]; |
1948 |
|
memset (fieldIOCounts, 0, nF*sizeof(int)); |
1949 |
|
|
1950 |
|
printf ("Writer assignments, epoch style %d\n", epochStyleIndex); |
1951 |
|
for (i = 0; i < nF; ++i) { |
1952 |
|
fieldInfoThisEpoch_t *p = &(thisEpochStyle[i]); |
1953 |
|
int chunkSize = divCeil(p->zDepth,p->nChunks); |
1954 |
|
printf ("\n"); |
1955 |
|
printf ( "field %2d ('%c'): %1d levels, %1d writers, %1d" |
1956 |
|
" levels per writer (last writer = %d levels)\n", |
1957 |
|
i, p->dataFieldID, p->zDepth, p->nChunks, |
1958 |
|
chunkSize, p->zDepth - ((p->nChunks - 1)*chunkSize)); |
1959 |
|
for (j = 0; j < thisEpochStyle[i].nChunks; ++j) { |
1960 |
|
printf (" %1d", thisEpochStyle[i].chunkWriters[j]); |
1961 |
|
} |
1962 |
|
printf ("\n"); |
1963 |
|
} |
1964 |
|
printf ("\n"); |
1965 |
|
|
1966 |
|
printf("\ncpu assignments, epoch style %d\n", epochStyleIndex); |
1967 |
|
int whichIONode = -1; |
1968 |
|
for (i = 0; i < numComputeNodes + numIONodes ; ++i) { |
1969 |
|
|
1970 |
|
if (rankAssignments[i*numRanksPerNode] >= 0) { |
1971 |
|
|
1972 |
|
// i/o node |
1973 |
|
++whichIONode; |
1974 |
|
printf("\n%s (%d Z, %ld bytes):", |
1975 |
|
&(allHostnames[i*numRanksPerNode][0]), |
1976 |
|
nToWrite[whichIONode], |
1977 |
|
nToWrite[whichIONode]*twoDFieldSizeInBytes); |
1978 |
|
|
1979 |
|
for (j = 0; j < numRanksPerNode; ++j) { |
1980 |
|
|
1981 |
|
int assignedField = rankAssignments[i*numRanksPerNode + j]; |
1982 |
|
int k, commRank, nChunks, isWriter; |
1983 |
|
nChunks = thisEpochStyle[assignedField].nChunks; |
1984 |
|
commRank = fieldIOCounts[assignedField]; |
1985 |
|
|
1986 |
|
isWriter = 0; |
1987 |
|
for (k = 0; k < nChunks; ++k) { |
1988 |
|
if (thisEpochStyle[assignedField].chunkWriters[k] == commRank) { |
1989 |
|
isWriter = 1; |
1990 |
|
break; |
1991 |
|
} |
1992 |
|
} |
1993 |
|
printf(isWriter ? " (%1d)" : " %1d ", assignedField); |
1994 |
|
|
1995 |
|
fieldIOCounts[assignedField] += 1; |
1996 |
|
} |
1997 |
|
printf("\n"); |
1998 |
|
|
1999 |
|
} else { |
2000 |
|
|
2001 |
|
// compute node |
2002 |
|
for (j = 0; j < numRanksPerNode; ++j) { |
2003 |
|
if ((i*numRanksPerNode + j) >= (numComputeRanks + numIORanks)) break; |
2004 |
|
ASSERT(-1 == rankAssignments[i*numRanksPerNode + j]); |
2005 |
|
} |
2006 |
|
printf(" #"); |
2007 |
|
for (; j < numRanksPerNode; ++j) { // "excess" ranks (if any) |
2008 |
|
printf("X"); |
2009 |
|
} |
2010 |
|
} |
2011 |
|
printf(" "); |
2012 |
|
} |
2013 |
|
printf("\n\n"); |
2014 |
|
} |
2015 |
|
|
2016 |
|
|
2017 |
|
|
2018 |
} // epoch style loop |
} // epoch style loop |
2019 |
|
|
2020 |
|
// Note: only non-null in rank 0, but it's ok to "free" NULL |
2021 |
|
free(allHostnames); |
2022 |
|
|
2023 |
|
|
2024 |
// I/O processes split off and start receiving data |
// I/O processes split off and start receiving data |
2025 |
// NOTE: the I/O processes DO NOT RETURN from this call |
// NOTE: the I/O processes DO NOT RETURN from this call |
2026 |
if (isIO == thisRanksType) ioRankMain(totalNumTiles); |
if (isIO == thisRanksType) ioRankMain(); |
2027 |
|
|
2028 |
|
|
2029 |
// The "compute" processes now return with their new intra-communicator. |
// The "compute" processes now return with their new intra-communicator. |
2032 |
// but first, call mpiio initialization routine! |
// but first, call mpiio initialization routine! |
2033 |
initSizesAndTypes(); |
initSizesAndTypes(); |
2034 |
|
|
2035 |
|
fflush (stdout); |
2036 |
return; |
return; |
2037 |
} |
} |
2038 |
|
|
2135 |
abort(); |
abort(); |
2136 |
} |
} |
2137 |
|
|
2138 |
|
|
2139 |
|
|
2140 |
|
// not necessary for correctness, but for better timings |
2141 |
|
MPI_Barrier(computeIntracomm); |
2142 |
|
if (0 == computeIntracommRank) |
2143 |
|
fprintf(stderr,"compute ready to emit %d %d %f\n",newEpochID,gcmIter,MPI_Wtime()); |
2144 |
|
|
2145 |
|
|
2146 |
|
|
2147 |
//////////////////////////////////////////////////////////////////////// |
//////////////////////////////////////////////////////////////////////// |
2148 |
// Need to be sure the i/o tasks are done processing the previous epoch |
// Need to be sure the i/o tasks are done processing the previous epoch |
2149 |
// before any compute tasks start sending tiles from a new epoch. |
// before any compute tasks start sending tiles from a new epoch. |
2163 |
// Compute ranks wait here until rank 0 gets the ack from the i/o ranks |
// Compute ranks wait here until rank 0 gets the ack from the i/o ranks |
2164 |
MPI_Barrier(computeIntracomm); |
MPI_Barrier(computeIntracomm); |
2165 |
|
|
2166 |
|
if (0 == computeIntracommRank) |
2167 |
|
fprintf(stderr,"compute handshake completed %d %d %f\n",newEpochID,gcmIter,MPI_Wtime()); |
2168 |
|
|
2169 |
|
|
2170 |
currentEpochID = newEpochID; |
currentEpochID = newEpochID; |
2171 |
currentEpochStyle = epochStyle; |
currentEpochStyle = epochStyle; |
2206 |
if (p->dataFieldID == dataFieldID) break; |
if (p->dataFieldID == dataFieldID) break; |
2207 |
} |
} |
2208 |
// Make sure we found a valid field |
// Make sure we found a valid field |
|
|
|
2209 |
ASSERT(p->dataIntercomm != MPI_COMM_NULL); |
ASSERT(p->dataIntercomm != MPI_COMM_NULL); |
2210 |
|
|
2211 |
MPI_Comm_remote_size(p->dataIntercomm, &remoteCommSize); |
MPI_Comm_remote_size(p->dataIntercomm, &remoteCommSize); |
2217 |
ASSERT(p->dataFieldID == dataFieldID); |
ASSERT(p->dataFieldID == dataFieldID); |
2218 |
|
|
2219 |
receiver = (computeIntracommRank + p->tileCount) % remoteCommSize; |
receiver = (computeIntracommRank + p->tileCount) % remoteCommSize; |
|
|
|
2220 |
tag = (tileID << numCheckBits) | (epochID & bitMask); |
tag = (tileID << numCheckBits) | (epochID & bitMask); |
2221 |
|
|
|
//fprintf(stderr,"%d %d\n",flag,tileID); |
|
|
|
|
|
/* |
|
|
if (tileID==189){ |
|
|
int i,j; |
|
|
for (i=0;i<TILE_Y;++i){ |
|
|
for (j=0;j<TILE_X;++j) |
|
|
fprintf(stderr,"%f ",((double*)data)[872+i*108+j]); |
|
|
fprintf(stderr,"\n"); |
|
|
} |
|
|
} |
|
|
|
|
|
*/ |
|
|
|
|
|
|
|
|
|
|
2222 |
FPRINTF(stderr,"Rank %d sends field '%c', tile %d, to i/o task %d with tag %d(%d)\n", |
FPRINTF(stderr,"Rank %d sends field '%c', tile %d, to i/o task %d with tag %d(%d)\n", |
2223 |
computeIntracommRank, dataFieldID, tileID, receiver, tag, tag >> numCheckBits); |
computeIntracommRank, dataFieldID, tileID, receiver, tag, tag >> numCheckBits); |
2224 |
|
|
2254 |
|
|
2255 |
|
|
2256 |
|
|
|
void myasyncio_set_global_sizes_(int *nx, int *ny, int *nz, |
|
|
int *nt, int *nb, |
|
|
int *tx, int *ty, |
|
|
int *ox, int *oy) |
|
|
{ |
|
|
|
|
|
|
|
|
int data[] = {*nx,*ny,*nz,*nt,*nb,*tx,*ty,*ox,*oy}; |
|
|
|
|
|
int items = sizeof(data)/sizeof(int); |
|
|
|
|
|
|
|
|
NTILES = *nt; // total number of tiles |
|
|
|
|
|
|
|
|
TILE_X = *tx; |
|
|
TILE_Y = *ty; |
|
|
XGHOSTS = *ox; |
|
|
YGHOSTS = *oy; |
|
|
tileOneZLevelItemCount = ((TILE_X + 2*XGHOSTS) * (TILE_Y + 2*YGHOSTS)); |
|
|
tileOneZLevelSizeInBytes = (tileOneZLevelItemCount * datumSize); // needed by compute ranks in f3 |
|
|
|
|
|
|
|
|
int rank,ierr; |
|
|
MPI_Comm_rank(globalIntercomm,&rank); |
|
|
|
|
|
|
|
|
if (0==rank) |
|
|
printf("%d %d %d\n%d %d\n%d %d\n%d %d\n",*nx,*ny,*nz,*nt,*nb,*tx,*ty,*ox,*oy); |
|
|
|
|
|
|
|
|
/* |
|
|
if (0==rank) |
|
|
ierr=MPI_Bcast(&NTILES,1,MPI_INT,MPI_ROOT,globalIntercomm); |
|
|
else |
|
|
ierr=MPI_Bcast(&NTILES,1,MPI_INT,MPI_PROC_NULL,globalIntercomm); |
|
|
*/ |
|
|
|
|
|
if (0==rank) |
|
|
ierr=MPI_Bcast(data,items,MPI_INT,MPI_ROOT,globalIntercomm); |
|
|
else |
|
|
ierr=MPI_Bcast(data,items,MPI_INT,MPI_PROC_NULL,globalIntercomm); |
|
|
|
|
|
} |
|
|
|
|
2257 |
void asyncio_tile_arrays_(int *xoff, int *yoff, int *xskip) |
void asyncio_tile_arrays_(int *xoff, int *yoff, int *xskip) |
2258 |
{ |
{ |
2259 |
int rank,ierr; |
int rank, ierr, rootProc; |
|
MPI_Comm_rank(globalIntercomm,&rank); |
|
|
|
|
|
if (0==rank) |
|
|
ierr=MPI_Bcast(xoff,NTILES,MPI_INT,MPI_ROOT,globalIntercomm); |
|
|
else |
|
|
ierr=MPI_Bcast(xoff,NTILES,MPI_INT,MPI_PROC_NULL,globalIntercomm); |
|
|
|
|
|
if (0==rank) |
|
|
ierr=MPI_Bcast(yoff,NTILES,MPI_INT,MPI_ROOT,globalIntercomm); |
|
|
else |
|
|
ierr=MPI_Bcast(yoff,NTILES,MPI_INT,MPI_PROC_NULL,globalIntercomm); |
|
2260 |
|
|
2261 |
if (0==rank) |
MPI_Comm_rank(globalIntercomm,&rank); |
2262 |
ierr=MPI_Bcast(xskip,NTILES,MPI_INT,MPI_ROOT,globalIntercomm); |
rootProc = (0 == rank) ? MPI_ROOT : MPI_PROC_NULL; |
|
else |
|
|
ierr=MPI_Bcast(xskip,NTILES,MPI_INT,MPI_PROC_NULL,globalIntercomm); |
|
2263 |
|
|
2264 |
|
ierr = MPI_Bcast (xoff, TOTAL_NUM_TILES, MPI_INT, rootProc, globalIntercomm); |
2265 |
|
ierr = MPI_Bcast (yoff, TOTAL_NUM_TILES, MPI_INT, rootProc, globalIntercomm); |
2266 |
|
ierr = MPI_Bcast (xskip, TOTAL_NUM_TILES, MPI_INT, rootProc, globalIntercomm); |
2267 |
} |
} |
2268 |
|
|
2269 |
|
|
2277 |
bron_f1( |
bron_f1( |
2278 |
MPI_Fint *ptr_parentComm, |
MPI_Fint *ptr_parentComm, |
2279 |
int *ptr_numComputeRanks, |
int *ptr_numComputeRanks, |
2280 |
int *ptr_totalNumTiles, |
int *ptr_numTiles, |
2281 |
MPI_Fint *rtnComputeComm) |
MPI_Fint *rtnComputeComm) |
2282 |
{ |
{ |
2283 |
// Convert the Fortran handle into a C handle |
// Convert the Fortran handle into a C handle |
2284 |
MPI_Comm newComm, parentComm = MPI_Comm_f2c(*ptr_parentComm); |
MPI_Comm newComm, parentComm = MPI_Comm_f2c(*ptr_parentComm); |
2285 |
|
|
2286 |
f1(parentComm, *ptr_numComputeRanks, *ptr_totalNumTiles, &newComm); |
f1(parentComm, *ptr_numComputeRanks, *ptr_numTiles, &newComm); |
2287 |
|
|
2288 |
// Convert the C handle into a Fortran handle |
// Convert the C handle into a Fortran handle |
2289 |
*rtnComputeComm = MPI_Comm_c2f(newComm); |
*rtnComputeComm = MPI_Comm_c2f(newComm); |
2319 |
f4(*ptr_epochID); |
f4(*ptr_epochID); |
2320 |
} |
} |
2321 |
|
|
2322 |
|
|