/[MITgcm]/MITgcm_contrib/llc_hires/llc_1080/code-async/recvTask.c
ViewVC logotype

Diff of /MITgcm_contrib/llc_hires/llc_1080/code-async/recvTask.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph | View Patch Patch

revision 1.1 by dimitri, Sun Sep 29 22:43:01 2013 UTC revision 1.3 by dimitri, Sat Mar 2 19:31:13 2019 UTC
# Line 1  Line 1 
1    
2  // Code to do the m-on-n fan-in, recompositing, and output, of data  // Code to do the m-on-n fan-in, recompositing, and output, of data
3  // tiles for mitGCM.  // tiles for MITgcm.
4  //  //
5  // There are aspects of this code that would be simpler in C++, but  // There are aspects of this code that would be simpler in C++, but
6  // we deliberately wrote the code in ansi-C to make linking it with  // we deliberately wrote the code in ansi-C to make linking it with
# Line 9  Line 9 
9    
10    
11  #include "PACKAGES_CONFIG.h"  #include "PACKAGES_CONFIG.h"
12    #include "SIZE.h"
13    
14  #include <stdio.h>  #include <stdio.h>
15  #include <unistd.h>  #include <unistd.h>
# Line 17  Line 18 
18  #include <sys/types.h>  #include <sys/types.h>
19  #include <sys/stat.h>  #include <sys/stat.h>
20  #include <fcntl.h>  #include <fcntl.h>
21    #include <assert.h>
22    
23  #include <mpi.h>  #include <mpi.h>
24  #include <alloca.h>  #include <alloca.h>
25    
26    
27  #include <stdint.h>  #include <stdint.h>
28    #include <limits.h>
29  #include <errno.h>  #include <errno.h>
30    
31  #define DEBUG 1  #define DEBUG 1
32    
33  #if (DEBUG >= 2)  #if (DEBUG >= 3)
34  #define FPRINTF fprintf  #define FPRINTF fprintf
35  #else  #else
36  #include <stdarg.h>  #include <stdarg.h>
# Line 35  void FPRINTF(FILE *fp,...){return;} Line 38  void FPRINTF(FILE *fp,...){return;}
38  #endif  #endif
39    
40    
41    #if (DEBUG >= 2)
42  // Define our own version of "assert", where we sleep rather than abort.  // Define our own version of "assert", where we sleep rather than abort.
43  // This makes it easier to attach the debugger.  // This makes it easier to attach the debugger.
 #if (DEBUG >= 1)  
44  #define ASSERT(_expr)  \  #define ASSERT(_expr)  \
45      if (!(_expr))  { \      if (!(_expr))  { \
46          fprintf(stderr, "ASSERT failed for pid %d : `%s': %s: %d: %s\n", \          fprintf(stderr, "ASSERT failed for pid %d : `%s': %s: %d: %s\n", \
# Line 45  void FPRINTF(FILE *fp,...){return;} Line 48  void FPRINTF(FILE *fp,...){return;}
48          sleep(9999); \          sleep(9999); \
49      }      }
50  #else  #else
51    // Use the standard version of "assert"
52  #define ASSERT assert  #define ASSERT assert
53  #endif  #endif
54    
55    
56    // The hostname that each rank is running on.  Currently, only rank 0
57    // uses this.  It is allocated and deallocated in routine "f1"
58    char (*allHostnames)[HOST_NAME_MAX+1]  = NULL;
59    
60    
61  // If numRanksPerNode is set to be > 0, we just use that setting.  // If numRanksPerNode is set to be > 0, we just use that setting.
62  // If it is <= 0, we dynamically determine the number of cores on  // If it is <= 0, we dynamically determine the number of cores on
63  // a node, and use that.  (N.B.: we assume *all* nodes being used in  // a node, and use that.  (N.B.: we assume *all* nodes being used in
64  // the run have the *same* number of cores.)  // the run have the *same* number of MPI processes on each.)
65  int numRanksPerNode = 0;  int numRanksPerNode = 0;
66    
67  // Just an error check; can be zero (if you're confident it's all correct).  // Just an error check; can be zero (if you're confident it's all correct).
# Line 63  int numRanksPerNode = 0; Line 71  int numRanksPerNode = 0;
71    
72    
73  ///////////////////////////////////////////////////////////////////////  ///////////////////////////////////////////////////////////////////////
74  // Info about the data fields  // Fundamental info about the data fields.  Most (but not all) of
75    // this is copied or derived from the values in "SIZE.h"
76    
77  // double for now.  Might also be float someday  // double for now.  Might also be float someday
78  #define datumType  double  #define datumType  double
# Line 72  int numRanksPerNode = 0; Line 81  int numRanksPerNode = 0;
81    
82  // Info about the data fields.  We assume that all the fields have the  // Info about the data fields.  We assume that all the fields have the
83  // same X and Y characteristics, but may have either 1 or NUM_Z levels.  // same X and Y characteristics, but may have either 1 or NUM_Z levels.
84  //  //   bcn: or MULTDIM levels (evidently used by "sea ice").
85  // the following 3 values are required here. Could be sent over or read in  
86  // with some rearrangement in init routines  // N.B. Please leave the seemingly unneeded "(long int)" casts in
87  //  // place.  These individual values may not need them, but they get
88  #define NUM_X   1080  // used in calculations where the extra length might be important.
89  #define NUM_Y   14040L                     // get rid of this someday  #define NUM_X   ((long int) sFacet)
90  #define NUM_Z   90  #define NUM_Y   (((long int) sFacet) * 13)
91  #define MULTDIM  7  #define NUM_Z   ((long int) Nr)
92    #define MULTDIM ((long int) 7)
93    
94    // Some values derived from the above constants
95  #define twoDFieldSizeInBytes  (NUM_X * NUM_Y * 1 * datumSize)  #define twoDFieldSizeInBytes  (NUM_X * NUM_Y * 1 * datumSize)
96  #define threeDFieldSizeInBytes  (twoDFieldSizeInBytes * NUM_Z)  #define threeDFieldSizeInBytes  (twoDFieldSizeInBytes * NUM_Z)
97  #define multDFieldSizeInBytes  (twoDFieldSizeInBytes * MULTDIM)  #define multDFieldSizeInBytes  (twoDFieldSizeInBytes * MULTDIM)
# Line 87  int numRanksPerNode = 0; Line 99  int numRanksPerNode = 0;
99  // Info about the data tiles.  We assume that all the tiles are the same  // Info about the data tiles.  We assume that all the tiles are the same
100  // size (no odd-sized last piece), they all have the same X and Y  // size (no odd-sized last piece), they all have the same X and Y
101  // characteristics (including ghosting), and are full depth in Z  // characteristics (including ghosting), and are full depth in Z
102  // (either 1 or NUM_Z as appropriate).  // (either 1, or NUM_Z, or MULTDIM, as appropriate).
103  //  #define TILE_X  sNx
104  // all these data now sent over from compute ranks  #define TILE_Y  sNy
105  //  #define XGHOSTS OLx
106  int TILE_X = -1;  #define YGHOSTS OLy
107  int TILE_Y =  -1;  #define TOTAL_NUM_TILES  ((sFacet / sNx) * (sFacet / sNy) * 13)
108  int XGHOSTS  = -1;  #define NUM_WET_TILES  (nPx * nPy)
109  int YGHOSTS =  -1;  
110    // Size of one Z level of an individual tile.  This is the "in memory"
111  // Size of one Z level of a tile (NOT one Z level of a whole field)  // size, including ghosting.
112  int tileOneZLevelItemCount = -1;  #define tileOneZLevelItemCount  (((long)(TILE_X + 2*XGHOSTS)) * (TILE_Y + 2*YGHOSTS))
113  int tileOneZLevelSizeInBytes = -1;  #define tileOneZLevelSizeInBytes  (tileOneZLevelItemCount * datumSize)
114    
115    
116    
117    //////////////////////////////////////////////////////////////////////
118    // Define the depth (i.e. number of Z-levels) of the various fields.
119    
120  typedef struct dataFieldDepth {  typedef struct dataFieldDepth {
121      char dataFieldID;      char dataFieldID;
122      int numZ;      int numZ;
123  } dataFieldDepth_t;  } dataFieldDepth_t;
124    
125  dataFieldDepth_t fieldDepths[] = {  dataFieldDepth_t fieldDepths[] = {
126     { 'u', NUM_Z },     { 'A', MULTDIM },    // seaice, 7 == MULTDIM in SEAICE_SIZE.h
127     { 'v', NUM_Z },  
128     { 'w', NUM_Z },     { 'B', 1 },
129     { 't', NUM_Z },     { 'C', 1 },
130     { 's', NUM_Z },     { 'D', 1 },
131     { 'x', NUM_Z },     { 'E', 1 },
132     { 'y', NUM_Z },     { 'F', 1 },
133     { 'n',     1 },     { 'G', 1 },
134     { 'd',     1 },     { 'H', 1 },
135     { 'h',     1 },     { 'I', 1 },
136     { 'a', MULTDIM },    // seaice, 7 == MULTDIM in SEAICE_SIZE.h     { 'J', 1 },
137     { 'b', 1 },     { 'K', 1 },
138     { 'c', 1 },     { 'L', 1 },
139     { 'd', 1 },     { 'M', 1 },
140     { 'e', 1 },     { 'N', 1 },
141     { 'f', 1 },     { 'O', 1 },
142     { 'g', 1 },     { 'P', 1 },
143       { 'Q', 1 },
144       { 'R', 1 },
145    
146       { 'S', NUM_Z },
147       { 'T', NUM_Z },
148       { 'U', NUM_Z },
149       { 'V', NUM_Z },
150       { 'W', NUM_Z },
151       { 'X', NUM_Z },
152       { 'Y', NUM_Z },
153  };  };
154  #define numAllFields  (sizeof(fieldDepths)/sizeof(dataFieldDepth_t))  #define numAllFields  (sizeof(fieldDepths)/sizeof(dataFieldDepth_t))
155    
156    
   
157  ///////////////////////////////////////////////////////////////////////  ///////////////////////////////////////////////////////////////////////
158  // Info about the various kinds of i/o epochs  // Info about the various kinds of i/o epochs
159  typedef struct epochFieldInfo {  typedef struct epochFieldInfo {
160      char dataFieldID;      char dataFieldID;
161      MPI_Comm registrationIntercomm;      MPI_Comm registrationIntercomm;
162      MPI_Comm dataIntercomm;      MPI_Comm dataIntercomm;
163      MPI_Comm ioRanksIntracomm;      MPI_Comm ioRanksIntracomm;  // i/o ranks for *this* field
164      int tileCount;      long int tileCount;
165      int zDepth;  // duplicates the fieldDepth entry; filled in automatically      int zDepth;  // duplicates the fieldDepth entry; filled in automatically
166        int *chunkWriters;  // Which rank writes the i'th chunk of z-levels
167        int nChunks;  // length of chunkWriters array
168      char filenameTemplate[128];      char filenameTemplate[128];
169      long int offset;      long int offset;
170      int pickup;      int pickup;
171  } fieldInfoThisEpoch_t;  } fieldInfoThisEpoch_t;
172    
173  // The normal i/o dump  // The normal i/o dump - style 0
174  fieldInfoThisEpoch_t fieldsForEpochStyle_0[] = {  fieldInfoThisEpoch_t fieldsForEpochStyle_0[] = {
175    { 'u', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "U.%010d.%s", 0, 0 },    { 'U', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "U.%010d.%s", 0, 0 },
176    { 'v', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "V.%010d.%s", 0, 0 },    { 'V', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "V.%010d.%s", 0, 0 },
177    { 't', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "T.%010d.%s", 0,0 },    { 'W', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "W.%010d.%s", 0, 0 },
178    { 'n', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "Eta.%010d.%s", 0,0 },    { 'S', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "Salt.%010d.%s", 0, 0 },
179    {'\0', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1,          "", 0,0 },    { 'T', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "Theta.%010d.%s", 0,0 },
180      { 'N', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "Eta.%010d.%s", 0,0 },
181    
182      { 'B', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "SIarea.%010d.%s", 0,0 },
183      { 'C', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "SIheff.%010d.%s", 0,0 },
184      { 'D', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "SIhsnow.%010d.%s", 0,0 },
185      { 'E', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "SIuice.%010d.%s", 0,0 },
186      { 'F', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "SIvice.%010d.%s", 0,0 },
187      { 'G', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "SIhsalt.%010d.%s", 0,0 },
188    
189    //{ 'H', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "EtaHnm1.%010d.%s", 0,0 },
190      { 'I', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "oceTAUX.%010d.%s", 0,0 },
191      { 'J', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "oceTAUY.%010d.%s", 0,0 },
192      { 'K', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "KPPhbl.%010d.%s", 0,0 },
193      { 'L', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "oceSflux.%010d.%s", 0,0 },
194    
195      { 'M', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "oceFWflx.%010d.%s", 0,0 },
196      { 'O', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "oceQnet.%010d.%s", 0,0 },
197      { 'P', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "PhiBot.%010d.%s", 0,0 },
198      { 'Q', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "oceQsw.%010d.%s", 0,0 },
199    //{ 'R', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "dEtaHdt.%010d.%s", 0,0 },
200    
201      {'\0', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "", 0,0 },
202  };  };
203    
204    
205  // pickup file  // pickup file dump - style 1
206    // NOTE: if this changes, write_pickup_meta will also need to be changed
207  fieldInfoThisEpoch_t fieldsForEpochStyle_1[] = {  fieldInfoThisEpoch_t fieldsForEpochStyle_1[] = {
208    { 'u', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 0 + twoDFieldSizeInBytes * 0, 1},    { 'U', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_%010d.%s", threeDFieldSizeInBytes * 0 + twoDFieldSizeInBytes * 0, 1},
209    { 'v', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 0, 1},    { 'V', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_%010d.%s", threeDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 0, 1},
210    { 't', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 2 + twoDFieldSizeInBytes * 0, 1},    { 'T', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_%010d.%s", threeDFieldSizeInBytes * 2 + twoDFieldSizeInBytes * 0, 1},
211    { 's', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 3 + twoDFieldSizeInBytes * 0, 1},    { 'S', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_%010d.%s", threeDFieldSizeInBytes * 3 + twoDFieldSizeInBytes * 0, 1},
212    { 'x', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 4 + twoDFieldSizeInBytes * 0, 1},    { 'X', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_%010d.%s", threeDFieldSizeInBytes * 4 + twoDFieldSizeInBytes * 0, 1},
213    { 'y', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 5 + twoDFieldSizeInBytes * 0, 1},    { 'Y', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_%010d.%s", threeDFieldSizeInBytes * 5 + twoDFieldSizeInBytes * 0, 1},
214    { 'n', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 6 + twoDFieldSizeInBytes * 0, 1},    { 'N', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_%010d.%s", threeDFieldSizeInBytes * 6 + twoDFieldSizeInBytes * 0, 1},
215    { 'd', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 6 + twoDFieldSizeInBytes * 1, 1},    { 'R', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_%010d.%s", threeDFieldSizeInBytes * 6 + twoDFieldSizeInBytes * 1, 1},
216    { 'h', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_%010d.%s", threeDFieldSizeInBytes * 6 + twoDFieldSizeInBytes * 2, 1},    { 'H', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_%010d.%s", threeDFieldSizeInBytes * 6 + twoDFieldSizeInBytes * 2, 1},
217    {'\0', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1,                    "", 0 ,1},    {'\0', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "", 0 ,1},
218  };  };
219    
220    
221  // seaice pickup  // seaice pickup dump - style 2
222    // NOTE: if this changes, write_pickup_meta will also need to be changed
223  fieldInfoThisEpoch_t fieldsForEpochStyle_2[] = {  fieldInfoThisEpoch_t fieldsForEpochStyle_2[] = {
224    { 'a', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 0 + twoDFieldSizeInBytes * 0, 2},    { 'A', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 0 + twoDFieldSizeInBytes * 0, 2},
225    { 'b', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 0, 2},    { 'B', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 0, 2},
226    { 'c', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 1, 2},    { 'C', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 1, 2},
227    { 'd', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 2, 2},    { 'D', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 2, 2},
228    { 'g', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 3, 2},    { 'G', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 3, 2},
229    { 'e', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 4, 2},    { 'E', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 4, 2},
230    { 'f', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 5, 2},    { 'F', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "pickup_seaice_%010d.%s", multDFieldSizeInBytes * 1 + twoDFieldSizeInBytes * 5, 2},
231    {'\0', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1,          "",                          0 },    {'\0', MPI_COMM_NULL, MPI_COMM_NULL, MPI_COMM_NULL, 0, -1, NULL, 0, "", 0 },
232  };  };
233    
234    
# Line 212  MPI_Comm globalIntercomm = MPI_COMM_NULL Line 263  MPI_Comm globalIntercomm = MPI_COMM_NULL
263  #define roundUp(_x,_y)  (divCeil((_x),(_y)) * (_y))  #define roundUp(_x,_y)  (divCeil((_x),(_y)) * (_y))
264    
265    
   
266  typedef enum {  typedef enum {
267      bufState_illegal,      bufState_illegal,
268      bufState_Free,      bufState_Free,
# Line 225  typedef struct buf_header{ Line 275  typedef struct buf_header{
275      bufState_t state;      bufState_t state;
276      int requestsArraySize;      int requestsArraySize;
277      MPI_Request *requests;      MPI_Request *requests;
278      uint64_t ck0;      char *payload;
279      char *payload;   // [tileOneZLevelSizeInBytes * NUM_Z];  // Max payload size  } bufHdr_t;
   uint64_t ck1;      // now rec'vd from compute ranks & dynamically allocated in    
 } bufHdr_t;          // allocateTileBufs  
280    
281    
282  bufHdr_t *freeTileBufs_ptr = NULL;  bufHdr_t *freeTileBufs_ptr = NULL;
283  bufHdr_t *inUseTileBufs_ptr = NULL;  bufHdr_t *inUseTileBufs_ptr = NULL;
284    
285  int maxTagValue = -1;  int maxTagValue = -1;
286  int totalNumTiles = -1;  
287    
288  // routine to convert double to float during memcpy  // routine to convert double to float during memcpy
289  // need to get byteswapping in here as well  // need to get byteswapping in here as well
# Line 285  long long i, rem; Line 333  long long i, rem;
333          }          }
334  }  }
335    
336    
337  // Debug routine  // Debug routine
338  countBufs(int nbufs)  countBufs(int nbufs)
339  {  {
# Line 306  countBufs(int nbufs) Line 355  countBufs(int nbufs)
355      if (nInUse + nFree != target) {      if (nInUse + nFree != target) {
356          fprintf(stderr, "Rank %d: bad number of buffs: free %d, inUse %d, should be %d\n",          fprintf(stderr, "Rank %d: bad number of buffs: free %d, inUse %d, should be %d\n",
357                          ioIntracommRank, nFree, nInUse, target);                          ioIntracommRank, nFree, nInUse, target);
         sleep(5000);  
358      }      }
359        ASSERT ((nInUse + nFree) == target);
360  }  }
361    
362  int readn(int fd, void *p, int nbytes)  
363    long int readn(int fd, void *p, long int nbytes)
364  {  {
365      
366    char *ptr = (char*)(p);    char *ptr = (char*)(p);
367    
368      int nleft, nread;      long int nleft, nread;
369        
370      nleft = nbytes;      nleft = nbytes;
371      while (nleft > 0){      while (nleft > 0){
# Line 380  void write_pickup_meta(FILE *fp, int gcm Line 430  void write_pickup_meta(FILE *fp, int gcm
430        
431    fprintf(fp," nDims = [ %3d ];\n",ndims);    fprintf(fp," nDims = [ %3d ];\n",ndims);
432    fprintf(fp," dimList = [\n");    fprintf(fp," dimList = [\n");
433    fprintf(fp," %10u,%10d,%10u,\n",NUM_X,1,NUM_X);    fprintf(fp," %10lu,%10d,%10lu,\n",NUM_X,1,NUM_X);
434    fprintf(fp," %10ld,%10d,%10ld\n",NUM_Y,1,NUM_Y);    fprintf(fp," %10ld,%10d,%10ld\n",NUM_Y,1,NUM_Y);
435    fprintf(fp," ];\n");    fprintf(fp," ];\n");
436    fprintf(fp," dataprec = [ 'float64' ];\n");    fprintf(fp," dataprec = [ 'float64' ];\n");
# Line 396  void write_pickup_meta(FILE *fp, int gcm Line 446  void write_pickup_meta(FILE *fp, int gcm
446    
447    
448    
449  double *outBuf=NULL;//[NUM_X*NUM_Y*NUM_Z];  // only needs to be myNumZSlabs  double *outBuf=NULL;//was [NUM_X*NUM_Y*NUM_Z], but only needs to be myNumZSlabs
450  size_t outBufSize=0;  size_t outBufSize=0;
451    
452    
453  void  void
454  do_write(int io_epoch, fieldInfoThisEpoch_t *whichField, int firstZ, int numZ, int gcmIter)  do_write(int io_epoch, fieldInfoThisEpoch_t *whichField, int myFirstZ, int myNumZ, int gcmIter)
455  {  {
456    if (0==numZ) return;  // this is *not* global NUM_Z : change name of parameter to avoid grief!    if (0==myNumZ) return;
457    
458    int pickup = whichField->pickup;    int pickup = whichField->pickup;
459    
# Line 411  do_write(int io_epoch, fieldInfoThisEpoc Line 461  do_write(int io_epoch, fieldInfoThisEpoc
461    // swap here, if necessary    // swap here, if necessary
462    //  int i,j;    //  int i,j;
463    
464    //i = NUM_X*NUM_Y*numZ;    //i = NUM_X*NUM_Y*myNumZ;
465    //mds_byteswapr8_(&i,outBuf);    //mds_byteswapr8_(&i,outBuf);
466    
467    // mds_byteswapr8 expects an integer count, which is gonna overflow someday    // mds_byteswapr8 expects an integer count, which is gonna overflow someday
468    // can't redefine to long without affecting a bunch of other calls    // can't redefine to long without affecting a bunch of other calls
469    // so do a slab at a time here, to delay the inevitable    // so do a slab at a time here, to delay the inevitable
470    //  i = NUM_X*NUM_Y;    //  i = NUM_X*NUM_Y;
471    //for (j=0;j<numZ;++j)    //for (j=0;j<myNumZ;++j)
472    //  mds_byteswapr8_(&i,&outBuf[i*j]);    //  mds_byteswapr8_(&i,&outBuf[i*j]);
473    
474    // gnu builtin evidently honored by intel compilers    // gnu builtin evidently honored by intel compilers
# Line 426  do_write(int io_epoch, fieldInfoThisEpoc Line 476  do_write(int io_epoch, fieldInfoThisEpoc
476    if (pickup) {    if (pickup) {
477      uint64_t *alias = (uint64_t*)outBuf;      uint64_t *alias = (uint64_t*)outBuf;
478      size_t i;      size_t i;
479      for (i=0;i<NUM_X*NUM_Y*numZ;++i)      for (i=0;i<NUM_X*NUM_Y*myNumZ;++i)
480        alias[i] = __builtin_bswap64(alias[i]);        alias[i] = __builtin_bswap64(alias[i]);
481    }    }
482    else {    else {
483      uint32_t *alias = (uint32_t*)outBuf;      uint32_t *alias = (uint32_t*)outBuf;
484      size_t i;      size_t i;
485      for (i=0;i<NUM_X*NUM_Y*numZ;++i)      for (i=0;i<NUM_X*NUM_Y*myNumZ;++i)
486        alias[i] = __builtin_bswap32(alias[i]);        alias[i] = __builtin_bswap32(alias[i]);
487    }    }
488    
# Line 451  do_write(int io_epoch, fieldInfoThisEpoc Line 501  do_write(int io_epoch, fieldInfoThisEpoc
501    
502    if (pickup) {    if (pickup) {
503      lseek(fd,whichField->offset,SEEK_SET);      lseek(fd,whichField->offset,SEEK_SET);
504      lseek(fd,firstZ*NUM_X*NUM_Y*datumSize,SEEK_CUR);      lseek(fd,myFirstZ*NUM_X*NUM_Y*datumSize,SEEK_CUR);
505      nbytes = NUM_X*NUM_Y*numZ*datumSize;      nbytes = NUM_X*NUM_Y*myNumZ*datumSize;
506    }    }
507    else {    else {
508      lseek(fd,firstZ*NUM_X*NUM_Y*sizeof(float),SEEK_CUR);      lseek(fd,myFirstZ*NUM_X*NUM_Y*sizeof(float),SEEK_CUR);
509      nbytes = NUM_X*NUM_Y*numZ*sizeof(float);      nbytes = NUM_X*NUM_Y*myNumZ*sizeof(float);
510    }    }
511    
512    ssize_t bwrit = writen(fd,outBuf,nbytes);      ssize_t bwrit = writen(fd,outBuf,nbytes);  
513    
514    if (-1==bwrit) perror("Henze : file write problem : ");    if (-1==bwrit) perror("Henze : file write problem : ");
515    
516    FPRINTF(stderr,"Wrote %d of %d bytes (%d -> %d) \n",bwrit,nbytes,firstZ,numZ);    FPRINTF(stderr,"Wrote %d of %d bytes (%d -> %d) \n",bwrit,nbytes,myFirstZ,myNumZ);
517    
518    //  ASSERT(nbytes == bwrit);    //  ASSERT(nbytes == bwrit);
519    
# Line 477  do_write(int io_epoch, fieldInfoThisEpoc Line 527  do_write(int io_epoch, fieldInfoThisEpoc
527  }  }
528    
529    
 int NTILES = -1;  
530    
531  typedef struct {  typedef struct {
532    int off;    long int off;
533    int skip;    long int skip;
534  } tile_layout_t;  } tile_layout_t;
535    
536  tile_layout_t *offsetTable;  tile_layout_t *offsetTable;
# Line 493  processSlabSection( Line 542  processSlabSection(
542    void *data,    void *data,
543    int myNumZSlabs)    int myNumZSlabs)
544  {  {
   int intracommSize,intracommRank;  
   // MPI_Comm_size(whichField->ioRanksIntracomm, &intracommSize);  
   MPI_Comm_rank(whichField->ioRanksIntracomm, &intracommRank);  
    //printf("i/o rank %d/%d recv'd %d::%d (%d->%d)\n",intracommRank,intracommSize,whichField,tileID,firstZ,lastZ);  
   // printf("rank %d : tile %d is gonna go at %d and stride with %d, z = %d\n",intracommRank,tileID,offsetTable[tileID].off,  
   //       offsetTable[tileID].skip,myNumZSlabs);  
   
545    int z;    int z;
546    
547    int pickup = whichField->pickup;    int pickup = whichField->pickup;
548    
549    //ASSERT((tileID > 0) && (tileID < (sizeof(offsetTable)/sizeof(tile_layout_t))));    ASSERT ((tileID > 0) && (tileID <= TOTAL_NUM_TILES));
   ASSERT( (tileID > 0) && (tileID <= NTILES) ); // offsetTable now dynamically allocated  
550    
551    if (myNumZSlabs * twoDFieldSizeInBytes > outBufSize){    if (myNumZSlabs * twoDFieldSizeInBytes > outBufSize){
552    
# Line 521  processSlabSection( Line 562  processSlabSection(
562    
563    for (z=0;z<myNumZSlabs;++z){    for (z=0;z<myNumZSlabs;++z){
564    
565      off_t zoffset = z*TILE_X*TILE_Y*NTILES; //NOT totalNumTiles;      off_t zoffset = z*TILE_X*TILE_Y*TOTAL_NUM_TILES;
566      off_t hoffset = offsetTable[tileID].off;      off_t hoffset = offsetTable[tileID].off;
567      off_t skipdst = offsetTable[tileID].skip;      off_t skipdst = offsetTable[tileID].skip;
568    
# Line 538  processSlabSection( Line 579  processSlabSection(
579    
580      off_t skipsrc = TILE_X+2*XGHOSTS;      off_t skipsrc = TILE_X+2*XGHOSTS;
581    
     //fprintf(stderr,"rank %d   tile %d   offset %d   skip %d    dst %x     src %x\n",intracommRank,tileID,hoffset,skipdst,dst,src);  
   
582      long long n = TILE_X;      long long n = TILE_X;
583    
584      int y;      int y;
# Line 560  processSlabSection( Line 599  processSlabSection(
599  void  void
600  allocateTileBufs(int numTileBufs, int maxIntracommSize)  allocateTileBufs(int numTileBufs, int maxIntracommSize)
601  {  {
   
   ASSERT(tileOneZLevelSizeInBytes>0);  // be sure we have rec'vd values by now  
   
602      int i, j;      int i, j;
603      for (i = 0;  i < numTileBufs;  ++i) {      for (i = 0;  i < numTileBufs;  ++i) {
604    
# Line 580  allocateTileBufs(int numTileBufs, int ma Line 616  allocateTileBufs(int numTileBufs, int ma
616          for (j = 0;  j < maxIntracommSize;  ++j) {          for (j = 0;  j < maxIntracommSize;  ++j) {
617              newBuf->requests[j] = MPI_REQUEST_NULL;              newBuf->requests[j] = MPI_REQUEST_NULL;
618          }          }
         newBuf->ck0 = newBuf->ck1 = 0xdeadbeefdeadbeef;  
619    
620          // Put the buf into the free list          // Put the buf into the free list
621          newBuf->state = bufState_Free;          newBuf->state = bufState_Free;
# Line 648  tryToReceiveDataTile( Line 683  tryToReceiveDataTile(
683               dataIntercomm, &mpiStatus);               dataIntercomm, &mpiStatus);
684      bufHdr->state = bufState_InUse;      bufHdr->state = bufState_InUse;
685    
     // Overrun check  
     ASSERT(bufHdr->ck0==0xdeadbeefdeadbeef);  
     ASSERT(bufHdr->ck0==bufHdr->ck1);  
   
686      // Return values      // Return values
687      *tileID = mpiStatus.MPI_TAG >> numCheckBits;      *tileID = mpiStatus.MPI_TAG >> numCheckBits;
688    
# Line 667  tryToReceiveDataTile( Line 698  tryToReceiveDataTile(
698  int  int
699  tryToReceiveZSlab(  tryToReceiveZSlab(
700    void *buf,    void *buf,
701    int expectedMsgSize,    long int expectedMsgSize,
702    MPI_Comm intracomm)    MPI_Comm intracomm)
703  {  {
704      MPI_Status mpiStatus;      MPI_Status mpiStatus;
# Line 690  tryToReceiveZSlab( Line 721  tryToReceiveZSlab(
721  }  }
722    
723    
   
724  void  void
725  redistributeZSlabs(  redistributeZSlabs(
726    bufHdr_t *bufHdr,    bufHdr_t *bufHdr,
727    int tileID,    int tileID,
728    int zSlabsPer,    int zSlabsPer,
729    int thisFieldNumZ,    fieldInfoThisEpoch_t *fieldInfo)
   MPI_Comm intracomm)  
730  {  {
731      int pieceSize = zSlabsPer * tileOneZLevelSizeInBytes;      int thisFieldNumZ = fieldInfo->zDepth;
732      int tileSizeInBytes = tileOneZLevelSizeInBytes * thisFieldNumZ;      long int tileSizeInBytes = tileOneZLevelSizeInBytes * thisFieldNumZ;
733      int offset = 0;      long int fullPieceSize = zSlabsPer * tileOneZLevelSizeInBytes;
734      int recvRank = 0;      long int offset = 0;
735    
736        MPI_Comm intracomm = fieldInfo->ioRanksIntracomm;
737    
738        // Note that the MPI interface definition requires that this arg
739        // (i.e. "pieceSize") be of type "int".  So check to be sure we
740        // haven't overflowed the size.
741        int pieceSize = (int)fullPieceSize;
742        if (pieceSize != fullPieceSize) {
743            fprintf(stderr, "ERROR: pieceSize:%d, fullPieceSize:%ld\n",
744                            pieceSize,    fullPieceSize);
745            fprintf(stderr, "ERROR: tileOneZLevelSizeInBytes:%ld, tileSizeInBytes:%ld\n",
746                            tileOneZLevelSizeInBytes,     tileSizeInBytes);
747            fprintf(stderr, "ERROR: thisFieldNumZ:%d, zSlabsPer:%d\n",
748                              thisFieldNumZ    , zSlabsPer);
749        }
750        ASSERT (pieceSize == fullPieceSize);
751        ASSERT (pieceSize > 0);
752    
753    
754        // Send each piece (aka chunk) to the rank that is responsible
755        // for writing it.
756        int curPiece = 0;
757      while ((offset + pieceSize) <= tileSizeInBytes) {      while ((offset + pieceSize) <= tileSizeInBytes) {
758            int recvRank = fieldInfo->chunkWriters[curPiece];
759          MPI_Isend(bufHdr->payload + offset, pieceSize, MPI_BYTE, recvRank,          MPI_Isend(bufHdr->payload + offset, pieceSize, MPI_BYTE, recvRank,
760                    tileID, intracomm, &(bufHdr->requests[recvRank]));                    tileID, intracomm, &(bufHdr->requests[recvRank]));
761          ASSERT(MPI_REQUEST_NULL != bufHdr->requests[recvRank]);          ASSERT(MPI_REQUEST_NULL != bufHdr->requests[recvRank]);
762    
763          offset += pieceSize;          offset += pieceSize;
764          recvRank += 1;          curPiece += 1;
765      }      }
766    
767      // There might be one last odd-sized piece      // There might be one last odd-sized piece (N.B.: odd-sized in Z;
768        // the X and Y sizes of a tile are always the same).
769      if (offset < tileSizeInBytes) {      if (offset < tileSizeInBytes) {
770            ASSERT (pieceSize >= tileSizeInBytes - offset);
771          pieceSize = tileSizeInBytes - offset;          pieceSize = tileSizeInBytes - offset;
772          ASSERT(pieceSize % tileOneZLevelSizeInBytes == 0);          ASSERT(pieceSize % tileOneZLevelSizeInBytes == 0);
773    
774            int recvRank = fieldInfo->chunkWriters[curPiece];
775          MPI_Isend(bufHdr->payload + offset, pieceSize, MPI_BYTE, recvRank,          MPI_Isend(bufHdr->payload + offset, pieceSize, MPI_BYTE, recvRank,
776                    tileID, intracomm, &(bufHdr->requests[recvRank]));                    tileID, intracomm, &(bufHdr->requests[recvRank]));
777          ASSERT(MPI_REQUEST_NULL != bufHdr->requests[recvRank]);          ASSERT(MPI_REQUEST_NULL != bufHdr->requests[recvRank]);
778            curPiece += 1;
         offset += pieceSize;  
         recvRank += 1;  
779      }      }
780    
781      // Sanity check      ASSERT (curPiece == fieldInfo->nChunks);
     ASSERT(recvRank <= bufHdr->requestsArraySize);  
     while (recvRank < bufHdr->requestsArraySize) {  
         ASSERT(MPI_REQUEST_NULL == bufHdr->requests[recvRank++])  
     }  
782  }  }
783    
784    
# Line 756  doNewEpoch(int epochID, int epochStyleIn Line 803  doNewEpoch(int epochID, int epochStyleIn
803      int zSlabsPer;      int zSlabsPer;
804      int myNumZSlabs, myFirstZSlab, myLastZSlab, myNumSlabPiecesToRecv;      int myNumZSlabs, myFirstZSlab, myLastZSlab, myNumSlabPiecesToRecv;
805    
806      int numTilesRecvd, numSlabPiecesRecvd;      int ii, numTilesRecvd, numSlabPiecesRecvd;
807    
808    
809      // Find the descriptor for my assigned field for this epoch style.      // Find the descriptor for my assigned field for this epoch style.
# Line 766  doNewEpoch(int epochID, int epochStyleIn Line 813  doNewEpoch(int epochID, int epochStyleIn
813      ASSERT('\0' != fieldInfo->dataFieldID);      ASSERT('\0' != fieldInfo->dataFieldID);
814    
815    
   
816      MPI_Comm_rank(fieldInfo->ioRanksIntracomm, &intracommRank);      MPI_Comm_rank(fieldInfo->ioRanksIntracomm, &intracommRank);
817      MPI_Comm_size(fieldInfo->ioRanksIntracomm, &intracommSize);      MPI_Comm_size(fieldInfo->ioRanksIntracomm, &intracommSize);
818    
   
     // Compute which z slab(s) we will be reassembling.  
819      zSlabsPer = divCeil(fieldInfo->zDepth, intracommSize);      zSlabsPer = divCeil(fieldInfo->zDepth, intracommSize);
820      myNumZSlabs = zSlabsPer;  
821        // Figure out if this rank writes z-slabs, and if so, which ones
822      // Adjust myNumZSlabs in case it didn't divide evenly      myNumZSlabs = 0;
823      myFirstZSlab = intracommRank * myNumZSlabs;      myFirstZSlab = 0;
824      if (myFirstZSlab >= fieldInfo->zDepth) {      myLastZSlab = -1;
825          myNumZSlabs = 0;      for (ii = 0;  ii < fieldInfo->nChunks;  ++ii) {
826      } else if ((myFirstZSlab + myNumZSlabs) > fieldInfo->zDepth) {          if (fieldInfo->chunkWriters[ii] == intracommRank) {
827          myNumZSlabs = fieldInfo->zDepth - myFirstZSlab;              myFirstZSlab = ii * zSlabsPer;
828      } else {              // The last chunk might be odd sized
829          myNumZSlabs = zSlabsPer;              myNumZSlabs = ((ii + 1) < fieldInfo->nChunks) ?
830                                   zSlabsPer : fieldInfo->zDepth - myFirstZSlab;
831                myLastZSlab = myFirstZSlab + myNumZSlabs - 1;
832    
833                break;
834            }
835        }
836        if (myNumZSlabs > 0) {
837            ASSERT ((myFirstZSlab >= 0) && (myFirstZSlab < fieldInfo->zDepth));
838            ASSERT ((myLastZSlab >= 0) && (myLastZSlab < fieldInfo->zDepth));
839            ASSERT (myLastZSlab >= myFirstZSlab);
840      }      }
841      myLastZSlab = myFirstZSlab + myNumZSlabs - 1;  
842    
843      // If we were not assigned any z-slabs, we don't get any redistributed      // If we were not assigned any z-slabs, we don't get any redistributed
844      // tile-slabs.  If we were assigned one or more slabs, we get      // tile-slabs.  If we were assigned one or more slabs, we get
845      // redistributed tile-slabs for every tile.      // redistributed tile-slabs for every tile.
846      myNumSlabPiecesToRecv = (0 == myNumZSlabs) ? 0 : totalNumTiles;      myNumSlabPiecesToRecv = (0 == myNumZSlabs) ? 0 : NUM_WET_TILES;
847    
848    
849      numTilesRecvd = 0;      numTilesRecvd = 0;
# Line 810  doNewEpoch(int epochID, int epochStyleIn Line 864  doNewEpoch(int epochID, int epochStyleIn
864              if (NULL == bufHdr) break; // No tile was received              if (NULL == bufHdr) break; // No tile was received
865    
866              numTilesRecvd += 1;              numTilesRecvd += 1;
867              redistributeZSlabs(bufHdr, tileID, zSlabsPer,              redistributeZSlabs(bufHdr, tileID, zSlabsPer, fieldInfo);
                                fieldInfo->zDepth, fieldInfo->ioRanksIntracomm);  
868    
869              // Add the bufHdr to the "in use" list              // Add the bufHdr to the "in use" list
870              bufHdr->next = inUseTileBufs_ptr;              bufHdr->next = inUseTileBufs_ptr;
871              inUseTileBufs_ptr = bufHdr;              inUseTileBufs_ptr = bufHdr;
   
   
872          }          }
873    
874    
875          ////////////////////////////////////////////////////////////////          ////////////////////////////////////////////////////////////////
876          // Check for tile-slabs redistributed from the i/o processes          // Check for tile-slabs redistributed from the i/o processes
877          while (numSlabPiecesRecvd < myNumSlabPiecesToRecv) {          while (numSlabPiecesRecvd < myNumSlabPiecesToRecv) {
878              int msgSize = tileOneZLevelSizeInBytes * myNumZSlabs;              long int msgSize = tileOneZLevelSizeInBytes * myNumZSlabs;
879              char data[msgSize];              char data[msgSize];
880    
881              int tileID = tryToReceiveZSlab(data, msgSize, fieldInfo->ioRanksIntracomm);              int tileID = tryToReceiveZSlab(data, msgSize, fieldInfo->ioRanksIntracomm);
882              if (tileID < 0) break;  // No slab was received              if (tileID < 0) break;  // No slab was received
883    
   
884              numSlabPiecesRecvd += 1;              numSlabPiecesRecvd += 1;
885              processSlabSection(fieldInfo, tileID, data, myNumZSlabs);              processSlabSection(fieldInfo, tileID, data, myNumZSlabs);
886    
   
887              // Can do the write here, or at the end of the epoch.              // Can do the write here, or at the end of the epoch.
888              // Probably want to make it asynchronous (waiting for              // Probably want to make it asynchronous (waiting for
889              // completion at the barrier at the start of the next epoch).              // completion at the barrier at the start of the next epoch).
# Line 846  doNewEpoch(int epochID, int epochStyleIn Line 895  doNewEpoch(int epochID, int epochStyleIn
895    
896    
897          ////////////////////////////////////////////////////////////////          ////////////////////////////////////////////////////////////////
898            // Sanity check for non-writers
899            if (0 == myNumSlabPiecesToRecv) {  
900    
901                long int msgSize = tileOneZLevelSizeInBytes * myNumZSlabs;
902                char data[msgSize];
903    
904                // Check that no one has tried to re-distribute a slab to us.
905                int tileID = tryToReceiveZSlab(data, msgSize, fieldInfo->ioRanksIntracomm);
906                ASSERT (tileID < 0);
907            }
908    
909    
910            ////////////////////////////////////////////////////////////////
911          // Check if we can release any of the inUse buffers (i.e. the          // Check if we can release any of the inUse buffers (i.e. the
912          // isends we used to redistribute those z-slabs have completed).          // isends we used to redistribute those z-slabs have completed).
913          // We do this by detaching the inUse list, then examining each          // We do this by detaching the inUse list, then examining each
# Line 875  doNewEpoch(int epochID, int epochStyleIn Line 937  doNewEpoch(int epochID, int epochStyleIn
937                  FPRINTF(stderr,"Rank %d freed a tile buffer\n", intracommRank);                  FPRINTF(stderr,"Rank %d freed a tile buffer\n", intracommRank);
938              } else {              } else {
939                  // At least one request is still outstanding.                  // At least one request is still outstanding.
940                  // Put the buf on the inUse list.                  // Put the buf back on the inUse list.
941                  bufHdr->next = inUseTileBufs_ptr;                  bufHdr->next = inUseTileBufs_ptr;
942                  inUseTileBufs_ptr = bufHdr;                  inUseTileBufs_ptr = bufHdr;
943              }              }
# Line 911  doNewEpoch(int epochID, int epochStyleIn Line 973  doNewEpoch(int epochID, int epochStyleIn
973    
974    
975    
   
   
   
976  // Main routine for the i/o tasks.  Callers DO NOT RETURN from  // Main routine for the i/o tasks.  Callers DO NOT RETURN from
977  // this routine; they MPI_Finalize and exit.  // this routine; they MPI_Finalize and exit.
978  void  void
979  ioRankMain(int totalNumTiles)  ioRankMain (void)
980  {  {
981      // This is one of a group of i/o processes that will be receiving tiles      // This is one of a group of i/o processes that will be receiving tiles
982      // from the computational processes.  This same process is also      // from the computational processes.  This same process is also
# Line 932  ioRankMain(int totalNumTiles) Line 991  ioRankMain(int totalNumTiles)
991      // the ghost cells), and put them into the slab we are assembling.  Once      // the ghost cells), and put them into the slab we are assembling.  Once
992      // a slab is fully assembled, it is written to disk.      // a slab is fully assembled, it is written to disk.
993    
994      int i, size, count, numTileBufs;      int i, ierr, count, numTileBufs;
995      MPI_Status status;      MPI_Status status;
996      int currentEpochID = 0;      int currentEpochID = 0;
997      int maxTileCount = 0, max3DTileCount = 0, maxIntracommSize = 0;      int maxTileCount = 0, max3DTileCount = 0, maxIntracommSize = 0;
# Line 941  ioRankMain(int totalNumTiles) Line 1000  ioRankMain(int totalNumTiles)
1000      MPI_Comm_remote_size(globalIntercomm, &numComputeRanks);      MPI_Comm_remote_size(globalIntercomm, &numComputeRanks);
1001    
1002    
1003      ////////////////////////////////////////////////////////////      int *xoff = malloc(TOTAL_NUM_TILES*sizeof(int));
     //// get global tile info via bcast over the global intercom  
   
   
     //    int ierr = MPI_Bcast(&NTILES,1,MPI_INT,0,globalIntercomm);  
   
     int data[9];  
   
     int ierr = MPI_Bcast(data,9,MPI_INT,0,globalIntercomm);  
   
     NTILES = data[3];  
     TILE_X = data[5];  
     TILE_Y = data[6];  
     XGHOSTS = data[7];  
     YGHOSTS = data[8];  
   
     tileOneZLevelItemCount = ((TILE_X + 2*XGHOSTS) * (TILE_Y + 2*YGHOSTS));    
     tileOneZLevelSizeInBytes = (tileOneZLevelItemCount * datumSize);        // also calculated in compute ranks, urghh  
   
     int *xoff = malloc(NTILES*sizeof(int));  
1004      ASSERT(xoff);      ASSERT(xoff);
1005      int *yoff = malloc(NTILES*sizeof(int));      int *yoff = malloc(TOTAL_NUM_TILES*sizeof(int));
1006      ASSERT(yoff);      ASSERT(yoff);
1007      int *xskip = malloc(NTILES*sizeof(int));      int *xskip = malloc(TOTAL_NUM_TILES*sizeof(int));
1008      ASSERT(xskip);      ASSERT(xskip);
1009    
1010      offsetTable = malloc((NTILES+1)*sizeof(tile_layout_t));      offsetTable = malloc((TOTAL_NUM_TILES+1)*sizeof(tile_layout_t));
1011      ASSERT(offsetTable);      ASSERT(offsetTable);
1012    
1013      ierr = MPI_Bcast(xoff,NTILES,MPI_INT,0,globalIntercomm);      ierr = MPI_Bcast(xoff,TOTAL_NUM_TILES,MPI_INT,0,globalIntercomm);
1014      ierr = MPI_Bcast(yoff,NTILES,MPI_INT,0,globalIntercomm);      ierr = MPI_Bcast(yoff,TOTAL_NUM_TILES,MPI_INT,0,globalIntercomm);
1015      ierr = MPI_Bcast(xskip,NTILES,MPI_INT,0,globalIntercomm);      ierr = MPI_Bcast(xskip,TOTAL_NUM_TILES,MPI_INT,0,globalIntercomm);
1016    
1017      for (i=0;i<NTILES;++i){      for (i=0;i<TOTAL_NUM_TILES;++i){
1018        offsetTable[i+1].off = NUM_X*(yoff[i]-1) + xoff[i] - 1;        offsetTable[i+1].off = NUM_X*(yoff[i]-1) + xoff[i] - 1;
1019        offsetTable[i+1].skip = xskip[i];        offsetTable[i+1].skip = xskip[i];
1020      }      }
# Line 1004  ioRankMain(int totalNumTiles) Line 1044  ioRankMain(int totalNumTiles)
1044          }          }
1045          ASSERT(NULL != p);          ASSERT(NULL != p);
1046          ASSERT('\0' != p->dataFieldID);          ASSERT('\0' != p->dataFieldID);
1047          MPI_Comm_size(p->ioRanksIntracomm, &size);          MPI_Comm_size(p->ioRanksIntracomm, &thisIntracommSize);
1048          if (size > maxIntracommSize) maxIntracommSize = size;          if (thisIntracommSize > maxIntracommSize) {
1049                maxIntracommSize = thisIntracommSize;
1050            }
1051    
1052    
1053          // Receive a message from *each* computational rank telling us          // Receive a message from *each* computational rank telling us
# Line 1024  ioRankMain(int totalNumTiles) Line 1066  ioRankMain(int totalNumTiles)
1066          // Sanity check          // Sanity check
1067          MPI_Allreduce (&(p->tileCount), &count, 1,          MPI_Allreduce (&(p->tileCount), &count, 1,
1068                         MPI_INT, MPI_SUM, p->ioRanksIntracomm);                         MPI_INT, MPI_SUM, p->ioRanksIntracomm);
1069          ASSERT(count == totalNumTiles);          ASSERT(count == NUM_WET_TILES);
1070      }      }
1071    
1072    
# Line 1041  ioRankMain(int totalNumTiles) Line 1083  ioRankMain(int totalNumTiles)
1083      // Hack - for now, just pick a value for numTileBufs      // Hack - for now, just pick a value for numTileBufs
1084      numTileBufs = (max3DTileCount > 0) ? max3DTileCount : maxTileCount;      numTileBufs = (max3DTileCount > 0) ? max3DTileCount : maxTileCount;
1085      if (numTileBufs < 4) numTileBufs = 4;      if (numTileBufs < 4) numTileBufs = 4;
1086      if (numTileBufs > 15) numTileBufs = 15;      if (numTileBufs > 25) numTileBufs = 25;
     //    if (numTileBufs > 25) numTileBufs = 25;  
1087    
1088      allocateTileBufs(numTileBufs, maxIntracommSize);      allocateTileBufs(numTileBufs, maxIntracommSize);
1089      countBufs(numTileBufs);      countBufs(numTileBufs);
# Line 1060  ioRankMain(int totalNumTiles) Line 1101  ioRankMain(int totalNumTiles)
1101          MPI_Barrier(ioIntracomm);          MPI_Barrier(ioIntracomm);
1102    
1103          if (0 == ioIntracommRank) {          if (0 == ioIntracommRank) {
1104              fprintf(stderr, "I/O ranks waiting for new epoch\n");              fprintf(stderr, "I/O ranks waiting for new epoch at time %f\n",MPI_Wtime());
1105              MPI_Send(NULL, 0, MPI_BYTE, 0, cmd_epochComplete, globalIntercomm);              MPI_Send(NULL, 0, MPI_BYTE, 0, cmd_epochComplete, globalIntercomm);
1106    
1107              MPI_Recv(cmd, 4, MPI_INT, 0, 0, globalIntercomm, MPI_STATUS_IGNORE);              MPI_Recv(cmd, 4, MPI_INT, 0, 0, globalIntercomm, MPI_STATUS_IGNORE);
1108              fprintf(stderr, "I/O ranks begining new epoch: %d, gcmIter = %d\n", cmd[1],cmd[3]);              fprintf(stderr, "I/O ranks begining new epoch: %d, gcmIter = %d, at time %f\n",
1109                                cmd[1], cmd[3], MPI_Wtime());
1110    
1111              // before we start a new epoch, have i/o rank 0:              // before we start a new epoch, have i/o rank 0:
1112              // determine output filenames for this epoch              // determine output filenames for this epoch
# Line 1121  ioRankMain(int totalNumTiles) Line 1163  ioRankMain(int totalNumTiles)
1163          }          }
1164          MPI_Bcast(cmd, 4, MPI_INT, 0, ioIntracomm);            MPI_Bcast(cmd, 4, MPI_INT, 0, ioIntracomm);  
1165    
1166            if (0 == ioIntracommRank)
1167              fprintf(stderr,"i/o handshake completed %d %d %f\n",cmd[1],cmd[3],MPI_Wtime());
1168    
1169          switch (cmd[0]) {          switch (cmd[0]) {
1170    
1171            case cmd_exit:            case cmd_exit:
# Line 1179  findField(const char c) Line 1224  findField(const char c)
1224    
1225  // Given a number of ranks and a set of fields we will want to output,  // Given a number of ranks and a set of fields we will want to output,
1226  // figure out how to distribute the ranks among the fields.  // figure out how to distribute the ranks among the fields.
1227    // We attempt to distribute according to three separate criteria:
1228    //  1) Try to make the number of ranks assigned to a field be
1229    //     proportional to the number of bytes in that field.
1230    //  2) Try to even out the total number of MPI messages that are
1231    //     sent to any given i/o node.
1232    //  3) Try to even out the amount of output being written from
1233    //     any given i/o node.
1234    
1235  void  void
1236  computeIORankAssigments(  computeIORankAssigments(
1237    int numComputeRanks,    int numComputeRanks,
1238    int numIORanks,    int numIORanks,
1239    int numFields,    int numFields,
1240    fieldInfoThisEpoch_t *fields,    fieldInfoThisEpoch_t *fields,
1241    int assignments[])    int assignments[],
1242      int nToWrite[])
1243  {  {
1244      int i,j,k, sum, count;      int i,j,k, iteration, sum, count;
1245    
1246      int numIONodes = numIORanks / numRanksPerNode;      int numIONodes = numIORanks / numRanksPerNode;
1247      int numIORanksThisField[numFields];      int numIORanksThisField[numFields], nRanks[numFields];
1248      long int bytesPerIORankThisField[numFields];      long int bytesPerIORankThisField[numFields];
1249      int expectedMessagesPerRankThisField[numFields];      int expectedMessagesPerRankThisField[numFields];
1250        int isWriter[numIORanks];
1251      long int fieldSizes[numFields];      long int fieldSizes[numFields];
1252    
1253      struct ioNodeInfo_t {      struct ioNodeInfo_t {
# Line 1210  computeIORankAssigments( Line 1265  computeIORankAssigments(
1265    
1266      ASSERT((numIONodes*numRanksPerNode) == numIORanks);      ASSERT((numIONodes*numRanksPerNode) == numIORanks);
1267    
1268        memset (assignments, 0, numIORanks*sizeof(int));
1269        memset (isWriter, 0, numIORanks*sizeof(int));
1270    
1271    
1272      //////////////////////////////////////////////////////////////      //////////////////////////////////////////////////////////////
1273      // Compute the size for each field in this epoch style      // Compute the size for each field in this epoch style
# Line 1219  computeIORankAssigments( Line 1277  computeIORankAssigments(
1277      }      }
1278    
1279    
1280      /////////////////////////////////////////////////////////      /////////////////////////////////////////////////////////////////
1281      // Distribute the available i/o ranks among the fields,      // Phase 1: Distribute the number of available i/o ranks among
1282      // proportionally based on field size.      // the fields, proportionally based on field size.
1283    
1284      // First, assign one rank per field      // First, assign one rank per field
1285      ASSERT(numIORanks >= numFields);      ASSERT(numIORanks >= numFields);
# Line 1246  computeIORankAssigments( Line 1304  computeIORankAssigments(
1304          bytesPerIORankThisField[k] = fieldSizes[k] / numIORanksThisField[k];          bytesPerIORankThisField[k] = fieldSizes[k] / numIORanksThisField[k];
1305      }      }
1306    
1307      ////////////////////////////////////////////////////////////      /////////////////////////////////////////////////////////////////
1308      // At this point, all the i/o ranks should be apportioned      // At this point, the number of i/o ranks should be apportioned
1309      // among the fields.  Check we didn't screw up the count.      // among the fields.  Check we didn't screw up the count.
1310      for (sum = 0, i = 0;  i < numFields;  ++i) {      for (sum = 0, i = 0;  i < numFields;  ++i) {
1311          sum += numIORanksThisField[i];          sum += numIORanksThisField[i];
1312      }      }
1313      ASSERT(sum == numIORanks);      ASSERT(sum == numIORanks);
1314    
1315        // Make a copy of numIORanksThisField
1316        memcpy (nRanks, numIORanksThisField, numFields*sizeof(int));
1317    
1318    
1319    
1320      //////////////////////////////////////////////////////////////////      //////////////////////////////////////////////////////////////////
1321        // Phase 2: try to even out the number of messages per node.
1322      // The *number* of i/o ranks assigned to a field is based on the      // The *number* of i/o ranks assigned to a field is based on the
1323      // field size.  But the *placement* of those ranks is based on      // field size.  But the *placement* of those ranks is based on
1324      // the expected number of messages the process will receive.      // the expected number of messages the process will receive.
# Line 1309  computeIORankAssigments( Line 1371  computeIORankAssigments(
1371    
1372          // Make an initial choice of field          // Make an initial choice of field
1373          for (i = 0;  i < numFields;  ++i) {          for (i = 0;  i < numFields;  ++i) {
1374              if (numIORanksThisField[i] > 0) break;              if (nRanks[i] > 0) break;
1375          }          }
1376          k = i;          k = i;
1377          ASSERT(k < numFields);          ASSERT(k < numFields);
1378    
1379          // Search for a better choice          // Search for a better choice
1380          for (++i;  i < numFields;  ++i) {          for (++i;  i < numFields;  ++i) {
1381              if (numIORanksThisField[i] <= 0) continue;              if (nRanks[i] <= 0) continue;
1382              if (expectedMessagesPerRankThisField[i] >              if (expectedMessagesPerRankThisField[i] >
1383                  expectedMessagesPerRankThisField[k])                  expectedMessagesPerRankThisField[k])
1384              {              {
# Line 1328  computeIORankAssigments( Line 1390  computeIORankAssigments(
1390          ioNodeInfo[j].expectedNumMessagesThisNode += expectedMessagesPerRankThisField[k];          ioNodeInfo[j].expectedNumMessagesThisNode += expectedMessagesPerRankThisField[k];
1391          ioNodeInfo[j].assignedFieldThisCore[ioNodeInfo[j].numCoresAssigned] = k;          ioNodeInfo[j].assignedFieldThisCore[ioNodeInfo[j].numCoresAssigned] = k;
1392          ioNodeInfo[j].numCoresAssigned += 1;          ioNodeInfo[j].numCoresAssigned += 1;
1393          numIORanksThisField[k] -= 1;          nRanks[k] -= 1;
1394      }      }
1395    
1396      // Sanity check - all ranks were assigned to a core      // Sanity check - all ranks were assigned to a core
1397      for (i = 1;  i < numFields;  ++i) {      for (i = 1;  i < numFields;  ++i) {
1398          ASSERT(0 == numIORanksThisField[i]);          ASSERT(0 == nRanks[i]);
1399      }      }
1400      // Sanity check - all cores were assigned a rank      // Sanity check - all cores were assigned a rank
1401      for (i = 1;  i < numIONodes;  ++i) {      for (i = 1;  i < numIONodes;  ++i) {
# Line 1341  computeIORankAssigments( Line 1403  computeIORankAssigments(
1403      }      }
1404    
1405    
1406      /////////////////////////////////////      //////////////////////////////////////////////////////////////////
1407        // Phase 3: Select which ranks will be assigned to write out the
1408        // fields.  We attempt to balance the amount that each node is
1409        // assigned to write out.  Since Phase 1 and Phase 2 have already
1410        // fixed a number of things, our options for balancing things
1411        // is somewhat restricted.
1412    
1413    
1414        // Count how many nodes each field is distributed across
1415        int numIONodesThisField[numFields];
1416        for (i = 0;  i < numFields;  ++i) {
1417            numIONodesThisField[i] = 0;
1418            for (j = 0;  j < numIONodes;  ++j) {
1419                for (k = 0;  k < numRanksPerNode;  ++k) {
1420                    if (ioNodeInfo[j].assignedFieldThisCore[k] == i) {
1421                        numIONodesThisField[i] += 1;
1422                        break;
1423                    }
1424                }
1425            }
1426            ASSERT (numIONodesThisField[i] > 0);
1427            ASSERT (numIONodesThisField[i] <= numIONodes);
1428        }
1429    
1430    
1431        // Init a per-node running count of z-levels-to-write
1432        memset (nToWrite, 0, numIONodes*sizeof(int));
1433    
1434    
1435        // Iterate through all the fields, although we will select
1436        // which field to operate on (i.e. curField) non-sequentially.
1437        for (iteration = 0;  iteration < numFields;  ++iteration) {
1438    
1439            // Pick the field distributed across the fewest number of nodes, on
1440            // the theory that this field will have the least flexibility.
1441            int curField = 0;
1442            for (j = 1;  j < numFields;  ++j) {
1443                if (numIONodesThisField[j] < numIONodesThisField[curField]) {
1444                    curField = j;
1445                }
1446            }
1447            ASSERT (numIONodesThisField[curField] <= numIONodes);
1448    
1449            // Now that we have chosen a field, identify any i/o nodes
1450            // that have rank(s) assigned to that field.
1451            int nAssigned[numIONodes];
1452            for (i = 0;  i < numIONodes;  ++i) {
1453                nAssigned[i] = 0;
1454                for (j = 0;  j < numRanksPerNode;  ++j) {
1455                    if (ioNodeInfo[i].assignedFieldThisCore[j] == curField) {
1456                        nAssigned[i] += 1;
1457                    }
1458                }
1459            }
1460    
1461    
1462            // We do the writes in units of entire z-levels.  If a rank is a
1463            // writer, it is assigned to write a chunk of one or more z-levels.
1464            int chunkSize = divCeil (fields[curField].zDepth, numIORanksThisField[curField]);
1465            int nChunks = divCeil (fields[curField].zDepth, chunkSize);
1466            int curChunk;
1467    
1468            // Designate the same number of ranks to be writers as there are chunks
1469            fields[curField].chunkWriters = malloc (nChunks*sizeof(int));
1470            ASSERT (fields[curField].chunkWriters != NULL);
1471            fields[curField].nChunks = nChunks;
1472            int nAvailable[numIONodes];
1473            memcpy (nAvailable, nAssigned, numIONodes*sizeof(int));
1474            for (curChunk = 0;  curChunk < nChunks;  ++curChunk) {
1475    
1476                // Note that the last chunk might be an odd size (if chunkSize
1477                // does not evenly divide zDepth).
1478                if ((curChunk + 1) == nChunks) {
1479                  chunkSize = fields[curField].zDepth - curChunk*chunkSize;
1480                }
1481    
1482                // Pick a node that has at least one rank assigned to curField
1483                // that has not already been designated as a writer.
1484                // There must still be at least one, or we would have exited
1485                // the loop already.
1486                int curNode;
1487                for (curNode = 0;  curNode < numIONodes;  ++curNode) {
1488                    if (nAvailable[curNode] > 0) break;
1489                }
1490                ASSERT (curNode < numIONodes);
1491    
1492                // curNode is a candidate node; try to find an acceptable
1493                // candidate node with a smaller nToWrite
1494                for (i = curNode + 1;  i < numIONodes;  ++i) {
1495                    if ((nAvailable[i] > 0) && (nToWrite[i] < nToWrite[curNode])) {
1496                      curNode = i;
1497                    }
1498                }
1499    
1500                // Find an appropriate rank on the chosen node
1501                for (j = 0;  j < numRanksPerNode;  ++j) {
1502                    if ( (ioNodeInfo[curNode].assignedFieldThisCore[j] == curField) &&
1503                         (!isWriter[curNode*numRanksPerNode + j]) )
1504                    {
1505                        break;
1506                    }
1507                }
1508                // Double-check that we found an appropriate rank.
1509                ASSERT (j < numRanksPerNode);
1510    
1511    
1512                // We've picked a rank to be the writer of the current chunk.
1513                // Now we need to figure out which rank this will wind up being
1514                // in the "ioRanksIntracomm" for this field (when that intracomm
1515                // eventually gets created), so we can set "chunkWriters".
1516                int eventualCommRank = 0;
1517                for (i = 0;  i < curNode;  ++i) eventualCommRank += nAssigned[i];
1518                for (i = 0;  i < j;  ++i) {
1519                    if (ioNodeInfo[curNode].assignedFieldThisCore[i] == curField) {
1520                        eventualCommRank += 1;
1521                    }
1522                }
1523    
1524    
1525                // Update the info for this choice of node+rank
1526                fields[curField].chunkWriters[curChunk] = eventualCommRank;
1527                isWriter[curNode*numRanksPerNode + j] = 1;
1528                nAvailable[curNode] -= 1;
1529                nToWrite[curNode] += chunkSize;
1530    
1531            }
1532    
1533            // We've finished with this curField; mark it so that we don't
1534            // re-select this same curField the next time through the loop.
1535            numIONodesThisField[curField] =  numIONodes + 1;
1536        }
1537    
1538    
1539        //////////////////////////////////////////////////////////////////
1540      // Return the computed assignments      // Return the computed assignments
1541        // N.B. We are also returning info in "fields[*].chunkWriters"
1542      for (i = 0;  i < numIONodes;  ++i) {      for (i = 0;  i < numIONodes;  ++i) {
1543          for (j = 0;  j < numRanksPerNode;  ++j) {          for (j = 0;  j < numRanksPerNode;  ++j) {
1544              assignments[i*numRanksPerNode + j] =              assignments[i*numRanksPerNode + j] =
# Line 1350  computeIORankAssigments( Line 1546  computeIORankAssigments(
1546          }          }
1547      }      }
1548    
1549    
1550      // Clean up      // Clean up
1551      for (i = 0;  i < numIONodes;  ++i) {      for (i = 0;  i < numIONodes;  ++i) {
1552          free(ioNodeInfo[i].assignedFieldThisCore);          free(ioNodeInfo[i].assignedFieldThisCore);
# Line 1360  computeIORankAssigments( Line 1557  computeIORankAssigments(
1557  //////////////////////////////////////////////////////////////////////////////////  //////////////////////////////////////////////////////////////////////////////////
1558    
1559    
   
   
1560  int  int
1561  isIORank(int commRank, int totalNumNodes, int numIONodes)  isIORank(int commRank, int totalNumNodes, int numIONodes)
1562  {  {
1563      // Figure out if this rank is on a node that will do i/o.      // Figure out if this rank is on a node that will do i/o.
1564      // Note that the i/o nodes are distributed throughout the      // Note that the i/o nodes are distributed throughout the
1565      // task, not clustered together.      // task, not clustered together.
     int ioNodeStride = totalNumNodes / numIONodes;  
1566      int thisRankNode = commRank / numRanksPerNode;      int thisRankNode = commRank / numRanksPerNode;
     int n = thisRankNode / ioNodeStride;  
     return (((n * ioNodeStride) == thisRankNode) && (n < numIONodes)) ? 1 : 0;  
 }  
   
1567    
1568  // Find the number of *physical cores* on the current node      int ioNodeStride = totalNumNodes / numIONodes;
1569  // (ignore hyperthreading).  This should work for pretty much      int extra = totalNumNodes % numIONodes;
 // any Linux based system (and fail horribly for anything else).  
 int  
 getNumCores(void)  
 {  
   return 20;  // until we figure out why this cratered  
1570    
1571      char *magic = "cat /proc/cpuinfo | egrep \"core id|physical id\" | "      int inflectionPoint = (ioNodeStride+1)*extra;
1572                    "tr -d \"\\n\" | sed s/physical/\\\\nphysical/g | "      if (thisRankNode <= inflectionPoint) {
1573                    "grep -v ^$ | sort -u | wc -l";          return (thisRankNode % (ioNodeStride+1)) == 0;
1574        } else {
1575            return ((thisRankNode - inflectionPoint) % ioNodeStride) == 0;
1576        }
1577    
1578      FILE *fp = popen(magic,"r");  }
     ASSERT(fp);  
1579    
     int rtnValue = -1;  
       
     int res = fscanf(fp,"%d",&rtnValue);  
1580    
     ASSERT(1==res);  
1581    
1582      pclose(fp);  // Get the number of MPI ranks that are running on one node.
1583    // We assume that the MPI ranks are launched in sequence, filling one
1584    // node before going to the next, and that each node has the same
1585    // number of ranks (except possibly the last node, which is allowed
1586    // to be short).
1587    int
1588    getNumRanksPerNode (MPI_Comm comm)
1589    {
1590        char myHostname[HOST_NAME_MAX+1];
1591        int status, count;
1592        int commSize, commRank;
1593    
1594        MPI_Comm_size (comm, &commSize);
1595        MPI_Comm_rank (comm, &commRank);
1596    
1597        status = gethostname (myHostname, HOST_NAME_MAX);
1598        myHostname[HOST_NAME_MAX] = '\0';
1599        ASSERT (0 == status);
1600    
1601        if (0 == commRank) {
1602            int nBlocks, ii, jj;
1603            assert (allHostnames != NULL);
1604    
1605            // We assume these are already in-order and so don't
1606            // need to be sorted.
1607    
1608            // Count how many ranks have the same hostname as rank 0
1609            for (count = 0;
1610                   (count < commSize) &&
1611                   (strcmp(&(allHostnames[count][0]), myHostname) == 0);
1612                 ++count);
1613            ASSERT (count > 0);
1614    
1615            // Check that the count is consistent for each block of hostnames
1616            // (except possibly the last block, which might not be full).
1617            nBlocks = commSize / count;
1618    
1619            for (jj = 1;  jj < nBlocks;  ++jj) {
1620                char *p = &(allHostnames[jj*count][0]);
1621                for (ii = 0;  ii < count;  ++ii) {
1622                    char *q = &(allHostnames[jj*count + ii][0]);
1623                    ASSERT (strcmp (p, q) == 0);
1624                }
1625            }
1626        }
1627    
1628      ASSERT(rtnValue > 0);      MPI_Bcast (&count, 1, MPI_INT, 0, comm);
1629      return rtnValue;      return count;
1630  }  }
1631    
1632    
   
1633  ////////////////////////////////////////////////////////////////////////  ////////////////////////////////////////////////////////////////////////
1634  ////////////////////////////////////////////////////////////////////////  ////////////////////////////////////////////////////////////////////////
1635  // Routines called by the mitGCM code  // Routines called by the mitGCM code
# Line 1418  f1( Line 1644  f1(
1644    int numTiles,    int numTiles,
1645    MPI_Comm *rtnComputeComm)    MPI_Comm *rtnComputeComm)
1646  {  {
1647        // Note that we ignore the argument "numTiles".  This value used to
1648        // be needed, but now we get the information directly from "SIZE.h"
1649    
1650      MPI_Comm newIntracomm, newIntercomm, dupParentComm;      MPI_Comm newIntracomm, newIntercomm, dupParentComm;
1651      int newIntracommRank, thisIsIORank;      int newIntracommRank, thisIsIORank;
1652      int parentSize, parentRank;      int parentSize, parentRank;
# Line 1426  f1( Line 1655  f1(
1655      int i, j, nF, compRoot, fieldIndex, epochStyleIndex;      int i, j, nF, compRoot, fieldIndex, epochStyleIndex;
1656      int totalNumNodes, numComputeNodes, newIntracommSize;      int totalNumNodes, numComputeNodes, newIntracommSize;
1657    
1658    
1659      // Init globals      // Init globals
1660      totalNumTiles = numTiles;  
1661        // Info about the parent communicator
1662        MPI_Initialized(&mpiIsInitialized);
1663        ASSERT(mpiIsInitialized);
1664        MPI_Comm_size(parentComm, &parentSize);
1665        MPI_Comm_rank(parentComm, &parentRank);
1666    
1667        // Find the max MPI "tag" value
1668        MPI_Attr_get(MPI_COMM_WORLD, MPI_TAG_UB, &intPtr, &tagUBexists);
1669        ASSERT(tagUBexists);
1670        maxTagValue = *intPtr;
1671    
1672        // Gather all the hostnames to rank zero
1673        char myHostname[HOST_NAME_MAX+1];
1674        int status;
1675        status = gethostname (myHostname, HOST_NAME_MAX);
1676        myHostname[HOST_NAME_MAX] = '\0';
1677        ASSERT (0 == status);
1678        if (parentRank != 0) {
1679            // send my hostname to rank 0
1680            MPI_Gather (myHostname, HOST_NAME_MAX+1, MPI_CHAR,
1681                        NULL, 0, MPI_CHAR, 0, parentComm);
1682        }
1683        else {
1684            allHostnames = malloc (parentSize*(HOST_NAME_MAX+1));
1685            assert (allHostnames != NULL);
1686    
1687            // Collect the hostnames from all the ranks
1688            MPI_Gather (myHostname, HOST_NAME_MAX+1, MPI_CHAR,
1689                        allHostnames, HOST_NAME_MAX+1, MPI_CHAR,
1690                        0, parentComm);
1691        }
1692    
1693      if (numRanksPerNode <= 0) {      if (numRanksPerNode <= 0) {
1694          // Might also want to check for an env var (or something)          // Might also want to check for an env var (or something)
1695          numRanksPerNode = getNumCores();          // N.B.: getNumRanksPerNode uses an MPI collective on the communicator
1696            // and needs the global "allHostnames" to already be set in rank 0.
1697            numRanksPerNode = getNumRanksPerNode(parentComm);
1698      }      }
1699    
1700      // Fill in the zDepth field of the fieldInfoThisEpoch_t descriptors      // Fill in the zDepth field of the fieldInfoThisEpoch_t descriptors
# Line 1452  f1( Line 1716  f1(
1716      }      }
1717    
1718    
     // Find the max MPI "tag" value  
     MPI_Attr_get(MPI_COMM_WORLD, MPI_TAG_UB, &intPtr, &tagUBexists);  
     ASSERT(tagUBexists);  
     maxTagValue = *intPtr;  
   
   
     // Info about the parent communicator  
     MPI_Initialized(&mpiIsInitialized);  
     ASSERT(mpiIsInitialized);  
     MPI_Comm_size(parentComm, &parentSize);  
     MPI_Comm_rank(parentComm, &parentRank);  
   
1719    
1720      // Figure out how many nodes we can use for i/o.      // Figure out how many nodes we can use for i/o.
1721      // To make things (e.g. memory calculations) simpler, we want      // To make things (e.g. memory calculations) simpler, we want
# Line 1479  f1( Line 1731  f1(
1731      numIORanks = numIONodes * numRanksPerNode;      numIORanks = numIONodes * numRanksPerNode;
1732    
1733    
1734        // Print out the hostnames, identifying which ones are i/o nodes
1735    
1736        if (0 == parentRank) {
1737            printf ("\n%d total nodes,  %d i/o,  %d compute\n",
1738                    totalNumNodes, numIONodes, numComputeNodes);
1739            for (i = 0;  i < parentSize;  i += numRanksPerNode) {
1740                if (isIORank (i, totalNumNodes, numIONodes)) {
1741                    printf ("\n(%s)", &(allHostnames[i][0]));
1742                } else {
1743                    printf (" %s", &(allHostnames[i][0]));
1744                }
1745            }
1746            printf ("\n\n");
1747        }
1748        fflush (stdout);
1749    
1750    
1751    
1752      // It is surprisingly easy to launch the job with the wrong number      // It is surprisingly easy to launch the job with the wrong number
1753      // of ranks, particularly if the number of compute ranks is not a      // of ranks, particularly if the number of compute ranks is not a
1754      // multiple of numRanksPerNode (the tendency is to just launch one      // multiple of numRanksPerNode (the tendency is to just launch one
# Line 1503  f1( Line 1773  f1(
1773      MPI_Comm_rank(newIntracomm, &newIntracommRank);      MPI_Comm_rank(newIntracomm, &newIntracommRank);
1774    
1775      // Excess ranks disappear      // Excess ranks disappear
1776      // N.B. "parentSize" still includes these ranks.      // N.B. "parentComm" (and parentSize) still includes these ranks, so
1777        // we cannot do MPI *collectives* using parentComm after this point,
1778        // although the communicator can still be used for other things.
1779      if (isExcess == thisRanksType) {      if (isExcess == thisRanksType) {
1780          MPI_Finalize();          MPI_Finalize();
1781          exit(0);          exit(0);
# Line 1547  f1( Line 1819  f1(
1819      // the fields, and create an inter-communicator for each split.      // the fields, and create an inter-communicator for each split.
1820    
1821      for (epochStyleIndex = 0;  epochStyleIndex < numEpochStyles;  ++epochStyleIndex) {      for (epochStyleIndex = 0;  epochStyleIndex < numEpochStyles;  ++epochStyleIndex) {
1822            if (0 == parentRank) {
1823                printf ("Set up epoch style %d\n", epochStyleIndex);
1824            }
1825    
1826          fieldInfoThisEpoch_t *thisEpochStyle = epochStyles[epochStyleIndex];          fieldInfoThisEpoch_t *thisEpochStyle = epochStyles[epochStyleIndex];
1827          int fieldAssignments[numIORanks];          int fieldAssignments[numIORanks];
1828          int rankAssignments[numComputeRanks + numIORanks];          int rankAssignments[numComputeRanks + numIORanks];
# Line 1554  f1( Line 1830  f1(
1830          // Count the number of fields in this epoch style          // Count the number of fields in this epoch style
1831          for (nF = 0;  thisEpochStyle[nF].dataFieldID != '\0';  ++nF) ;          for (nF = 0;  thisEpochStyle[nF].dataFieldID != '\0';  ++nF) ;
1832    
1833          // Decide how to apportion the i/o ranks among the fields          // Decide how to apportion the i/o ranks among the fields.
1834            // (Currently, this call also sets the "chunkWriters" array.)
1835          for (i=0; i < numIORanks; ++i) fieldAssignments[i] = -1;          for (i=0; i < numIORanks; ++i) fieldAssignments[i] = -1;
1836            int nToWrite[numIORanks/numRanksPerNode];
1837          computeIORankAssigments(numComputeRanks, numIORanks, nF,          computeIORankAssigments(numComputeRanks, numIORanks, nF,
1838                                thisEpochStyle, fieldAssignments);                                thisEpochStyle, fieldAssignments, nToWrite);
1839          // Sanity check          // Sanity check
1840          for (i=0; i < numIORanks; ++i) {          for (i=0; i < numIORanks; ++i) {
1841              ASSERT((fieldAssignments[i] >= 0) && (fieldAssignments[i] < nF));              ASSERT((fieldAssignments[i] >= 0) && (fieldAssignments[i] < nF));
# Line 1579  f1( Line 1857  f1(
1857          }          }
1858          ASSERT(j == numIORanks);          ASSERT(j == numIORanks);
1859    
1860    
1861          if (0 == parentRank) {          if (0 == parentRank) {
1862              printf("\ncpu assignments, epoch style %d\n", epochStyleIndex);              printf ("Create communicators for epoch style %d\n", epochStyleIndex);
             for (i = 0; i < numComputeNodes + numIONodes ; ++i) {  
                 if (rankAssignments[i*numRanksPerNode] >= 0) {  
                     // i/o node  
                     for (j = 0; j < numRanksPerNode; ++j) {  
                         printf(" %1d", rankAssignments[i*numRanksPerNode + j]);  
                     }  
                 } else {  
                     // compute node  
                     for (j = 0; j < numRanksPerNode; ++j) {  
                         if ((i*numRanksPerNode + j) >= (numComputeRanks + numIORanks)) break;  
                         ASSERT(-1 == rankAssignments[i*numRanksPerNode + j]);  
                     }  
                     printf(" #");  
                     for (; j < numRanksPerNode; ++j) {  // "excess" ranks (if any)  
                         printf("X");  
                     }  
                 }  
                 printf(" ");  
             }  
             printf("\n\n");  
1863          }          }
1864            fflush (stdout);
1865    
1866    
1867          // Find the lowest rank assigned to computation; use it as          // Find the lowest rank assigned to computation; use it as
1868          // the "root" for the upcoming intercomm creates.          // the "root" for the upcoming intercomm creates.
# Line 1609  f1( Line 1870  f1(
1870          // Sanity check          // Sanity check
1871          if ((isCompute == thisRanksType) && (0 == newIntracommRank)) ASSERT(compRoot == parentRank);          if ((isCompute == thisRanksType) && (0 == newIntracommRank)) ASSERT(compRoot == parentRank);
1872    
   
1873          // If this is an I/O rank, split the newIntracomm (which, for          // If this is an I/O rank, split the newIntracomm (which, for
1874          // the i/o ranks, is a communicator among all the i/o ranks)          // the i/o ranks, is a communicator among all the i/o ranks)
1875          // into the pieces as assigned.          // into the pieces as assigned.
# Line 1638  f1( Line 1898  f1(
1898                  // ... and dup a separate one for tile registrations                  // ... and dup a separate one for tile registrations
1899                  MPI_Comm_dup(thisEpochStyle[myField].dataIntercomm,                  MPI_Comm_dup(thisEpochStyle[myField].dataIntercomm,
1900                           &(thisEpochStyle[myField].registrationIntercomm));                           &(thisEpochStyle[myField].registrationIntercomm));
1901    
1902    
1903                    // Sanity check: make sure the chunkWriters array looks ok.
1904                    {
1905                        int ii, jj, commSize, nChunks = thisEpochStyle[myField].nChunks;
1906                        ASSERT (nChunks > 0);
1907                        ASSERT (thisEpochStyle[myField].chunkWriters != NULL);
1908                        MPI_Comm_size (thisEpochStyle[myField].dataIntercomm, &commSize);
1909                        for (ii = 0;  ii < nChunks;  ++ii) {
1910                            ASSERT (thisEpochStyle[myField].chunkWriters[ii] >=  0);
1911                            ASSERT (thisEpochStyle[myField].chunkWriters[ii] < commSize);
1912                            for (jj = ii+1;  jj < nChunks;  ++jj) {
1913                                ASSERT (thisEpochStyle[myField].chunkWriters[ii] !=
1914                                        thisEpochStyle[myField].chunkWriters[jj]);
1915                            }
1916                        }
1917                    }
1918              }              }
1919          }          }
1920          else {          else {
# Line 1660  f1( Line 1937  f1(
1937              }              }
1938          }          }
1939    
1940    
1941            // Print a "map" of the core assignments.  Compute nodes are indicated
1942            // by a "#" for the whole node, i/o nodes have the individual cores
1943            // within the node broken out and their field assignment printed.
1944            // If the core is writing to the disk, the field assignment is printed
1945            // in parentheses.
1946            if (0 == parentRank) {
1947                int fieldIOCounts[nF];
1948                memset (fieldIOCounts, 0, nF*sizeof(int));
1949    
1950                printf ("Writer assignments, epoch style %d\n", epochStyleIndex);
1951                for (i = 0;  i < nF;  ++i) {
1952                    fieldInfoThisEpoch_t *p = &(thisEpochStyle[i]);
1953                    int chunkSize = divCeil(p->zDepth,p->nChunks);
1954                    printf ("\n");
1955                    printf ( "field %2d ('%c'): %1d levels, %1d writers, %1d"
1956                             " levels per writer (last writer = %d levels)\n",
1957                            i, p->dataFieldID, p->zDepth, p->nChunks,
1958                            chunkSize, p->zDepth - ((p->nChunks - 1)*chunkSize));
1959                    for (j = 0;  j < thisEpochStyle[i].nChunks;  ++j) {
1960                        printf (" %1d", thisEpochStyle[i].chunkWriters[j]);
1961                    }
1962                    printf ("\n");
1963                }
1964                printf ("\n");
1965    
1966                printf("\ncpu assignments, epoch style %d\n", epochStyleIndex);
1967                int whichIONode = -1;
1968                for (i = 0; i < numComputeNodes + numIONodes ; ++i) {
1969    
1970                    if (rankAssignments[i*numRanksPerNode] >= 0) {
1971    
1972                        // i/o node
1973                        ++whichIONode;
1974                        printf("\n%s (%d Z, %ld bytes):",
1975                               &(allHostnames[i*numRanksPerNode][0]),
1976                               nToWrite[whichIONode],
1977                               nToWrite[whichIONode]*twoDFieldSizeInBytes);
1978    
1979                        for (j = 0; j < numRanksPerNode; ++j) {
1980    
1981                            int assignedField = rankAssignments[i*numRanksPerNode + j];
1982                            int k, commRank, nChunks, isWriter;
1983                            nChunks = thisEpochStyle[assignedField].nChunks;
1984                            commRank = fieldIOCounts[assignedField];
1985    
1986                            isWriter = 0;
1987                            for (k = 0;  k < nChunks;  ++k) {
1988                                if (thisEpochStyle[assignedField].chunkWriters[k] == commRank) {
1989                                    isWriter = 1;
1990                                    break;
1991                                }
1992                            }
1993                            printf(isWriter ? " (%1d)" : "  %1d ", assignedField);
1994    
1995                            fieldIOCounts[assignedField] += 1;
1996                        }
1997                        printf("\n");
1998    
1999                    } else {
2000    
2001                        // compute node
2002                        for (j = 0; j < numRanksPerNode; ++j) {
2003                            if ((i*numRanksPerNode + j) >= (numComputeRanks + numIORanks)) break;
2004                            ASSERT(-1 == rankAssignments[i*numRanksPerNode + j]);
2005                        }
2006                        printf(" #");
2007                        for (; j < numRanksPerNode; ++j) {  // "excess" ranks (if any)
2008                            printf("X");
2009                        }
2010                    }
2011                    printf(" ");
2012                }
2013                printf("\n\n");
2014            }
2015    
2016    
2017    
2018      } // epoch style loop      } // epoch style loop
2019    
2020        // Note: only non-null in rank 0, but it's ok to "free" NULL
2021        free(allHostnames);
2022    
2023    
2024      // I/O processes split off and start receiving data      // I/O processes split off and start receiving data
2025      // NOTE: the I/O processes DO NOT RETURN from this call      // NOTE: the I/O processes DO NOT RETURN from this call
2026      if (isIO == thisRanksType) ioRankMain(totalNumTiles);      if (isIO == thisRanksType) ioRankMain();
2027    
2028    
2029      // The "compute" processes now return with their new intra-communicator.      // The "compute" processes now return with their new intra-communicator.
# Line 1674  f1( Line 2032  f1(
2032      // but first, call mpiio initialization routine!      // but first, call mpiio initialization routine!
2033      initSizesAndTypes();      initSizesAndTypes();
2034    
2035        fflush (stdout);
2036      return;      return;
2037  }  }
2038    
# Line 1776  beginNewEpoch(int newEpochID, int gcmIte Line 2135  beginNewEpoch(int newEpochID, int gcmIte
2135          abort();          abort();
2136      }      }
2137    
2138    
2139    
2140        // not necessary for correctness, but for better timings
2141        MPI_Barrier(computeIntracomm);
2142        if (0 == computeIntracommRank)
2143          fprintf(stderr,"compute ready to emit %d %d %f\n",newEpochID,gcmIter,MPI_Wtime());
2144    
2145    
2146    
2147      ////////////////////////////////////////////////////////////////////////      ////////////////////////////////////////////////////////////////////////
2148      // Need to be sure the i/o tasks are done processing the previous epoch      // Need to be sure the i/o tasks are done processing the previous epoch
2149      // before any compute tasks start sending tiles from a new epoch.      // before any compute tasks start sending tiles from a new epoch.
# Line 1795  beginNewEpoch(int newEpochID, int gcmIte Line 2163  beginNewEpoch(int newEpochID, int gcmIte
2163      // Compute ranks wait here until rank 0 gets the ack from the i/o ranks      // Compute ranks wait here until rank 0 gets the ack from the i/o ranks
2164      MPI_Barrier(computeIntracomm);      MPI_Barrier(computeIntracomm);
2165    
2166        if (0 == computeIntracommRank)
2167          fprintf(stderr,"compute handshake completed %d %d %f\n",newEpochID,gcmIter,MPI_Wtime());
2168    
2169    
2170      currentEpochID = newEpochID;      currentEpochID = newEpochID;
2171      currentEpochStyle = epochStyle;      currentEpochStyle = epochStyle;
# Line 1835  f3(char dataFieldID, int tileID, int epo Line 2206  f3(char dataFieldID, int tileID, int epo
2206              if (p->dataFieldID == dataFieldID) break;              if (p->dataFieldID == dataFieldID) break;
2207          }          }
2208          // Make sure we found a valid field          // Make sure we found a valid field
   
2209          ASSERT(p->dataIntercomm != MPI_COMM_NULL);          ASSERT(p->dataIntercomm != MPI_COMM_NULL);
2210    
2211          MPI_Comm_remote_size(p->dataIntercomm, &remoteCommSize);          MPI_Comm_remote_size(p->dataIntercomm, &remoteCommSize);
# Line 1847  f3(char dataFieldID, int tileID, int epo Line 2217  f3(char dataFieldID, int tileID, int epo
2217      ASSERT(p->dataFieldID == dataFieldID);      ASSERT(p->dataFieldID == dataFieldID);
2218    
2219      receiver = (computeIntracommRank + p->tileCount) % remoteCommSize;      receiver = (computeIntracommRank + p->tileCount) % remoteCommSize;
   
2220      tag = (tileID << numCheckBits) | (epochID & bitMask);      tag = (tileID << numCheckBits) | (epochID & bitMask);
2221    
     //fprintf(stderr,"%d %d\n",flag,tileID);  
   
     /*  
     if (tileID==189){  
       int i,j;  
       for (i=0;i<TILE_Y;++i){  
         for (j=0;j<TILE_X;++j)  
           fprintf(stderr,"%f ",((double*)data)[872+i*108+j]);  
         fprintf(stderr,"\n");  
       }  
     }  
           
     */  
   
   
   
2222      FPRINTF(stderr,"Rank %d sends field '%c', tile %d, to i/o task %d with tag %d(%d)\n",      FPRINTF(stderr,"Rank %d sends field '%c', tile %d, to i/o task %d with tag %d(%d)\n",
2223                     computeIntracommRank, dataFieldID, tileID, receiver, tag, tag >> numCheckBits);                     computeIntracommRank, dataFieldID, tileID, receiver, tag, tag >> numCheckBits);
2224    
# Line 1901  f4(int epochID) Line 2254  f4(int epochID)
2254    
2255    
2256    
 void myasyncio_set_global_sizes_(int *nx, int *ny, int *nz,  
                                  int *nt, int *nb,  
                                  int *tx, int *ty,  
                                  int *ox, int *oy)  
 {  
   
   
   int data[] = {*nx,*ny,*nz,*nt,*nb,*tx,*ty,*ox,*oy};  
     
   int items = sizeof(data)/sizeof(int);  
   
   
   NTILES = *nt;  // total number of tiles  
   
   
   TILE_X = *tx;  
   TILE_Y = *ty;  
   XGHOSTS = *ox;  
   YGHOSTS = *oy;  
   tileOneZLevelItemCount = ((TILE_X + 2*XGHOSTS) * (TILE_Y + 2*YGHOSTS));  
   tileOneZLevelSizeInBytes = (tileOneZLevelItemCount * datumSize);    // needed by compute ranks in f3  
   
     
   int rank,ierr;  
   MPI_Comm_rank(globalIntercomm,&rank);  
     
   
   if (0==rank)  
     printf("%d %d %d\n%d %d\n%d %d\n%d %d\n",*nx,*ny,*nz,*nt,*nb,*tx,*ty,*ox,*oy);  
   
   
   /*  
   if (0==rank)  
     ierr=MPI_Bcast(&NTILES,1,MPI_INT,MPI_ROOT,globalIntercomm);  
   else  
     ierr=MPI_Bcast(&NTILES,1,MPI_INT,MPI_PROC_NULL,globalIntercomm);  
   */  
   
   if (0==rank)  
     ierr=MPI_Bcast(data,items,MPI_INT,MPI_ROOT,globalIntercomm);  
   else  
     ierr=MPI_Bcast(data,items,MPI_INT,MPI_PROC_NULL,globalIntercomm);  
     
 }  
   
2257  void asyncio_tile_arrays_(int *xoff, int *yoff, int *xskip)  void asyncio_tile_arrays_(int *xoff, int *yoff, int *xskip)
2258  {  {
2259      int rank,ierr;      int rank, ierr, rootProc;
     MPI_Comm_rank(globalIntercomm,&rank);  
   
     if (0==rank)  
       ierr=MPI_Bcast(xoff,NTILES,MPI_INT,MPI_ROOT,globalIntercomm);  
     else  
       ierr=MPI_Bcast(xoff,NTILES,MPI_INT,MPI_PROC_NULL,globalIntercomm);  
   
     if (0==rank)  
       ierr=MPI_Bcast(yoff,NTILES,MPI_INT,MPI_ROOT,globalIntercomm);  
     else  
       ierr=MPI_Bcast(yoff,NTILES,MPI_INT,MPI_PROC_NULL,globalIntercomm);  
2260    
2261      if (0==rank)      MPI_Comm_rank(globalIntercomm,&rank);
2262        ierr=MPI_Bcast(xskip,NTILES,MPI_INT,MPI_ROOT,globalIntercomm);      rootProc = (0 == rank) ? MPI_ROOT : MPI_PROC_NULL;
     else  
       ierr=MPI_Bcast(xskip,NTILES,MPI_INT,MPI_PROC_NULL,globalIntercomm);  
2263    
2264        ierr = MPI_Bcast (xoff, TOTAL_NUM_TILES, MPI_INT, rootProc, globalIntercomm);
2265        ierr = MPI_Bcast (yoff, TOTAL_NUM_TILES, MPI_INT, rootProc, globalIntercomm);
2266        ierr = MPI_Bcast (xskip, TOTAL_NUM_TILES, MPI_INT, rootProc, globalIntercomm);
2267  }  }
2268    
2269    
# Line 1979  void Line 2277  void
2277  bron_f1(  bron_f1(
2278    MPI_Fint *ptr_parentComm,    MPI_Fint *ptr_parentComm,
2279    int *ptr_numComputeRanks,    int *ptr_numComputeRanks,
2280    int *ptr_totalNumTiles,    int *ptr_numTiles,
2281    MPI_Fint *rtnComputeComm)    MPI_Fint *rtnComputeComm)
2282  {  {
2283      // Convert the Fortran handle into a C handle      // Convert the Fortran handle into a C handle
2284      MPI_Comm newComm, parentComm = MPI_Comm_f2c(*ptr_parentComm);      MPI_Comm newComm, parentComm = MPI_Comm_f2c(*ptr_parentComm);
2285    
2286      f1(parentComm, *ptr_numComputeRanks, *ptr_totalNumTiles, &newComm);      f1(parentComm, *ptr_numComputeRanks, *ptr_numTiles, &newComm);
2287    
2288      // Convert the C handle into a Fortran handle      // Convert the C handle into a Fortran handle
2289      *rtnComputeComm = MPI_Comm_c2f(newComm);      *rtnComputeComm = MPI_Comm_c2f(newComm);
# Line 2021  bron_f4(int *ptr_epochID) Line 2319  bron_f4(int *ptr_epochID)
2319      f4(*ptr_epochID);      f4(*ptr_epochID);
2320  }  }
2321    
2322    

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.3

  ViewVC Help
Powered by ViewVC 1.1.22