ESDM
Middleware for Earth System Data
esdm-datatypes-internal.h
1 #ifndef ESDM_DATATYPES_INTERNAL_H
2 #define ESDM_DATATYPES_INTERNAL_H
3 
4 #include <jansson.h>
5 #include <glib.h>
6 #include <stdatomic.h>
7 #include <stdbool.h>
8 
9 #include <esdm-datatypes.h>
10 #include <smd-datatype.h>
11 
12 #ifdef HAVE_SCIL
13 #include <scil.h>
14 #else
15 // fake definition of hints
17  int missing;
18 };
19 #endif
20 
21 typedef struct esdm_grid_t esdm_grid_t;
22 typedef struct estream_write_t estream_write_t;
23 
24 enum { ESDM_ID_LENGTH = 23 }; //= strlen(id), to allocate the buffers, add one byte for the termination
25 
26 enum esdm_data_status_e {
27  ESDM_DATA_NOT_LOADED, //no data in memory, on-disk state is unspecified
28  ESDM_DATA_DIRTY, //data in memory is different from data on disk
29  ESDM_DATA_PERSISTENT, //data is in memory, on-disk data is equal to data in memory
30  ESDM_DATA_DELETED //no data in memory, no data on disk, and no data should be added in any way, the object is a zombie
31 };
32 
33 typedef enum esdm_data_status_e esdm_data_status_e;
34 
36  esdm_dataset_t ** dset;
37  int count;
38  int buff_size;
39 };
40 
42  char *name;
43  smd_attr_t *attr;
44  esdm_datasets_t dsets;
45 
46  int refcount;
47  esdm_data_status_e status;
48  int mode_flags; // set via esdm_mode_flags_e
49 };
50 
52 
54  GHashTable* table;
55 };
56 
57 typedef struct esdm_fragments_t esdm_fragments_t;
58 
60  char *name;
61  char *id;
62  char **dims_dset_id; // array of variable names != NULL if set
63  esdm_container_t *container;
64  esdm_dataspace_t *dataspace;
65  smd_attr_t *fill_value; // use for read of not-written data, if set
66  smd_attr_t *attr;
67  int64_t *actual_size; // used for unlimited dimensions
68  esdm_fragments_t fragments;
69  int64_t gridCount, incompleteGridCount, gridSlotCount;
70  esdm_grid_t** grids; //This array first contains the complete grids, then the grids that still lack some subgrids/fragments, and finally some pointers that are allocated but not used.
71  //When a grid is completed, it is swapped with the first incomplete grid and the grid counts are adjusted accordingly. This should be more efficient than managing two separate arrays.
72  int refcount;
73  esdm_data_status_e status;
74  int mode_flags; // set via esdm_mode_flags_e
75  scil_user_hints_t * chints; // compression hints from SCIL, NULL if none available
76 };
77 
79  char * id;
80  esdm_dataset_t *dataset;
81  esdm_dataspace_t *dataspace;
82  esdm_backend_t *backend;
83  void * backend_md; // backend-specific metadata if set
84  void * buf;
85  size_t elements;
86  size_t bytes; // expected size in bytes
87  size_t actual_bytes; // actual size, can differ from actual size due to compression
88  //int direct_io;
89  esdm_data_status_e status;
90  bool ownsBuf; //If true, the fragment is responsible to free the buffer when it's destructed or unloaded. Otherwise, `buf` is just a reference for zero copy writing.
91 };
92 
93 // MODULES ////////////////////////////////////////////////////////////////////
94 
103 typedef enum esdm_module_type_t {
104  ESDM_MODULE_DATA,
105  ESDM_MODULE_METADATA
106 } esdm_module_type_t;
107 
108 // Callbacks
124  // General for ESDM
125  int (*finalize)(esdm_backend_t * b);
126 
127  int (*performance_estimate)(esdm_backend_t * b, esdm_fragment_t *fragment, float *out_time);
128  float (*estimate_throughput) (esdm_backend_t * b);
129 
130  int (*fragment_create) (esdm_backend_t * b, esdm_fragment_t *fragment);
131  int (*fragment_retrieve)(esdm_backend_t * b, esdm_fragment_t *fragment);
132  int (*fragment_update) (esdm_backend_t * b, esdm_fragment_t *fragment);
133  int (*fragment_delete) (esdm_backend_t * b, esdm_fragment_t *fragment);
134 
135  int (*fragment_metadata_create)(esdm_backend_t * b, esdm_fragment_t *fragment, smd_string_stream_t* stream);
136  void* (*fragment_metadata_load)(esdm_backend_t * b, esdm_fragment_t *fragment, json_t *metadata);
137  int (*fragment_metadata_free) (esdm_backend_t * b, void * options);
138 
139  int (*mkfs)(esdm_backend_t * b, int format_flags);
140  int (*fsck)(esdm_backend_t * b);
141 
142  // write streaming functions
157  //TODO: I find the semantics of `cur_buf` and `cur_offset` surprising. Imho, we should redesign this call, possibly splitting it into two or three functions.
158  int (*fragment_write_stream_blocksize)(esdm_backend_t * b, estream_write_t * state, void * cur_buf, size_t cur_offset, uint64_t cur_size);
159 };
160 
162  // General for ESDM
163  int (*finalize)(esdm_md_backend_t *);
164  int (*performance_estimate)(esdm_md_backend_t *, esdm_fragment_t *fragment, float *out_time);
165 
166  // ESDM Data Model Specific
167  int (*container_create)(esdm_md_backend_t *, esdm_container_t *container, int allow_overwrite);
168  int (*container_commit)(esdm_md_backend_t *, esdm_container_t *container, char * json, int md_size);
169  int (*container_retrieve)(esdm_md_backend_t *, esdm_container_t *container, char ** out_json, int * out_size);
170  int (*container_update)(esdm_md_backend_t *, esdm_container_t *container);
171  int (*container_destroy)(esdm_md_backend_t *, esdm_container_t *container);
172  int (*container_remove)(esdm_md_backend_t *, esdm_container_t *container);
173 
174  int (*dataset_create)(esdm_md_backend_t *, esdm_dataset_t *dataset);
175  int (*dataset_commit)(esdm_md_backend_t *, esdm_dataset_t *dataset, char * json, int md_size);
176  int (*dataset_retrieve)(esdm_md_backend_t *, esdm_dataset_t *dataset, char ** out_json, int * out_size);
177  int (*dataset_update)(esdm_md_backend_t *, esdm_dataset_t *dataset);
178  int (*dataset_destroy)(esdm_md_backend_t *, esdm_dataset_t *dataset);
179  int (*dataset_remove)(esdm_md_backend_t *, esdm_dataset_t *dataset);
180 
181  int (*mkfs)(esdm_md_backend_t *, int format_flags);
182  int (*fsck)(esdm_md_backend_t*);
183 };
184 
185 typedef enum esdmI_fragmentation_method_t {
186  ESDMI_FRAGMENTATION_METHOD_CONTIGUOUS, //fragments are optimized for memory locality, may result in fragments that are single slices or even lines of the hypervolume
187  ESDMI_FRAGMENTATION_METHOD_EQUALIZED //all dimensions are treated equally, creating fragments that extend in all available dimensions
188 } esdmI_fragmentation_method_t;
189 
198  esdm_config_backend_t *config;
199  char *name;
200  esdm_module_type_t type;
201  char *version; // 0.0.0
202  void *data; /* backend-specific data. */
203  //uint32_t blocksize; /* any io must be multiple of 'blocksize' and aligned. */
204  esdm_backend_t_callbacks_t callbacks;
205  int threads;
206  GThreadPool *threadPool;
207 };
208 
210  esdm_config_backend_t *config;
211  char *name;
212  char *version;
213  void *data;
214  esdm_md_backend_callbacks_t callbacks;
215 };
216 
217 typedef enum io_operation_t {
218  ESDM_OP_WRITE = 0,
219  ESDM_OP_READ
220 } io_operation_t;
221 
222 typedef struct io_request_status_t {
223  atomic_int pending_ops;
224  GMutex mutex;
225  GCond done_condition;
226  int return_code;
228 
229 typedef struct {
230  void *mem_buf;
231  esdm_dataspace_t *buf_space;
233 
234 typedef struct io_work_t io_work_t;
235 
236 struct io_work_t {
237  esdm_fragment_t *fragment;
238  io_operation_t op;
239  esdm_status return_code;
240  io_request_status_t *parent;
241  void (*callback)(io_work_t *work);
243 };
244 
246 // INTERNAL
248 
249 // Organisation structures of core components /////////////////////////////////
250 
251 // Configuration
253  const char *type;
254  const char *id;
255  const char *target;
256 
257  int max_threads_per_node;
258  int max_global_threads;
259  uint64_t max_fragment_size; //this is a soft limit that may be exceeded anytime
260  esdmI_fragmentation_method_t fragmentation_method;
261  data_accessibility_t data_accessibility;
262  uint32_t write_stream_blocksize; /* size in bytes for enabling write streaming, 0 if disabled */
263 
264  json_t *performance_model;
265  json_t *esdm;
266  json_t *backend;
267 };
268 
270  int count;
271  esdm_config_backend_t **backends;
272 };
273 
275 
276 // Modules
279  int count;
280  esdm_module_type_t *module;
281 };
282 
283 // Scheduler
284 typedef struct esdm_io_t esdm_io_t;
285 struct esdm_io_t {
286  int member;
287  int callback;
288 };
289 
290 // Entry points and state for core components /////////////////////////////////
291 
292 enum {
293  BOUND_LIST_IMPLEMENTATION_ARRAY = 0,
294  BOUND_LIST_IMPLEMENTATION_BTREE = 1
295 };
296 
297 typedef struct esdm_config_t {
298  void *json;
299  uint8_t boundListImplementation; //one of the BOUND_LIST_IMPLEMENTATION_* constants
300 } esdm_config_t;
301 
302 typedef struct esdm_modules_t {
303  int data_backend_count;
304  esdm_backend_t **data_backends;
305  esdm_md_backend_t *metadata_backend;
306  //esdm_modules_t** modules;
308 
309 typedef struct esdm_layout_t {
310  int info;
311 } esdm_layout_t;
312 
313 typedef struct esdm_scheduler_t {
314  int info;
315  GThreadPool *thread_pool;
316  GAsyncQueue *read_queue;
317  GAsyncQueue *write_queue;
319 
320 typedef struct esdm_performance_t {
321  int info;
323 
324 typedef struct esdm_instance_t esdm_instance_t;
325 
327  int is_initialized;
328  int procs_per_node;
329  int total_procs;
330  esdm_statistics_t readStats;
331  esdm_statistics_t writeStats;
332  esdm_config_t *config;
333  esdm_modules_t *modules;
334  esdm_layout_t *layout;
335  esdm_scheduler_t *scheduler;
336  esdm_performance_t *performance;
337 };
338 
339 // Auxiliary? /////////////////////////////////////////////////////////////////
340 
341 typedef struct esdm_bytesequence_t esdm_bytesyquence_t;
343  esdm_type_t type;
344  size_t count;
345  void *data;
346 };
347 
348 typedef struct esdmI_range_t esdmI_range_t;
350  int64_t start, end; //start is inclusive, end is exclusive, i.e. the range includes all `x` with `start <= x < end`
351 };
352 
353 typedef struct esdmI_hypercube_t esdmI_hypercube_t;
355  int64_t dims;
356  esdmI_range_t ranges[];
357 };
358 
359 //Plain old data object. Neither owns the hypercubes, nor the memory in which their pointers are stored.
360 typedef struct esdmI_hypercubeList_t {
361  esdmI_hypercube_t** cubes;
362  int64_t count;
364 
365 //A hypercubeList that actually owns its memory, allowing it to grow/shrink as needed.
369  int64_t allocatedCount;
370 };
371 
372 //helper for the esdmI_boundList_t implementations
375  int64_t bakedBound; //the LSB is used to store whether the bound is a start or end bound (1 == start bound), the actual bound is shifted left one bit
376  int64_t cubeIndex;
377 };
378 
379 typedef struct esdmI_boundTree_t esdmI_boundTree_t;
380 typedef union esdmI_boundIterator_t {
381  struct {
382  esdmI_boundListEntry_t* entry;
383  } arrayIterator;
384  struct {
385  esdmI_boundTree_t* node;
386  int entryPosition;
387  } treeIterator;
389 
392  void (*add)(void* me, int64_t bound, bool isStart, int64_t cubeIndex);
393  esdmI_boundListEntry_t* (*findFirst)(void* me, int64_t bound, bool isStart, esdmI_boundIterator_t* out_iterator);
394  esdmI_boundListEntry_t* (*nextEntry)(void* me, esdmI_boundIterator_t* inout_iterator);
395  void (*destruct)(void* me);
396 };
397 
398 typedef struct esdmI_boundList_t esdmI_boundList_t;
400  esdmI_boundList_vtable_t* vtable;
401 };
402 
403 static inline void boundList_add(esdmI_boundList_t* me, int64_t bound, bool isStart, int64_t cubeIndex) { me->vtable->add(me, bound, isStart, cubeIndex); }
404 static inline esdmI_boundListEntry_t* boundList_findFirst(esdmI_boundList_t* me, int64_t bound, bool isStart, esdmI_boundIterator_t* out_iterator) { return me->vtable->findFirst(me, bound, isStart, out_iterator); }
405 static inline esdmI_boundListEntry_t* boundList_nextEntry(esdmI_boundList_t* me, esdmI_boundIterator_t* inout_iterator) { return me->vtable->nextEntry(me, inout_iterator); }
406 static inline void boundList_destruct(esdmI_boundList_t* me) { me->vtable->destruct(me); }
407 
408 //helper for esdmI_hypercubeNeighbourManager_t
409 //The intention of implementing this as a class of its own is to facilitate changing the data structure
410 //from a linear sorted list with binary search to some balanced tree in the future.
411 //
412 //This is private to esdmI_hypercubeNeighbourManager_t.
415  esdmI_boundList_t super;
416  esdmI_boundListEntry_t* entries;
417  int64_t count, allocatedCount;
418 };
419 
420 //Stores an index of bounds in the form of a B-tree.
421 //The max of 21 puts the sizeof(esdmI_boundTree_t) at 512 bytes, which is exactly eight cache lines.
422 //This is a tuning parameter that might call for other values on other machines than mine.
423 //
424 //XXX: The motivation for this rather complex structure over the simple array in `esdmI_boundArray_t` is that the later has a quadratic complexity.
425 // While the simple array access outperforms the more complicated data structure for small hypercube counts,
426 // the B-tree outperforms the simply array when we have a couple of thousands entries in the list.
427 // We simply cannot tolerate quadratic complexities when the N is controlled by HPC applications...
428 #define BOUND_TREE_MAX_BRANCH_FACTOR 21
429 #define BOUND_TREE_MAX_ENTRY_COUNT (BOUND_TREE_MAX_BRANCH_FACTOR - 1)
431  esdmI_boundList_t super;
432  int64_t entryCount;
433  esdmI_boundListEntry_t bounds[BOUND_TREE_MAX_ENTRY_COUNT];
434  esdmI_boundTree_t* children[BOUND_TREE_MAX_BRANCH_FACTOR];
435  esdmI_boundTree_t* parent;
436 };
437 
438 //another helper for esdmI_hypercubeNeighbourManager_t
441  int64_t neighbourCount, allocatedCount, *neighbourIndices;
442 };
443 
444 //A hypercubeList that owns its memory, and which keeps track of the neighbourhood relations between the different hypercubes.
447  int64_t allocatedCount;
448  int64_t dims; //All hypercubes in the list must be of the same rank.
449 
450  esdmI_neighbourList_t* neighbourLists; //`list->count` entries, space for `allocatedCount` entries
451 
452  esdmI_boundList_t* boundLists[]; //one esdmI_boundList_t per dimension
453 };
454 
455 typedef struct esdm_readTimes_t esdm_readTimes_t;
457  double makeSet; //the time to determine the sets of fragments than need to be fetched from disk
458  double coverageCheck; //the time needed to check whether the available fragments cover the requested regions
459  double enqueue; //the time needed to queue the read requests
460  double completion; //the time spent waiting for background tasks to complete
461  double writeback; //the time spent writing back fragments after transposition/composition into the user requested data layout
462  double total; //sum of all the times above and other small things like taking times...
463 };
464 
465 typedef struct esdm_writeTimes_t esdm_writeTimes_t;
467  double backendDistribution; //the time spent deciding which backend should handle which parts of the data
468  double backendDispatch; //the time spent sending fragments to the different backends
469  double completion; //the time spent waiting for background tasks to complete
470  double total; //sum of all the times above and other small things like taking times...
471 };
472 
474 typedef struct esdm_copyTimes_t esdm_copyTimes_t;
476  double planning; //the time spent analysing the dataspaces to determine what needs to be done
477  double execution; //the time spent to actually move the data
478  double total; //sum of all the times above and other small things like taking times...
479 };
480 
481 //timers for each of the different backend interface functions
484  double finalize;
485  double performance_estimate;
486  double estimate_throughput;
487  double fragment_create;
488  double fragment_retrieve;
489  double fragment_update;
490  double fragment_delete;
491  double fragment_metadata_create;
492  double fragment_metadata_load;
493  double fragment_metadata_free;
494  double mkfs;
495  double fsck;
496  double fragment_write_stream_blocksize;
497 };
498 
499 //statistics for the handling of fragments
502  double fragmentAdding;
503  double fragmentLookup;
504  double metadataCreation;
505  double setCreation;
506 
507  int64_t fragmentAddCalls;
508  int64_t fragmentLookupCalls;
509  int64_t metadataCreationCalls;
510  int64_t setCreationCalls;
511 };
512 
513 #endif
Datatype primitives provided by ESDM.
esdm_status
Definition: esdm-datatypes.h:37
Definition: esdm-datatypes-internal.h:414
Definition: esdm-datatypes-internal.h:374
Definition: esdm-datatypes-internal.h:399
Definition: esdm-datatypes-internal.h:391
Definition: esdm-datatypes-internal.h:430
Definition: esdm-datatypes-internal.h:360
Definition: esdm-datatypes-internal.h:445
Definition: esdm-datatypes-internal.h:367
Definition: esdm-datatypes-internal.h:354
Definition: esdm-datatypes-internal.h:440
Definition: esdm-datatypes-internal.h:349
Definition: esdm-datatypes-internal.h:483
Definition: esdm-datatypes-internal.h:123
int(* fragment_write_stream_blocksize)(esdm_backend_t *b, estream_write_t *state, void *cur_buf, size_t cur_offset, uint64_t cur_size)
Definition: esdm-datatypes-internal.h:158
Definition: esdm-datatypes-internal.h:197
Definition: esdm-datatypes-internal.h:342
Definition: esdm-datatypes-internal.h:252
Definition: esdm-datatypes-internal.h:269
Definition: esdm-datatypes-internal.h:297
Definition: esdm-datatypes-internal.h:41
Definition: esdm-datatypes-internal.h:475
Definition: esdm-datatypes-internal.h:59
Definition: esdm-datatypes-internal.h:35
Definition: esdm-datatypes.h:81
Definition: esdm-datatypes-internal.h:78
Definition: esdm-datatypes-internal.h:501
Definition: esdm-datatypes-internal.h:53
Definition: esdm-grid.c:18
Definition: esdm-datatypes-internal.h:326
Definition: esdm-datatypes-internal.h:285
Definition: esdm-datatypes-internal.h:309
Definition: esdm-datatypes-internal.h:161
Definition: esdm-datatypes-internal.h:209
Definition: esdm-datatypes-internal.h:278
Definition: esdm-datatypes-internal.h:302
Definition: esdm-datatypes-internal.h:320
Definition: esdm-datatypes-internal.h:456
Definition: esdm-datatypes-internal.h:313
Definition: esdm-datatypes.h:101
Definition: esdm-datatypes-internal.h:466
Definition: esdm-stream.h:158
Definition: esdm-datatypes-internal.h:222
Definition: esdm-datatypes-internal.h:229
Definition: esdm-datatypes-internal.h:236
Definition: esdm-datatypes-internal.h:16
Definition: esdm-datatypes-internal.h:380