Skip to content

Commit 7ad35b7

Browse files
committed
updates for dataset reads/writes
1 parent b68e967 commit 7ad35b7

File tree

9 files changed

+220
-78
lines changed

9 files changed

+220
-78
lines changed

pyproject.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,16 @@ classifiers = [
1717
authors = [{ "name" = "The HDF Group", "email" = "[email protected]" }]
1818
keywords = ["json", "hdf5", "multidimensional array", "data", "datacube"]
1919
requires-python = ">=3.9"
20+
version = "1.0.0"
21+
2022
dependencies = [
2123
"h5py >= 3.10",
2224
"numpy >= 2.0; python_version>='3.9'",
2325
"jsonschema >=4.4.0",
2426
"tomli; python_version<'3.11'",
2527
]
2628

27-
dynamic = ["version"]
29+
#dynamic = ["version"]
2830

2931
[project.urls]
3032
Homepage = "https://support.hdfgroup.org/documentation/hdf5-json/latest/"
@@ -44,7 +46,7 @@ dev = ["check-manifest"]
4446
test = ["coverage"]
4547

4648
[build-system]
47-
requires = ["setuptools", "setuptools_scm", "wheel"]
49+
requires = ["setuptools >= 61"]
4850
build-backend = "setuptools.build_meta"
4951

5052
[tool.setuptools]

src/h5json/dset_util.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ def getDatasetLayout(dset_json):
4040
if "layout" in cp:
4141
layout = cp["layout"]
4242

43+
if layout is None and "layout" in dset_json:
44+
# previous HSDS versions stored layout here
45+
layout = dset_json["layout"]
46+
4347
return layout
4448

4549

src/h5json/h5pystore/h5py_reader.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
import numpy as np
1414
import logging
1515
from os import stat as os_stat
16-
import time
1716

1817
from ..objid import createObjId, getCollectionForId
1918
from ..hdf5dtype import getTypeItem, isOpaqueDtype

src/h5json/h5reader.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414

1515
import logging
1616
import time
17-
import numpy as np
1817

1918
from .objid import createObjId
2019

@@ -158,10 +157,9 @@ def getDatasetValues(self, obj_id, sel=None, dtype=None):
158157
number of elements as the rank of the dataset.
159158
"""
160159

161-
# just return a zero array
162-
arr = np.zeros(sel.shape, dtype=dtype)
160+
# just return None
163161

164-
return arr
162+
return None
165163

166164
def open(self):
167165
""" Open data source for reading """

src/h5json/h5writer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ def open(self):
7878
@abstractmethod
7979
def flush(self):
8080
""" Write dirty items """
81-
pass
81+
# return False since we can't actually persist anything
82+
return False
8283

8384
@abstractmethod
8485
def close(self):

src/h5json/hdf5db.py

Lines changed: 112 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from .hdf5dtype import getTypeItem, createDataType, Reference, special_dtype
1616
from .array_util import jsonToArray, bytesArrayToList
1717
from .dset_util import resize_dataset
18+
from .shape_util import getShapeClass, getShapeDims
1819
from .filters import getFiltersJson
1920
from .objid import createObjId, getCollectionForId, isValidUuid, getUuidFromId, getHashTagForId
2021
from . import selections
@@ -24,6 +25,14 @@
2425
from .h5writer import H5Writer, H5NullWriter
2526

2627

28+
def _getDatasetUpdates(dset_json):
29+
""" return a list of value updates for the datset.
30+
initalize one if not already present. """
31+
if "updates" not in dset_json:
32+
dset_json["updates"] = []
33+
return dset_json["updates"]
34+
35+
2736
class Hdf5db:
2837
"""
2938
This class is used to manage id lookup tables for primary HDF objects (Groups, Datasets,
@@ -109,10 +118,12 @@ def root_id(self):
109118

110119
def is_new(self, obj_id):
111120
""" return true if this is a new object (has not been persisted) """
121+
obj_id = getHashTagForId(obj_id)
112122
return obj_id in self._new_objects
113123

114124
def is_dirty(self, obj_id):
115125
""" return true if this object has been modified """
126+
obj_id = getHashTagForId(obj_id)
116127
if self.is_new(obj_id):
117128
return True
118129
return obj_id in self._dirty_objects
@@ -131,7 +142,7 @@ def deleted_objects(self):
131142

132143
def make_dirty(self, obj_id):
133144
""" Mark the object as dirty and update the lastModified timestamp """
134-
145+
obj_id = getHashTagForId(obj_id)
135146
if obj_id not in self.db:
136147
self.log.error("make dirty called on deleted object")
137148
raise KeyError(f"obj_id: {obj_id} not found")
@@ -236,8 +247,8 @@ def close(self):
236247
""" close reader and writer handles """
237248
self.log.info("Hdf5db __close")
238249

239-
self.flush()
240-
if self.writer:
250+
if self.writer and not isinstance(self.writer, H5NullWriter):
251+
self.flush()
241252
self.writer.close()
242253
if self.reader:
243254
self.reader.close()
@@ -280,13 +291,13 @@ def _checkWriter(self):
280291
def getObjectById(self, obj_id, refresh=False):
281292
""" return object with given id """
282293
self._checkReader()
283-
tag = getHashTagForId(obj_id)
284-
if tag not in self.db or refresh:
294+
obj_id = getHashTagForId(obj_id)
295+
if obj_id not in self.db or refresh:
285296
# load the obj from the reader
286297
self.log.debug(f"getObjectById - fetching {obj_id} from reader")
287298
obj_json = self.reader.getObjectById(obj_id)
288-
self.db[tag] = obj_json
289-
obj_json = self.db[tag]
299+
self.db[obj_id] = obj_json
300+
obj_json = self.db[obj_id]
290301

291302
return obj_json
292303

@@ -299,6 +310,9 @@ def getObjectIdByPath(self, h5path, parent_id=None):
299310

300311
if parent_id is None:
301312
parent_id = self.root_id
313+
else:
314+
parent_id = getHashTagForId(parent_id)
315+
302316
self.log.debug(f"getObjectIdDByPath(h5path: {h5path} parent_id: {parent_id}")
303317

304318
obj_json = self.getObjectById(parent_id)
@@ -359,7 +373,7 @@ def getObjectByPath(self, path):
359373
return obj_json
360374

361375
def getDtype(self, obj_json):
362-
""" Return numpy data type for given object id
376+
""" Return numpy data type for given dataset, datatype, or attribute
363377
"""
364378

365379
if "type" not in obj_json:
@@ -546,81 +560,99 @@ def getDatasetValues(self, dset_id, sel):
546560
If a slices list or tuple is provided, it should have the same
547561
number of elements as the rank of the dataset.
548562
"""
563+
564+
def init_arr(dtype, cpl):
565+
""" create an ndarray with the give shape, dtype and fill_value
566+
(if the latter is found in the creation properties list) """
567+
arr_shape = sel.count if isinstance(sel.count, tuple) else (sel.count, )
568+
arr = np.zeros(arr_shape, dtype=dtype)
569+
if "fillValue" in cpl:
570+
fillValue = cpl["fillValue"]
571+
# TBD: fix for compound types
572+
arr[...] = fillValue
573+
return arr
574+
575+
dset_id = getHashTagForId(dset_id)
549576
self.log.info(f"getDatasetValues dset_id: {dset_id}, sel: {sel}")
550577

551-
self._checkReader()
552578
dset_json = self.getObjectById(dset_id)
553579
shape_json = dset_json["shape"]
554580
if not isinstance(sel, selections.Selection):
555581
raise TypeError("Expected Selection class")
556582

557-
if shape_json["class"] == "H5S_NULL":
558-
return None
559-
560-
if shape_json["class"] == "H5S_SCALAR":
561-
if sel.select_type != selections.H5S_SELECT_ALL:
562-
# TBD: support other selection types
563-
raise ValueError("Only SELECT_ALL selections are supported for scalar datasets")
564-
if sel.shape != ():
565-
raise ValueError("Selection shape does not match dataset shape")
566-
rank = 0
567-
else:
568-
dims = tuple(shape_json["dims"])
569-
if sel.shape != dims:
570-
raise ValueError("Selection shape does not match dataset shape")
571-
rank = len(dims)
572-
573583
dtype = self.getDtype(dset_json)
574584

575585
if "creationProperties" in dset_json:
576586
cpl = dset_json["creationProperties"]
577587
else:
578588
cpl = {}
579589

580-
# determine if we need to make a read request or not
581-
if dset_id in self._new_objects:
590+
updates = _getDatasetUpdates(dset_json)
591+
592+
shape_class = getShapeClass(shape_json)
593+
594+
if shape_class == "H5S_NULL":
595+
# return None for selections on null space
596+
return None
597+
598+
if sel.shape != getShapeDims(shape_json):
599+
raise ValueError("Selection shape does not match dataset shape")
600+
601+
if shape_class == "H5S_SCALAR":
602+
if sel.select_type != selections.H5S_SELECT_ALL:
603+
# TBD: support other selection types
604+
raise ValueError("Only SELECT_ALL selections are supported for scalar datasets")
605+
if sel.shape != ():
606+
raise ValueError("Selection shape does not match dataset shape")
607+
if updates:
608+
# for scalars the update has to be the requested value
609+
(update_sel, arr) = updates[-1]
610+
elif dset_id in self._new_objects:
611+
arr = init_arr(dtype, cpl)
612+
else:
613+
# fetch from the server
614+
arr = self.reader.getDatasetValues(dset_id, sel, dtype=dtype)
615+
if arr is None:
616+
raise KeyError(f"Data for dataset {dset_id} not returned")
617+
# done with NULL and SCALAR cases
618+
return arr
619+
620+
# simple daaset
621+
arr = None
622+
fetch = True
623+
624+
# determine if we need to get data from the reader
625+
if isinstance(self._reader, H5NullReader) or dset_id in self._new_objects:
582626
fetch = False
583627
else:
584-
fetch = True
585-
# check against pending updates
586-
if "updates" in dset_json:
587-
updates = dset_json["updates"]
588-
for (update_sel, update_val) in updates:
589-
if selections.contained(sel, update_sel):
590-
fetch = False
591-
break
592-
593-
# send a reader request unless an update already covers the sel area
594-
if fetch:
595-
arr = self.reader.getDatasetValues(dset_id, sel, dtype=dtype)
596-
else:
597-
if "fillValue" in cpl:
598-
fillValue = cpl["fillValue"]
599-
# TBD: fix for compound types
600-
arr = np.zeros(sel.mshape, dtype=dtype)
601-
arr[...] = fillValue
602-
else:
603-
arr = np.zeros(sel.mshape, dtype=dtype)
604-
605-
if "updates" in dset_json:
606-
# apply any non-flushed changes that intersect the current selection
607-
updates = dset_json["updates"]
608628
for (update_sel, update_val) in updates:
609629
sel_inter = selections.intersect(sel, update_sel)
610630
if sel_inter.nselect == 0:
611631
continue
612-
# update portion of arr, that intersects update_val
613-
slices = []
614-
for dim in range(rank):
615-
start = sel_inter.start[dim] - sel.start[dim]
616-
stop = start + sel_inter.count[dim]
617-
slices.append(slice(start, stop, 1))
618-
slices = tuple(slices)
619-
# TBD: needs updating to work in the general case!
620-
if slices == ():
621-
arr[slices] = update_val[slices]
622-
else:
623-
arr[slices] = update_val
632+
if selections.contained(sel, update_sel):
633+
# desired selection is wholly contained in this update
634+
# TBD: determine if multiple updates would contain all the
635+
# required elements
636+
fetch = False
637+
break
638+
if fetch:
639+
# get last saved version of the data from the reader
640+
arr = self.reader.getDatasetValues(dset_id, sel, dtype=dtype)
641+
else:
642+
# initialize an array with fill value if given
643+
arr = init_arr(dtype, cpl)
644+
645+
# apply any updates that impact this selection
646+
for (update_sel, update_val) in updates:
647+
# get the part of the update that is in common with the requested selection
648+
x_sel = selections.intersect(sel, update_sel)
649+
if x_sel.nselect == 0:
650+
# this update doesn't effect the selection, so ignore
651+
continue
652+
# apply the update to the array to be returned
653+
src_sel = selections.translate(update_sel, x_sel)
654+
tgt_sel = selections.translate(sel, x_sel)
655+
arr[tgt_sel.slices] = update_val[src_sel.slices]
624656

625657
return arr
626658

@@ -641,22 +673,32 @@ def setDatasetValues(self, dset_id, sel, arr):
641673
src_dt = arr.dtype
642674
if src_dt != tgt_dt:
643675
raise TypeError("arr.dtype doesn't match dataset dtype")
644-
645-
if shape_json["class"] == "H5S_NULL":
676+
shape_class = getShapeClass(shape_json)
677+
if shape_class == "H5S_NULL":
646678
raise ValueError("writing to null space dataset not supported")
647-
if shape_json["class"] == "H5S_SCALAR":
679+
if shape_class == "H5S_SCALAR":
648680
if sel.shape != ():
649681
raise ValueError("Selection shape does not match dataset shape")
650682
if len(arr.shape) > 0:
651683
raise TypeError("Expected scalar ndarray for scalar dataset")
652684
else:
653-
dims = tuple(shape_json["dims"])
685+
dims = getShapeDims(shape_json)
654686
if sel.shape != dims:
655687
raise ValueError("Selection shape does not match dataset shape")
656-
if "updates" not in dset_json or sel.select_type == selections.H5S_SELECT_ALL:
688+
updates = _getDatasetUpdates(dset_json)
689+
if sel.select_type == selections.H5S_SELECT_ALL:
657690
# for select all, throw out any existing updates since this will overwrite them
658-
dset_json["updates"] = []
659-
updates = dset_json["updates"]
691+
updates.clear()
692+
arr = arr.copy() # make a copy in case the client updates it later
693+
rank = len(sel.shape)
694+
if len(arr.shape) < rank:
695+
# reshape to keep compatiblity with dataset rank
696+
if sel.select_type == selections.H5S_SELECT_ALL:
697+
# this should not result in a dimension reduction
698+
raise ValueError("unexpected selection shape")
699+
if sel.select_type != selections.H5S_SELECT_HYPERSLABS:
700+
raise ValueError("tbd")
701+
arr = arr.reshape(sel.mshape)
660702
updates.append((sel, arr.copy()))
661703
self.make_dirty(dset_id)
662704

0 commit comments

Comments
 (0)