Skip to content

Commit 1d4f27e

Browse files
author
Martin Durant
committed
Spec v1 for Ike
1 parent cf17832 commit 1d4f27e

File tree

1 file changed

+39
-65
lines changed

1 file changed

+39
-65
lines changed

fsspec_reference_maker/hdf.py

Lines changed: 39 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
from zarr.meta import encode_fill_value, json_dumps
77
from numcodecs import Zlib
88
import fsspec
9+
import fsspec.utils
910

1011
lggr = logging.getLogger('h5-to-zarr')
11-
chunks_meta_key = ".zchunkstore"
1212

1313

1414
def _path_to_prefix(path):
@@ -20,37 +20,7 @@ def _path_to_prefix(path):
2020
return prefix
2121

2222

23-
def chunks_info(zarray, chunks_loc):
24-
"""Store chunks location information for a Zarr array.
25-
26-
Parameters
27-
----------
28-
zarray : zarr.core.Array
29-
Zarr array that will use the chunk data.
30-
chunks_loc : dict
31-
File storage information for the chunks belonging to the Zarr array.
32-
"""
33-
if 'source' not in chunks_loc:
34-
raise ValueError('Chunk source information missing')
35-
if any([k not in chunks_loc['source'] for k in ('uri', 'array_name')]):
36-
raise ValueError(
37-
f'{chunks_loc["source"]}: Chunk source information incomplete')
38-
39-
key = _path_to_prefix(zarray.path) + chunks_meta_key
40-
chunks_meta = dict()
41-
for k, v in chunks_loc.items():
42-
if k != 'source':
43-
k = zarray._chunk_key(k)
44-
if any([a not in v for a in ('offset', 'size')]):
45-
raise ValueError(
46-
f'{k}: Incomplete chunk location information')
47-
chunks_meta[k] = v
48-
49-
# Store Zarr array chunk location metadata...
50-
zarray.store[key] = json_dumps(chunks_meta)
51-
52-
53-
class Hdf5ToZarr:
23+
class SingleHdf5ToZarr:
5424
"""Translate the content of one HDF5 file into Zarr metadata.
5525
5626
HDF5 groups become Zarr groups. HDF5 datasets become Zarr arrays. Zarr array
@@ -69,10 +39,11 @@ class Hdf5ToZarr:
6939
"""
7040

7141
def __init__(self, h5f: Union[str, BinaryIO], url: str,
72-
xarray: bool = False):
42+
xarray: bool = False, spec=1):
7343
# Open HDF5 file in read mode...
7444
lggr.debug(f'HDF5 file: {h5f}')
7545
lggr.debug(f'xarray: {xarray}')
46+
self.spec = spec
7647
self._h5f = h5py.File(h5f, mode='r')
7748
self._xr = xarray
7849

@@ -85,24 +56,34 @@ def __init__(self, h5f: Union[str, BinaryIO], url: str,
8556
def translate(self):
8657
"""Translate content of one HDF5 file into Zarr storage format.
8758
59+
This method is the main entry point to execute the workflow, and
60+
returns a "reference" structure to be used with zarr/fsspec-reference-maker
61+
8862
No data is copied out of the HDF5 file.
63+
64+
:returns
65+
dict with references
8966
"""
90-
import json
9167
lggr.debug('Translation begins')
92-
self.transfer_attrs(self._h5f, self._zroot)
93-
self._h5f.visititems(self.translator)
94-
ref = {}
95-
for key, value in self.store.items():
96-
if key.endswith(".zchunkstore"):
97-
value = json.loads(value)
98-
source = value.pop("source")["uri"]
99-
for k, v in value.items():
100-
ref[k] = (source, v["offset"], v["size"])
101-
# else:
102-
# ref[key] = value.decode()
103-
return ref
104-
105-
def transfer_attrs(self, h5obj: Union[h5py.Dataset, h5py.Group],
68+
self._transfer_attrs(self._h5f, self._zroot)
69+
self._h5f.visititems(self._translator)
70+
if self.spec < 1:
71+
return self.store
72+
else:
73+
for k, v in self.store.copy().items():
74+
if isinstance(v, list):
75+
self.store[k][0] = "{{u}}"
76+
else:
77+
self.store[k] = v.decode()
78+
return {
79+
"version": 1,
80+
"templates": {
81+
"u": self._uri
82+
},
83+
"refs": self.store
84+
}
85+
86+
def _transfer_attrs(self, h5obj: Union[h5py.Dataset, h5py.Group],
10687
zobj: Union[zarr.Array, zarr.Group]):
10788
"""Transfer attributes from an HDF5 object to its equivalent Zarr object.
10889
@@ -138,9 +119,10 @@ def transfer_attrs(self, h5obj: Union[h5py.Dataset, h5py.Group],
138119
lggr.exception(
139120
f'Caught TypeError: {n}@{h5obj.name} = {v} ({type(v)})')
140121

141-
def translator(self, name: str, h5obj: Union[h5py.Dataset, h5py.Group]):
122+
def _translator(self, name: str, h5obj: Union[h5py.Dataset, h5py.Group]):
142123
"""Produce Zarr metadata for all groups and datasets in the HDF5 file.
143124
"""
125+
refs = {}
144126
if isinstance(h5obj, h5py.Dataset):
145127
lggr.debug(f'HDF5 dataset: {h5obj.name}')
146128
if h5obj.id.get_create_plist().get_layout() == h5py.h5d.COMPACT:
@@ -159,7 +141,7 @@ def translator(self, name: str, h5obj: Union[h5py.Dataset, h5py.Group]):
159141
compression = None
160142

161143
# Get storage info of this HDF5 dataset...
162-
cinfo = self.storage_info(h5obj)
144+
cinfo = self._storage_info(h5obj)
163145
if self._xr and h5py.h5ds.is_scale(h5obj.id) and not cinfo:
164146
return
165147

@@ -171,7 +153,7 @@ def translator(self, name: str, h5obj: Union[h5py.Dataset, h5py.Group]):
171153
compression=compression,
172154
overwrite=True)
173155
lggr.debug(f'Created Zarr array: {za}')
174-
self.transfer_attrs(h5obj, za)
156+
self._transfer_attrs(h5obj, za)
175157

176158
if self._xr:
177159
# Do this for xarray...
@@ -181,14 +163,13 @@ def translator(self, name: str, h5obj: Union[h5py.Dataset, h5py.Group]):
181163

182164
# Store chunk location metadata...
183165
if cinfo:
184-
cinfo['source'] = {'uri': self._uri,
185-
'array_name': h5obj.name}
186-
chunks_info(za, cinfo)
166+
for k, v in cinfo.items():
167+
self.store[za._chunk_key(k)] = [self._uri, v['offset'], v['size']]
187168

188169
elif isinstance(h5obj, h5py.Group):
189170
lggr.debug(f'HDF5 group: {h5obj.name}')
190171
zgrp = self._zroot.create_group(h5obj.name)
191-
self.transfer_attrs(h5obj, zgrp)
172+
self._transfer_attrs(h5obj, zgrp)
192173

193174
def _get_array_dims(self, dset):
194175
"""Get a list of dimension scale names attached to input HDF5 dataset.
@@ -223,7 +204,7 @@ def _get_array_dims(self, dset):
223204
f'dimension scales attached to dimension #{n}')
224205
return dims
225206

226-
def storage_info(self, dset: h5py.Dataset) -> dict:
207+
def _storage_info(self, dset: h5py.Dataset) -> dict:
227208
"""Get storage information of an HDF5 dataset in the HDF5 file.
228209
229210
Storage information consists of file offset and size (length) for every
@@ -274,16 +255,9 @@ def storage_info(self, dset: h5py.Dataset) -> dict:
274255

275256

276257
def run(url, **storage_options):
277-
lggr.setLevel(logging.DEBUG)
278-
lggr_handler = logging.StreamHandler()
279-
lggr_handler.setFormatter(logging.Formatter(
280-
'%(levelname)s:%(name)s:%(funcName)s:%(message)s')
281-
)
282-
lggr.handlers.clear()
283-
lggr.addHandler(lggr_handler)
284-
258+
fsspec.utils.setup_logging(logger=lggr)
285259
with fsspec.open(url, **storage_options) as f:
286-
h5chunks = Hdf5ToZarr(f, url, xarray=True)
260+
h5chunks = SingleHdf5ToZarr(f, url, xarray=True)
287261
return h5chunks.translate()
288262

289263

0 commit comments

Comments
 (0)