6
6
from zarr .meta import encode_fill_value , json_dumps
7
7
from numcodecs import Zlib
8
8
import fsspec
9
+ import fsspec .utils
9
10
10
11
lggr = logging .getLogger ('h5-to-zarr' )
11
- chunks_meta_key = ".zchunkstore"
12
12
13
13
14
14
def _path_to_prefix (path ):
@@ -20,37 +20,7 @@ def _path_to_prefix(path):
20
20
return prefix
21
21
22
22
23
- def chunks_info (zarray , chunks_loc ):
24
- """Store chunks location information for a Zarr array.
25
-
26
- Parameters
27
- ----------
28
- zarray : zarr.core.Array
29
- Zarr array that will use the chunk data.
30
- chunks_loc : dict
31
- File storage information for the chunks belonging to the Zarr array.
32
- """
33
- if 'source' not in chunks_loc :
34
- raise ValueError ('Chunk source information missing' )
35
- if any ([k not in chunks_loc ['source' ] for k in ('uri' , 'array_name' )]):
36
- raise ValueError (
37
- f'{ chunks_loc ["source" ]} : Chunk source information incomplete' )
38
-
39
- key = _path_to_prefix (zarray .path ) + chunks_meta_key
40
- chunks_meta = dict ()
41
- for k , v in chunks_loc .items ():
42
- if k != 'source' :
43
- k = zarray ._chunk_key (k )
44
- if any ([a not in v for a in ('offset' , 'size' )]):
45
- raise ValueError (
46
- f'{ k } : Incomplete chunk location information' )
47
- chunks_meta [k ] = v
48
-
49
- # Store Zarr array chunk location metadata...
50
- zarray .store [key ] = json_dumps (chunks_meta )
51
-
52
-
53
- class Hdf5ToZarr :
23
+ class SingleHdf5ToZarr :
54
24
"""Translate the content of one HDF5 file into Zarr metadata.
55
25
56
26
HDF5 groups become Zarr groups. HDF5 datasets become Zarr arrays. Zarr array
@@ -69,10 +39,11 @@ class Hdf5ToZarr:
69
39
"""
70
40
71
41
def __init__ (self , h5f : Union [str , BinaryIO ], url : str ,
72
- xarray : bool = False ):
42
+ xarray : bool = False , spec = 1 ):
73
43
# Open HDF5 file in read mode...
74
44
lggr .debug (f'HDF5 file: { h5f } ' )
75
45
lggr .debug (f'xarray: { xarray } ' )
46
+ self .spec = spec
76
47
self ._h5f = h5py .File (h5f , mode = 'r' )
77
48
self ._xr = xarray
78
49
@@ -85,24 +56,34 @@ def __init__(self, h5f: Union[str, BinaryIO], url: str,
85
56
def translate (self ):
86
57
"""Translate content of one HDF5 file into Zarr storage format.
87
58
59
+ This method is the main entry point to execute the workflow, and
60
+ returns a "reference" structure to be used with zarr/fsspec-reference-maker
61
+
88
62
No data is copied out of the HDF5 file.
63
+
64
+ :returns
65
+ dict with references
89
66
"""
90
- import json
91
67
lggr .debug ('Translation begins' )
92
- self .transfer_attrs (self ._h5f , self ._zroot )
93
- self ._h5f .visititems (self .translator )
94
- ref = {}
95
- for key , value in self .store .items ():
96
- if key .endswith (".zchunkstore" ):
97
- value = json .loads (value )
98
- source = value .pop ("source" )["uri" ]
99
- for k , v in value .items ():
100
- ref [k ] = (source , v ["offset" ], v ["size" ])
101
- # else:
102
- # ref[key] = value.decode()
103
- return ref
104
-
105
- def transfer_attrs (self , h5obj : Union [h5py .Dataset , h5py .Group ],
68
+ self ._transfer_attrs (self ._h5f , self ._zroot )
69
+ self ._h5f .visititems (self ._translator )
70
+ if self .spec < 1 :
71
+ return self .store
72
+ else :
73
+ for k , v in self .store .copy ().items ():
74
+ if isinstance (v , list ):
75
+ self .store [k ][0 ] = "{{u}}"
76
+ else :
77
+ self .store [k ] = v .decode ()
78
+ return {
79
+ "version" : 1 ,
80
+ "templates" : {
81
+ "u" : self ._uri
82
+ },
83
+ "refs" : self .store
84
+ }
85
+
86
+ def _transfer_attrs (self , h5obj : Union [h5py .Dataset , h5py .Group ],
106
87
zobj : Union [zarr .Array , zarr .Group ]):
107
88
"""Transfer attributes from an HDF5 object to its equivalent Zarr object.
108
89
@@ -138,9 +119,10 @@ def transfer_attrs(self, h5obj: Union[h5py.Dataset, h5py.Group],
138
119
lggr .exception (
139
120
f'Caught TypeError: { n } @{ h5obj .name } = { v } ({ type (v )} )' )
140
121
141
- def translator (self , name : str , h5obj : Union [h5py .Dataset , h5py .Group ]):
122
+ def _translator (self , name : str , h5obj : Union [h5py .Dataset , h5py .Group ]):
142
123
"""Produce Zarr metadata for all groups and datasets in the HDF5 file.
143
124
"""
125
+ refs = {}
144
126
if isinstance (h5obj , h5py .Dataset ):
145
127
lggr .debug (f'HDF5 dataset: { h5obj .name } ' )
146
128
if h5obj .id .get_create_plist ().get_layout () == h5py .h5d .COMPACT :
@@ -159,7 +141,7 @@ def translator(self, name: str, h5obj: Union[h5py.Dataset, h5py.Group]):
159
141
compression = None
160
142
161
143
# Get storage info of this HDF5 dataset...
162
- cinfo = self .storage_info (h5obj )
144
+ cinfo = self ._storage_info (h5obj )
163
145
if self ._xr and h5py .h5ds .is_scale (h5obj .id ) and not cinfo :
164
146
return
165
147
@@ -171,7 +153,7 @@ def translator(self, name: str, h5obj: Union[h5py.Dataset, h5py.Group]):
171
153
compression = compression ,
172
154
overwrite = True )
173
155
lggr .debug (f'Created Zarr array: { za } ' )
174
- self .transfer_attrs (h5obj , za )
156
+ self ._transfer_attrs (h5obj , za )
175
157
176
158
if self ._xr :
177
159
# Do this for xarray...
@@ -181,14 +163,13 @@ def translator(self, name: str, h5obj: Union[h5py.Dataset, h5py.Group]):
181
163
182
164
# Store chunk location metadata...
183
165
if cinfo :
184
- cinfo ['source' ] = {'uri' : self ._uri ,
185
- 'array_name' : h5obj .name }
186
- chunks_info (za , cinfo )
166
+ for k , v in cinfo .items ():
167
+ self .store [za ._chunk_key (k )] = [self ._uri , v ['offset' ], v ['size' ]]
187
168
188
169
elif isinstance (h5obj , h5py .Group ):
189
170
lggr .debug (f'HDF5 group: { h5obj .name } ' )
190
171
zgrp = self ._zroot .create_group (h5obj .name )
191
- self .transfer_attrs (h5obj , zgrp )
172
+ self ._transfer_attrs (h5obj , zgrp )
192
173
193
174
def _get_array_dims (self , dset ):
194
175
"""Get a list of dimension scale names attached to input HDF5 dataset.
@@ -223,7 +204,7 @@ def _get_array_dims(self, dset):
223
204
f'dimension scales attached to dimension #{ n } ' )
224
205
return dims
225
206
226
- def storage_info (self , dset : h5py .Dataset ) -> dict :
207
+ def _storage_info (self , dset : h5py .Dataset ) -> dict :
227
208
"""Get storage information of an HDF5 dataset in the HDF5 file.
228
209
229
210
Storage information consists of file offset and size (length) for every
@@ -274,16 +255,9 @@ def storage_info(self, dset: h5py.Dataset) -> dict:
274
255
275
256
276
257
def run (url , ** storage_options ):
277
- lggr .setLevel (logging .DEBUG )
278
- lggr_handler = logging .StreamHandler ()
279
- lggr_handler .setFormatter (logging .Formatter (
280
- '%(levelname)s:%(name)s:%(funcName)s:%(message)s' )
281
- )
282
- lggr .handlers .clear ()
283
- lggr .addHandler (lggr_handler )
284
-
258
+ fsspec .utils .setup_logging (logger = lggr )
285
259
with fsspec .open (url , ** storage_options ) as f :
286
- h5chunks = Hdf5ToZarr (f , url , xarray = True )
260
+ h5chunks = SingleHdf5ToZarr (f , url , xarray = True )
287
261
return h5chunks .translate ()
288
262
289
263
0 commit comments