Skip to content

Commit

Permalink
Custom filter support (#159)
Browse files Browse the repository at this point in the history
* add support for bitshuffle

* bump version
  • Loading branch information
jreadey authored Dec 1, 2023
1 parent f1cf192 commit 38278c8
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 12 deletions.
47 changes: 42 additions & 5 deletions h5pyd/_apps/utillib.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@
MIN_CHUNK_SIZE = 1 * 1024 * 1024
MAC_CHUNK_SIZE = 8 * 1024 * 1024

H5Z_FILTER_MAP = { 32001: "blosclz",
32004: "lz4",
32008: "bitshuffle",
32015: "zstd",
}


# check if hdf5 library version supports chunk iteration
hdf_library_version = h5py.version.hdf5_version_tuple
library_has_chunk_iter = (hdf_library_version >= (1, 14, 0) or (hdf_library_version < (1, 12, 0) and (hdf_library_version >= (1, 10, 10))))
Expand Down Expand Up @@ -761,8 +768,8 @@ def create_chunktable(dset, dset_dims, ctx):
chunk_key += str(index[dim] // chunk_dims[dim])
if dim < rank - 1:
chunk_key += "_"
logging.debug(f"adding chunk_key: {chunk_key}")
chunk_map[chunk_key] = (chunk_info.byte_offset, chunk_info.size)
logging.debug(f"adding chunk_key: {chunk_key}")
chunk_map[chunk_key] = (chunk_info.byte_offset, chunk_info.size)

chunks["class"] = "H5D_CHUNKED_REF"
if not extend:
Expand Down Expand Up @@ -1121,6 +1128,7 @@ def create_dataset(dobj, ctx):
# or vlen
pass
else:
logging.debug(f"filter setup for {dobj.name}")
if not ctx["ignorefilters"]:
kwargs["compression"] = dobj.compression
kwargs["compression_opts"] = dobj.compression_opts
Expand All @@ -1134,7 +1142,7 @@ def create_dataset(dobj, ctx):

# TBD: it would be better if HSDS could let us know what filters
# are supported (like it does with compressors)
# For now, just hard-code fletcher32 and scaleoffset to be ignored
# For now, just hard-ccreate_datasetcreate_datasetode fletcher32 and scaleoffset to be ignored
if dobj.fletcher32:
msg = f"fletcher32 filter used by dataset: {dobj.name} is not "
msg += "supported by HSDS, this filter will not be used"
Expand All @@ -1144,7 +1152,35 @@ def create_dataset(dobj, ctx):
msg = f"scaleoffset filter used by dataset: {dobj.name} is not "
msg += "supported by HSDS, this filter will not be used"
logging.warning(msg)
# kwargs["scaleoffset"] = dobj.scaleoffset

if is_h5py(dobj) and not kwargs.get("compression"):
# apply any custom filters as long as they are supported in HSDS
for filter_id in dobj._filters:
filter_opts = dobj._filters[filter_id]
try:
filter_id = int(filter_id)
except ValueError:
msg = "unrecognized filter id: {filter_id} for {dobj.name}, ignoring"
logging.warning(msg)

if not isinstance(filter_id, int):
continue

if filter_id in H5Z_FILTER_MAP:
filter_name = H5Z_FILTER_MAP[filter_id]
if filter_name == "bitshuffle":
kwargs["shuffle"] = filter_name
logging.info(f"using bitshuffle on {dobj.name}")
else:
# supported non-standard compressor
kwargs["compression"] = filter_name
logging.info(f"using compressor: {filter_name} for {dobj.name}")
kwargs["compression_opts"] = filter_opts
logging.info(f"compression_opts: {filter_opts}")
else:
logging.warning(f"filter id {filter_id} for {dobj.name} not supported")

# kwargs["scaleoffset"] = dobj.scaleoffset
# setting the fillvalue is failing in some cases
# see: https://github.com/HDFGroup/h5pyd/issues/119
# don't set fill value for reference types
Expand Down Expand Up @@ -1501,14 +1537,15 @@ def load_file(

logging.info(f"input file: {fin.filename}")
logging.info(f"output file: {fout.filename}")
logging.info(f"dataload: {dataload}")
if dataload != "ingest":
if not dataload:
logging.info("no data load")
elif dataload in ("link", "fastlink"):
if not s3path:
logging.error("s3path expected to be set")
sys.exit(1)
logging.info("using s3path")
logging.info(f"using s3path: {s3path}")
else:
logging.error(f"unexpected dataload value: {dataload}")
sys.exit(1)
Expand Down
19 changes: 17 additions & 2 deletions h5pyd/_hl/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,23 @@ def rq_tuple(tpl, name):
filters.append(filter_scaleoffset)

if shuffle:
filter_shuffle = {"class": "H5Z_FILTER_SHUFFLE"}
filter_shuffle["id"] = 2
if isinstance(shuffle, int) and shuffle == 32008:
bitshuffle = True
elif isinstance(shuffle, str) and shuffle == "bitshuffle":
bitshuffle = True
else:
bitshuffle = False

if bitshuffle:
filter_shuffle = {"class": "H5Z_FILTER_BITSHUFFLE"}
filter_shuffle["id"] = 32008
filter_shuffle["name"] = "bitshuffle"

else:
# regular shuffle filter
filter_shuffle = {"class": "H5Z_FILTER_SHUFFLE"}
filter_shuffle["id"] = 2
filter_shuffle["name"] = "shuffle"
filters.append(filter_shuffle)

if compression == "gzip":
Expand Down
6 changes: 3 additions & 3 deletions h5pyd/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import sys
import numpy

version = "0.16.0"
version = "0.17.0"

hdf5_version = "REST"

Expand All @@ -28,8 +28,8 @@
else ("",)
)

api_version_tuple = (0, 16, 0)
api_version = "0.16.0"
api_version_tuple = (0, 17, 0)
api_version = "0.17.0"

__doc__ = """\
This is h5pyd **%s**
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

setup(
name="h5pyd",
version="0.16.0",
version="0.17.0",
description="h5py compatible client lib for HDF REST API",
long_description=long_description,
url="http://github.com/HDFGroup/h5pyd",
Expand Down Expand Up @@ -70,4 +70,4 @@
"hsstat = h5pyd._apps.hsstat:main",
]
},
)
)

0 comments on commit 38278c8

Please sign in to comment.