Skip to content

Commit

Permalink
Merge pull request #15 from hdmf-dev/paper
Browse files Browse the repository at this point in the history
  • Loading branch information
rly authored Apr 11, 2024
2 parents 406beb2 + 13949d9 commit 1eaf412
Show file tree
Hide file tree
Showing 4 changed files with 188 additions and 1 deletion.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# HDMF-AI - an HDMF schema and API for AI/ML workflows

![Schema](schema.png)
![Schema](paper/schema.png)
124 changes: 124 additions & 0 deletions paper/paper.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
@inproceedings{tritt2019hdmf,
title={HDMF: hierarchical data modeling framework for modern science data standards},
author={Tritt, Andrew J and R{\"u}bel, Oliver and Dichter, Benjamin and Ly, Ryan and Kang, Donghe and Chang, Edward F and Frank, Loren M and Bouchard, Kristofer},
booktitle={2019 IEEE International Conference on Big Data (Big Data)},
pages={165--179},
year={2019},
organization={IEEE},
doi={10.1109/BigData47090.2019.9005648}
}

@article{belthangady2019applications,
title={Applications, promises, and pitfalls of deep learning for fluorescence image reconstruction},
author={Belthangady, Chinmay and Royer, Loic A},
journal={Nature methods},
volume={16},
number={12},
pages={1215--1225},
year={2019},
publisher={Nature Publishing Group US New York},
doi={10.1038/s41592-019-0458-z}
}

@article{wilkinson2016fair,
title={The FAIR Guiding Principles for scientific data management and stewardship},
author={Wilkinson, Mark D and Dumontier, Michel and Aalbersberg, IJsbrand Jan and Appleton, Gabrielle and Axton, Myles and Baak, Arie and Blomberg, Niklas and Boiten, Jan-Willem and da Silva Santos, Luiz Bonino and Bourne, Philip E and others},
journal={Scientific data},
volume={3},
number={1},
pages={1--9},
year={2016},
publisher={Nature Publishing Group},
doi={10.1038/sdata.2016.18}
}

@article{rubel2022neurodata,
title={The Neurodata Without Borders ecosystem for neurophysiological data science},
author={R{\"u}bel, Oliver and Tritt, Andrew and Ly, Ryan and Dichter, Benjamin K and Ghosh, Satrajit and Niu, Lawrence and Baker, Pamela and Soltesz, Ivan and Ng, Lydia and Svoboda, Karel and others},
journal={Elife},
volume={11},
pages={e78362},
year={2022},
publisher={eLife Sciences Publications Limited},
doi={10.7554/eLife.78362}
}

@article{huerta2023fair,
title={FAIR for AI: An interdisciplinary and international community building perspective},
author={Huerta, EA and Blaiszik, Ben and Brinson, L Catherine and Bouchard, Kristofer E and Diaz, Daniel and Doglioni, Caterina and Duarte, Javier M and Emani, Murali and Foster, Ian and Fox, Geoffrey and others},
journal={Scientific data},
volume={10},
number={1},
pages={487},
year={2023},
publisher={Nature Publishing Group UK London}
}

@article{goble2020fair,
title={FAIR computational workflows},
author={Goble, Carole and Cohen-Boulakia, Sarah and Soiland-Reyes, Stian and Garijo, Daniel and Gil, Yolanda and Crusoe, Michael R and Peters, Kristian and Schober, Daniel},
journal={Data Intelligence},
volume={2},
number={1-2},
pages={108--121},
year={2020},
publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…}
}

@misc{souza2019provenance,
title={Provenance Data in the Machine Learning Lifecycle in Computational Science and Engineering},
author={Renan Souza and Leonardo Azevedo and Vítor Lourenço and Elton Soares and Raphael Thiago and Rafael Brandão and Daniel Civitarese and Emilio Vital Brazil and Marcio Moreno and Patrick Valduriez and Marta Mattoso and Renato Cerqueira and Marco A. S. Netto},
year={2019},
eprint={1910.04223},
archivePrefix={arXiv},
primaryClass={cs.DC}
}

@software{hdf5,
author = {{The HDF Group}},
title = {{Hierarchical Data Format, version 5}},
url = {https://github.com/HDFGroup/hdf5}
}

@software{zarr,
author = {Alistair Miles and
jakirkham and
M Bussonnier and
Josh Moore and
Dimitri Papadopoulos Orfanos and
James Bourbeau and
Andrew Fulton and
Davis Bennett and
Gregory Lee and
Sanket Verma and
Zain Patel and
Ryan Abernathey and
David Stansby and
Mads R. B. Kristensen and
Matthew Rocklin and
AWA BRANDON AWA and
Joe Hamman and
Saransh Chopra and
Elliott Sales de Andrade and
Martin Durant and
Vincent Schut and
raphael dussin and
Juan Nunez-Iglesias and
Chris Barnes and
Shivank Chaudhary and
shikharsg and
hailiangzhang and
Weddy Gikunda},
title = {zarr-developers/zarr-python: v2.17.1},
year = 2024,
publisher = {Zenodo},
doi = {10.5281/zenodo.3773449},
url = {https://doi.org/10.5281/zenodo.3773449}
}

@software{Tritt_deep-taxon,
author = {Tritt, Andrew J},
license = {BSD-3-Clause-LBNL},
title = {{deep-taxon}},
url = {https://github.com/exabiome/deep-taxon}
}
63 changes: 63 additions & 0 deletions paper/paper.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
---
title: 'HDMF-AI: A schema and API for storing the results from AI/ML workflows'
tags:
- artificial intelligence
- machine learning
- data standards
- data management
- data modeling
- deep learning
- scientific data
- scientific machine learning

authors:
- name: Ryan Ly
orcid: 0000-0001-9238-0642
affiliation: 1
corresponding: true
- name: Andrew Tritt
orcid: 0000-0002-1617-449X
affiliation: 2
- name: Marcin Joachimiak
orcid: 0000-0001-8175-045X
affiliation: 3
- name: Kris Bouchard
orcid: 0000-0002-1974-4603
affiliation: "1, 4, 5, 6"
affiliations:
- name: Scientific Data Division, Lawrence Berkeley National Laboratory, USA
index: 1
- name: Applied Mathematics and Computational Research Division, Lawrence Berkeley National Laboratory, USA
index: 2
- name: Biosystems Data Science Department, Environmental Genomics and Systems Biology Division, Lawrence Berkeley National Laboratory, USA
index: 3
- name: Biological Systems & Engineering Division, Lawrence Berkeley National Laboratory, USA
index: 4
- name: Helen Wills Neuroscience Institute, UC Berkeley, USA
index: 5
- name: Redwood Center for Theoretical Neuroscience, UC Berkeley, USA
index: 6
date: 11 April 2024
bibliography: paper.bib

---

# Summary

Scientists are increasingly using artificial intelligence (AI) methods that learn directly from data to make new discoveries in complex systems across multiple domains. However, the lack of standardized data and models in the scientific community hinders the reproducibility and reusability of these methods and their results [@huerta2023fair] [@goble2020fair]. Here, we present `HDMF-AI`, a schema and API for storing the common results of AI algorithms in a standardized way. `HDMF-AI` is designed to be flexible and extensible, allowing users to store a range of AI results. These results can be directly linked to the model training data, which enables greater understanding of how models solved the task and more comprehensive analysis of errors. `HDMF-AI` provides users with a convenient programming interface for reading and writing AI results, with powerful options to optimize storage space and data transfer. By using `HDMF-AI`, scientists can easily make their results available and share them with others, helping to ensure that their work is reproducible and reusable.

# Statement of Need

Modern AI approaches, such as deep learning, are powerful at uncovering subtle structure in complex datasets that are informative for solving a task. These approaches can also discover structures that may be scientifically artefactual [@belthangady2019applications]. For example, there may be relationships between the data acquisition protocols and the collected data, and deep learning could potentially utilize such "nuisance variables" when solving the task. Thus, to trust the results of AI algorithms, we must understand what data features/samples a trained model is utilizing to solve the task, and link that to metadata about those samples to interpret and evaluate the basis of results. Many solutions exist for provenance tracking of AI/ML workflows, e.g., [@souza2019provenance]; however, these solutions are designed for production settings and are difficult to use in exploratory analysis. Although many scientific communities have standardized formats for sharing self-describing data, the AI community has no standard format that connects data and models. The adoption of AI by scientists hinges on making data, models, and workflows FAIR (Findable, Accessible, Interoperable, Reusable) [@wilkinson2016fair] and cross-referenceable to each other to maximize interpretability, reproducibility, and reusability.

`HDMF-AI` is a schema and Python API for storing the common results of AI algorithms in a standardized way within the Hierarchical Data Modeling Framework (HDMF) [@tritt2019hdmf]. `HDMF-AI` is designed to be flexible and extensible, allowing users to store a range of AI and machine learning results and metadata, such as from classification, regression, and clustering. These results are stored in the `ResultsTable` data type, which extends the `DynamicTable` data type within the base HDMF schema. The `DynamicTable` schema supports simple tabular data as well as more complex structures common in scientific data, such as ragged arrays, n-dimensional arrays, and enumerations. The `ResultsTable` schema represents each data sample as a row and includes columns for storing model outputs and information about the AI/ML workflow, such as which data were used for training, validation, and testing. These columns are represented as new data types in the schema to allow extension and composition in other data types (see \autoref{fig:schema}). By extending `DynamicTable`, the `ResultsTable` allows the user to add arbitrary columns, enabling the storage of non-standardized metadata and AI outputs, such as performance metrics, alongside the standardized columns. The `ResultsTable` schema also supports a direct link to data stored in another `DynamicTable`, enabling the user to associate AI results with the original data. This link allows for greater understanding of how models are completing the task and analysis of any associated errors. The schema also supports the storage of model parameters and links to the source code(s) used to train the model, as well as links to publicly available pre-trained models if they were used.

Using the HDMF API, the `ResultsTable` can easily be added to datasets that follow an HDMF-based standard, such as Neurodata Without Borders [@rubel2022neurodata], a popular data standard for neurophysiology, and HDMF-Seq, a format for storing taxonomic and genomic sequence data [@Tritt_deep-taxon]. HDMF provides core functionality that allows `HDMF-AI` users to store AI results using advanced features and options for efficient storage and access, such as chunking, compression, and selective streaming from an S3 bucket. Users can write results to an HDF5 file, a popular file format for scientific data and high-performance computing [@hdf5], or a Zarr store, a new format optimized for cloud computing [@zarr]. By leveraging existing HDMF tools and standards, `HDMF-AI` provides a scalable and extensible framework for storing AI results in an accessible, standardized way that is compatible with other HDMF-based data formats. By enabling standardized co-storage of data and AI results, `HDMF-AI` may enhance the reproducibility and explainability of AI for science.

![UML diagram of the HDMF-AI schema. Data types with orange headers are introduced by HDMF-AI. Data types with blue headers are defined in HDMF. Fields colored in gray are optional.\label{fig:schema}](schema.png)

# Acknowledgements

This work is part of the ENDURABLE project supported by the Advanced Scientific Computing Research (ASCR) program in the U.S. Department of Energy, Office of Science, Office of Biological and Environmental Research (BER) [DE-AC02-05CH11231 to LBNL].

# References
File renamed without changes

0 comments on commit 1eaf412

Please sign in to comment.