From 3a2007bf852e781793305cb8a9b68becc3b37a29 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 20 Mar 2024 13:32:02 -0400 Subject: [PATCH] Initial Parquet File/Dataset abstraction (#501) Prep work for being able to fetch metadata from a bunch of Parquet files in a folder and load a spatial filter ### Change list - New `ParquetFile` and `ParquetDataset` Rust structs to read from one or multiple Parquet files. This is generic over `AsyncFileReader`, which primarily works with object store. - Added initial metadata handling, to e.g. read the bounding box of the file. - Added Python bindings to each class. This uses ObjectStore. - Added initial JS bindings to each class. This uses a custom implementation of `AsyncFileReader` [vendored from parquet-wasm](https://github.com/kylebarron/parquet-wasm/blob/0ecad6e1d87a377051087122f7ee2db52a2d6533/src/reader_async.rs). I couldn't get the ObjectStore integration working just yet. Thought object_store_wasm_s3 was updated and that might be an interesting thing to check out. --- Cargo.toml | 5 +- js/Cargo.lock | 1388 ++++++++++++++--- js/Cargo.toml | 31 +- js/package.json | 2 +- js/src/io/mod.rs | 4 + js/src/io/object_store.rs | 95 ++ js/src/io/object_store_s3/mod.rs | 2 + js/src/io/parquet/async.rs | 92 ++ js/src/io/parquet/async_file_reader/fetch.rs | 95 ++ js/src/io/parquet/async_file_reader/mod.rs | 573 +++++++ js/src/io/parquet/mod.rs | 9 + js/src/io/{parquet.rs => parquet/sync.rs} | 4 +- js/src/table.rs | 12 + python/core/Cargo.lock | 4 + python/core/Cargo.toml | 4 +- .../core/python/geoarrow/rust/core/_rust.pyi | 26 +- python/core/src/io/parquet.rs | 210 ++- python/core/src/lib.rs | 2 + src/array/metadata.rs | 7 +- src/io/geozero/table/builder/table.rs | 2 +- src/io/parquet/metadata.rs | 63 +- src/io/parquet/mod.rs | 4 +- src/io/parquet/reader.rs | 9 + src/io/parquet/reader_async.rs | 247 ++- src/io/parquet/test.rs | 19 + src/io/parquet/writer.rs | 8 +- 26 files changed, 2632 insertions(+), 285 deletions(-) create mode 100644 js/src/io/object_store.rs create mode 100644 js/src/io/object_store_s3/mod.rs create mode 100644 js/src/io/parquet/async.rs create mode 100644 js/src/io/parquet/async_file_reader/fetch.rs create mode 100644 js/src/io/parquet/async_file_reader/mod.rs create mode 100644 js/src/io/parquet/mod.rs rename js/src/io/{parquet.rs => parquet/sync.rs} (92%) create mode 100644 src/io/parquet/test.rs diff --git a/Cargo.toml b/Cargo.toml index f879e38dd..f326d9fbe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -51,7 +51,7 @@ async-stream = { version = "0.3", optional = true } async-trait = { version = "0.1", optional = true } bumpalo = { version = "3", features = ["collections"] } byteorder = "1" -bytes = { version = "*", optional = true } +bytes = { version = "1.5.0", optional = true } chrono = "0.4" # Set default-features = false because async not working in wasm right now flatgeobuf = { version = "4.1.0", optional = true, default-features = false } @@ -79,7 +79,7 @@ rayon = { version = "1.8.0", optional = true } # Note: geo has a hard dependency on rstar, so there's no point in feature flagging it rstar = "0.12" serde = { version = "1", features = ["derive"] } -serde_json = { version = "1" } +serde_json = "1" sqlx = { version = "0.7", optional = true, default-features = false, features = [ "chrono", "json", @@ -92,6 +92,7 @@ thiserror = "1" [dev-dependencies] approx = "0.5.1" +bytes = "1.5.0" criterion = { version = "0.5", features = ["html_reports"] } gdal = { version = "0.16", features = ["bindgen"] } geozero = { version = "0.12", features = ["with-wkb"] } diff --git a/js/Cargo.lock b/js/Cargo.lock index 7e842a738..0f27948d5 100644 --- a/js/Cargo.lock +++ b/js/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "addr2line" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +dependencies = [ + "gimli", +] + [[package]] name = "adler" version = "1.0.2" @@ -10,9 +19,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "ahash" -version = "0.8.6" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91429305e9f0a25f6205c5b8e0d2db09e0708a7a6df0f42212bb56c32c8ac97a" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", "const-random", @@ -24,9 +33,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.1.2" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" dependencies = [ "memchr", ] @@ -69,9 +78,9 @@ dependencies = [ [[package]] name = "anstream" -version = "0.6.5" +version = "0.6.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d664a92ecae85fd0a7392615844904654d1d5f5514837f471ddef4a057aba1b6" +checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb" dependencies = [ "anstyle", "anstyle-parse", @@ -83,9 +92,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.4" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87" +checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" [[package]] name = "anstyle-parse" @@ -117,9 +126,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.76" +version = "1.0.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59d2a3357dde987206219e78ecfbbb6e8dad06cbb65292758d3270e6254f7355" +checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247" [[package]] name = "approx" @@ -205,7 +214,7 @@ dependencies = [ "arrow-schema", "arrow-select", "atoi", - "base64", + "base64 0.22.0", "chrono", "half", "lexical-core", @@ -314,7 +323,7 @@ version = "51.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "02d9483aaabe910c4781153ae1b6ae0393f72d9ef757d38d09d450070cf2e528" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.5.0", ] [[package]] @@ -363,6 +372,39 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "async-stream" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.53", +] + +[[package]] +name = "async-trait" +version = "0.1.78" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "461abc97219de0eaaf81fe3ef974a540158f3d079c2ab200f891f1a2ef201e85" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.53", +] + [[package]] name = "atoi" version = "2.0.0" @@ -373,19 +415,31 @@ dependencies = [ ] [[package]] -name = "atomic-polyfill" -version = "1.0.3" +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "backtrace" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cf2bce30dfe09ef0bfaef228b9d414faaf7e563035494d7fe092dba54b300f4" +checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" dependencies = [ - "critical-section", + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", ] [[package]] -name = "autocfg" -version = "1.1.0" +name = "base64" +version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" [[package]] name = "base64" @@ -401,15 +455,15 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.4.1" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" +checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" [[package]] name = "brotli" -version = "3.4.0" +version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "516074a47ef4bce09577a3b379392300159ce5b1ba2e501ff1c819950066100f" +checksum = "d640d25bc63c50fb1f0b545ffd80207d2e10a4c965530809b40ba3386825c391" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -428,9 +482,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.14.0" +version = "3.15.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" +checksum = "7ff69b9dd49fd426c69a0db9fc04dd934cdb6645ff000864d98f7e2af8830eaa" [[package]] name = "byteorder" @@ -446,9 +500,9 @@ checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" [[package]] name = "cc" -version = "1.0.83" +version = "1.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5" dependencies = [ "jobserver", "libc", @@ -471,14 +525,14 @@ dependencies = [ "js-sys", "num-traits", "wasm-bindgen", - "windows-targets 0.52.0", + "windows-targets 0.52.4", ] [[package]] name = "clap" -version = "4.4.11" +version = "4.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfaff671f6b22ca62406885ece523383b9b64022e341e53e009a62ebc47a45f2" +checksum = "949626d00e063efc93b6dca932419ceb5432f99769911c0b995f7e884c778813" dependencies = [ "clap_builder", "clap_derive", @@ -486,9 +540,9 @@ dependencies = [ [[package]] name = "clap-verbosity-flag" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c90e95e5bd4e8ac34fa6f37c774b0c6f8ed06ea90c79931fd448fcf941a9767" +checksum = "bb9b20c0dd58e4c2e991c8d203bbeb76c11304d1011659686b5b644bc29aa478" dependencies = [ "clap", "log", @@ -496,9 +550,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.4.11" +version = "4.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a216b506622bb1d316cd51328dce24e07bdff4a6128a47c7e7fad11878d5adbb" +checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4" dependencies = [ "anstream", "anstyle", @@ -508,21 +562,21 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.4.7" +version = "4.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf9804afaaf59a91e75b022a30fb7229a7901f60c755489cc61c9b423b836442" +checksum = "90239a040c80f5e14809ca132ddc4176ab33d5e17e49691793296e3fcb34d72f" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", - "syn", + "syn 2.0.53", ] [[package]] name = "clap_lex" -version = "0.6.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" +checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" [[package]] name = "colorchoice" @@ -542,9 +596,9 @@ dependencies = [ [[package]] name = "const-random" -version = "0.1.17" +version = "0.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aaf16c9c2c612020bcfd042e170f6e32de9b9d75adb5277cdbbd2e2c8c8299a" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" dependencies = [ "const-random-macro", ] @@ -560,6 +614,16 @@ dependencies = [ "tiny-keccak", ] +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation-sys" version = "0.8.6" @@ -568,19 +632,13 @@ checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" [[package]] name = "crc32fast" -version = "1.3.2" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +checksum = "b3855a8a784b474f333699ef2bbca9db2c4a1f6d9088a90a2d25b1eb53111eaa" dependencies = [ "cfg-if", ] -[[package]] -name = "critical-section" -version = "1.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7059fff8937831a9ae6f0fe4d658ffabf58f2ca96aa9dec1c889f936f705f216" - [[package]] name = "crunchy" version = "0.2.2" @@ -628,6 +686,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "doc-comment" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" + [[package]] name = "earcutr" version = "0.4.3" @@ -640,15 +704,24 @@ dependencies = [ [[package]] name = "either" -version = "1.9.0" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" + +[[package]] +name = "encoding_rs" +version = "0.8.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" +checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1" +dependencies = [ + "cfg-if", +] [[package]] name = "env_logger" -version = "0.10.1" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95b3f3e67048839cb0d0781f445682a35113da7121f7c949db0e2be96a4fbece" +checksum = "4cd405aab171cb85d6735e5c8d9db038c17d3ca007a4d2c25f337935c3d90580" dependencies = [ "humantime", "is-terminal", @@ -731,6 +804,125 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8" +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + +[[package]] +name = "form_urlencoded" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "futures" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" + +[[package]] +name = "futures-executor" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" + +[[package]] +name = "futures-macro" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.53", +] + +[[package]] +name = "futures-sink" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" + +[[package]] +name = "futures-task" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" + +[[package]] +name = "futures-util" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + [[package]] name = "geo" version = "0.28.0" @@ -744,7 +936,7 @@ dependencies = [ "log", "num-traits", "robust", - "rstar 0.12.0", + "rstar", "spade", ] @@ -756,7 +948,7 @@ checksum = "9ff16065e5720f376fbced200a5ae0f47ace85fd70b7e54269790281353b6d61" dependencies = [ "approx", "num-traits", - "rstar 0.12.0", + "rstar", "serde", ] @@ -776,15 +968,16 @@ dependencies = [ "byteorder", "chrono", "flatgeobuf", + "futures", "geo", "geodesy", "geozero", "indexmap", - "itertools 0.12.0", + "itertools 0.12.1", "num_enum", "parquet", "phf", - "rstar 0.11.0", + "rstar", "serde", "serde_json", "thiserror", @@ -797,15 +990,24 @@ dependencies = [ "arrow-array", "arrow-buffer", "arrow-wasm", + "async-stream", + "async-trait", "bytes", "console_error_panic_hook", + "futures", "geo", "geoarrow", "geodesy", + "object_store", "parquet", + "range-reader", + "reqwest", "thiserror", + "tokio", "wasm-bindgen", + "wasm-bindgen-futures", "wasm-bindgen-test", + "wasm-streams", "web-sys", "zstd", ] @@ -830,11 +1032,11 @@ dependencies = [ [[package]] name = "geographiclib-rs" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ea804e7bd3c6a4ca6a01edfa35231557a8a81d4d3f3e1e2b650d028c42592be" +checksum = "e6e5ed84f8089c70234b0a8e0aedb6dc733671612ddc0d37c6066052f9781960" dependencies = [ - "lazy_static", + "libm", ] [[package]] @@ -866,9 +1068,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.11" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe9006bed769170c11f845cf00c7c1e9092aeb3f268e007c3e760ac68008070f" +checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" dependencies = [ "cfg-if", "js-sys", @@ -878,23 +1080,39 @@ dependencies = [ ] [[package]] -name = "half" -version = "2.3.1" +name = "gimli" +version = "0.28.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" + +[[package]] +name = "h2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872" +checksum = "51ee2dd2e4f378392eeff5d51618cd9a63166a2513846bbc55f21cfacd9199d4" dependencies = [ - "cfg-if", - "crunchy", - "num-traits", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", ] [[package]] -name = "hash32" -version = "0.2.1" +name = "half" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67" +checksum = "b5eceaaeec696539ddaf7b333340f1af35a5aa87ae3e4f3ead0532f72affab2e" dependencies = [ - "byteorder", + "cfg-if", + "crunchy", + "num-traits", ] [[package]] @@ -916,26 +1134,13 @@ dependencies = [ "allocator-api2", ] -[[package]] -name = "heapless" -version = "0.7.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdc6457c0eb62c71aac4bc17216026d8410337c4126773b9c5daba343f17964f" -dependencies = [ - "atomic-polyfill", - "hash32 0.2.1", - "rustc_version", - "spin", - "stable_deref_trait", -] - [[package]] name = "heapless" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0bfb9eb618601c89945a70e254898da93b13be0388091d42117462b265bb3fad" dependencies = [ - "hash32 0.3.1", + "hash32", "stable_deref_trait", ] @@ -945,11 +1150,57 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "hermit-abi" -version = "0.3.3" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" + +[[package]] +name = "http" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" +dependencies = [ + "bytes", + "http", +] + +[[package]] +name = "http-body-util" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0475f8b2ac86659c21b64320d5d653f9efe42acd2a4e560073ec61a155a34f1d" +dependencies = [ + "bytes", + "futures-core", + "http", + "http-body", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" +checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" [[package]] name = "humantime" @@ -957,11 +1208,67 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" +[[package]] +name = "hyper" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "186548d73ac615b32a73aafe38fb4f56c0d340e110e5a200bcadbaf2e199263a" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "http", + "http-body", + "hyper", + "pin-project-lite", + "socket2", + "tokio", + "tower", + "tower-service", + "tracing", +] + [[package]] name = "iana-time-zone" -version = "0.1.58" +version = "0.1.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8326b86b6cff230b97d0d312a6c40a60726df3332e721f72a1b035f451663b20" +checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141" dependencies = [ "android_system_properties", "core-foundation-sys", @@ -980,11 +1287,21 @@ dependencies = [ "cc", ] +[[package]] +name = "idna" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + [[package]] name = "indexmap" -version = "2.1.0" +version = "2.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f" +checksum = "7b0b929d511467233429c45a44ac1dcaa21ba0f5ba11e4879e6ed28ddb4f9df4" dependencies = [ "equivalent", "hashbrown", @@ -996,15 +1313,21 @@ version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" +[[package]] +name = "ipnet" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" + [[package]] name = "is-terminal" -version = "0.4.9" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" +checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" dependencies = [ "hermit-abi", - "rustix", - "windows-sys 0.48.0", + "libc", + "windows-sys 0.52.0", ] [[package]] @@ -1018,9 +1341,9 @@ dependencies = [ [[package]] name = "itertools" -version = "0.12.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25db6b064527c5d482d0423354fcd07a89a2dfe07b67892e62411946db7f07b0" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" dependencies = [ "either", ] @@ -1033,18 +1356,18 @@ checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" [[package]] name = "jobserver" -version = "0.1.27" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d" +checksum = "ab46a6e9526ddef3ae7f787c06f0f2600639ba80ea3eade3d8e670a2230f51d6" dependencies = [ "libc", ] [[package]] name = "js-sys" -version = "0.3.66" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cee9c64da59eae3b50095c18d3e74f8b73c0b86d2792824ff01bbce68ba229ca" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" dependencies = [ "wasm-bindgen", ] @@ -1121,9 +1444,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.151" +version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] name = "libm" @@ -1137,16 +1460,16 @@ version = "0.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85c833ca1e66078851dba29046874e38f08b2c883700aa29a03ddd3b23814ee8" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.5.0", "libc", "redox_syscall", ] [[package]] name = "linux-raw-sys" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" [[package]] name = "lock_api" @@ -1160,15 +1483,15 @@ dependencies = [ [[package]] name = "log" -version = "0.4.20" +version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" [[package]] name = "lz4_flex" -version = "0.11.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ea9b256699eda7b0387ffbc776dd625e28bde3918446381781245b7a50349d8" +checksum = "912b45c753ff5f7f5208307e8ace7d2a2e30d024e26d3509f3dce546c044ce15" dependencies = [ "twox-hash", ] @@ -1179,15 +1502,50 @@ version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + [[package]] name = "miniz_oxide" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" dependencies = [ "adler", ] +[[package]] +name = "mio" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.48.0", +] + +[[package]] +name = "native-tls" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" +dependencies = [ + "lazy_static", + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + [[package]] name = "num" version = "0.4.1" @@ -1215,28 +1573,27 @@ dependencies = [ [[package]] name = "num-complex" -version = "0.4.4" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214" +checksum = "23c6602fda94a57c990fe0df199a035d83576b496aa29f4e634a8ac6004e68a6" dependencies = [ "num-traits", ] [[package]] name = "num-integer" -version = "0.1.45" +version = "0.1.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" dependencies = [ - "autocfg", "num-traits", ] [[package]] name = "num-iter" -version = "0.1.43" +version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252" +checksum = "d869c01cc0c455284163fd0092f1f93835385ccab5a98a0dcc497b2f8bf055a9" dependencies = [ "autocfg", "num-integer", @@ -1257,9 +1614,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.17" +version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" +checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" dependencies = [ "autocfg", "libm", @@ -1267,31 +1624,105 @@ dependencies = [ [[package]] name = "num_enum" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "683751d591e6d81200c39fb0d1032608b77724f34114db54f571ff1317b337c0" +checksum = "02339744ee7253741199f897151b38e72257d13802d4ee837285cc2990a90845" dependencies = [ "num_enum_derive", ] [[package]] name = "num_enum_derive" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c11e44798ad209ccdd91fc192f0526a369a01234f7373e1b141c96d7cee4f0e" +checksum = "681030a937600a36906c185595136d26abfebb4aa9c65701cefcaf8578bb982b" dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn", + "syn 2.0.53", ] [[package]] -name = "once_cell" -version = "1.19.0" +name = "object" +version = "0.32.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" +dependencies = [ + "memchr", +] + +[[package]] +name = "object_store" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8718f8b65fdf67a45108d1548347d4af7d71fb81ce727bbf9e3b2535e079db3" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "humantime", + "itertools 0.12.1", + "parking_lot", + "percent-encoding", + "snafu", + "tokio", + "tracing", + "url", + "walkdir", +] + +[[package]] +name = "once_cell" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "openssl" +version = "0.10.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f" +dependencies = [ + "bitflags 2.5.0", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.53", +] + +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + +[[package]] +name = "openssl-sys" +version = "0.9.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dda2b0f344e78efc2facf7d195d098df0dd72151b26ab98da807afc26c198dff" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "ordered-float" version = "2.10.1" @@ -1301,6 +1732,29 @@ dependencies = [ "num-traits", ] +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets 0.48.5", +] + [[package]] name = "parquet" version = "51.0.0" @@ -1315,11 +1769,12 @@ dependencies = [ "arrow-ipc", "arrow-schema", "arrow-select", - "base64", + "base64 0.22.0", "brotli", "bytes", "chrono", "flate2", + "futures", "half", "hashbrown", "lz4_flex", @@ -1329,6 +1784,7 @@ dependencies = [ "seq-macro", "snap", "thrift", + "tokio", "twox-hash", "zstd", ] @@ -1339,6 +1795,12 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + [[package]] name = "phf" version = "0.11.2" @@ -1369,7 +1831,7 @@ dependencies = [ "phf_shared", "proc-macro2", "quote", - "syn", + "syn 2.0.53", ] [[package]] @@ -1381,36 +1843,67 @@ dependencies = [ "siphasher", ] +[[package]] +name = "pin-project" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.53", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + [[package]] name = "pkg-config" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69d3587f8a9e599cc7ec2c00e331f71c4e69a5f9a4b8a6efd5b07466b9736f9a" +checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" [[package]] name = "proc-macro-crate" -version = "2.0.1" +version = "3.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97dc5fea232fc28d2f597b37c4876b348a40e33f3b02cc975c8d006d78d94b1a" +checksum = "6d37c51ca738a55da99dc0c4a34860fd675453b8b36209178c2249bb13651284" dependencies = [ - "toml_datetime", "toml_edit", ] [[package]] name = "proc-macro2" -version = "1.0.71" +version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75cb1540fadbd5b8fbccc4dddad2734eba435053f725621c070711a14bb5f4b8" +checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.33" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] @@ -1430,6 +1923,15 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +[[package]] +name = "range-reader" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f531f927c2953e31880daaf95178ca85128a55067ee115f12b39cd4f8f1cfebb" +dependencies = [ + "futures", +] + [[package]] name = "redox_syscall" version = "0.4.1" @@ -1452,9 +1954,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.2" +version = "1.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" dependencies = [ "aho-corasick", "memchr", @@ -1464,9 +1966,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.3" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" dependencies = [ "aho-corasick", "memchr", @@ -1479,6 +1981,48 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +[[package]] +name = "reqwest" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58b48d98d932f4ee75e541614d32a7f44c889b72bd9c2e04d95edd135989df88" +dependencies = [ + "base64 0.21.7", + "bytes", + "encoding_rs", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-tls", + "hyper-util", + "ipnet", + "js-sys", + "log", + "mime", + "native-tls", + "once_cell", + "percent-encoding", + "pin-project-lite", + "rustls-pemfile", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "system-configuration", + "tokio", + "tokio-native-tls", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "winreg", +] + [[package]] name = "robust" version = "1.1.0" @@ -1487,25 +2031,20 @@ checksum = "cbf4a6aa5f6d6888f39e980649f3ad6b666acdce1d78e95b8a2cb076e687ae30" [[package]] name = "rstar" -version = "0.11.0" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73111312eb7a2287d229f06c00ff35b51ddee180f017ab6dec1f69d62ac098d6" +checksum = "133315eb94c7b1e8d0cb097e5a710d850263372fd028fff18969de708afc7008" dependencies = [ - "heapless 0.7.17", + "heapless", "num-traits", "smallvec", ] [[package]] -name = "rstar" -version = "0.12.0" +name = "rustc-demangle" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "133315eb94c7b1e8d0cb097e5a710d850263372fd028fff18969de708afc7008" -dependencies = [ - "heapless 0.8.0", - "num-traits", - "smallvec", -] +checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" [[package]] name = "rustc_version" @@ -1518,22 +2057,49 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.28" +version = "0.38.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" +checksum = "65e04861e65f21776e67888bfbea442b3642beaa0138fdb1dd7a84a52dffdb89" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.5.0", "errno", "libc", "linux-raw-sys", "windows-sys 0.52.0", ] +[[package]] +name = "rustls-pemfile" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" +dependencies = [ + "base64 0.21.7", +] + [[package]] name = "ryu" -version = "1.0.16" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" +checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "schannel" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" +dependencies = [ + "windows-sys 0.52.0", +] [[package]] name = "scoped-tls" @@ -1553,11 +2119,34 @@ version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04c565b551bafbef4157586fa379538366e4385d42082f255bfd96e4fe8519da" +[[package]] +name = "security-framework" +version = "2.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "semver" -version = "1.0.20" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "836fa6a3e1e547f9a2c4040802ec865b5d85f4014efe00555d7090a3dcaa1090" +checksum = "92d43fe69e652f3df9bdc2b85b2854a0825b86e4fb76bc44d945137d053639ca" [[package]] name = "seq-macro" @@ -1567,18 +2156,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.193" +version = "1.0.197" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25dd9975e68d0cb5aa1120c288333fc98731bd1dd12f561e468ea4728c042b89" +checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2" dependencies = [ "serde_derive", ] [[package]] name = "serde-wasm-bindgen" -version = "0.6.3" +version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9b713f70513ae1f8d92665bbbbda5c295c2cf1da5542881ae5eefe20c9af132" +checksum = "8302e169f0eddcc139c70f139d19d6467353af16f9fce27e8c30158036a1e16b" dependencies = [ "js-sys", "serde", @@ -1587,37 +2176,80 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.193" +version = "1.0.197" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3" +checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.53", ] [[package]] name = "serde_json" -version = "1.0.108" +version = "1.0.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d1c7e3eac408d115102c4c24ad393e0821bb3a5df4d506a80f85f7a742a526b" +checksum = "c5f09b1bd632ef549eaa9f60a1f8de742bdbc698e6cee2095fc84dde5f549ae0" dependencies = [ "itoa", "ryu", "serde", ] +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + [[package]] name = "siphasher" version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + [[package]] name = "smallvec" -version = "1.11.2" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dccd0940a2dcdf68d092b8cbab7dc0ad8fa938bf95787e1b916b0e3d0e8e970" +checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" + +[[package]] +name = "snafu" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4de37ad025c587a29e8f3f5605c00f70b98715ef90b9061a815b9e59e9042d6" +dependencies = [ + "doc-comment", + "snafu-derive", +] + +[[package]] +name = "snafu-derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990079665f075b699031e9c08fd3ab99be5029b96f3b78dc0709e8f77e4efebf" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "syn 1.0.109", +] [[package]] name = "snap" @@ -1626,24 +2258,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] -name = "spade" -version = "2.5.0" +name = "socket2" +version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "874799535a5ac50b123ec82e2e8015340fbebb6f71dc009ec0d4a6601fde16a4" +checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871" dependencies = [ - "hashbrown", - "num-traits", - "robust", - "smallvec", + "libc", + "windows-sys 0.52.0", ] [[package]] -name = "spin" -version = "0.9.8" +name = "spade" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +checksum = "61addf9117b11d1f5b4bf6fe94242ba25f59d2d4b2080544b771bd647024fd00" dependencies = [ - "lock_api", + "hashbrown", + "num-traits", + "robust", + "smallvec", ] [[package]] @@ -1660,61 +2293,98 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" [[package]] name = "strsim" -version = "0.10.0" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ee073c9e4cd00e28217186dbe12796d692868f432bf2e97ee73bed0c56dfa01" + +[[package]] +name = "syn" +version = "1.0.109" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] [[package]] name = "syn" -version = "2.0.43" +version = "2.0.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee659fb5f3d355364e1f3e5bc10fb82068efbf824a1e9d1c9504244a6469ad53" +checksum = "7383cd0e49fff4b6b90ca5670bfd3e9d6a733b3f90c686605aa7eec8c4996032" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] +[[package]] +name = "sync_wrapper" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" + +[[package]] +name = "system-configuration" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tempfile" -version = "3.9.0" +version = "3.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" +checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" dependencies = [ "cfg-if", "fastrand", - "redox_syscall", "rustix", "windows-sys 0.52.0", ] [[package]] name = "termcolor" -version = "1.4.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff1bc3d3f05aff0403e8ac0d92ced918ec05b666a43f83297ccef5bea8a3d449" +checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" dependencies = [ "winapi-util", ] [[package]] name = "thiserror" -version = "1.0.52" +version = "1.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83a48fd946b02c0a526b2e9481c8e2a17755e47039164a86c4070446e3a4614d" +checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.52" +version = "1.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7fbe9b594d6568a6a1443250a7e67d80b74e1e96f6d1715e1e21cc1888291d3" +checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.53", ] [[package]] @@ -1737,23 +2407,155 @@ dependencies = [ "crunchy", ] +[[package]] +name = "tinyvec" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "tokio" +version = "1.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" +dependencies = [ + "backtrace", + "bytes", + "libc", + "mio", + "pin-project-lite", + "socket2", + "tokio-macros", + "windows-sys 0.48.0", +] + +[[package]] +name = "tokio-macros" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.53", +] + +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", + "tracing", +] + [[package]] name = "toml_datetime" -version = "0.6.3" +version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cda73e2f1397b1262d6dfdcef8aafae14d1de7748d66822d3bfeeb6d03e5e4b" +checksum = "3550f4e9685620ac18a50ed434eb3aec30db8ba93b0287467bca5826ea25baf1" [[package]] name = "toml_edit" -version = "0.20.2" +version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "396e4d48bbb2b7554c944bde63101b5ae446cff6ec4a24227428f15eb72ef338" +checksum = "6a8534fd7f78b5405e860340ad6575217ce99f38d4d5c8f2442cb5ecb50090e1" dependencies = [ "indexmap", "toml_datetime", "winnow", ] +[[package]] +name = "tower" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +dependencies = [ + "futures-core", + "futures-util", + "pin-project", + "pin-project-lite", + "tokio", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-layer" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0" + +[[package]] +name = "tower-service" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" + +[[package]] +name = "tracing" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +dependencies = [ + "log", + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.53", +] + +[[package]] +name = "tracing-core" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +dependencies = [ + "once_cell", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + [[package]] name = "twox-hash" version = "1.6.3" @@ -1764,12 +2566,38 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "unicode-bidi" +version = "0.3.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" + [[package]] name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +[[package]] +name = "unicode-normalization" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "url" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + [[package]] name = "utf8parse" version = "0.2.1" @@ -1778,20 +2606,45 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "uuid" -version = "1.6.1" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560" +checksum = "a183cf7feeba97b4dd1c0d46788634f6221d87fa961b305bed08c851829efcc0" dependencies = [ "getrandom", "wasm-bindgen", ] +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -1800,9 +2653,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.89" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ed0d4f68a3015cc185aff4db9506a015f4b96f95303897bfa23f846db54064e" +checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" dependencies = [ "cfg-if", "serde", @@ -1812,24 +2665,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.89" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b56f625e64f3a1084ded111c4d5f477df9f8c92df113852fa5a374dbda78826" +checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn", + "syn 2.0.53", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.39" +version = "0.4.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac36a15a220124ac510204aec1c3e5db8a22ab06fd6706d881dc6149f8ed9a12" +checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0" dependencies = [ "cfg-if", "js-sys", @@ -1839,9 +2692,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.89" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0162dbf37223cd2afce98f3d0785506dcb8d266223983e4b5b525859e6e182b2" +checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1849,28 +2702,28 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.89" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0eb82fcb7930ae6219a7ecfd55b217f5f0893484b7a13022ebb2b2bf20b5283" +checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.53", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.89" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ab9b36309365056cd639da3134bf87fa8f3d86008abf99e612384a6eecd459f" +checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" [[package]] name = "wasm-bindgen-test" -version = "0.3.39" +version = "0.3.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cf9242c0d27999b831eae4767b2a146feb0b27d332d553e605864acd2afd403" +checksum = "d9bf62a58e0780af3e852044583deee40983e5886da43a271dd772379987667b" dependencies = [ "console_error_panic_hook", "js-sys", @@ -1882,20 +2735,33 @@ dependencies = [ [[package]] name = "wasm-bindgen-test-macro" -version = "0.3.39" +version = "0.3.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "794645f5408c9a039fd09f4d113cdfb2e7eba5ff1956b07bcf701cf4b394fe89" +checksum = "b7f89739351a2e03cb94beb799d47fb2cac01759b40ec441f7de39b00cbf7ef0" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.53", +] + +[[package]] +name = "wasm-streams" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4609d447824375f43e1ffbc051b50ad8f4b3ae8219680c94452ea05eb240ac7" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", ] [[package]] name = "web-sys" -version = "0.3.66" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50c24a44ec86bb68fbecd1b3efed7e85ea5621b39b35ef2766b66cd984f8010f" +checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" dependencies = [ "js-sys", "wasm-bindgen", @@ -1934,11 +2800,11 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-core" -version = "0.51.1" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1f8cf84f35d2db49a46868f947758c7a1138116f7fac3bc844f43ade1292e64" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets 0.48.5", + "windows-targets 0.52.4", ] [[package]] @@ -1956,7 +2822,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.0", + "windows-targets 0.52.4", ] [[package]] @@ -1976,17 +2842,17 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" dependencies = [ - "windows_aarch64_gnullvm 0.52.0", - "windows_aarch64_msvc 0.52.0", - "windows_i686_gnu 0.52.0", - "windows_i686_msvc 0.52.0", - "windows_x86_64_gnu 0.52.0", - "windows_x86_64_gnullvm 0.52.0", - "windows_x86_64_msvc 0.52.0", + "windows_aarch64_gnullvm 0.52.4", + "windows_aarch64_msvc 0.52.4", + "windows_i686_gnu 0.52.4", + "windows_i686_msvc 0.52.4", + "windows_x86_64_gnu 0.52.4", + "windows_x86_64_gnullvm 0.52.4", + "windows_x86_64_msvc 0.52.4", ] [[package]] @@ -1997,9 +2863,9 @@ checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" +checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" [[package]] name = "windows_aarch64_msvc" @@ -2009,9 +2875,9 @@ checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_aarch64_msvc" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" +checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" [[package]] name = "windows_i686_gnu" @@ -2021,9 +2887,9 @@ checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_gnu" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" +checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" [[package]] name = "windows_i686_msvc" @@ -2033,9 +2899,9 @@ checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_i686_msvc" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" +checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" [[package]] name = "windows_x86_64_gnu" @@ -2045,9 +2911,9 @@ checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnu" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" +checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" [[package]] name = "windows_x86_64_gnullvm" @@ -2057,9 +2923,9 @@ checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" +checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" [[package]] name = "windows_x86_64_msvc" @@ -2069,19 +2935,29 @@ checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "windows_x86_64_msvc" -version = "0.52.0" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" +checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" [[package]] name = "winnow" -version = "0.5.30" +version = "0.5.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b5c3db89721d50d0e2a673f5043fc4722f76dcc352d7b1ab8b8288bed4ed2c5" +checksum = "f593a95398737aeed53e489c785df13f3618e41dbcd6718c6addbf1395aa6876" dependencies = [ "memchr", ] +[[package]] +name = "winreg" +version = "0.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" +dependencies = [ + "cfg-if", + "windows-sys 0.48.0", +] + [[package]] name = "wkt" version = "0.10.3" @@ -2111,7 +2987,7 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.53", ] [[package]] diff --git a/js/Cargo.toml b/js/Cargo.toml index 0259d9dfc..ebed27921 100644 --- a/js/Cargo.toml +++ b/js/Cargo.toml @@ -26,9 +26,27 @@ data = [] geodesy = ["dep:geodesy", "geoarrow/geodesy"] debug = ["console_error_panic_hook"] -io_geojson = ["geoarrow/geozero", "table"] io_flatgeobuf = ["geoarrow/flatgeobuf", "table"] +io_geojson = ["geoarrow/geozero", "table"] +io_http = [] +io_object_store = [ + "dep:async-trait", + "dep:futures", + "dep:object_store", + "dep:reqwest", + "dep:tokio", +] io_parquet = ["geoarrow/parquet", "table", "dep:bytes", "dep:parquet"] +io_parquet_async = [ + "async-stream", + "geoarrow/parquet_async", + "io_http", + # We don't currently use object_store in Parque + # "io_object_store", + "io_parquet", + "range-reader", + "wasm-streams", +] io_parquet_compressions = [ "io_parquet", "io_parquet_brotli", @@ -54,6 +72,7 @@ vector = [] [dependencies] wasm-bindgen = "0.2.63" +wasm-bindgen-futures = "0.4.42" # The `console_error_panic_hook` crate provides better debugging of panics by # logging them with `console.error`. This is great for development, but requires @@ -62,14 +81,22 @@ wasm-bindgen = "0.2.63" arrow-array = "51" arrow-buffer = "51" arrow-wasm = { git = "https://github.com/kylebarron/arrow-wasm", rev = "6ea4db31391ab2a402d8db30e7945a02f8d20a40" } +async-stream = { version = "0.3.5", optional = true } +async-trait = { version = "0.1.77", optional = true } bytes = { version = "1", optional = true } console_error_panic_hook = { version = "0.1.6", optional = true } - +futures = { version = "0.3.30", optional = true } geo = "0.28" geoarrow = { path = "../" } geodesy = { version = "0.12", optional = true, features = ["js"] } +object_store = { version = "*", optional = true } parquet = { version = "51", optional = true, features = ["arrow", "base64"] } +range-reader = { version = "0.2", optional = true } +reqwest = { version = "*", optional = true } thiserror = "1" +tokio = { version = "*", default-features = false, optional = true } +wasm-streams = { version = "0.3.0", optional = true } + # Pass "wasm" and "thin" down to the transitive zstd dependency zstd = { version = "*", features = [ diff --git a/js/package.json b/js/package.json index 027093d22..6c756640c 100644 --- a/js/package.json +++ b/js/package.json @@ -2,7 +2,7 @@ "scripts": { "build": "bash ./scripts/build.sh", "build:test": "ENV='DEV' FEATURES='--all-features' yarn build", - "build:geoparquet": "FEATURES='--no-default-features --features debug --features io_parquet --features io_parquet_compressions' NAME='@geoarrow/geoparquet-wasm' bash ./scripts/build.sh", + "build:geoparquet": "FEATURES='--no-default-features --features debug --features io_parquet --features io_parquet_async --features io_parquet_compressions' NAME='@geoarrow/geoparquet-wasm' bash ./scripts/build.sh", "build:flatgeobuf": "FEATURES='--no-default-features --features debug --features io_flatgeobuf' NAME='@geoarrow/flatgeobuf-wasm' bash ./scripts/build.sh", "docs:build": "typedoc", "docs:serve": "cd docs_build && http-server", diff --git a/js/src/io/mod.rs b/js/src/io/mod.rs index 49e790b15..433a7a1e9 100644 --- a/js/src/io/mod.rs +++ b/js/src/io/mod.rs @@ -2,5 +2,9 @@ pub mod flatgeobuf; #[cfg(feature = "io_geojson")] pub mod geojson; +// #[cfg(feature = "io_object_store")] +// pub mod object_store; +// #[cfg(feature = "io_object_store")] +// pub mod object_store_s3; #[cfg(feature = "io_parquet")] pub mod parquet; diff --git a/js/src/io/object_store.rs b/js/src/io/object_store.rs new file mode 100644 index 000000000..d57c386bb --- /dev/null +++ b/js/src/io/object_store.rs @@ -0,0 +1,95 @@ +//! Shims for object store on the web. + +use async_trait::async_trait; +use std::fmt; +use std::sync::Arc; +// use object_store::client::get::GetClientExt; +// use object_store::client::header::get_etag; +// use object_store::http::client::Client; +use bytes::Bytes; +use futures::stream::BoxStream; +use object_store::path::Path; +use object_store::{ + GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta, ObjectStore, + PutMode, PutOptions, PutResult, Result, +}; +use reqwest::Client; +use tokio::io::AsyncWrite; + +#[derive(Debug)] +pub struct HTTPWasmStore { + client: Arc, +} + +#[async_trait] +impl ObjectStore for HTTPWasmStore { + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + todo!() + } + + async fn put_opts( + &self, + _location: &Path, + _bytes: Bytes, + _opts: PutOptions, + ) -> Result { + Err(object_store::Error::NotImplemented) + } + + async fn put_multipart( + &self, + _location: &Path, + ) -> Result<(MultipartId, Box)> { + Err(object_store::Error::NotImplemented) + } + + fn list(&self, _prefix: Option<&Path>) -> BoxStream<'_, Result> { + todo!() + // let prefix_len = prefix.map(|p| p.as_ref().len()).unwrap_or_default(); + // let prefix = prefix.cloned(); + // futures::stream::once(async move { + // let status = self.client.list(prefix.as_ref(), "infinity").await?; + + // let iter = status + // .response + // .into_iter() + // .filter(|r| !r.is_dir()) + // .map(|response| { + // response.check_ok()?; + // response.object_meta(self.client.base_url()) + // }) + // // Filter out exact prefix matches + // .filter_ok(move |r| r.location.as_ref().len() > prefix_len); + + // Ok::<_, object_store::Error>(futures::stream::iter(iter)) + // }) + // .try_flatten() + // .boxed() + } + + async fn abort_multipart(&self, _location: &Path, _multipart_id: &MultipartId) -> Result<()> { + Err(object_store::Error::NotImplemented) + } + + async fn delete(&self, _location: &Path) -> Result<()> { + Err(object_store::Error::NotImplemented) + } + + async fn list_with_delimiter(&self, _prefix: Option<&Path>) -> Result { + Err(object_store::Error::NotImplemented) + } + + async fn copy(&self, _from: &Path, _to: &Path) -> Result<()> { + Err(object_store::Error::NotImplemented) + } + + async fn copy_if_not_exists(&self, _from: &Path, _to: &Path) -> Result<()> { + Err(object_store::Error::NotImplemented) + } +} + +impl fmt::Display for HTTPWasmStore { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "HTTPWasmStore") + } +} diff --git a/js/src/io/object_store_s3/mod.rs b/js/src/io/object_store_s3/mod.rs new file mode 100644 index 000000000..161093772 --- /dev/null +++ b/js/src/io/object_store_s3/mod.rs @@ -0,0 +1,2 @@ +//! This is vendored from https://github.com/JanKaul/object_store_s3_wasm +//! under the MIT license diff --git a/js/src/io/parquet/async.rs b/js/src/io/parquet/async.rs new file mode 100644 index 000000000..8ad7cbe83 --- /dev/null +++ b/js/src/io/parquet/async.rs @@ -0,0 +1,92 @@ +use geoarrow::array::CoordType; +use geoarrow::io::parquet::ParquetDataset as _ParquetDataset; +use geoarrow::io::parquet::ParquetFile as _ParquetFile; +use wasm_bindgen::prelude::*; + +use crate::error::WasmResult; +use crate::io::parquet::async_file_reader::HTTPFileReader; +use crate::table::GeoTable; + +#[wasm_bindgen] +pub struct ParquetFile { + file: _ParquetFile, +} + +#[wasm_bindgen] +impl ParquetFile { + #[wasm_bindgen(constructor)] + pub async fn new(url: String) -> WasmResult { + let reader = HTTPFileReader::new(url, Default::default(), 500_000); + let file = _ParquetFile::new(reader, Default::default()).await?; + Ok(Self { file }) + } + + /// The number of rows in this file. + #[wasm_bindgen(getter, js_name = numRows)] + pub fn num_rows(&self) -> usize { + self.file.num_rows() + } + + /// The number of row groups in this file. + #[wasm_bindgen(getter, js_name = numRowGroups)] + pub fn num_row_groups(&self) -> usize { + self.file.num_row_groups() + } + + /// Access the bounding box of the given column for the entire file + /// + /// If no column name is passed, retrieves the bbox from the primary geometry column. + /// + /// An Err will be returned if the column name does not exist in the dataset + /// None will be returned if the metadata does not contain bounding box information. + #[wasm_bindgen(js_name = fileBbox)] + pub fn file_bbox(&self, column_name: Option) -> WasmResult>> { + let name = column_name.as_deref(); + let bbox = self.file.file_bbox(name)?; + Ok(bbox.map(|b| b.to_vec())) + } + + pub async fn read(&self) -> WasmResult { + let table = self.file.read(&Default::default()).await?; + Ok(table.into()) + } + + #[wasm_bindgen(js_name = readRowGroups)] + pub async fn read_row_groups(&self, row_groups: Vec) -> WasmResult { + let table = self + .file + .read_row_groups(row_groups, &CoordType::Interleaved) + .await?; + Ok(table.into()) + } +} + +#[wasm_bindgen] +pub struct ParquetDataset { + inner: _ParquetDataset, +} + +#[wasm_bindgen] +impl ParquetDataset { + #[wasm_bindgen(constructor)] + pub async fn new(urls: Vec) -> WasmResult { + let readers = urls + .into_iter() + .map(|url| HTTPFileReader::new(url, Default::default(), 500_000)) + .collect(); + let dataset = _ParquetDataset::new(readers, Default::default()).await?; + Ok(Self { inner: dataset }) + } + + /// The total number of rows across all files. + #[wasm_bindgen(getter, js_name = numRows)] + pub fn num_rows(&self) -> usize { + self.inner.num_rows() + } + + /// The total number of row groups across all files + #[wasm_bindgen(getter, js_name = numRowGroups)] + pub fn num_row_groups(&self) -> usize { + self.inner.num_row_groups() + } +} diff --git a/js/src/io/parquet/async_file_reader/fetch.rs b/js/src/io/parquet/async_file_reader/fetch.rs new file mode 100644 index 000000000..0fc86f6ca --- /dev/null +++ b/js/src/io/parquet/async_file_reader/fetch.rs @@ -0,0 +1,95 @@ +use std::convert::TryInto; + +use futures::channel::oneshot; +use futures::future::BoxFuture; +use range_reader::{RangeOutput, RangedAsyncReader}; +use wasm_bindgen::prelude::*; +use wasm_bindgen_futures::spawn_local; + +/// Get content-length of file +pub async fn _get_content_length(url: String) -> Result { + let client = reqwest::Client::new(); + let resp = client.head(url).send().await?; + Ok(resp.content_length().unwrap().try_into().unwrap()) +} + +pub async fn get_content_length(url: String) -> Result { + let (sender, receiver) = oneshot::channel::(); + spawn_local(async move { + let inner_data = _get_content_length(url).await.unwrap(); + sender.send(inner_data).unwrap(); + }); + let data = receiver.await.unwrap(); + Ok(data) +} + +/// Construct range header from start and length +pub fn range_from_start_and_length(start: u64, length: u64) -> String { + // Subtract 1 from length because end is inclusive + // > bytes units ... are offsets (zero-indexed & inclusive) + // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Range + format!("bytes={}-{}", start, start + length - 1) +} + +pub fn range_from_start(start: u64) -> String { + format!("bytes={}-", start) +} + +pub fn range_from_end(length: u64) -> String { + format!("bytes=-{}", length) +} + +/// Make range request on remote file +async fn _make_range_request( + url: &str, + start: u64, + length: usize, +) -> Result, reqwest::Error> { + let client = reqwest::Client::new(); + let range_str = range_from_start_and_length(start, length as u64); + let resp = client + .get(url) + .header("Range", range_str) + .send() + .await? + .error_for_status()?; + Ok(resp.bytes().await?.to_vec()) +} + +pub async fn make_range_request( + url: String, + start: u64, + length: usize, +) -> Result, JsValue> { + let (sender, receiver) = oneshot::channel::>(); + spawn_local(async move { + let inner_data = _make_range_request(&url, start, length).await.unwrap(); + sender.send(inner_data).unwrap(); + }); + let data = receiver.await.unwrap(); + Ok(data) +} + +/// Create a RangedAsyncReader +pub fn create_reader( + url: String, + content_length: usize, + min_request_size: Option, +) -> RangedAsyncReader { + // at least 4kb per s3 request. Adjust to your liking. + let min_request_size = min_request_size.unwrap_or(4 * 1024); + + // Closure for making an individual HTTP range request to a file + let range_get = Box::new(move |start: u64, length: usize| { + let url = url.clone(); + + Box::pin(async move { + let data = make_range_request(url.clone(), start, length) + .await + .unwrap(); + Ok(RangeOutput { start, data }) + }) as BoxFuture<'static, std::io::Result> + }); + + RangedAsyncReader::new(content_length, min_request_size, range_get) +} diff --git a/js/src/io/parquet/async_file_reader/mod.rs b/js/src/io/parquet/async_file_reader/mod.rs new file mode 100644 index 000000000..7df8e7277 --- /dev/null +++ b/js/src/io/parquet/async_file_reader/mod.rs @@ -0,0 +1,573 @@ +//! An asynchronous Parquet reader that is able to read and inspect remote files without +//! downloading them in entirety. + +pub mod fetch; + +use futures::channel::oneshot; +use futures::future::BoxFuture; +use parquet::arrow::ProjectionMask; +use std::ops::Range; +use std::sync::Arc; +use wasm_bindgen::prelude::*; +use wasm_bindgen_futures::spawn_local; + +use crate::error::WasmResult; +use fetch::{range_from_end, range_from_start_and_length}; + +use arrow_wasm::{RecordBatch, Table}; +use bytes::Bytes; +use futures::TryStreamExt; +use futures::{stream, FutureExt}; +use parquet::arrow::arrow_reader::ArrowReaderMetadata; +use parquet::arrow::async_reader::{AsyncFileReader, ParquetRecordBatchStreamBuilder}; + +use parquet::file::footer::{decode_footer, decode_metadata}; +use parquet::file::metadata::ParquetMetaData; +use reqwest::Client; + +use async_trait::async_trait; + +#[async_trait(?Send)] +trait SharedIO { + fn generate_builder( + reader: &T, + meta: &ArrowReaderMetadata, + batch_size: &usize, + projection_mask: &Option, + ) -> ParquetRecordBatchStreamBuilder { + let builder = + ParquetRecordBatchStreamBuilder::new_with_metadata(reader.clone(), meta.clone()) + .with_batch_size(*batch_size) + .with_projection( + projection_mask + .as_ref() + .unwrap_or(&ProjectionMask::all()) + .clone(), + ); + builder + } + + async fn inner_read_row_group( + &self, + reader: &T, + meta: &ArrowReaderMetadata, + batch_size: &usize, + projection_mask: &Option, + i: usize, + ) -> WasmResult { + let builder = Self::generate_builder(reader, meta, batch_size, projection_mask); + let schema = builder.schema().clone(); + let stream = builder + .with_row_groups(vec![i]) + .build() + .map_err(geoarrow::error::GeoArrowError::from)?; + let results = stream.try_collect::>().await.unwrap(); + + // NOTE: This is not only one batch by default due to arrow-rs's default rechunking. + // assert_eq!(results.len(), 1, "Expected one record batch"); + // Ok(RecordBatch::new(results.pop().unwrap())) + Ok(Table::new(schema, results)) + } + + async fn inner_stream( + &self, + concurrency: Option, + meta: &ArrowReaderMetadata, + reader: &T, + batch_size: &usize, + projection_mask: &Option, + ) -> WasmResult { + use futures::StreamExt; + let concurrency = concurrency.unwrap_or(1); + let meta = meta.clone(); + let reader = reader.clone(); + let batch_size = *batch_size; + let num_row_groups = meta.metadata().num_row_groups(); + let projection_mask = projection_mask.clone(); + let buffered_stream = stream::iter((0..num_row_groups).map(move |i| { + let builder = Self::generate_builder(&reader, &meta, &batch_size, &projection_mask) + .with_row_groups(vec![i]); + builder.build().unwrap().try_collect::>() + })) + .buffered(concurrency); + let out_stream = buffered_stream.flat_map(|maybe_record_batches| { + stream::iter(maybe_record_batches.unwrap()) + .map(|record_batch| Ok(RecordBatch::new(record_batch).into())) + }); + Ok(wasm_streams::ReadableStream::from_stream(out_stream).into_raw()) + } +} + +// #[wasm_bindgen] +// pub struct AsyncParquetFile { +// reader: HTTPFileReader, +// meta: ArrowReaderMetadata, +// batch_size: usize, +// projection_mask: Option, +// } + +// impl SharedIO for AsyncParquetFile {} + +// #[wasm_bindgen] +// impl AsyncParquetFile { +// #[wasm_bindgen(constructor)] +// pub async fn new(url: String) -> WasmResult { +// let client = Client::new(); +// let mut reader = HTTPFileReader::new(url.clone(), client.clone(), 1024); +// let meta = ArrowReaderMetadata::load_async(&mut reader, Default::default()).await?; +// Ok(Self { +// reader, +// meta, +// projection_mask: None, +// batch_size: 1024, +// }) +// } + +// #[wasm_bindgen(js_name = withBatchSize)] +// pub fn with_batch_size(self, batch_size: usize) -> Self { +// Self { batch_size, ..self } +// } + +// #[wasm_bindgen(js_name = selectColumns)] +// pub fn select_columns(self, columns: Vec) -> WasmResult { +// let pq_schema = self.meta.parquet_schema(); +// let projection_mask = Some(generate_projection_mask(columns, pq_schema)?); +// Ok(Self { +// projection_mask, +// ..self +// }) +// } + +// // #[wasm_bindgen] +// // pub fn metadata(&self) -> WasmResult { +// // Ok(self.meta.metadata().as_ref().to_owned().into()) +// // } + +// #[wasm_bindgen(js_name = readRowGroup)] +// pub async fn read_row_group(&self, i: usize) -> WasmResult
{ +// self.inner_read_row_group( +// &self.reader, +// &self.meta, +// &self.batch_size, +// &self.projection_mask, +// i, +// ) +// .await +// } + +// #[wasm_bindgen] +// pub async fn stream( +// &self, +// concurrency: Option, +// ) -> WasmResult { +// self.inner_stream( +// concurrency, +// &self.meta, +// &self.reader, +// &self.batch_size, +// &self.projection_mask, +// ) +// .await +// } +// } + +#[derive(Debug, Clone)] +pub struct HTTPFileReader { + url: String, + client: Client, + coalesce_byte_size: usize, +} + +impl HTTPFileReader { + pub fn new(url: String, client: Client, coalesce_byte_size: usize) -> Self { + Self { + url, + client, + coalesce_byte_size, + } + } +} + +impl AsyncFileReader for HTTPFileReader { + fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, parquet::errors::Result> { + async move { + let range_str = + range_from_start_and_length(range.start as u64, (range.end - range.start) as u64); + + // Map reqwest error to parquet error + // let map_err = |err| parquet::errors::ParquetError::External(Box::new(err)); + + let bytes = make_range_request_with_client( + self.url.to_string(), + self.client.clone(), + range_str, + ) + .await + .unwrap(); + + Ok(bytes) + } + .boxed() + } + + fn get_byte_ranges( + &mut self, + ranges: Vec>, + ) -> BoxFuture<'_, parquet::errors::Result>> { + let fetch_ranges = merge_ranges(&ranges, self.coalesce_byte_size); + + // NOTE: This still does _sequential_ requests, but it should be _fewer_ requests if they + // can be merged. + async move { + let mut fetched = Vec::with_capacity(ranges.len()); + + for range in fetch_ranges.iter() { + let data = self.get_bytes(range.clone()).await?; + fetched.push(data); + } + + Ok(ranges + .iter() + .map(|range| { + let idx = fetch_ranges.partition_point(|v| v.start <= range.start) - 1; + let fetch_range = &fetch_ranges[idx]; + let fetch_bytes = &fetched[idx]; + + let start = range.start - fetch_range.start; + let end = range.end - fetch_range.start; + fetch_bytes.slice(start..end) + }) + .collect()) + } + .boxed() + } + + fn get_metadata(&mut self) -> BoxFuture<'_, parquet::errors::Result>> { + async move { + let meta = fetch_parquet_metadata(self.url.as_str(), &self.client, None).await?; + Ok(Arc::new(meta)) + } + .boxed() + } +} + +// /// Safety: Do not use this in a multi-threaded environment, +// /// (transitively depends on !Send web_sys::File) +// #[wasm_bindgen] +// pub struct AsyncParquetLocalFile { +// reader: JsFileReader, +// meta: ArrowReaderMetadata, +// batch_size: usize, +// projection_mask: Option, +// } + +// impl SharedIO for AsyncParquetLocalFile {} + +// #[wasm_bindgen] +// impl AsyncParquetLocalFile { +// #[wasm_bindgen(constructor)] +// pub async fn new(handle: web_sys::File) -> WasmResult { +// let mut reader = JsFileReader::new(handle, 1024); +// let meta = ArrowReaderMetadata::load_async(&mut reader, Default::default()).await?; +// Ok(Self { +// reader, +// meta, +// batch_size: 1024, +// projection_mask: None, +// }) +// } + +// #[wasm_bindgen(js_name = withBatchSize)] +// pub fn with_batch_size(self, batch_size: usize) -> Self { +// Self { batch_size, ..self } +// } + +// #[wasm_bindgen(js_name = selectColumns)] +// pub fn select_columns(self, columns: Vec) -> WasmResult { +// let pq_schema = self.meta.parquet_schema(); +// let projection_mask = Some(generate_projection_mask(columns, pq_schema)?); +// Ok(Self { +// projection_mask, +// ..self +// }) +// } + +// #[wasm_bindgen] +// pub fn metadata(&self) -> WasmResult { +// Ok(self.meta.metadata().as_ref().to_owned().into()) +// } + +// #[wasm_bindgen(js_name = readRowGroup)] +// pub async fn read_row_group(&self, i: usize) -> WasmResult
{ +// let inner = self +// .inner_read_row_group( +// &self.reader, +// &self.meta, +// &self.batch_size, +// &self.projection_mask, +// i, +// ) +// .await +// .unwrap(); +// Ok(inner) +// } + +// #[wasm_bindgen] +// pub async fn stream( +// &self, +// concurrency: Option, +// ) -> WasmResult { +// self.inner_stream( +// concurrency, +// &self.meta, +// &self.reader, +// &self.batch_size, +// &self.projection_mask, +// ) +// .await +// } +// } + +// #[derive(Debug, Clone)] +// struct WrappedFile { +// inner: web_sys::File, +// pub size: f64, +// } +// /// Safety: This is not in fact thread-safe. Do not attempt to use this in work-stealing +// /// async runtimes / multi-threaded environments +// /// +// /// web_sys::File objects, like all JSValues, are !Send (even in JS, there's +// /// maybe ~5 Transferable types), and eventually boil down to PhantomData<*mut u8>. +// /// Any struct that holds one is inherently !Send, which disqualifies it from being used +// /// with the AsyncFileReader trait. +// unsafe impl Send for WrappedFile {} + +// impl WrappedFile { +// pub fn new(inner: web_sys::File) -> Self { +// let size = inner.size(); +// Self { inner, size } +// } +// pub async fn get_bytes(&mut self, range: Range) -> Vec { +// use js_sys::Uint8Array; +// use wasm_bindgen_futures::JsFuture; +// let (sender, receiver) = oneshot::channel(); +// let file = self.inner.clone(); +// spawn_local(async move { +// let subset_blob = file +// .slice_with_i32_and_i32( +// range.start.try_into().unwrap(), +// range.end.try_into().unwrap(), +// ) +// .unwrap(); +// let buf = JsFuture::from(subset_blob.array_buffer()).await.unwrap(); +// let out_vec = Uint8Array::new_with_byte_offset(&buf, 0).to_vec(); +// sender.send(out_vec).unwrap(); +// }); + +// receiver.await.unwrap() +// } +// } + +// #[derive(Debug, Clone)] +// pub struct JsFileReader { +// file: WrappedFile, +// coalesce_byte_size: usize, +// } + +// impl JsFileReader { +// pub fn new(file: web_sys::File, coalesce_byte_size: usize) -> Self { +// Self { +// file: WrappedFile::new(file), +// coalesce_byte_size, +// } +// } +// } + +// impl AsyncFileReader for JsFileReader { +// fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, parquet::errors::Result> { +// async move { +// let (sender, receiver) = oneshot::channel(); +// let mut file = self.file.clone(); +// spawn_local(async move { +// let result: Bytes = file.get_bytes(range).await.into(); +// sender.send(result).unwrap() +// }); +// let data = receiver.await.unwrap(); +// Ok(data) +// } +// .boxed() +// } + +// fn get_byte_ranges( +// &mut self, +// ranges: Vec>, +// ) -> BoxFuture<'_, parquet::errors::Result>> { +// let fetch_ranges = merge_ranges(&ranges, self.coalesce_byte_size); + +// // NOTE: This still does _sequential_ requests, but it should be _fewer_ requests if they +// // can be merged. +// // Assuming that we have a file on the local file system, these fetches should be +// // _relatively_ fast +// async move { +// let mut fetched = Vec::with_capacity(ranges.len()); + +// for range in fetch_ranges.iter() { +// let data = self.get_bytes(range.clone()).await?; +// fetched.push(data); +// } + +// Ok(ranges +// .iter() +// .map(|range| { +// // a given range CAN span two coalesced row group sets. +// // log!("Range: {:?} Actual length: {:?}", range.end - range.start, res.len()); +// let idx = fetch_ranges.partition_point(|v| v.start <= range.start) - 1; +// let fetch_range = &fetch_ranges[idx]; +// let fetch_bytes = &fetched[idx]; + +// let start = range.start - fetch_range.start; +// let end = range.end - fetch_range.start; +// fetch_bytes.slice(start..end) +// }) +// .collect()) +// } +// .boxed() +// } + +// fn get_metadata(&mut self) -> BoxFuture<'_, parquet::errors::Result>> { +// async move { +// // we only *really* need the last 8 bytes to determine the location of the metadata bytes +// let file_size: usize = (self.file.size as i64).try_into().unwrap(); +// // we already know the size of the file! +// let suffix_range: Range = (file_size - 8)..file_size; +// let suffix = self.get_bytes(suffix_range).await.unwrap(); +// let suffix_len = suffix.len(); + +// let mut footer = [0; 8]; +// footer.copy_from_slice(&suffix[suffix_len - 8..suffix_len]); +// let metadata_byte_length = decode_footer(&footer)?; +// // Did not fetch the entire file metadata in the initial read, need to make a second request +// let meta = if metadata_byte_length > suffix_len - 8 { +// // might want to figure out how to get get_bytes to accept a one-sided range +// let meta_range = (file_size - metadata_byte_length - 8)..file_size; + +// let meta_bytes = self.get_bytes(meta_range).await.unwrap(); + +// decode_metadata(&meta_bytes[0..meta_bytes.len() - 8])? +// } else { +// let metadata_start = suffix_len - metadata_byte_length - 8; + +// let slice = &suffix[metadata_start..suffix_len - 8]; +// decode_metadata(slice)? +// }; +// Ok(Arc::new(meta)) +// } +// .boxed() +// } +// } + +pub async fn make_range_request_with_client( + url: String, + client: Client, + range_str: String, +) -> std::result::Result { + let (sender, receiver) = oneshot::channel(); + spawn_local(async move { + let resp = client + .get(url) + .header("Range", range_str) + .send() + .await + .unwrap() + .error_for_status() + .unwrap(); + let bytes = resp.bytes().await.unwrap(); + sender.send(bytes).unwrap(); + }); + let data = receiver.await.unwrap(); + Ok(data) +} + +/// Returns a sorted list of ranges that cover `ranges` +/// +/// Copied from object-store +/// https://github.com/apache/arrow-rs/blob/61da64a0557c80af5bb43b5f15c6d8bb6a314cb2/object_store/src/util.rs#L132C1-L169C1 +fn merge_ranges(ranges: &[Range], coalesce: usize) -> Vec> { + if ranges.is_empty() { + return vec![]; + } + + let mut ranges = ranges.to_vec(); + ranges.sort_unstable_by_key(|range| range.start); + + let mut ret = Vec::with_capacity(ranges.len()); + let mut start_idx = 0; + let mut end_idx = 1; + + while start_idx != ranges.len() { + let mut range_end = ranges[start_idx].end; + + while end_idx != ranges.len() + && ranges[end_idx] + .start + .checked_sub(range_end) + .map(|delta| delta <= coalesce) + .unwrap_or(true) + { + range_end = range_end.max(ranges[end_idx].end); + end_idx += 1; + } + + let start = ranges[start_idx].start; + let end = range_end; + ret.push(start..end); + + start_idx = end_idx; + end_idx += 1; + } + + ret +} + +// Derived from: +// https://github.com/apache/arrow-rs/blob/61da64a0557c80af5bb43b5f15c6d8bb6a314cb2/parquet/src/arrow/async_reader/metadata.rs#L54-L57 +pub async fn fetch_parquet_metadata( + url: &str, + client: &Client, + prefetch: Option, +) -> parquet::errors::Result { + let suffix_length = prefetch.unwrap_or(8); + let range_str = range_from_end(suffix_length as u64); + + // Map reqwest error to parquet error + // let map_err = |err| parquet::errors::ParquetError::External(Box::new(err)); + + let suffix = make_range_request_with_client(url.to_string(), client.clone(), range_str) + .await + .unwrap(); + let suffix_len = suffix.len(); + + let mut footer = [0; 8]; + footer.copy_from_slice(&suffix[suffix_len - 8..suffix_len]); + + let metadata_byte_length = decode_footer(&footer)?; + + // Did not fetch the entire file metadata in the initial read, need to make a second request + let metadata = if metadata_byte_length > suffix_len - 8 { + let metadata_range_str = range_from_end((metadata_byte_length + 8) as u64); + + let meta_bytes = + make_range_request_with_client(url.to_string(), client.clone(), metadata_range_str) + .await + .unwrap(); + + decode_metadata(&meta_bytes[0..meta_bytes.len() - 8])? + } else { + let metadata_start = suffix_len - metadata_byte_length - 8; + + let slice = &suffix[metadata_start..suffix_len - 8]; + decode_metadata(slice)? + }; + + Ok(metadata) +} diff --git a/js/src/io/parquet/mod.rs b/js/src/io/parquet/mod.rs new file mode 100644 index 000000000..2947ed605 --- /dev/null +++ b/js/src/io/parquet/mod.rs @@ -0,0 +1,9 @@ +#[cfg(feature = "io_parquet_async")] +pub mod r#async; +#[cfg(feature = "io_parquet_async")] +pub mod async_file_reader; +pub mod sync; + +#[cfg(feature = "io_parquet_async")] +pub use r#async::{ParquetDataset, ParquetFile}; +pub use sync::read_geoparquet; diff --git a/js/src/io/parquet.rs b/js/src/io/parquet/sync.rs similarity index 92% rename from js/src/io/parquet.rs rename to js/src/io/parquet/sync.rs index b5c81797a..cddf6f38b 100644 --- a/js/src/io/parquet.rs +++ b/js/src/io/parquet/sync.rs @@ -6,7 +6,7 @@ use wasm_bindgen::prelude::*; use crate::error::WasmResult; use crate::table::GeoTable; -/// Read a FlatGeobuf file into GeoArrow memory +/// Read a GeoParquet file into GeoArrow memory /// /// Example: /// @@ -21,7 +21,7 @@ use crate::table::GeoTable; /// const arrowTable = tableFromIPC(arrowUint8Array); /// ``` /// -/// @param file Uint8Array containing FlatGeobuf data +/// @param file Uint8Array containing GeoParquet data /// @returns Uint8Array containing Arrow data in [IPC Stream format](https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format). To parse this into an Arrow table, pass to `tableFromIPC` in the Arrow JS bindings. #[wasm_bindgen(js_name = readGeoParquet)] pub fn read_geoparquet(file: Vec) -> WasmResult { diff --git a/js/src/table.rs b/js/src/table.rs index d20d6fbf4..5b28baaca 100644 --- a/js/src/table.rs +++ b/js/src/table.rs @@ -13,3 +13,15 @@ impl GeoTable { Table::new(schema, batches) } } + +impl From for GeoTable { + fn from(value: geoarrow::table::GeoTable) -> Self { + Self(value) + } +} + +impl From for geoarrow::table::GeoTable { + fn from(value: GeoTable) -> Self { + value.0 + } +} diff --git a/python/core/Cargo.lock b/python/core/Cargo.lock index 1c7445b69..87b3af3a7 100644 --- a/python/core/Cargo.lock +++ b/python/core/Cargo.lock @@ -1016,6 +1016,7 @@ dependencies = [ "arrow-buffer", "bytes", "flatgeobuf", + "futures", "geo", "geoarrow", "geozero", @@ -1959,15 +1960,18 @@ dependencies = [ "bytes", "chrono", "flate2", + "futures", "half", "hashbrown", "lz4_flex", "num", "num-bigint", + "object_store", "paste", "seq-macro", "snap", "thrift", + "tokio", "twox-hash", "zstd", ] diff --git a/python/core/Cargo.toml b/python/core/Cargo.toml index 3fe8a01b5..868d65e81 100644 --- a/python/core/Cargo.toml +++ b/python/core/Cargo.toml @@ -22,10 +22,11 @@ arrow-buffer = "51" arrow = { version = "51", features = ["ffi"] } bytes = "1" flatgeobuf = { version = "4.1.0", default-features = false } +futures = "0.3" object_store = { version = "0.9.0", features = ["aws", "azure", "gcp", "http"] } object_store_python = { git = "https://github.com/kylebarron/object-store-python", branch = "kyle/expose-inner", package = "object-store-internal" } # object_store_python = { git = "https://github.com/roeap/object-store-python", rev = "445e9d7fa238fc3cd31cc2820caee0d8e10fedb8", package = "object-store-internal" } -parquet = "51" +parquet = { version = "51", features = ["object_store"] } pyo3 = { version = "0.20.0", features = [ "abi3-py38", "multiple-pymethods", @@ -41,6 +42,7 @@ geoarrow = { path = "../../", features = [ "flatgeobuf", "geozero", "ipc_compression", + "parquet_async", "parquet_compression", "parquet", "polylabel", diff --git a/python/core/python/geoarrow/rust/core/_rust.pyi b/python/core/python/geoarrow/rust/core/_rust.pyi index 3ee5c3e3b..062e535ca 100644 --- a/python/core/python/geoarrow/rust/core/_rust.pyi +++ b/python/core/python/geoarrow/rust/core/_rust.pyi @@ -1428,6 +1428,29 @@ def total_bounds( def explode(input: ArrowStreamExportable) -> GeoTable: ... # I/O + +class ParquetFile: + def __init__(self, path: str, fs: ObjectStore) -> None: ... + @property + def num_rows(self) -> int: ... + @property + def num_row_groups(self) -> int: ... + def file_bbox(self) -> Optional[List[float]]: ... + async def read_async(self) -> GeoTable: ... + def read(self) -> GeoTable: ... + async def read_row_groups_async(self, row_groups: Sequence[int]) -> GeoTable: ... + def read_row_groups(self, row_groups: Sequence[int]) -> GeoTable: ... + +class ParquetDataset: + def __init__(self, paths: Sequence[str], fs: ObjectStore) -> None: ... + @property + def num_rows(self) -> int: ... + @property + def num_row_groups(self) -> int: ... + +class ObjectStore: + def __init__(self, root: str, options: Optional[Dict[str, str]] = None) -> None: ... + def read_csv( file: str | Path | BinaryIO, geometry_column_name: str, @@ -1499,9 +1522,6 @@ def write_ipc_stream( ) -> None: ... def write_parquet(table: ArrowStreamExportable, file: str) -> None: ... -class ObjectStore: - def __init__(self, root: str, options: Optional[Dict[str, str]] = None) -> None: ... - # Interop def from_ewkb( input: ArrowArrayExportable, diff --git a/python/core/src/io/parquet.rs b/python/core/src/io/parquet.rs index a23b87d1f..7b9b15b77 100644 --- a/python/core/src/io/parquet.rs +++ b/python/core/src/io/parquet.rs @@ -1,11 +1,20 @@ use std::fs::File; use std::io::BufWriter; +use std::sync::Arc; -use crate::error::PyGeoArrowResult; +use crate::error::{PyGeoArrowError, PyGeoArrowResult}; use crate::table::GeoTable; + +use geoarrow::array::CoordType; +use geoarrow::error::GeoArrowError; use geoarrow::io::parquet::read_geoparquet as _read_geoparquet; use geoarrow::io::parquet::write_geoparquet as _write_geoparquet; use geoarrow::io::parquet::GeoParquetReaderOptions; +use geoarrow::io::parquet::ParquetDataset as _ParquetDataset; +use geoarrow::io::parquet::ParquetFile as _ParquetFile; +use object_store::ObjectStore; +use object_store_python::PyObjectStore; +use parquet::arrow::async_reader::ParquetObjectReader; use pyo3::exceptions::PyFileNotFoundError; use pyo3::prelude::*; @@ -44,3 +53,202 @@ pub fn write_parquet(mut table: GeoTable, file: String) -> PyGeoArrowResult<()> _write_geoparquet(&mut table.0, writer, None)?; Ok(()) } + +/// Reader interface for a single Parquet file. +#[pyclass(module = "geoarrow.rust.core._rust")] +pub struct ParquetFile { + file: _ParquetFile, +} + +#[pymethods] +impl ParquetFile { + /// Construct a new ParquetFile + /// + /// This will synchronously fetch metadata from the provided path + /// + /// Args: + /// path: a string URL to read from. + /// fs: the file system interface to read from. + /// + /// Returns: + /// A new ParquetFile object. + // TODO: change this to aenter + #[new] + pub fn new(path: String, fs: PyObjectStore) -> PyGeoArrowResult { + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap() + .block_on(async move { + let meta = fs + .inner + .head(&path.into()) + .await + .map_err(GeoArrowError::ObjectStoreError)?; + let reader = ParquetObjectReader::new(fs.inner, meta); + let file = _ParquetFile::new(reader, Default::default()).await?; + Ok(Self { file }) + }) + } + + /// The number of rows in this file. + #[getter] + fn num_rows(&self) -> usize { + self.file.num_rows() + } + + /// The number of row groups in this file. + #[getter] + fn num_row_groups(&self) -> usize { + self.file.num_row_groups() + } + + /// Access the bounding box of the given column for the entire file + /// + /// If no column name is passed, retrieves the bbox from the primary geometry column. + /// + /// An Err will be returned if the column name does not exist in the dataset + /// None will be returned if the metadata does not contain bounding box information. + fn file_bbox(&self, column_name: Option<&str>) -> PyGeoArrowResult>> { + let bbox = self.file.file_bbox(column_name)?; + Ok(bbox.map(|b| b.to_vec())) + } + + /// Read this entire file in an async fashion. + fn read_async(&self, py: Python) -> PyGeoArrowResult { + let file = self.file.clone(); + let fut = pyo3_asyncio::tokio::future_into_py(py, async move { + let table = file + .read(&CoordType::Interleaved) + .await + .map_err(PyGeoArrowError::GeoArrowError)?; + Ok(GeoTable(table)) + })?; + Ok(fut.into()) + } + + /// Read this entire file synchronously. + fn read(&self) -> PyGeoArrowResult { + let file = self.file.clone(); + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap() + .block_on(async move { + let table = file + .read(&CoordType::Interleaved) + .await + .map_err(PyGeoArrowError::GeoArrowError)?; + Ok(GeoTable(table)) + }) + } + + /// Read the selected row group indexes in an async fashion. + /// + /// Args: + /// row_groups: numeric indexes of the Parquet row groups to read. + /// + /// Returns: + /// parsed table. + fn read_row_groups_async( + &self, + py: Python, + row_groups: Vec, + ) -> PyGeoArrowResult { + let file = self.file.clone(); + let fut = pyo3_asyncio::tokio::future_into_py(py, async move { + let table = file + .read_row_groups(row_groups, &CoordType::Interleaved) + .await + .map_err(PyGeoArrowError::GeoArrowError)?; + Ok(GeoTable(table)) + })?; + Ok(fut.into()) + } + + /// Read the selected row group indexes synchronously. + /// + /// Args: + /// row_groups: numeric indexes of the Parquet row groups to read. + /// + /// Returns: + /// parsed table. + fn read_row_groups(&self, row_groups: Vec) -> PyGeoArrowResult { + let file = self.file.clone(); + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap() + .block_on(async move { + let table = file + .read_row_groups(row_groups, &CoordType::Interleaved) + .await + .map_err(PyGeoArrowError::GeoArrowError)?; + Ok(GeoTable(table)) + }) + } +} + +/// Encapsulates details of reading a complete Parquet dataset possibly consisting of multiple +/// files and partitions in subdirectories. +#[pyclass(module = "geoarrow.rust.core._rust")] +pub struct ParquetDataset { + inner: _ParquetDataset, +} + +/// Create a reader per path with the given ObjectStore instance. +async fn create_readers( + paths: Vec, + store: Arc, +) -> PyGeoArrowResult> { + let paths: Vec = paths.into_iter().map(|path| path.into()).collect(); + let futures = paths.iter().map(|path| store.head(path)); + let object_metas = futures::future::join_all(futures) + .await + .into_iter() + .collect::, object_store::Error>>() + .map_err(GeoArrowError::ObjectStoreError)?; + let readers = object_metas + .into_iter() + .map(|meta| ParquetObjectReader::new(store.clone(), meta)) + .collect::>(); + Ok(readers) +} + +#[pymethods] +impl ParquetDataset { + /// Construct a new ParquetDataset + /// + /// This will synchronously fetch metadata from all listed files. + /// + /// Args: + /// paths: a list of string URLs to read from. + /// fs: the file system interface to read from. + /// + /// Returns: + /// A new ParquetDataset object. + #[new] + pub fn new(paths: Vec, fs: PyObjectStore) -> PyGeoArrowResult { + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap() + .block_on(async move { + let readers = create_readers(paths, fs.inner).await?; + let dataset = _ParquetDataset::new(readers, Default::default()).await?; + Ok(Self { inner: dataset }) + }) + } + + /// The total number of rows across all files. + #[getter] + fn num_rows(&self) -> usize { + self.inner.num_rows() + } + + /// The total number of row groups across all files + #[getter] + fn num_row_groups(&self) -> usize { + self.inner.num_row_groups() + } +} diff --git a/python/core/src/lib.rs b/python/core/src/lib.rs index 172a90ea5..9799e6be3 100644 --- a/python/core/src/lib.rs +++ b/python/core/src/lib.rs @@ -165,6 +165,8 @@ fn _rust(_py: Python, m: &PyModule) -> PyResult<()> { // IO m.add_class::()?; + m.add_class::()?; + m.add_class::()?; m.add_function(wrap_pyfunction!(crate::io::csv::read_csv, m)?)?; m.add_function(wrap_pyfunction!(crate::io::flatgeobuf::read_flatgeobuf, m)?)?; diff --git a/src/array/metadata.rs b/src/array/metadata.rs index f4f4f529f..81dbaecc1 100644 --- a/src/array/metadata.rs +++ b/src/array/metadata.rs @@ -3,10 +3,11 @@ //! This metadata is [defined by the GeoArrow specification](https://geoarrow.org/extension-types). use serde::{Deserialize, Serialize}; +use serde_json::Value; /// If present, instructs consumers that edges follow a spherical path rather than a planar one. If /// this value is omitted, edges will be interpreted as planar. -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Hash)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub enum Edges { #[serde(rename = "spherical")] Spherical, @@ -17,7 +18,7 @@ pub enum Edges { /// /// This is serialized to JSON when a [`geoarrow`] array is exported to an [`arrow`] array and /// deserialized when imported from an [`arrow`] array. -#[derive(Default, Debug, Clone, Serialize, Deserialize, PartialEq, Hash)] +#[derive(Default, Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct ArrayMetadata { /// A JSON object describing the coordinate reference system (CRS) using PROJJSON. This key can /// also be omitted if the producer does not have any information about the CRS. Note that @@ -25,7 +26,7 @@ pub struct ArrayMetadata { /// to the wording in the GeoPackage WKB binary encoding: axis order is always (longitude, /// latitude) and (easting, northing) regardless of the the axis order encoded in the CRS /// specification. - pub crs: Option, + pub crs: Option, /// If present, instructs consumers that edges follow a spherical path rather than a planar /// one. If this value is omitted, edges will be interpreted as planar. diff --git a/src/io/geozero/table/builder/table.rs b/src/io/geozero/table/builder/table.rs index a3755b4a1..606dad0f2 100644 --- a/src/io/geozero/table/builder/table.rs +++ b/src/io/geozero/table/builder/table.rs @@ -14,7 +14,7 @@ use crate::table::GeoTable; use crate::trait_::{GeometryArrayBuilder, GeometryArrayTrait}; /// Options for creating a GeoTableBuilder. -#[derive(Debug, Clone, PartialEq, Hash)] +#[derive(Debug, Clone, PartialEq)] pub struct GeoTableBuilderOptions { pub metadata: Arc, diff --git a/src/io/parquet/metadata.rs b/src/io/parquet/metadata.rs index bd9da56f8..d2d414b07 100644 --- a/src/io/parquet/metadata.rs +++ b/src/io/parquet/metadata.rs @@ -12,14 +12,14 @@ use parquet::file::metadata::FileMetaData; use serde::{Deserialize, Serialize}; use serde_json::Value; -#[derive(Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct GeoParquetMetadata { pub version: String, pub primary_column: String, pub columns: HashMap, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct GeoParquetColumnMetadata { pub encoding: String, pub geometry_types: Vec, @@ -53,6 +53,65 @@ impl GeoParquetMetadata { "expected a 'geo' key in GeoParquet metadata".to_string(), )) } + + /// Check if this metadata is compatible with another metadata instance, swallowing the error + /// message if not compatible. + pub fn is_compatible_with(&self, other: &GeoParquetMetadata) -> bool { + self.try_compatible_with(other).is_ok() + } + + /// Assert that this metadata is compatible with another metadata instance, erroring if not + pub fn try_compatible_with(&self, other: &GeoParquetMetadata) -> Result<()> { + if self.version.as_str() != other.version.as_str() { + return Err(GeoArrowError::General( + "Different GeoParquet versions".to_string(), + )); + } + + if self.primary_column.as_str() != other.primary_column.as_str() { + return Err(GeoArrowError::General( + "Different GeoParquet primary columns".to_string(), + )); + } + + for key in self.columns.keys() { + let left = self.columns.get(key).unwrap(); + let right = other + .columns + .get(key) + .ok_or(GeoArrowError::General(format!( + "Other GeoParquet metadata missing column {}", + key + )))?; + + if left.encoding.as_str() != right.encoding.as_str() { + return Err(GeoArrowError::General(format!( + "Different GeoParquet encodings for column {}", + key + ))); + } + + match (left.crs.as_ref(), right.crs.as_ref()) { + (Some(left_crs), Some(right_crs)) => { + if left_crs != right_crs { + return Err(GeoArrowError::General(format!( + "Different GeoParquet CRS for column {}", + key + ))); + } + } + (Some(_), None) | (None, Some(_)) => { + return Err(GeoArrowError::General(format!( + "Different GeoParquet CRS for column {}", + key + ))); + } + (None, None) => (), + } + } + + Ok(()) + } } // TODO: deduplicate with `resolve_types` in `downcast.rs` diff --git a/src/io/parquet/mod.rs b/src/io/parquet/mod.rs index 5037c6c68..bb6e9d0fb 100644 --- a/src/io/parquet/mod.rs +++ b/src/io/parquet/mod.rs @@ -37,9 +37,11 @@ mod metadata; mod reader; #[cfg(feature = "parquet_async")] mod reader_async; +#[cfg(test)] +mod test; mod writer; pub use reader::{read_geoparquet, GeoParquetReaderOptions}; #[cfg(feature = "parquet_async")] -pub use reader_async::read_geoparquet_async; +pub use reader_async::{read_geoparquet_async, ParquetDataset, ParquetFile, ParquetReaderOptions}; pub use writer::write_geoparquet; diff --git a/src/io/parquet/reader.rs b/src/io/parquet/reader.rs index 3f03a9f5c..0c5f0b1aa 100644 --- a/src/io/parquet/reader.rs +++ b/src/io/parquet/reader.rs @@ -20,6 +20,15 @@ impl GeoParquetReaderOptions { } } +impl Default for GeoParquetReaderOptions { + fn default() -> Self { + Self { + batch_size: 65535, + coord_type: Default::default(), + } + } +} + /// Read a GeoParquet file to a GeoTable. pub fn read_geoparquet( reader: R, diff --git a/src/io/parquet/reader_async.rs b/src/io/parquet/reader_async.rs index f737dc982..f0650d9c6 100644 --- a/src/io/parquet/reader_async.rs +++ b/src/io/parquet/reader_async.rs @@ -1,10 +1,15 @@ -use crate::error::Result; -use crate::io::parquet::metadata::build_arrow_schema; +use crate::array::CoordType; +use crate::error::{GeoArrowError, Result}; +use crate::io::parquet::metadata::{build_arrow_schema, GeoParquetMetadata}; use crate::io::parquet::reader::GeoParquetReaderOptions; use crate::table::GeoTable; +use arrow_schema::SchemaRef; use futures::stream::TryStreamExt; +use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions}; use parquet::arrow::async_reader::{AsyncFileReader, ParquetRecordBatchStreamBuilder}; +use parquet::arrow::ProjectionMask; +use serde_json::Value; /// Asynchronously read a GeoParquet file to a GeoTable. pub async fn read_geoparquet_async( @@ -14,9 +19,15 @@ pub async fn read_geoparquet_async( let builder = ParquetRecordBatchStreamBuilder::new(reader) .await? .with_batch_size(options.batch_size); + read_builder(builder, &options.coord_type).await +} +async fn read_builder( + builder: ParquetRecordBatchStreamBuilder, + coord_type: &CoordType, +) -> Result { let (arrow_schema, geometry_column_index, target_geo_data_type) = - build_arrow_schema(&builder, &options.coord_type)?; + build_arrow_schema(&builder, coord_type)?; let stream = builder.build()?; let batches = stream.try_collect::<_>().await?; @@ -29,6 +40,236 @@ pub async fn read_geoparquet_async( ) } +#[derive(Clone, Default)] +pub struct ParquetReaderOptions { + batch_size: Option, + limit: Option, + offset: Option, + projection: Option, +} + +/// To create from an object-store item: +/// +/// ```notest +/// let reader = ParquetObjectReader::new(store, meta); +/// +/// ``` +#[derive(Clone)] +pub struct ParquetFile { + reader: R, + meta: ArrowReaderMetadata, + options: ParquetReaderOptions, + geo_meta: Option, +} + +impl ParquetFile { + /// Construct a new `ParquetFile` from a reader. + /// + /// This will fetch the metadata from the reader. + pub async fn new(mut reader: R, options: ParquetReaderOptions) -> Result { + let reader_options = ArrowReaderOptions::new().with_page_index(true); + let meta = ArrowReaderMetadata::load_async(&mut reader, reader_options).await?; + let geo_meta = GeoParquetMetadata::from_parquet_meta(meta.metadata().file_metadata()).ok(); + Ok(Self { + reader, + meta, + options, + geo_meta, + }) + } + + /// Construct a new `ParquetFile` from an existing metadata + pub fn from_meta( + reader: R, + meta: ArrowReaderMetadata, + options: ParquetReaderOptions, + ) -> Result { + let geo_meta = GeoParquetMetadata::from_parquet_meta(meta.metadata().file_metadata()).ok(); + Ok(Self { + reader, + meta, + options, + geo_meta, + }) + } + + /// The Arrow schema of the underlying data + /// + /// Note that this schema is before conversion of any geometry column(s) to GeoArrow. + pub fn schema(&self) -> SchemaRef { + self.meta.schema().clone() + } + + /// The number of rows in this file. + pub fn num_rows(&self) -> usize { + self.meta + .metadata() + .row_groups() + .iter() + .fold(0, |acc, row_group_meta| { + acc + usize::try_from(row_group_meta.num_rows()).unwrap() + }) + } + + /// The number of row groups in this file. + pub fn num_row_groups(&self) -> usize { + self.meta.metadata().num_row_groups() + } + + /// Access the geo metadata of this file. + pub fn geo_metadata(&self) -> Option<&GeoParquetMetadata> { + self.geo_meta.as_ref() + } + + /// Access the bounding box of the given column for the entire file + /// + /// If no column name is passed, retrieves the bbox from the primary geometry column. + /// + /// An Err will be returned if the column name does not exist in the dataset + /// None will be returned if the metadata does not contain bounding box information. + pub fn file_bbox(&self, column_name: Option<&str>) -> Result> { + if let Some(geo_meta) = self.geo_metadata() { + let column_name = column_name.unwrap_or(geo_meta.primary_column.as_str()); + let column_meta = geo_meta + .columns + .get(column_name) + .ok_or(GeoArrowError::General(format!( + "Column {} not found in GeoParquet metadata", + column_name + )))?; + Ok(column_meta.bbox.as_deref()) + } else { + Ok(None) + } + } + + pub fn crs(&self, column_name: Option<&str>) -> Result> { + if let Some(geo_meta) = self.geo_metadata() { + let column_name = column_name.unwrap_or(geo_meta.primary_column.as_str()); + let column_meta = geo_meta + .columns + .get(column_name) + .ok_or(GeoArrowError::General(format!( + "Column {} not found in GeoParquet metadata", + column_name + )))?; + Ok(column_meta.crs.as_ref()) + } else { + Ok(None) + } + } + + fn builder(&self) -> ParquetRecordBatchStreamBuilder { + let mut builder = ParquetRecordBatchStreamBuilder::new_with_metadata( + self.reader.clone(), + self.meta.clone(), + ); + + if let Some(batch_size) = self.options.batch_size { + builder = builder.with_batch_size(batch_size); + } + + if let Some(limit) = self.options.limit { + builder = builder.with_limit(limit); + } + + if let Some(offset) = self.options.offset { + builder = builder.with_offset(offset); + } + + if let Some(projection) = &self.options.projection { + builder = builder.with_projection(projection.clone()); + } + + builder + } + + /// Read into a table. + pub async fn read(&self, coord_type: &CoordType) -> Result { + let builder = self.builder(); + read_builder(builder, coord_type).await + } + + /// Read the specified row groups into a table. + pub async fn read_row_groups( + &self, + row_groups: Vec, + coord_type: &CoordType, + ) -> Result { + let builder = self.builder().with_row_groups(row_groups); + read_builder(builder, coord_type).await + } +} + +#[derive(Clone)] +pub struct ParquetDataset { + // TODO: should this be a hashmap instead? + files: Vec>, +} + +impl ParquetDataset { + pub async fn new(readers: Vec, options: ParquetReaderOptions) -> Result { + let futures = readers + .into_iter() + .map(|reader| ParquetFile::new(reader, options.clone())); + let files = futures::future::join_all(futures) + .await + .into_iter() + .collect::>>()?; + + // Validate metadata across files with `GeoParquetMetadata::try_compatible_with` + for pair in files.windows(2) { + match (pair[0].geo_metadata(), pair[1].geo_metadata()) { + (Some(left), Some(right)) => left.try_compatible_with(right)?, + (None, Some(_)) | (Some(_), None) => { + return Err(GeoArrowError::General( + "Not all files have GeoParquet metadata".to_string(), + )) + } + (None, None) => (), + } + } + + Ok(Self { files }) + } + + /// The total number of rows across all files. + pub fn num_rows(&self) -> usize { + self.files.iter().fold(0, |acc, file| acc + file.num_rows()) + } + + /// The total number of row groups across all files + pub fn num_row_groups(&self) -> usize { + self.files + .iter() + .fold(0, |acc, file| acc + file.num_row_groups()) + } + + /// The total bounds of the entire dataset + /// + /// An Err will be returned if the column name does not exist in the dataset + /// None will be returned if the metadata does not contain bounding box information. + pub fn total_bounds(&self, _column_name: Option<&str>) -> Result>> { + // let x = self.files.iter().try_fold(None::>, |acc, file| { + // match (acc, file.file_bbox(column_name)?) { + // (None, None) => Ok(None), + // (Some(acc), None) + // } + // })?; + todo!() + } + + /// Read into a table. + pub async fn read(&self, coord_type: &CoordType) -> Result> { + let futures = self.files.iter().map(|file| file.read(coord_type)); + let tables = futures::future::join_all(futures) + .await + .into_iter() + .collect::>>()?; + Ok(tables) + } +} + #[cfg(test)] mod test { use super::*; diff --git a/src/io/parquet/test.rs b/src/io/parquet/test.rs new file mode 100644 index 000000000..4514c0bd8 --- /dev/null +++ b/src/io/parquet/test.rs @@ -0,0 +1,19 @@ +use std::fs::File; +use std::io::Cursor; + +use bytes::Bytes; + +use crate::io::parquet::{read_geoparquet, write_geoparquet}; + +#[ignore = "fails!"] +#[test] +fn round_trip_nybb() { + let file = File::open("fixtures/geoparquet/nybb.parquet").unwrap(); + let mut table = read_geoparquet(file, Default::default()).unwrap(); + + let mut buf = vec![]; + write_geoparquet(&mut table, Cursor::new(&mut buf), Default::default()).unwrap(); + let again = read_geoparquet(Bytes::from(buf), Default::default()).unwrap(); + assert_eq!(table.schema(), again.schema()); + // assert_eq!(table.geometry().unwrap().ch, again.geometry().unwrap()); +} diff --git a/src/io/parquet/writer.rs b/src/io/parquet/writer.rs index de4971c06..e6474d7ed 100644 --- a/src/io/parquet/writer.rs +++ b/src/io/parquet/writer.rs @@ -1,6 +1,5 @@ use std::collections::HashMap; use std::io::Write; -use std::str::FromStr; use parquet::arrow::ArrowWriter; use parquet::file::metadata::KeyValue; @@ -63,12 +62,7 @@ fn create_metadata(table: &GeoTable) -> Result { .first() .unwrap() .metadata(); - let crs = array_metadata - .as_ref() - .crs - .as_ref() - .map(|crs_str| serde_json::Value::from_str(crs_str.as_str())) - .transpose()?; + let crs = array_metadata.as_ref().crs.clone(); let geometry_column_name = table.schema().field(table.geometry_column_index()).name(); let column_meta = GeoParquetColumnMetadata {