From 0bdacd8670168071ff4522dd9a01e53403c618b1 Mon Sep 17 00:00:00 2001 From: Dumitrel Loghin Date: Thu, 18 Apr 2024 17:28:33 +0800 Subject: [PATCH] build Merkle Tree only with leaves allocated from Rust and remove locks --- field/Cargo.toml | 2 +- plonky2/Cargo.toml | 2 +- plonky2/src/fri/oracle.rs | 9 - plonky2/src/hash/merkle_tree.rs | 354 +++++--------------------------- plonky2/src/lib.rs | 1 + 5 files changed, 59 insertions(+), 309 deletions(-) diff --git a/field/Cargo.toml b/field/Cargo.toml index 25cf46ee92..230ad5030c 100644 --- a/field/Cargo.toml +++ b/field/Cargo.toml @@ -23,7 +23,7 @@ rand = { workspace = true, features = ["getrandom"] } serde = { workspace = true, features = ["alloc"] } static_assertions = { workspace = true } unroll = { workspace = true } -cryptography_cuda ={git="ssh://git@github.com/okx/cryptography_cuda.git", rev="56cee09dd044de44f05c7d54383c6a8cb4078b29", optional=true} +cryptography_cuda = { git = "ssh://git@github.com/okx/cryptography_cuda.git", rev = "173510160183f3299f4765b30bd4f2c1685353f9", optional = true } [dev-dependencies] rand = { version = "0.8.5", default-features = false, features = ["getrandom"] } diff --git a/plonky2/Cargo.toml b/plonky2/Cargo.toml index 10c9ed375a..5078219997 100644 --- a/plonky2/Cargo.toml +++ b/plonky2/Cargo.toml @@ -42,7 +42,7 @@ once_cell = { version = "1.18.0" } plonky2_field = { version = "0.2.0", path = "../field", default-features = false } plonky2_maybe_rayon = { version = "0.2.0", path = "../maybe_rayon", default-features = false } plonky2_util = { version = "0.2.0", path = "../util", default-features = false } -cryptography_cuda ={git="ssh://git@github.com/okx/cryptography_cuda.git", rev="56cee09dd044de44f05c7d54383c6a8cb4078b29", optional=true} +cryptography_cuda = { git = "ssh://git@github.com/okx/cryptography_cuda.git", rev = "173510160183f3299f4765b30bd4f2c1685353f9", optional = true } [target.'cfg(all(target_arch = "wasm32", target_os = "unknown"))'.dependencies] getrandom = { version = "0.2", default-features = false, features = ["js"] } diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs index 747c3a51b3..83f71071ab 100644 --- a/plonky2/src/fri/oracle.rs +++ b/plonky2/src/fri/oracle.rs @@ -6,8 +6,6 @@ use cryptography_cuda::{ device::memory::HostOrDeviceSlice, lde_batch, lde_batch_multi_gpu, transpose_rev_batch, types::*, }; -#[cfg(feature = "cuda")] -use crate::hash::merkle_tree::GPU_LOCK; use itertools::Itertools; use plonky2_field::types::Field; @@ -245,10 +243,6 @@ impl, C: GenericConfig, const D: usize> log_n: usize, _degree: usize, ) -> MerkleTree>::Hasher> { - - let mut lock = GPU_LOCK.lock().unwrap(); - *lock += 1; - // let salt_size = if blinding { SALT_SIZE } else { 0 }; // println!("salt_size: {:?}", salt_size); let output_domain_size = log_n + rate_bits; @@ -374,9 +368,6 @@ impl, C: GenericConfig, const D: usize> #[cfg(all(feature = "cuda", feature = "batch"))] if log_n > 10 && polynomials.len() > 0 { - let mut lock = GPU_LOCK.lock().unwrap(); - *lock += 1; - println!("log_n: {:?}", log_n); let start_lde = std::time::Instant::now(); diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs index c4fdf2505b..e59512d56c 100644 --- a/plonky2/src/hash/merkle_tree.rs +++ b/plonky2/src/hash/merkle_tree.rs @@ -6,8 +6,6 @@ use core::mem::MaybeUninit; use core::slice; use std::collections::HashSet; #[cfg(feature = "cuda")] -use std::os::raw::c_void; -#[cfg(feature = "cuda")] use std::sync::Mutex; use std::time::Instant; @@ -17,9 +15,7 @@ use cryptography_cuda::device::memory::HostOrDeviceSlice; use cryptography_cuda::device::stream::CudaStream; #[cfg(feature = "cuda")] use cryptography_cuda::merkle::bindings::{ - fill_delete, fill_digests_buf_linear_gpu, fill_digests_buf_linear_gpu_with_gpu_ptr, - fill_digests_buf_linear_multigpu, fill_digests_buf_linear_multigpu_with_gpu_ptr, fill_init, - get_cap_ptr, get_digests_ptr, get_leaves_ptr, + fill_digests_buf_linear_gpu_with_gpu_ptr, fill_digests_buf_linear_multigpu_with_gpu_ptr, }; use num::range; #[cfg(feature = "cuda")] @@ -37,7 +33,7 @@ use crate::plonk::config::{GenericHashOut, Hasher}; use crate::util::log2_strict; #[cfg(feature = "cuda")] -pub static GPU_LOCK: Lazy>> = Lazy::new(|| Arc::new(Mutex::new(0))); +pub static GPU_ID: Lazy>> = Lazy::new(|| Arc::new(Mutex::new(0))); #[cfg(feature = "cuda_timing")] fn print_time(now: Instant, msg: &str) { @@ -269,294 +265,49 @@ union U8U64 { } #[cfg(feature = "cuda")] -fn fill_digests_buf_gpu_v1>( +fn fill_digests_buf_gpu>( digests_buf: &mut [MaybeUninit], cap_buf: &mut [MaybeUninit], leaves: &Vec, leaf_size: usize, cap_height: usize, ) { - let digests_count: u64 = digests_buf.len().try_into().unwrap(); - let leaves_count: u64 = (leaves.len() / leaf_size).try_into().unwrap(); - let leaf_size: u64 = leaf_size.try_into().unwrap(); - let caps_count: u64 = cap_buf.len().try_into().unwrap(); - let cap_height: u64 = cap_height.try_into().unwrap(); - let hash_size: u64 = H::HASH_SIZE.try_into().unwrap(); - - let mut lock = GPU_LOCK.lock().unwrap(); - *lock += 1; - - unsafe { - let now = Instant::now(); - fill_init( - digests_count, - leaves_count, - caps_count, - leaf_size, - hash_size, - H::HASHER_TYPE as u64, - ); - print_time(now, "fill init"); - let now = Instant::now(); - - // copy data to C - let mut pd: *mut u64 = get_digests_ptr(); - let mut pl: *mut u64 = get_leaves_ptr(); - let mut pc: *mut u64 = get_cap_ptr(); - - for elem in leaves { - let val = &elem.to_canonical_u64(); - *pl = *val; - pl = pl.add(1); - } - - print_time(now, "copy data to C"); - let now = Instant::now(); + let leaves_count = leaves.len() / leaf_size; - let num_gpus: usize = std::env::var("NUM_OF_GPUS") + let num_gpus: usize = std::env::var("NUM_OF_GPUS") .expect("NUM_OF_GPUS should be set") .parse() .unwrap(); - // println!("Digest size {}, Leaves {}, Leaf size {}, Cap H {}", digests_count, leaves_count, leaf_size, cap_height); - if !FORCE_SINGLE_GPU - && leaves_count >= (1 << 12) - && cap_height > 0 - && num_gpus > 1 - && H::HASHER_TYPE == HasherType::PoseidonBN128 - { - // println!("Multi GPU"); - fill_digests_buf_linear_multigpu( - digests_count, - caps_count, - leaves_count, - leaf_size, - cap_height, - num_gpus as u64, - ); - } else { - // println!("Single GPU"); - fill_digests_buf_linear_gpu( - digests_count, - caps_count, - leaves_count, - leaf_size, - cap_height, - ); - } - print_time(now, "kernel"); - let now = Instant::now(); - - // TODO - debug code - to remove in future - // let mut pd : *mut u64 = get_digests_ptr(); - /* - println!("*** Digests"); - for i in 0..leaves.len() { - for j in 0..leaf_size { - print!("{} ", *pd); - pd = pd.add(1); - } - println!(); - } - pd = get_digests_ptr(); - */ - /* - let fname = format!("gpu-{}-{}-{}-{}.txt", digests_count, leaves_count, leaf_size, cap_height); - let mut file = File::create(fname).unwrap(); - for _i in 0..digests_count { - for _j in 0..4 { - let str = format!("{} ", *pd); - file.write_all(str.as_bytes()); - pd = pd.add(1); - } - file.write_all(b"\n"); - } - pd = get_digests_ptr(); - */ - - // copy data from C - for dg in digests_buf { - let mut parts = U8U64 { f1: [0; 32] }; - // copy hash from pd to digests_buf - for i in 0..4 { - parts.f2[i] = *pd; - pd = pd.add(1); - } - let (slice, _) = parts.f1.split_at(H::HASH_SIZE); - let h: H::Hash = H::Hash::from_bytes(slice); - dg.write(h); - } - for cp in cap_buf { - let mut parts = U8U64 { f1: [0; 32] }; - // copy hash from pc to cap_buf - for i in 0..4 { - parts.f2[i] = *pc; - pc = pc.add(1); - } - let (slice, _) = parts.f1.split_at(H::HASH_SIZE); - let h: H::Hash = H::Hash::from_bytes(slice); - cp.write(h); - } - - print_time(now, "copy results"); - let now = Instant::now(); - - fill_delete(); - print_time(now, "fill delete"); + let mut gpu_id_lock = GPU_ID.lock().unwrap(); + let gpu_id = *gpu_id_lock; + *gpu_id_lock += 1; + if *gpu_id_lock >= num_gpus as u64 { + *gpu_id_lock = 0; } -} - -/* -#[allow(dead_code)] -#[cfg(feature = "cuda")] -fn fill_digests_buf_gpu_v2>( - digests_buf: &mut [MaybeUninit], - cap_buf: &mut [MaybeUninit], - leaves: &Vec, - leaf_size: usize, - cap_height: usize, -) { - let digests_count: u64 = digests_buf.len().try_into().unwrap(); - let leaves_count: u64 = (leaves.len() / leaf_size).try_into().unwrap(); - let caps_count: u64 = cap_buf.len().try_into().unwrap(); - let cap_height: u64 = cap_height.try_into().unwrap(); - let leaf_size: u64 = leaf_size.try_into().unwrap(); - - let leaves_size = leaves.len(); + Mutex::unlock(gpu_id_lock); let now = Instant::now(); - - // if digests_buf is empty (size 0), just allocate a few bytes to avoid errors - let digests_size = if digests_buf.len() == 0 { - NUM_HASH_OUT_ELTS - } else { - digests_buf.len() * NUM_HASH_OUT_ELTS - }; - let caps_size = if cap_buf.len() == 0 { - NUM_HASH_OUT_ELTS - } else { - cap_buf.len() * NUM_HASH_OUT_ELTS - }; - - let mut lock = GPU_LOCK.lock().unwrap(); - *lock += 1; - - // println!("{} {} {} {} {:?}", leaves_count, leaf_size, digests_count, caps_count, H::HASHER_TYPE); let mut gpu_leaves_buf: HostOrDeviceSlice<'_, F> = - HostOrDeviceSlice::cuda_malloc(0, leaves_size).unwrap(); - let mut gpu_digests_buf: HostOrDeviceSlice<'_, F> = - HostOrDeviceSlice::cuda_malloc(0, digests_size).unwrap(); - let mut gpu_caps_buf: HostOrDeviceSlice<'_, F> = - HostOrDeviceSlice::cuda_malloc(0, caps_size).unwrap(); - print_time(now, "alloc gpu ds"); - let now = Instant::now(); - - // Note: flatten() is very slow, so we use a naive nested for loop - // let leaves1 = leaves.to_vec().into_iter().flatten().collect::>(); - - // v1: use 2 for loops - better than flatten() - let mut leaves1 = Vec::with_capacity(leaves_size); - for el in leaves { - leaves1.push(el.clone()); - } - /* - // v2: use par chunks - same performance - let mut leaves1 = vec![F::ZERO; leaves.len() * leaves[0].len()]; - leaves1.par_chunks_exact_mut(leaves[0].len()).enumerate().for_each( - |(i, c)| { - c.copy_from_slice(leaves[i].as_slice()); - } - ); - */ - - let _ = gpu_leaves_buf.copy_from_host(leaves1.as_slice()); + HostOrDeviceSlice::cuda_malloc(gpu_id as i32, leaves.len()).unwrap(); + print_time(now, "alloc gpu leaves buffer"); - print_time(now, "data copy to gpu"); let now = Instant::now(); + let _ = gpu_leaves_buf.copy_from_host(leaves.as_slice()); + print_time(now, "leaves copy to gpu"); - unsafe { - let num_gpus: usize = std::env::var("NUM_OF_GPUS") - .expect("NUM_OF_GPUS should be set") - .parse() - .unwrap(); - if !FORCE_SINGLE_GPU - && leaves_count >= (1 << 12) - && cap_height > 0 - && num_gpus > 1 - && H::HASHER_TYPE == HasherType::PoseidonBN128 - { - // println!("Multi GPU"); - fill_digests_buf_linear_multigpu_with_gpu_ptr( - gpu_digests_buf.as_mut_ptr() as *mut c_void, - gpu_caps_buf.as_mut_ptr() as *mut c_void, - gpu_leaves_buf.as_ptr() as *mut c_void, - digests_count, - caps_count, - leaves_count, - leaf_size, - cap_height, - H::HASHER_TYPE as u64, - ); - } else { - // println!("Single GPU"); - fill_digests_buf_linear_gpu_with_gpu_ptr( - gpu_digests_buf.as_mut_ptr() as *mut c_void, - gpu_caps_buf.as_mut_ptr() as *mut c_void, - gpu_leaves_buf.as_ptr() as *mut c_void, - digests_count, - caps_count, - leaves_count, - leaf_size, - cap_height, - H::HASHER_TYPE as u64, - ); - } - }; - print_time(now, "kernel"); let now = Instant::now(); - - if digests_buf.len() > 0 { - let mut host_digests_buf: Vec = vec![F::ZERO; digests_size]; - let _ = gpu_digests_buf.copy_to_host(host_digests_buf.as_mut_slice(), digests_size); - host_digests_buf - .chunks_exact(4) - .zip(digests_buf) - .for_each(|(x, y)| { - unsafe { - let mut parts = U8U64 { f1: [0; 32] }; - parts.f2[0] = x[0].to_canonical_u64(); - parts.f2[1] = x[1].to_canonical_u64(); - parts.f2[2] = x[2].to_canonical_u64(); - parts.f2[3] = x[3].to_canonical_u64(); - let (slice, _) = parts.f1.split_at(H::HASH_SIZE); - let h: H::Hash = H::Hash::from_bytes(slice); - y.write(h); - }; - }); - } - - if cap_buf.len() > 0 { - let mut host_caps_buf: Vec = vec![F::ZERO; caps_size]; - let _ = gpu_caps_buf.copy_to_host(host_caps_buf.as_mut_slice(), caps_size); - host_caps_buf - .chunks_exact(4) - .zip(cap_buf) - .for_each(|(x, y)| { - unsafe { - let mut parts = U8U64 { f1: [0; 32] }; - parts.f2[0] = x[0].to_canonical_u64(); - parts.f2[1] = x[1].to_canonical_u64(); - parts.f2[2] = x[2].to_canonical_u64(); - parts.f2[3] = x[3].to_canonical_u64(); - let (slice, _) = parts.f1.split_at(H::HASH_SIZE); - let h: H::Hash = H::Hash::from_bytes(slice); - y.write(h); - }; - }); - } - print_time(now, "copy results"); + fill_digests_buf_gpu_ptr::( + digests_buf, + cap_buf, + gpu_leaves_buf.as_mut_ptr(), + leaves_count, + leaf_size, + cap_height, + gpu_id, + ); + print_time(now, "fill_digests_buf_gpu_ptr"); } -*/ #[cfg(feature = "cuda")] fn fill_digests_buf_gpu_ptr>( @@ -566,6 +317,7 @@ fn fill_digests_buf_gpu_ptr>( leaves_len: usize, leaf_len: usize, cap_height: usize, + gpu_id: u64, ) { let digests_count: u64 = digests_buf.len().try_into().unwrap(); let leaves_count: u64 = leaves_len.try_into().unwrap(); @@ -573,8 +325,6 @@ fn fill_digests_buf_gpu_ptr>( let cap_height: u64 = cap_height.try_into().unwrap(); let leaf_size: u64 = leaf_len.try_into().unwrap(); - GPU_LOCK.try_lock().expect_err("GPU_LOCK should be locked!"); - let now = Instant::now(); // if digests_buf is empty (size 0), just allocate a few bytes to avoid errors let digests_size = if digests_buf.len() == 0 { @@ -589,9 +339,9 @@ fn fill_digests_buf_gpu_ptr>( }; let mut gpu_digests_buf: HostOrDeviceSlice<'_, F> = - HostOrDeviceSlice::cuda_malloc(0 as i32, digests_size).unwrap(); + HostOrDeviceSlice::cuda_malloc(gpu_id as i32, digests_size).unwrap(); let mut gpu_cap_buf: HostOrDeviceSlice<'_, F> = - HostOrDeviceSlice::cuda_malloc(0 as i32, caps_size).unwrap(); + HostOrDeviceSlice::cuda_malloc(gpu_id as i32, caps_size).unwrap(); unsafe { let num_gpus: usize = std::env::var("NUM_OF_GPUS") @@ -628,6 +378,7 @@ fn fill_digests_buf_gpu_ptr>( leaf_size, cap_height, H::HASHER_TYPE as u64, + gpu_id, ); } } @@ -670,21 +421,18 @@ fn fill_digests_buf_gpu_ptr>( } if cap_buf.len() > 0 { - host_caps - .chunks_exact(4) - .zip(cap_buf) - .for_each(|(x, y)| { - unsafe { - let mut parts = U8U64 { f1: [0; 32] }; - parts.f2[0] = x[0].to_canonical_u64(); - parts.f2[1] = x[1].to_canonical_u64(); - parts.f2[2] = x[2].to_canonical_u64(); - parts.f2[3] = x[3].to_canonical_u64(); - let (slice, _) = parts.f1.split_at(H::HASH_SIZE); - let h: H::Hash = H::Hash::from_bytes(slice); - y.write(h); - }; - }); + host_caps.chunks_exact(4).zip(cap_buf).for_each(|(x, y)| { + unsafe { + let mut parts = U8U64 { f1: [0; 32] }; + parts.f2[0] = x[0].to_canonical_u64(); + parts.f2[1] = x[1].to_canonical_u64(); + parts.f2[2] = x[2].to_canonical_u64(); + parts.f2[3] = x[3].to_canonical_u64(); + let (slice, _) = parts.f1.split_at(H::HASH_SIZE); + let h: H::Hash = H::Hash::from_bytes(slice); + y.write(h); + }; + }); } print_time(now, "copy results"); } @@ -701,7 +449,7 @@ fn fill_digests_buf_meta>( if leaf_size <= H::HASH_SIZE / 8 || H::HASHER_TYPE == HasherType::Keccak { fill_digests_buf::(digests_buf, cap_buf, leaves, leaf_size, cap_height); } else { - fill_digests_buf_gpu_v1::(digests_buf, cap_buf, leaves, leaf_size, cap_height); + fill_digests_buf_gpu::(digests_buf, cap_buf, leaves, leaf_size, cap_height); } } @@ -826,6 +574,7 @@ impl> MerkleTree { let digests_buf = capacity_up_to_mut(&mut digests, num_digests); let cap_buf = capacity_up_to_mut(&mut cap, len_cap); let now = Instant::now(); + let gpu_id = 0; fill_digests_buf_gpu_ptr::( digests_buf, cap_buf, @@ -833,6 +582,7 @@ impl> MerkleTree { leaves_len, leaf_len, cap_height, + gpu_id, ); print_time(now, "fill digests buffer"); @@ -1028,11 +778,13 @@ impl> MerkleTree { for i in 0..positions.len() { let subtree_offset = positions[i] / subtree_digests_len; let idx_in_subtree = positions[i] % subtree_digests_len; - let digest_idx = subtree_offset * subtree_digests_len + 2 * (idx_in_subtree + 1); + let digest_idx = + subtree_offset * subtree_digests_len + 2 * (idx_in_subtree + 1); unsafe { let left_digest = digests_buf[digest_idx].assume_init(); let right_digest = digests_buf[digest_idx + 1].assume_init(); - digests_buf[positions[i]].write(H::two_to_one(left_digest, right_digest)); + digests_buf[positions[i]] + .write(H::two_to_one(left_digest, right_digest)); } } } @@ -1387,9 +1139,15 @@ mod tests { #[test] fn test_change_leaf_and_update_range() -> Result<()> { for h in 0..11 { - println!("Run verify_change_leaf_and_update_range_one_by_one() for height {:?}", h); + println!( + "Run verify_change_leaf_and_update_range_one_by_one() for height {:?}", + h + ); verify_change_leaf_and_update_range_one_by_one(1024, 68, h, 32, 48); - println!("Run verify_change_leaf_and_update_range() for height {:?}", h); + println!( + "Run verify_change_leaf_and_update_range() for height {:?}", + h + ); verify_change_leaf_and_update_range(1024, 68, h, 32, 48); } diff --git a/plonky2/src/lib.rs b/plonky2/src/lib.rs index 3bc266a9f5..f47db8fe26 100644 --- a/plonky2/src/lib.rs +++ b/plonky2/src/lib.rs @@ -3,6 +3,7 @@ #![deny(rustdoc::broken_intra_doc_links)] #![deny(missing_debug_implementations)] #![cfg_attr(not(feature = "std"), no_std)] +#![feature(mutex_unlock)] // #[cfg(not(feature = "std"))] pub extern crate alloc;