diff --git a/Cargo.toml b/Cargo.toml index a11b080104..cbff147249 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ members = ["field", "maybe_rayon", "plonky2", "starky", "util", "gen", "u32", "e resolver = "2" [workspace.dependencies] -cryptography_cuda = { git = "ssh://git@github.com/okx/cryptography_cuda.git", rev = "547192b2ef42dc7519435059c86f88431b8de999" } +zeknox = { git = "ssh://git@github.com/okx/zeknox.git", rev = "04c1ac8c9ed44c0c1d885f8029045dd9969b2f55" } ahash = { version = "0.8.7", default-features = false, features = [ "compile-time-rng", ] } # NOTE: Be sure to keep this version the same as the dependency in `hashbrown`. diff --git a/ecdsa/Cargo.toml b/ecdsa/Cargo.toml index 5bca4a8454..9da2cee6d1 100644 --- a/ecdsa/Cargo.toml +++ b/ecdsa/Cargo.toml @@ -11,8 +11,8 @@ categories.workspace = true [features] parallel = ["plonky2_maybe_rayon/parallel", "plonky2/parallel"] -cuda = ["cryptography_cuda/cuda", "plonky2/cuda"] -no_cuda = ["cryptography_cuda/no_cuda", "plonky2/no_cuda"] +cuda = ["zeknox/cuda", "plonky2/cuda"] +no_cuda = ["zeknox/no_cuda", "plonky2/no_cuda"] [dependencies] anyhow = { version = "1.0.40" } @@ -22,7 +22,7 @@ num = { version = "0.4.0" } plonky2 = { path = "../plonky2" } plonky2_u32 = { path = "../u32" } serde = { version = "1.0", features = ["derive"] } -cryptography_cuda = { workspace = true, optional = true } +zeknox = { workspace = true, optional = true } [dev-dependencies] rand = { version = "0.8.4", features = ["getrandom"] } diff --git a/ecgfp5/Cargo.toml b/ecgfp5/Cargo.toml index 8a0859e2c7..8e85b6d380 100644 --- a/ecgfp5/Cargo.toml +++ b/ecgfp5/Cargo.toml @@ -26,7 +26,7 @@ itertools = "0.10" serde = "1" rand = { version = "0.8.5", default-features = false, features = ["getrandom"] } hex = "0.4.3" -cryptography_cuda = { workspace = true, optional = true } +zeknox = { workspace = true, optional = true } [dev-dependencies] rand = { version = "0.8.5", features = ["min_const_gen"] } @@ -48,5 +48,5 @@ name = "schnorr" harness = false [features] -cuda = ["cryptography_cuda/cuda", "plonky2/cuda"] -no_cuda = ["cryptography_cuda/no_cuda", "plonky2/no_cuda"] +cuda = ["zeknox/cuda", "plonky2/cuda"] +no_cuda = ["zeknox/no_cuda", "plonky2/no_cuda"] diff --git a/field/Cargo.toml b/field/Cargo.toml index 8c155bd8c6..056a62d679 100644 --- a/field/Cargo.toml +++ b/field/Cargo.toml @@ -23,7 +23,7 @@ rand = { workspace = true, features = ["getrandom"] } serde = { workspace = true, features = ["alloc"] } static_assertions = { workspace = true } unroll = { workspace = true } -cryptography_cuda = { workspace = true, optional = true } +zeknox = { workspace = true, optional = true } [dev-dependencies] rand = { version = "0.8.5", default-features = false, features = ["getrandom"] } @@ -37,9 +37,9 @@ quote = "1" [features] default = [] -cuda = ["cryptography_cuda/cuda"] +cuda = ["zeknox/cuda"] precompile = [] -no_cuda = ["cryptography_cuda/no_cuda"] +no_cuda = ["zeknox/no_cuda"] # Display math equations properly in documentation [package.metadata.docs.rs] diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs index c13bbca272..fe8c508487 100644 --- a/field/src/polynomial/mod.rs +++ b/field/src/polynomial/mod.rs @@ -152,6 +152,7 @@ impl PolynomialCoeffs { .collect() } + ///WIP: We can try a tree based estriens method for this. Maybe will speed it up? pub fn eval(&self, x: F) -> F { self.coeffs .iter() diff --git a/plonky2/Cargo.toml b/plonky2/Cargo.toml index 7de17ec942..191a1fe70c 100644 --- a/plonky2/Cargo.toml +++ b/plonky2/Cargo.toml @@ -17,8 +17,8 @@ gate_testing = [] parallel = ["hashbrown/rayon", "plonky2_maybe_rayon/parallel"] std = ["anyhow/std", "rand/std", "itertools/use_std"] timing = ["std", "dep:web-time"] -cuda = ["cryptography_cuda/cuda"] -no_cuda = ["cryptography_cuda/no_cuda"] +cuda = ["zeknox/cuda"] +no_cuda = ["zeknox/no_cuda"] batch = [] cuda_timing = [] papi = [] @@ -44,7 +44,7 @@ papi-bindings = { version = "0.5.2" } plonky2_field = { version = "0.2.2", path = "../field", default-features = false } plonky2_maybe_rayon = { version = "0.2.0", path = "../maybe_rayon", default-features = false } plonky2_util = { version = "0.2.0", path = "../util", default-features = false } -cryptography_cuda = { workspace = true, optional = true } +zeknox = { workspace = true, optional = true } dyn-clone = "1.0.17" [target.'cfg(all(target_arch = "wasm32", target_os = "unknown"))'.dependencies] diff --git a/plonky2/benches/lde.rs b/plonky2/benches/lde.rs index 465c60846c..82938b61fb 100644 --- a/plonky2/benches/lde.rs +++ b/plonky2/benches/lde.rs @@ -2,7 +2,7 @@ mod allocator; use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; #[cfg(feature = "cuda")] -use cryptography_cuda::init_cuda_degree_rs; +use zeknox::init_cuda_degree_rs; use plonky2::field::extension::Extendable; use plonky2::field::goldilocks_field::GoldilocksField; use plonky2::field::polynomial::PolynomialCoeffs; diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs index ca945da163..f792919132 100644 --- a/plonky2/src/fri/oracle.rs +++ b/plonky2/src/fri/oracle.rs @@ -2,9 +2,9 @@ use alloc::{format, vec::Vec}; #[cfg(feature = "cuda")] -use cryptography_cuda::{ - device::memory::HostOrDeviceSlice, lde_batch, lde_batch_multi_gpu, transpose_rev_batch, - types::*, +use zeknox::{ + device::memory::HostOrDeviceSlice, intt_batch, lde_batch, lde_batch_multi_gpu, + transpose_rev_batch, types::*, }; use itertools::Itertools; use plonky2_field::types::Field; @@ -33,7 +33,7 @@ pub static GPU_INIT: once_cell::sync::Lazy> #[cfg(all(feature = "cuda", any(test, doctest)))] fn init_gpu() { - use cryptography_cuda::init_cuda_rs; + use zeknox::init_cuda_rs; let mut init = GPU_INIT.lock().unwrap(); if *init == 0 { @@ -74,6 +74,7 @@ impl, C: GenericConfig, const D: usize> D impl, C: GenericConfig, const D: usize> PolynomialBatch { + // #[cfg(not(feature = "cuda"))] /// Creates a list polynomial commitment for the polynomials interpolating the values in `values`. pub fn from_values( values: Vec>, @@ -82,6 +83,25 @@ impl, C: GenericConfig, const D: usize> cap_height: usize, timing: &mut TimingTree, fft_root_table: Option<&FftRootTable>, + ) -> Self { + Self::from_values_cpu( + values, + rate_bits, + blinding, + cap_height, + timing, + fft_root_table, + ) + } + + /// Creates a list polynomial commitment for the polynomials interpolating the values in `values`. + pub fn from_values_cpu( + values: Vec>, + rate_bits: usize, + blinding: bool, + cap_height: usize, + timing: &mut TimingTree, + fft_root_table: Option<&FftRootTable>, ) -> Self { // #[cfg(any(not(feature = "cuda"), not(feature = "batch")))] let coeffs = timed!( @@ -90,53 +110,7 @@ impl, C: GenericConfig, const D: usize> values.into_par_iter().map(|v| v.ifft()).collect::>() ); - // #[cfg(all(feature = "cuda", feature = "batch"))] - // let degree = values[0].len(); - // #[cfg(all(feature = "cuda", feature = "batch"))] - // let log_n = log2_strict(degree); - - // #[cfg(all(feature = "cuda", feature = "batch"))] - // let num_gpus: usize = std::env::var("NUM_OF_GPUS") - // .expect("NUM_OF_GPUS should be set") - // .parse() - // .unwrap(); - // // let num_gpus = 1; - // #[cfg(all(feature = "cuda", feature = "batch"))] - // let total_num_of_fft = values.len(); - // #[cfg(all(feature = "cuda", feature = "batch"))] - // let per_device_batch = total_num_of_fft.div_ceil(num_gpus); - - // #[cfg(all(feature = "cuda", feature = "batch"))] - // let chunk_size = total_num_of_fft.div_ceil(num_gpus); - // #[cfg(all(feature = "cuda", feature = "batch"))] - // println!( - // "invoking intt_batch, total_nums: {:?}, log_n: {:?}, num_gpus: {:?}", - // total_num_of_fft, log_n, num_gpus - // ); - - // #[cfg(all(feature = "cuda", feature = "batch"))] - // let coeffs = timed!( - // timing, - // "IFFT", - // values - // .par_chunks(chunk_size) - // .enumerate() - // .flat_map(|(id, poly_chunk)| { - // let mut polys_values: Vec = - // poly_chunk.iter().flat_map(|p| p.values.clone()).collect(); - // let mut ntt_cfg = NTTConfig::default(); - // ntt_cfg.batches = per_device_batch as u32; - - // intt_batch(id, polys_values.as_mut_ptr(), log_n, ntt_cfg); - // polys_values - // .chunks(1 << log_n) - // .map(|buffer| PolynomialCoeffs::new(buffer.to_vec())) - // .collect::>>() - // }) - // .collect() - // ); - - Self::from_coeffs( + Self::from_coeffs_cpu( coeffs, rate_bits, blinding, @@ -146,6 +120,187 @@ impl, C: GenericConfig, const D: usize> ) } + // #[cfg(feature = "cuda")] + // pub fn from_values( + // values: Vec>, + // rate_bits: usize, + // blinding: bool, + // cap_height: usize, + // timing: &mut TimingTree, + // fft_root_table: Option<&FftRootTable>, + // ) -> Self { + // let degree = values[0].len(); + // let log_n = log2_strict(degree); + + // if log_n > 1 && log_n + rate_bits > 1 && values.len() > 0 { + // #[cfg(any(test, doctest))] + // init_gpu(); + + // let _num_gpus: usize = std::env::var("NUM_OF_GPUS") + // .expect("NUM_OF_GPUS should be set") + // .parse() + // .unwrap(); + + // Self::from_values_gpu( + // values.as_slice(), + // rate_bits, + // blinding, + // cap_height, + // timing, + // fft_root_table, + // log_n, + // degree, + // ) + // } else { + // Self::from_values_cpu( + // values, + // rate_bits, + // blinding, + // cap_height, + // timing, + // fft_root_table, + // ) + // } + // } + + // #[cfg(feature = "cuda")] + // pub fn from_values_gpu( + // values: &[PolynomialValues], + // rate_bits: usize, + // blinding: bool, + // cap_height: usize, + // timing: &mut TimingTree, + // _fft_root_table: Option<&FftRootTable>, + // log_n: usize, + // degree: usize, + // ) -> Self { + // let output_domain_size = log_n + rate_bits; + + // let num_gpus: usize = std::env::var("NUM_OF_GPUS") + // .expect("NUM_OF_GPUS should be set") + // .parse() + // .unwrap(); + + // let total_num_of_fft = values.len(); + // println!("total_num_of_fft: {:?}", total_num_of_fft); + // // println!("fft_size: {:?}", log_n); + + // let total_num_input_elements = total_num_of_fft * (1 << log_n); + // let total_num_output_elements = total_num_of_fft * (1 << output_domain_size); + + // let mut gpu_input: Vec = values + // .into_iter() + // .flat_map(|v| v.values.iter().cloned()) + // .collect(); + + // let mut device_data: HostOrDeviceSlice<'_, F> = + // HostOrDeviceSlice::cuda_malloc(0 as i32, total_num_input_elements).unwrap(); + + // let _ret = device_data.copy_from_host(&gpu_input); + + // let mut cfg_ntt = NTTConfig::default(); + // cfg_ntt.are_inputs_on_device = true; + // cfg_ntt.are_outputs_on_device = true; + // cfg_ntt.batches = total_num_of_fft as u32; + + // intt_batch(0, device_data.as_mut_ptr(), log_n, cfg_ntt.clone()); + + // let mut cfg_lde = NTTConfig::default(); + // cfg_lde.batches = total_num_of_fft as u32; + // cfg_lde.extension_rate_bits = rate_bits as u32; + // cfg_lde.are_inputs_on_device = true; + // cfg_lde.are_outputs_on_device = true; + // cfg_lde.with_coset = true; + // cfg_lde.is_multi_gpu = true; + + // let mut device_output_data: HostOrDeviceSlice<'_, F> = + // HostOrDeviceSlice::cuda_malloc(0 as i32, total_num_output_elements).unwrap(); + + // if num_gpus == 1 { + // let _ = timed!( + // timing, + // "LDE on 1 GPU", + // lde_batch( + // 0, + // device_output_data.as_mut_ptr(), + // device_data.as_mut_ptr(), + // log_n, + // cfg_lde.clone() + // ) + // ); + // } else { + // let _ = timed!( + // timing, + // "LDE on multi GPU", + // lde_batch_multi_gpu::( + // device_output_data.as_mut_ptr(), + // device_data.as_mut_ptr(), + // num_gpus, + // cfg_lde.clone(), + // log_n, + // ) + // ); + // } + + // let mut cfg_trans = TransposeConfig::default(); + // cfg_trans.batches = total_num_of_fft as u32; + // cfg_trans.are_inputs_on_device = true; + // cfg_trans.are_outputs_on_device = true; + + // let mut device_transpose_data: HostOrDeviceSlice<'_, F> = + // HostOrDeviceSlice::cuda_malloc(0 as i32, total_num_output_elements).unwrap(); + + // let _ = timed!( + // timing, + // "transpose", + // transpose_rev_batch( + // 0 as i32, + // device_transpose_data.as_mut_ptr(), + // device_output_data.as_mut_ptr(), + // output_domain_size, + // cfg_trans + // ) + // ); + + // let mt = timed!( + // timing, + // "Merkle tree with GPU data", + // MerkleTree::new_from_gpu_leaves( + // &device_transpose_data, + // 1 << output_domain_size, + // total_num_of_fft, + // cap_height + // ) + // ); + + // let mut coeffs_1d = vec![F::ZERO; total_num_input_elements]; + // device_data + // .copy_to_host(coeffs_1d.as_mut_slice(), total_num_input_elements) + // .unwrap(); + + // let chunk_size = 1 << log_n; + // let coeffs_batch: Vec> = coeffs_1d + // .chunks(chunk_size) + // .map(|chunk| PolynomialCoeffs { + // coeffs: chunk.to_vec(), + // }) + // .collect(); + + // drop(device_transpose_data); + // drop(device_output_data); + // drop(device_data); + + // assert_eq!(coeffs_batch.len(), values.len()); + + // Self { + // polynomials: coeffs_batch, + // merkle_tree: mt, + // degree_log: log2_strict(degree), + // rate_bits, + // blinding, + // } + // } + /// Creates a list polynomial commitment for the polynomials `polynomials`. pub fn from_coeffs_cpu( polynomials: Vec>, @@ -208,17 +363,13 @@ impl, C: GenericConfig, const D: usize> timing: &mut TimingTree, fft_root_table: Option<&FftRootTable>, ) -> Self { + let pols = polynomials.len(); let degree = polynomials[0].len(); let log_n = log2_strict(degree); - #[cfg(any(test, doctest))] - init_gpu(); + if log_n + rate_bits > 1 && polynomials.len() > 0 { - if log_n + rate_bits > 1 - && polynomials.len() > 0 - && pols * (1 << (log_n + rate_bits)) < (1 << 31) - { let _num_gpus: usize = std::env::var("NUM_OF_GPUS") .expect("NUM_OF_GPUS should be set") .parse() @@ -290,12 +441,17 @@ impl, C: GenericConfig, const D: usize> let mut cfg_lde = NTTConfig::default(); cfg_lde.batches = total_num_of_fft as u32; cfg_lde.extension_rate_bits = rate_bits as u32; - cfg_lde.are_inputs_on_device = false; + cfg_lde.are_inputs_on_device = true; cfg_lde.are_outputs_on_device = true; cfg_lde.with_coset = true; cfg_lde.is_multi_gpu = true; cfg_lde.salt_size = salt_size as u32; + let mut device_data: HostOrDeviceSlice<'_, F> = + HostOrDeviceSlice::cuda_malloc(0 as i32, total_num_input_elements).unwrap(); + + let _ret = device_data.copy_from_host(&gpu_input); + let mut device_output_data: HostOrDeviceSlice<'_, F> = HostOrDeviceSlice::cuda_malloc(0 as i32, total_num_output_elements).unwrap(); if num_gpus == 1 { @@ -305,7 +461,7 @@ impl, C: GenericConfig, const D: usize> lde_batch( 0, device_output_data.as_mut_ptr(), - gpu_input.as_mut_ptr(), + device_data.as_mut_ptr(), log_n, cfg_lde.clone() ) @@ -316,12 +472,10 @@ impl, C: GenericConfig, const D: usize> "LDE on multi GPU", lde_batch_multi_gpu::( device_output_data.as_mut_ptr(), - gpu_input.as_mut_ptr(), + device_data.as_mut_ptr(), num_gpus, cfg_lde.clone(), log_n, - total_num_input_elements, - total_num_output_elements, ) ); } @@ -373,118 +527,10 @@ impl, C: GenericConfig, const D: usize> init_gpu(); let degree = polynomials[0].len(); - #[cfg(all(feature = "cuda", feature = "batch"))] - let log_n = log2_strict(degree) + rate_bits; - // If blinding, salt with two random elements to each leaf vector. let salt_size = if blinding { SALT_SIZE } else { 0 }; // println!("salt_size: {:?}", salt_size); - #[cfg(all(feature = "cuda", feature = "batch"))] - let num_gpus: usize = std::env::var("NUM_OF_GPUS") - .expect("NUM_OF_GPUS should be set") - .parse() - .unwrap(); - // let num_gpus: usize = 1; - #[cfg(all(feature = "cuda", feature = "batch"))] - println!("get num of gpus: {:?}", num_gpus); - #[cfg(all(feature = "cuda", feature = "batch"))] - let total_num_of_fft = polynomials.len(); - // println!("total_num_of_fft: {:?}", total_num_of_fft); - #[cfg(all(feature = "cuda", feature = "batch"))] - let per_device_batch = total_num_of_fft.div_ceil(num_gpus); - - #[cfg(all(feature = "cuda", feature = "batch"))] - let chunk_size = total_num_of_fft.div_ceil(num_gpus); - - #[cfg(all(feature = "cuda", feature = "batch"))] - if log_n > 10 && polynomials.len() > 0 { - println!("log_n: {:?}", log_n); - let start_lde = std::time::Instant::now(); - - // let poly_chunk = polynomials; - // let id = 0; - let ret = polynomials - .par_chunks(chunk_size) - .enumerate() - .flat_map(|(id, poly_chunk)| { - println!( - "invoking ntt_batch, device_id: {:?}, per_device_batch: {:?}", - id, per_device_batch - ); - - let start = std::time::Instant::now(); - - let input_domain_size = 1 << log2_strict(degree); - let device_input_data: HostOrDeviceSlice<'_, F> = - HostOrDeviceSlice::cuda_malloc( - id as i32, - input_domain_size * polynomials.len(), - ) - .unwrap(); - let device_input_data = std::sync::RwLock::new(device_input_data); - - poly_chunk.par_iter().enumerate().for_each(|(i, p)| { - // println!("copy for index: {:?}", i); - let _guard = device_input_data.read().unwrap(); - let _ = _guard.copy_from_host_offset( - p.coeffs.as_slice(), - input_domain_size * i, - input_domain_size, - ); - }); - - println!("data transform elapsed: {:?}", start.elapsed()); - let mut cfg_lde = NTTConfig::default(); - cfg_lde.batches = per_device_batch as u32; - cfg_lde.extension_rate_bits = rate_bits as u32; - cfg_lde.are_inputs_on_device = true; - cfg_lde.are_outputs_on_device = true; - cfg_lde.with_coset = true; - println!( - "start cuda_malloc with elements: {:?}", - (1 << log_n) * per_device_batch - ); - let mut device_output_data: HostOrDeviceSlice<'_, F> = - HostOrDeviceSlice::cuda_malloc(id as i32, (1 << log_n) * per_device_batch) - .unwrap(); - - let start = std::time::Instant::now(); - lde_batch::( - id, - device_output_data.as_mut_ptr(), - device_input_data.read().unwrap().as_ptr(), - log2_strict(degree), - cfg_lde, - ); - println!("real lde_batch elapsed: {:?}", start.elapsed()); - let start = std::time::Instant::now(); - let nums: Vec = (0..poly_chunk.len()).collect(); - let r = nums - .par_iter() - .map(|i| { - let mut host_data: Vec = vec![F::ZERO; 1 << log_n]; - let _ = device_output_data.copy_to_host_offset( - host_data.as_mut_slice(), - (1 << log_n) * i, - 1 << log_n, - ); - PolynomialValues::new(host_data).values - }) - .collect::>>(); - println!("collect data from gpu used: {:?}", start.elapsed()); - r - }) - .chain( - (0..salt_size) - .into_par_iter() - .map(|_| F::rand_vec(degree << rate_bits)), - ) - .collect(); - println!("real lde elapsed: {:?}", start_lde.elapsed()); - return ret; - } - let ret = polynomials .par_iter() .map(|p| { diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs index d1ca38fb01..ebe0203b9c 100644 --- a/plonky2/src/hash/merkle_tree.rs +++ b/plonky2/src/hash/merkle_tree.rs @@ -10,11 +10,11 @@ use std::sync::Mutex; use std::time::Instant; #[cfg(feature = "cuda")] -use cryptography_cuda::device::memory::HostOrDeviceSlice; +use zeknox::device::memory::HostOrDeviceSlice; #[cfg(feature = "cuda")] -use cryptography_cuda::device::stream::CudaStream; +use zeknox::device::stream::CudaStream; #[cfg(feature = "cuda")] -use cryptography_cuda::{ +use zeknox::{ fill_digests_buf_linear_gpu_with_gpu_ptr, fill_digests_buf_linear_multigpu_with_gpu_ptr, }; use num::range; @@ -552,7 +552,7 @@ fn fill_digests_buf_cpu>( leaf_size: usize, cap_height: usize, ) { - use cryptography_cuda::fill_digests_buf_linear_cpu; + use zeknox::fill_digests_buf_linear_cpu; let leaves_count = (leaves.len() / leaf_size) as u64; let digests_count: u64 = digests_buf.len().try_into().unwrap(); diff --git a/plonky2/src/plonk/prover.rs b/plonky2/src/plonk/prover.rs index b9e6d3c3e1..dbf78d4a56 100644 --- a/plonky2/src/plonk/prover.rs +++ b/plonky2/src/plonk/prover.rs @@ -242,6 +242,8 @@ where ) ); + println!("NUM_PPZs: {:?}", partial_products_zs_and_lookup_commitment.polynomials.len()); + challenger.observe_cap::(&partial_products_zs_and_lookup_commitment.merkle_tree.cap); let alphas = challenger.get_n_challenges(num_challenges); diff --git a/plonky2/src/util/reducing.rs b/plonky2/src/util/reducing.rs index b99da32e6a..741a475e44 100644 --- a/plonky2/src/util/reducing.rs +++ b/plonky2/src/util/reducing.rs @@ -340,6 +340,7 @@ mod tests { builder.connect_extension(manual_reduce, circuit_reduce); let data = builder.build::(); + println!("STARTINGPROOF"); let proof = data.prove(pw)?; verify(proof, &data.verifier_only, &data.common) diff --git a/plonky2/src/util/test_utils.rs b/plonky2/src/util/test_utils.rs index c4d52c4c04..0dfcaaff57 100644 --- a/plonky2/src/util/test_utils.rs +++ b/plonky2/src/util/test_utils.rs @@ -1,6 +1,6 @@ #[cfg(feature = "cuda")] pub fn init_cuda() { - use cryptography_cuda::{get_number_of_gpus_rs, init_coset_rs, init_twiddle_factors_rs}; + use zeknox::{get_number_of_gpus_rs, init_coset_rs, init_twiddle_factors_rs}; use plonky2_field::goldilocks_field::GoldilocksField; use plonky2_field::types::{Field, PrimeField64}; diff --git a/starky/Cargo.toml b/starky/Cargo.toml index 9b64b9f7cf..8cef2dffd7 100644 --- a/starky/Cargo.toml +++ b/starky/Cargo.toml @@ -16,8 +16,8 @@ default = ["parallel", "std", "timing"] parallel = ["plonky2/parallel", "plonky2_maybe_rayon/parallel"] std = ["anyhow/std", "plonky2/std"] timing = ["plonky2/timing"] -cuda = ["cryptography_cuda/cuda", "plonky2/cuda"] -no_cuda = ["cryptography_cuda/no_cuda", "plonky2/no_cuda"] +cuda = ["zeknox/cuda", "plonky2/cuda"] +no_cuda = ["zeknox/no_cuda", "plonky2/no_cuda"] [dependencies] ahash = { workspace = true } @@ -26,7 +26,7 @@ hashbrown = { workspace = true } itertools = { workspace = true } log = { workspace = true } num-bigint = { version = "0.4.3", default-features = false } -cryptography_cuda = { workspace = true, optional = true } +zeknox = { workspace = true, optional = true } # Local dependencies diff --git a/u32/Cargo.toml b/u32/Cargo.toml index 5b47ef4255..b9c1b207a9 100644 --- a/u32/Cargo.toml +++ b/u32/Cargo.toml @@ -14,11 +14,11 @@ anyhow = { version = "1.0.40", default-features = false } itertools = { version = "0.10.0", default-features = false } num = { version = "0.4", default-features = false } plonky2 = { path = "../plonky2" } -cryptography_cuda = { workspace = true, optional = true } +zeknox = { workspace = true, optional = true } [dev-dependencies] rand = { version = "0.8.4", default-features = false, features = ["getrandom"] } [features] -cuda = ["cryptography_cuda/cuda", "plonky2/cuda"] -no_cuda = ["cryptography_cuda/no_cuda", "plonky2/no_cuda"] \ No newline at end of file +cuda = ["zeknox/cuda", "plonky2/cuda"] +no_cuda = ["zeknox/no_cuda", "plonky2/no_cuda"] \ No newline at end of file