From 4c58a5166fbd1da426b7e66d3bc1438e2fb8c322 Mon Sep 17 00:00:00 2001
From: bunnie <bunnie@kosagi.com>
Date: Mon, 11 Mar 2024 18:15:19 +0800
Subject: [PATCH] cleanup to refer to local job primitives

---
 curve25519-dalek/src/montgomery.rs | 718 +++++++++++++----------------
 1 file changed, 331 insertions(+), 387 deletions(-)

diff --git a/curve25519-dalek/src/montgomery.rs b/curve25519-dalek/src/montgomery.rs
index d047f43d..7f901d1e 100644
--- a/curve25519-dalek/src/montgomery.rs
+++ b/curve25519-dalek/src/montgomery.rs
@@ -329,7 +329,7 @@ impl ProjectivePoint {
     pub fn as_affine(&self) -> MontgomeryPoint {
         //TODO: consider making this a seperate feature. Something like "panic_on_sw_eval" which would
         //be ameniable to upstreaming
-        #[cfg(all(not(test),curve25519_dalek_backend = "u32e_backend"))] // due to issue https://github.com/rust-lang/rust/issues/59168, you will have to manually comment this out when running a test on the full system and not just this crate.
+        #[cfg(all(not(test), curve25519_dalek_backend = "u32e_backend"))] // due to issue https://github.com/rust-lang/rust/issues/59168, you will have to manually comment this out when running a test on the full system and not just this crate.
         log::warn!("sw as_affine being used - check for build config errors!");
         let u = &self.U * &self.W.invert();
         MontgomeryPoint(u.as_bytes())
@@ -463,25 +463,23 @@ impl ProjectivePoint {
                 mul %31, %29, %21
                 fin  // finish execution
         );
-        let mut engine = engine_25519::Engine25519::new();
-        let mut job = engine_25519::Job {
-            id: None,
-            ucode: [0; 1024],
-            uc_len: mcode.len() as u32,
-            uc_start: 0,
-            window: Some(0),
-            rf: [0; engine_25519::RF_SIZE_IN_U32],
+
+        use crate::backend::serial::u32e::*;
+        ensure_engine();
+        let mut ucode_hw: &'static mut [u32] = unsafe {
+            core::slice::from_raw_parts_mut(ENGINE_MEM.unwrap().as_mut_ptr() as *mut u32, 1024)
+        };
+        let rf_hw: &mut [u32] = unsafe {
+            core::slice::from_raw_parts_mut(
+                (ENGINE_MEM.unwrap().as_mut_ptr() as usize + RF_U8_BASE) as *mut u32,
+                TOTAL_RF_SIZE_IN_U32,
+            )
         };
-        for (&src, dst) in mcode.iter().zip(job.ucode.iter_mut()) {
-            *dst = src as u32;
-        }
-        copy_to_rf(self.U.as_bytes(), 29, &mut job.rf);
-        copy_to_rf(self.W.as_bytes(), 30, &mut job.rf);
 
-        // start the run
-        let result_rf = engine.spawn_job(job).expect("couldn't run engine job");
+        copy_to_rf(self.U.as_bytes(), 29, rf_hw, 0);
+        copy_to_rf(self.W.as_bytes(), 30, rf_hw, 0);
 
-        MontgomeryPoint(copy_from_rf(31, &result_rf))
+        MontgomeryPoint(run_job(&mut ucode_hw, &rf_hw, &mcode, 0))
     }
 }
 
@@ -541,27 +539,6 @@ pub(crate) fn differential_add_and_double(
     Q.W = t17;  // W_{Q'} = U_D * 4 (W_P U_Q - U_P W_Q)^2
 }
 
-#[cfg(curve25519_dalek_backend = "u32e_backend")]
-fn copy_to_rf(bytes: [u8; 32], register: usize, rf: &mut [u32; engine_25519::RF_SIZE_IN_U32]) {
-    use core::convert::TryInto;
-    for (byte, rf_dst) in bytes.chunks_exact(4).zip(rf[register * 8..(register+1)*8].iter_mut()) {
-        *rf_dst = u32::from_le_bytes(byte.try_into().expect("chunks_exact failed us"));
-    }
-}
-
-#[cfg(curve25519_dalek_backend = "u32e_backend")]
-fn copy_from_rf(register: usize, rf: &[u32; engine_25519::RF_SIZE_IN_U32]) -> [u8; 32] {
-    let mut ret: [u8; 32] = [0; 32];
-
-    for (src, dst) in rf[register*8 .. (register+1)*8].iter().zip(ret.chunks_exact_mut(4).into_iter()) {
-        for (&src_byte, dst_byte) in src.to_le_bytes().iter().zip(dst.iter_mut()) {
-            *dst_byte = src_byte;
-        }
-    }
-
-    ret
-}
-
 #[allow(dead_code)] // absorbed into mul, but might be useful later on as a subroutine for something else
 #[cfg(curve25519_dalek_backend = "u32e_backend")]
 pub(crate) fn differential_add_and_double(
@@ -650,39 +627,36 @@ pub(crate) fn differential_add_and_double(
 
             fin  // finish execution
     );
-    let mut engine = engine_25519::Engine25519::new();
-
-    let mut job = engine_25519::Job {
-        id: None,
-        ucode: [0; 1024],
-        uc_len: mcode.len() as u32,
-        uc_start: 0,
-        window: Some(0),
-        rf: [0; engine_25519::RF_SIZE_IN_U32],
+    use crate::backend::serial::u32e::*;
+    ensure_engine();
+    let mut ucode_hw: &'static mut [u32] = unsafe {
+        core::slice::from_raw_parts_mut(ENGINE_MEM.unwrap().as_mut_ptr() as *mut u32, 1024)
+    };
+    let rf_hw: &mut [u32] = unsafe {
+        core::slice::from_raw_parts_mut(
+            (ENGINE_MEM.unwrap().as_mut_ptr() as usize + RF_U8_BASE) as *mut u32,
+            TOTAL_RF_SIZE_IN_U32,
+        )
     };
-
-    for (&src, dst) in mcode.iter().zip(job.ucode.iter_mut()) {
-        *dst = src;
-    }
 
     // P.U in %20
     // P.W in %21
     // Q.U in %22
     // Q.W in %23
     // affine_PmQ in %24
-    copy_to_rf(P.U.as_bytes(), 20, &mut job.rf);
-    copy_to_rf(P.W.as_bytes(), 21, &mut job.rf);
-    copy_to_rf(Q.U.as_bytes(), 22, &mut job.rf);
-    copy_to_rf(Q.W.as_bytes(), 23, &mut job.rf);
-    copy_to_rf(affine_PmQ.as_bytes(), 24, &mut job.rf);
+    copy_to_rf(P.U.as_bytes(), 20, rf_hw, 0);
+    copy_to_rf(P.W.as_bytes(), 21, rf_hw, 0);
+    copy_to_rf(Q.U.as_bytes(), 22, rf_hw, 0);
+    copy_to_rf(Q.W.as_bytes(), 23, rf_hw, 0);
+    copy_to_rf(affine_PmQ.as_bytes(), 24, rf_hw, 0);
 
     // start the run
-    let result_rf = engine.spawn_job(job).expect("couldn't run engine job");
+    run_job(&mut ucode_hw, &rf_hw, &mcode, 0);
 
-    P.U = FieldElement::from_bytes(&copy_from_rf(20, &result_rf));
-    P.W = FieldElement::from_bytes(&copy_from_rf(21, &result_rf));
-    Q.U = FieldElement::from_bytes(&copy_from_rf(22, &result_rf));
-    Q.W = FieldElement::from_bytes(&copy_from_rf(23, &result_rf));
+    P.U = FieldElement::from_bytes(&copy_from_rf(20, &rf_hw, 0));
+    P.W = FieldElement::from_bytes(&copy_from_rf(21, &rf_hw, 0));
+    Q.U = FieldElement::from_bytes(&copy_from_rf(22, &rf_hw, 0));
+    Q.W = FieldElement::from_bytes(&copy_from_rf(23, &rf_hw, 0));
 }
 
 define_mul_assign_variants!(LHS = MontgomeryPoint, RHS = Scalar);
@@ -705,10 +679,12 @@ impl Mul<&Scalar> for &MontgomeryPoint {
     /// Given `self` \\( = u\_0(P) \\), and a `Scalar` \\(n\\), return \\( u\_0([n]P) \\).
     #[cfg(curve25519_dalek_backend = "u32e_backend")]
     fn mul(self, scalar: &Scalar) -> MontgomeryPoint {
+        use crate::backend::serial::u32e::*;
+
         log::debug!("hw mont");
         // Algorithm 8 of Costello-Smith 2017
         let affine_u = FieldElement::from_bytes(&self.0);
-        let mut x0 = ProjectivePoint::identity();
+        let x0 = ProjectivePoint::identity();
         let x1 = ProjectivePoint {
             U: affine_u,
             W: FieldElement::ONE,
@@ -716,339 +692,307 @@ impl Mul<&Scalar> for &MontgomeryPoint {
 
         // for now, prefer to use the fully-accelerated version where this code is local to the server
         // instead of transmitting it every call with the data...gives about a 2x performance speedup
-        if false {
-            #[cfg(not(test))] // due to issue https://github.com/rust-lang/rust/issues/59168, you will have to manually comment this out when running a test on the full system and not just this crate.
-            log::warn!("wrong multiply being used!");
-
-            let mcode = assemble_engine25519!(
-                start:
-                    // P.U in %20
-                    // P.W in %21
-                    // Q.U in %22
-                    // Q.W in %23
-                    // affine_PmQ in %24
-                    // %30 is the TRD scratch register and cswap dummy
-                    // %29 is the subtraction temporary value register and k_t
-                    // x0.U in %25
-                    // x0.W in %26
-                    // x1.U in %27
-                    // x1.W in %28
-                    // %19 is the loop counter, starts with 254 (if 0, loop runs exactly once)
-                    // %31 is the scalar
-                    // %18 is the swap variable
-                    psa %18, #0
-
-                    // for i in (0..255).rev()
-                mainloop:
-                    // let choice: u8 = (bits[i + 1] ^ bits[i]) as u8;
-                    // ProjectivePoint::conditional_swap(&mut x0, &mut x1, choice.into());
-                    xbt %29, %31        // orignally[k_t = (k>>t) & 1] now[k_t = k[254]]
-                    shl %31, %31        // k = k<<1
-                    xor %18, %18, %29   // swap ^= k_t
-
-                    // cswap x0.U (%25), x1.U (%27)
-                    xor %30, %25, %27
-                    msk %30, %18, %30
-                    xor %25, %30, %25
-                    xor %27, %30, %27
-                    // cswap x0.W (%26), x1.W (%28)
-                    xor %30, %26, %28
-                    msk %30, %18, %30
-                    xor %26, %30, %26
-                    xor %28, %30, %28
-
-                    psa %18, %29  // swap = k_t
-
-                        // differential_add_and_double(&mut x0, &mut x1, &affine_u);
-                        psa %20, %25
-                        psa %21, %26
-                        psa %22, %27
-                        psa %23, %28
-                        // affine_u is already in %24
-
-                        // let t0 = &P.U + &P.W;
-                        add %0, %20, %21
-                        trd %30, %0
-                        sub %0, %0, %30
-                        // let t1 = &P.U - &P.W;
-                        sub %21, #3, %21    // negate &P.W using #FIELDPRIME (#3)
-                        add %1, %20, %21
-                        trd %30, %1
-                        sub %1, %1, %30
-                        // let t2 = &Q.U + &Q.W;
-                        add %2, %22, %23
-                        trd %30, %2
-                        sub %2, %2, %30
-                        // let t3 = &Q.U - &Q.W;
-                        sub %23, #3, %23
-                        add %3, %22, %23
-                        trd %30, %3
-                        sub %3, %3, %30
-                        // let t4 = t0.square();   // (U_P + W_P)^2 = U_P^2 + 2 U_P W_P + W_P^2
-                        mul %4, %0, %0
-                        // let t5 = t1.square();   // (U_P - W_P)^2 = U_P^2 - 2 U_P W_P + W_P^2
-                        mul %5, %1, %1
-                        // let t6 = &t4 - &t5;     // 4 U_P W_P
-                        sub %29, #3, %5
-                        add %6, %4, %29
-                        trd %30, %6
-                        sub %6, %6, %30
-                        // let t7 = &t0 * &t3;     // (U_P + W_P) (U_Q - W_Q) = U_P U_Q + W_P U_Q - U_P W_Q - W_P W_Q
-                        mul %7, %0, %3
-                        // let t8 = &t1 * &t2;     // (U_P - W_P) (U_Q + W_Q) = U_P U_Q - W_P U_Q + U_P W_Q - W_P W_Q
-                        mul %8, %1, %2
-                        // let t9  = &t7 + &t8;    // 2 (U_P U_Q - W_P W_Q)
-                        add %9, %7, %8
-                        trd %30, %9
-                        sub %9, %9, %30
-                        // let t10 = &t7 - &t8;    // 2 (W_P U_Q - U_P W_Q)
-                        sub %29, #3, %8
-                        add %10, %7, %29
-                        trd %30, %10
-                        sub %10, %10, %30
-                        // let t11 =  t9.square(); // 4 (U_P U_Q - W_P W_Q)^2
-                        mul %11, %9, %9
-                        // let t12 = t10.square(); // 4 (W_P U_Q - U_P W_Q)^2
-                        mul %12, %10, %10
-                        // let t13 = &APLUS2_OVER_FOUR * &t6; // (A + 2) U_P U_Q
-                        mul %13, #4, %6   // #4 is A+2/4
-                        // let t14 = &t4 * &t5;    // ((U_P + W_P)(U_P - W_P))^2 = (U_P^2 - W_P^2)^2
-                        mul %14, %4, %5
-                        // let t15 = &t13 + &t5;   // (U_P - W_P)^2 + (A + 2) U_P W_P
-                        add %15, %13, %5
-                        trd %30, %15
-                        sub %15, %15, %30
-                        // let t16 = &t6 * &t15;   // 4 (U_P W_P) ((U_P - W_P)^2 + (A + 2) U_P W_P)
-                        mul %16, %6, %15
-                        // let t17 = affine_PmQ * &t12; // U_D * 4 (W_P U_Q - U_P W_Q)^2
-                        mul %17, %24, %12    // affine_PmQ loaded into %24
-
-                        ///// these can be eliminated down the road, but included for 1:1 algorithm correspodence to reference in early testing
-                        // P.U = t14;  // U_{P'} = (U_P + W_P)^2 (U_P - W_P)^2
-                        psa %20, %14
-                        // P.W = t16;  // W_{P'} = (4 U_P W_P) ((U_P - W_P)^2 + ((A + 2)/4) 4 U_P W_P)
-                        psa %21, %16
-                        // let t18 = t11;               // W_D * 4 (U_P U_Q - W_P W_Q)^2
-                        // Q.U = t18;  // U_{Q'} = W_D * 4 (U_P U_Q - W_P W_Q)^2
-                        psa %22, %11   // collapsed two to save a register
-                        // Q.W = t17;  // W_{Q'} = U_D * 4 (W_P U_Q - U_P W_Q)^2
-                        psa %23, %17
-
-                        ///// 'return' arguments for next iteration, can be optimized out later
-                        psa %25, %20
-                        psa %26, %21
-                        psa %27, %22
-                        psa %28, %23
-
-                    brz end, %19     // if loop counter is 0, quit
-                    sub %19, %19, #1 // subtract one from the loop counter and run again
-                    brz mainloop, #0    // go back to the top
-                end:
-                    // ProjectivePoint::conditional_swap(&mut x0, &mut x1, Choice::from(bits[0] as u8));
-                    // cswap x0.U (%25), x1.U (%27)
-                    xor %30, %25, %27
-                    msk %30, %18, %30
-                    xor %25, %30, %25
-                    xor %27, %30, %27
-                    // cswap x0.W (%26), x1.W (%28)
-                    xor %30, %26, %28
-                    msk %30, %18, %30
-                    xor %26, %30, %26
-                    xor %28, %30, %28
-
-                    // AFFINE SPLICE -- pass arguments to the affine block
-                    psa %29, %25
-                    psa %30, %26
-                    // W.invert() in %21
-                    // U in %29
-                    // W in %30
-                    // result in %31
-                    // loop counter in %28
-
-                    // from FieldElement.invert()
-                        // let (t19, t3) = self.pow22501();   // t19: 249..0 ; t3: 3,1,0
-                        // let t0  = self.square();           // 1         e_0 = 2^1
-                        mul %0, %30, %30  // self is W, e.g. %30
-                        // let t1  = t0.square().square();    // 3         e_1 = 2^3
-                        mul %1, %0, %0
-                        mul %1, %1, %1
-                        // let t2  = self * &t1;              // 3,0       e_2 = 2^3 + 2^0
-                        mul %2, %30, %1
-                        // let t3  = &t0 * &t2;               // 3,1,0
-                        mul %3, %0, %2
-                        // let t4  = t3.square();             // 4,2,1
-                        mul %4, %3, %3
-                        // let t5  = &t2 * &t4;               // 4,3,2,1,0
-                        mul %5, %2, %4
-
-                        // let t6  = t5.pow2k(5);             // 9,8,7,6,5
-                        psa %28, #5       // coincidentally, constant #5 is the number 5
-                        mul %6, %5, %5
-                    pow2k_5:
-                        sub %28, %28, #1  // %28 = %28 - 1
-                        brz pow2k_5_exit, %28
-                        mul %6, %6, %6
-                        brz pow2k_5, #0
-                    pow2k_5_exit:
-                        // let t7  = &t6 * &t5;               // 9,8,7,6,5,4,3,2,1,0
-                        mul %7, %6, %5
-
-                        // let t8  = t7.pow2k(10);            // 19..10
-                        psa %28, #6        // constant #6 is the number 10
-                        mul %8, %7, %7
-                    pow2k_10:
-                        sub %28, %28, #1
-                        brz pow2k_10_exit, %28
-                        mul %8, %8, %8
-                        brz pow2k_10, #0
-                    pow2k_10_exit:
-                        // let t9  = &t8 * &t7;               // 19..0
-                        mul %9, %8, %7
-
-                        // let t10 = t9.pow2k(20);            // 39..20
-                        psa %28, #7         // constant #7 is the number 20
-                        mul %10, %9, %9
-                    pow2k_20:
-                        sub %28, %28, #1
-                        brz pow2k_20_exit, %28
-                        mul %10, %10, %10
-                        brz pow2k_20, #0
-                    pow2k_20_exit:
-                        // let t11 = &t10 * &t9;              // 39..0
-                        mul %11, %10, %9
-
-                        // let t12 = t11.pow2k(10);           // 49..10
-                        psa %28, #6         // constant #6 is the number 10
-                        mul %12, %11, %11
-                    pow2k_10b:
-                        sub %28, %28, #1
-                        brz pow2k_10b_exit, %28
-                        mul %12, %12, %12
-                        brz pow2k_10b, #0
-                    pow2k_10b_exit:
-                        // let t13 = &t12 * &t7;              // 49..0
-                        mul %13, %12, %7
-
-                        // let t14 = t13.pow2k(50);           // 99..50
-                        psa %28, #8         // constant #8 is the number 50
-                        mul %14, %13, %13
-                    pow2k_50a:
-                        sub %28, %28, #1
-                        brz pow2k_50a_exit, %28
-                        mul %14, %14, %14
-                        brz pow2k_50a, #0
-                    pow2k_50a_exit:
-                        // let t15 = &t14 * &t13;             // 99..0
-                        mul %15, %14, %13
-
-                        // let t16 = t15.pow2k(100);          // 199..100
-                        psa %28, #9         // constant #9 is the number 100
-                        mul %16, %15, %15
-                    pow2k_100:
-                        sub %28, %28, #1
-                        brz pow2k_100_exit, %28
-                        mul %16, %16, %16
-                        brz pow2k_100, #0
-                    pow2k_100_exit:
-                        // let t17 = &t16 * &t15;             // 199..0
-                        mul %17, %16, %15
-
-                        // let t18 = t17.pow2k(50);           // 249..50
-                        psa %28, #8         // constant #8 is the number 50
-                        mul %18, %17, %17
-                    pow2k_50b:
-                        sub %28, %28, #1
-                        brz pow2k_50b_exit, %28
-                        mul %18, %18, %18
-                        brz pow2k_50b, #0
-                    pow2k_50b_exit:
-                        // let t19 = &t18 * &t13;             // 249..0
-                        mul %19, %18, %13
-                        //(t19, t3) // just a return value, values are already there, do nothing
-
-                        //let t20 = t19.pow2k(5);            // 254..5
-                        psa %28, #5
-                        mul %20, %19, %19
-                    pow2k_5_last:
-                        sub %28, %28, #1
-                        brz pow2k_5_last_exit, %28
-                        mul %20, %20, %20
-                        brz pow2k_5_last, #0
-                    pow2k_5_last_exit:
-
-                        //let t21 = &t20 * &t3;              // 254..5,3,1,0
-                        mul %21, %20, %3
-
-                    // u = &self.U * &self.W.invert()
-                    mul %31, %29, %21
-                    fin  // finish execution
-            );
-            let mut engine = engine_25519::Engine25519::new();
-            let mut job = engine_25519::Job {
-                id: None,
-                ucode: [0; 1024],
-                uc_len: mcode.len() as u32,
-                uc_start: 0,
-                window: Some(0),
-                rf: [0; engine_25519::RF_SIZE_IN_U32],
-            };
+        let mcode = assemble_engine25519!(
+            start:
+                // P.U in %20
+                // P.W in %21
+                // Q.U in %22
+                // Q.W in %23
+                // affine_PmQ in %24
+                // %30 is the TRD scratch register and cswap dummy
+                // %29 is the subtraction temporary value register and k_t
+                // x0.U in %25
+                // x0.W in %26
+                // x1.U in %27
+                // x1.W in %28
+                // %19 is the loop counter, starts with 254 (if 0, loop runs exactly once)
+                // %31 is the scalar
+                // %18 is the swap variable
+                psa %18, #0
+
+                // for i in (0..255).rev()
+            mainloop:
+                // let choice: u8 = (bits[i + 1] ^ bits[i]) as u8;
+                // ProjectivePoint::conditional_swap(&mut x0, &mut x1, choice.into());
+                xbt %29, %31        // orignally[k_t = (k>>t) & 1] now[k_t = k[254]]
+                shl %31, %31        // k = k<<1
+                xor %18, %18, %29   // swap ^= k_t
+
+                // cswap x0.U (%25), x1.U (%27)
+                xor %30, %25, %27
+                msk %30, %18, %30
+                xor %25, %30, %25
+                xor %27, %30, %27
+                // cswap x0.W (%26), x1.W (%28)
+                xor %30, %26, %28
+                msk %30, %18, %30
+                xor %26, %30, %26
+                xor %28, %30, %28
+
+                psa %18, %29  // swap = k_t
+
+                    // differential_add_and_double(&mut x0, &mut x1, &affine_u);
+                    psa %20, %25
+                    psa %21, %26
+                    psa %22, %27
+                    psa %23, %28
+                    // affine_u is already in %24
+
+                    // let t0 = &P.U + &P.W;
+                    add %0, %20, %21
+                    trd %30, %0
+                    sub %0, %0, %30
+                    // let t1 = &P.U - &P.W;
+                    sub %21, #3, %21    // negate &P.W using #FIELDPRIME (#3)
+                    add %1, %20, %21
+                    trd %30, %1
+                    sub %1, %1, %30
+                    // let t2 = &Q.U + &Q.W;
+                    add %2, %22, %23
+                    trd %30, %2
+                    sub %2, %2, %30
+                    // let t3 = &Q.U - &Q.W;
+                    sub %23, #3, %23
+                    add %3, %22, %23
+                    trd %30, %3
+                    sub %3, %3, %30
+                    // let t4 = t0.square();   // (U_P + W_P)^2 = U_P^2 + 2 U_P W_P + W_P^2
+                    mul %4, %0, %0
+                    // let t5 = t1.square();   // (U_P - W_P)^2 = U_P^2 - 2 U_P W_P + W_P^2
+                    mul %5, %1, %1
+                    // let t6 = &t4 - &t5;     // 4 U_P W_P
+                    sub %29, #3, %5
+                    add %6, %4, %29
+                    trd %30, %6
+                    sub %6, %6, %30
+                    // let t7 = &t0 * &t3;     // (U_P + W_P) (U_Q - W_Q) = U_P U_Q + W_P U_Q - U_P W_Q - W_P W_Q
+                    mul %7, %0, %3
+                    // let t8 = &t1 * &t2;     // (U_P - W_P) (U_Q + W_Q) = U_P U_Q - W_P U_Q + U_P W_Q - W_P W_Q
+                    mul %8, %1, %2
+                    // let t9  = &t7 + &t8;    // 2 (U_P U_Q - W_P W_Q)
+                    add %9, %7, %8
+                    trd %30, %9
+                    sub %9, %9, %30
+                    // let t10 = &t7 - &t8;    // 2 (W_P U_Q - U_P W_Q)
+                    sub %29, #3, %8
+                    add %10, %7, %29
+                    trd %30, %10
+                    sub %10, %10, %30
+                    // let t11 =  t9.square(); // 4 (U_P U_Q - W_P W_Q)^2
+                    mul %11, %9, %9
+                    // let t12 = t10.square(); // 4 (W_P U_Q - U_P W_Q)^2
+                    mul %12, %10, %10
+                    // let t13 = &APLUS2_OVER_FOUR * &t6; // (A + 2) U_P U_Q
+                    mul %13, #4, %6   // #4 is A+2/4
+                    // let t14 = &t4 * &t5;    // ((U_P + W_P)(U_P - W_P))^2 = (U_P^2 - W_P^2)^2
+                    mul %14, %4, %5
+                    // let t15 = &t13 + &t5;   // (U_P - W_P)^2 + (A + 2) U_P W_P
+                    add %15, %13, %5
+                    trd %30, %15
+                    sub %15, %15, %30
+                    // let t16 = &t6 * &t15;   // 4 (U_P W_P) ((U_P - W_P)^2 + (A + 2) U_P W_P)
+                    mul %16, %6, %15
+                    // let t17 = affine_PmQ * &t12; // U_D * 4 (W_P U_Q - U_P W_Q)^2
+                    mul %17, %24, %12    // affine_PmQ loaded into %24
+
+                    ///// these can be eliminated down the road, but included for 1:1 algorithm correspodence to reference in early testing
+                    // P.U = t14;  // U_{P'} = (U_P + W_P)^2 (U_P - W_P)^2
+                    psa %20, %14
+                    // P.W = t16;  // W_{P'} = (4 U_P W_P) ((U_P - W_P)^2 + ((A + 2)/4) 4 U_P W_P)
+                    psa %21, %16
+                    // let t18 = t11;               // W_D * 4 (U_P U_Q - W_P W_Q)^2
+                    // Q.U = t18;  // U_{Q'} = W_D * 4 (U_P U_Q - W_P W_Q)^2
+                    psa %22, %11   // collapsed two to save a register
+                    // Q.W = t17;  // W_{Q'} = U_D * 4 (W_P U_Q - U_P W_Q)^2
+                    psa %23, %17
+
+                    ///// 'return' arguments for next iteration, can be optimized out later
+                    psa %25, %20
+                    psa %26, %21
+                    psa %27, %22
+                    psa %28, %23
+
+                brz end, %19     // if loop counter is 0, quit
+                sub %19, %19, #1 // subtract one from the loop counter and run again
+                brz mainloop, #0    // go back to the top
+            end:
+                // ProjectivePoint::conditional_swap(&mut x0, &mut x1, Choice::from(bits[0] as u8));
+                // cswap x0.U (%25), x1.U (%27)
+                xor %30, %25, %27
+                msk %30, %18, %30
+                xor %25, %30, %25
+                xor %27, %30, %27
+                // cswap x0.W (%26), x1.W (%28)
+                xor %30, %26, %28
+                msk %30, %18, %30
+                xor %26, %30, %26
+                xor %28, %30, %28
+
+                // AFFINE SPLICE -- pass arguments to the affine block
+                psa %29, %25
+                psa %30, %26
+                // W.invert() in %21
+                // U in %29
+                // W in %30
+                // result in %31
+                // loop counter in %28
 
-            for (&src, dst) in mcode.iter().zip(job.ucode.iter_mut()) {
-                *dst = src as u32;
-            }
-
-            copy_to_rf(x0.U.as_bytes(), 25, &mut job.rf);
-            copy_to_rf(x0.W.as_bytes(), 26, &mut job.rf);
-            copy_to_rf(x1.U.as_bytes(), 27, &mut job.rf);
-            copy_to_rf(x1.W.as_bytes(), 28, &mut job.rf);
-            copy_to_rf(affine_u.as_bytes(), 24, &mut job.rf);
-            copy_to_rf(scalar.bytes, 31, &mut job.rf);
-            // load the number 254 into the loop index register
-            copy_to_rf([
-                254, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            ], 19, &mut job.rf);
-
-            // start the run
-            let result_rf = engine.spawn_job(job).expect("couldn't run engine job");
-
-            if false { // unmerged affine path
-                x0.U = FieldElement::from_bytes(&copy_from_rf(25, &result_rf));
-                x0.W = FieldElement::from_bytes(&copy_from_rf(26, &result_rf));
-                
-                //Note: is seems this TODO has been handled as ProjectivePoint's as_affine already
-                //has an accelerated version. Should this be removed or is there further
-                //optimization work to be done. If so, what is that work?
-
-                // TODO: optimize this relatively innocuous looking call.
-                // this consumes about 100ms runtime -- need to implement this using
-                // curve25519 acceleration!
-                x0.as_affine()
-            } else {
-                MontgomeryPoint(copy_from_rf(31, &result_rf))
-            }
-        } else {
-            let mut engine = engine_25519::Engine25519::new();
-            let job = engine_25519::MontgomeryJob {
-                x0_u: x0.U.as_bytes(),
-                x0_w: x0.W.as_bytes(),
-                x1_u: x1.U.as_bytes(),
-                x1_w: x1.W.as_bytes(),
-                affine_u: affine_u.as_bytes(),
-                scalar: scalar.bytes,
-            };
+                // from FieldElement.invert()
+                    // let (t19, t3) = self.pow22501();   // t19: 249..0 ; t3: 3,1,0
+                    // let t0  = self.square();           // 1         e_0 = 2^1
+                    mul %0, %30, %30  // self is W, e.g. %30
+                    // let t1  = t0.square().square();    // 3         e_1 = 2^3
+                    mul %1, %0, %0
+                    mul %1, %1, %1
+                    // let t2  = self * &t1;              // 3,0       e_2 = 2^3 + 2^0
+                    mul %2, %30, %1
+                    // let t3  = &t0 * &t2;               // 3,1,0
+                    mul %3, %0, %2
+                    // let t4  = t3.square();             // 4,2,1
+                    mul %4, %3, %3
+                    // let t5  = &t2 * &t4;               // 4,3,2,1,0
+                    mul %5, %2, %4
 
-            MontgomeryPoint(engine.montgomery_job(job).expect("couldn't run montgomery multiply job"))
-        }
+                    // let t6  = t5.pow2k(5);             // 9,8,7,6,5
+                    psa %28, #5       // coincidentally, constant #5 is the number 5
+                    mul %6, %5, %5
+                pow2k_5:
+                    sub %28, %28, #1  // %28 = %28 - 1
+                    brz pow2k_5_exit, %28
+                    mul %6, %6, %6
+                    brz pow2k_5, #0
+                pow2k_5_exit:
+                    // let t7  = &t6 * &t5;               // 9,8,7,6,5,4,3,2,1,0
+                    mul %7, %6, %5
+
+                    // let t8  = t7.pow2k(10);            // 19..10
+                    psa %28, #6        // constant #6 is the number 10
+                    mul %8, %7, %7
+                pow2k_10:
+                    sub %28, %28, #1
+                    brz pow2k_10_exit, %28
+                    mul %8, %8, %8
+                    brz pow2k_10, #0
+                pow2k_10_exit:
+                    // let t9  = &t8 * &t7;               // 19..0
+                    mul %9, %8, %7
+
+                    // let t10 = t9.pow2k(20);            // 39..20
+                    psa %28, #7         // constant #7 is the number 20
+                    mul %10, %9, %9
+                pow2k_20:
+                    sub %28, %28, #1
+                    brz pow2k_20_exit, %28
+                    mul %10, %10, %10
+                    brz pow2k_20, #0
+                pow2k_20_exit:
+                    // let t11 = &t10 * &t9;              // 39..0
+                    mul %11, %10, %9
+
+                    // let t12 = t11.pow2k(10);           // 49..10
+                    psa %28, #6         // constant #6 is the number 10
+                    mul %12, %11, %11
+                pow2k_10b:
+                    sub %28, %28, #1
+                    brz pow2k_10b_exit, %28
+                    mul %12, %12, %12
+                    brz pow2k_10b, #0
+                pow2k_10b_exit:
+                    // let t13 = &t12 * &t7;              // 49..0
+                    mul %13, %12, %7
+
+                    // let t14 = t13.pow2k(50);           // 99..50
+                    psa %28, #8         // constant #8 is the number 50
+                    mul %14, %13, %13
+                pow2k_50a:
+                    sub %28, %28, #1
+                    brz pow2k_50a_exit, %28
+                    mul %14, %14, %14
+                    brz pow2k_50a, #0
+                pow2k_50a_exit:
+                    // let t15 = &t14 * &t13;             // 99..0
+                    mul %15, %14, %13
+
+                    // let t16 = t15.pow2k(100);          // 199..100
+                    psa %28, #9         // constant #9 is the number 100
+                    mul %16, %15, %15
+                pow2k_100:
+                    sub %28, %28, #1
+                    brz pow2k_100_exit, %28
+                    mul %16, %16, %16
+                    brz pow2k_100, #0
+                pow2k_100_exit:
+                    // let t17 = &t16 * &t15;             // 199..0
+                    mul %17, %16, %15
+
+                    // let t18 = t17.pow2k(50);           // 249..50
+                    psa %28, #8         // constant #8 is the number 50
+                    mul %18, %17, %17
+                pow2k_50b:
+                    sub %28, %28, #1
+                    brz pow2k_50b_exit, %28
+                    mul %18, %18, %18
+                    brz pow2k_50b, #0
+                pow2k_50b_exit:
+                    // let t19 = &t18 * &t13;             // 249..0
+                    mul %19, %18, %13
+                    //(t19, t3) // just a return value, values are already there, do nothing
+
+                    //let t20 = t19.pow2k(5);            // 254..5
+                    psa %28, #5
+                    mul %20, %19, %19
+                pow2k_5_last:
+                    sub %28, %28, #1
+                    brz pow2k_5_last_exit, %28
+                    mul %20, %20, %20
+                    brz pow2k_5_last, #0
+                pow2k_5_last_exit:
+
+                    //let t21 = &t20 * &t3;              // 254..5,3,1,0
+                    mul %21, %20, %3
+
+                // u = &self.U * &self.W.invert()
+                mul %31, %29, %21
+                fin  // finish execution
+        );
+
+        let window = 0;
+        ensure_engine();
+        let mut ucode_hw: &'static mut [u32] = unsafe {
+            core::slice::from_raw_parts_mut(ENGINE_MEM.unwrap().as_mut_ptr() as *mut u32, 1024)
+        };
+        let mut rf_hw: &mut [u32] = unsafe {
+            core::slice::from_raw_parts_mut(
+                (ENGINE_MEM.unwrap().as_mut_ptr() as usize + RF_U8_BASE) as *mut u32,
+                TOTAL_RF_SIZE_IN_U32,
+            )
+        };
+
+        copy_to_rf(x0.U.as_bytes(), 25, &mut rf_hw, window);
+        copy_to_rf(x0.W.as_bytes(), 26, &mut rf_hw, window);
+        copy_to_rf(x1.U.as_bytes(), 27, &mut rf_hw, window);
+        copy_to_rf(x1.W.as_bytes(), 28, &mut rf_hw, window);
+        copy_to_rf(affine_u.as_bytes(), 24, &mut rf_hw, window);
+        copy_to_rf(scalar.bytes, 31, &mut rf_hw, window);
+        copy_to_rf(
+            [
+                254, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                0x00, 0x00, 0x00, 0x00,
+            ],
+            19,
+            &mut rf_hw,
+            window,
+        ); // 254 as loop counter
+
+        MontgomeryPoint(run_job(&mut ucode_hw, &rf_hw, &mcode, window))
     }
 
     /// Given `self` \\( = u\_0(P) \\), and a `Scalar` \\(n\\), return \\( u\_0(\[n\]P) \\)
     #[cfg(not(curve25519_dalek_backend = "u32e_backend"))]
     fn mul(self, scalar: &Scalar) -> MontgomeryPoint {
         // TODO: consider feature "panic_on_sw_eval"
-        #[cfg(all(not(test),curve25519_dalek_backend = "u32e_backend"))] // due to issue https://github.com/rust-lang/rust/issues/59168, you will have to manually comment this out when running a test on the full system and not just this crate.
+        #[cfg(all(not(test), curve25519_dalek_backend = "u32e_backend"))] // due to issue https://github.com/rust-lang/rust/issues/59168, you will have to manually comment this out when running a test on the full system and not just this crate.
         log::warn!("sw montgomery multiply being used - check for build config errors!");
         // We multiply by the integer representation of the given Scalar. By scalar invariant #1,
         // the MSB is 0, so we can skip it.