From bc866f8f0e038178ded159b0ca264cde895568ea Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce.martin@tiledb.com>
Date: Fri, 15 Nov 2024 14:57:59 -0800
Subject: [PATCH] incorporate more PR fb

---
 apis/python/src/tiledbsoma/_fastercsx.py     |  6 ++--
 apis/python/src/tiledbsoma/fastercsx.cc      | 30 ++++++++---------
 apis/python/src/tiledbsoma/io/conversions.py |  2 +-
 apis/python/tests/test_fastercsx.py          |  7 ----
 apis/python/tests/test_libfastercsx.py       | 34 ++++++++++----------
 libtiledbsoma/src/utils/fastercsx.h          | 24 +++++++-------
 6 files changed, 49 insertions(+), 54 deletions(-)

diff --git a/apis/python/src/tiledbsoma/_fastercsx.py b/apis/python/src/tiledbsoma/_fastercsx.py
index d7bdd3fb3b..0bf19ccf29 100644
--- a/apis/python/src/tiledbsoma/_fastercsx.py
+++ b/apis/python/src/tiledbsoma/_fastercsx.py
@@ -16,7 +16,7 @@
 from typing_extensions import TypeAlias
 
 from .options._soma_tiledb_context import SOMATileDBContext
-from .pytiledbsoma.fastercsx import compress_coo, copy_to_dense, sort_indices
+from .pytiledbsoma.fastercsx import compress_coo, copy_csx_to_dense, sort_csx_indices
 
 NDArrayIndex: TypeAlias = npt.NDArray[np.integer[Any]]
 NDArrayNumber: TypeAlias = npt.NDArray[Union[np.integer[Any], np.floating[Any]]]
@@ -87,7 +87,7 @@ def from_ijd(
             context.native_context, (n_major, n_minor), i, j, d, indptr, indices, data
         )
         if make_sorted:
-            sort_indices(context.native_context, indptr, indices, data)
+            sort_csx_indices(context.native_context, indptr, indices, data)
         return CompressedMatrix(
             indptr, indices, data, shape, format, make_sorted, context
         )
@@ -200,7 +200,7 @@ def to_numpy(self, index: slice | None = None) -> NDArrayNumber:
             else (self.shape[0], n_major)
         )
         out = np.zeros(math.prod(out_shape), dtype=self.data.dtype)
-        copy_to_dense(
+        copy_csx_to_dense(
             self.context.native_context,
             major_idx_start,
             major_idx_end,
diff --git a/apis/python/src/tiledbsoma/fastercsx.cc b/apis/python/src/tiledbsoma/fastercsx.cc
index 29999396de..d3add6613e 100644
--- a/apis/python/src/tiledbsoma/fastercsx.cc
+++ b/apis/python/src/tiledbsoma/fastercsx.cc
@@ -237,7 +237,7 @@ void compress_coo_validate_args_(
     3. sum(Ai.size())==sum(Aj.size())==sum(Ad.size())==Bj.size()==Bd.size()
     (this is nnz).
     4. num chunks/items in Ai/Aj/Ad is same size and type
-    5. ensure B* are writable
+    5. ensure B* are writeable
     6. Ensure each element in A* tuples are same type
     etc...
 
@@ -286,7 +286,7 @@ void compress_coo_validate_args_(
         throw std::length_error("Pointer array size does not match n_rows.");
 
     if (!Bp.writeable() || !Bj.writeable() || !Bd.writeable())
-        throw std::invalid_argument("Output arrays must be writable.");
+        throw std::invalid_argument("Output arrays must be writeable.");
 
     if (Ai.size() > 0) {
         if (!Ai[0].dtype().is(Aj[0].dtype()))
@@ -298,7 +298,7 @@ void compress_coo_validate_args_(
 /**
  * @brief python binding for coo_compress, which converts COO chunked array (I,
  * J, D) into CSX arrays (P, J, D). The output arrays are NOT sorted on the
- * minor dimension - see `sort_indices` if that is required.
+ * minor dimension -- see `sort_csx_indices` if that is required.
  */
 void compress_coo(
     std::shared_ptr<tiledbsoma::SOMAContext> ctx,
@@ -382,10 +382,10 @@ void compress_coo(
 }
 
 /**
- * @brief python binding for sort_indices, which sorts minor dimension of a
+ * @brief Python binding for sort_csx_indices, which sorts minor dimension of a
  * compressed matrix
  */
-void sort_indices(
+void sort_csx_indices(
     std::shared_ptr<tiledbsoma::SOMAContext> ctx,
     py::array Bp,
     py::array Bj,
@@ -395,7 +395,7 @@ void sort_indices(
     if (Bp.ndim() != 1 || Bj.ndim() != 1 || Bd.ndim() != 1)
         throw std::length_error("All arrays must be 1D");
     if (!Bp.writeable() || !Bj.writeable() || !Bd.writeable())
-        throw std::invalid_argument("Output arrays must be writable.");
+        throw std::invalid_argument("Output arrays must be writeable.");
 
     // Get dispatch types
     CsxIndexType csx_major_index_type = lookup_dtype_(
@@ -427,7 +427,7 @@ void sort_indices(
                 make_mutable_casted_span_<VALUE, remap_value_t<VALUE>>(Bd);
 
             py::gil_scoped_release release;
-            return fastercsx::sort_indices(
+            return fastercsx::sort_csx_indices(
                 ctx->thread_pool().get(),
                 n_row,
                 nnz,
@@ -443,7 +443,7 @@ void sort_indices(
 /**
  * @brief Python binding for parallel slice+to_dense operation.
  */
-void copy_to_dense(
+void copy_csx_to_dense(
     std::shared_ptr<tiledbsoma::SOMAContext> ctx,
     const uint64_t major_idx_start,
     const int64_t major_idx_end,
@@ -469,7 +469,7 @@ void copy_to_dense(
     if (n_major != Bp.size() - 1)
         throw std::length_error("n_rows does not match Bp.size()");
     if (!out.writeable())
-        throw std::invalid_argument("out must be writable");
+        throw std::invalid_argument("out must be writeable");
     if (out.dtype().num() != Bd.dtype().num())
         throw pybind11::type_error("out dtype must match Bd dtype");
 
@@ -498,7 +498,7 @@ void copy_to_dense(
                 make_mutable_casted_span_<VALUE, remap_value_t<VALUE>>(out);
 
             py::gil_scoped_release release;
-            return fastercsx::copy_to_dense(
+            return fastercsx::copy_csx_to_dense(
                 ctx->thread_pool().get(),
                 major_idx_start,
                 major_idx_end,
@@ -538,7 +538,7 @@ void count_rows(
         throw std::length_error("Pointer array size does not match n_rows.");
 
     if (!Bp.writeable())
-        throw std::invalid_argument("Output arrays must be writable.");
+        throw std::invalid_argument("Output arrays must be writeable.");
 
     CooIndexType coo_index_type = lookup_dtype_(
         coo_index_type_dispatch, Ai[0].dtype(), "COO index (row, col) arrays");
@@ -583,8 +583,8 @@ void load_fastercsx(py::module& m) {
         "Create CSX elements");
 
     fastercsx_m.def(
-        "sort_indices",
-        sort_indices,
+        "sort_csx_indices",
+        sort_csx_indices,
         py::arg("ctx").noconvert(),
         py::arg("Bp").noconvert(),
         py::arg("Bj").noconvert(),
@@ -592,8 +592,8 @@ void load_fastercsx(py::module& m) {
         "Sort minor axis indices and data");
 
     fastercsx_m.def(
-        "copy_to_dense",
-        copy_to_dense,
+        "copy_csx_to_dense",
+        copy_csx_to_dense,
         py::arg("ctx").noconvert(),
         py::arg("major_idx_start"),
         py::arg("major_idx_end"),
diff --git a/apis/python/src/tiledbsoma/io/conversions.py b/apis/python/src/tiledbsoma/io/conversions.py
index 393aefa81e..107a83855b 100644
--- a/apis/python/src/tiledbsoma/io/conversions.py
+++ b/apis/python/src/tiledbsoma/io/conversions.py
@@ -83,7 +83,7 @@ def to_tiledb_supported_array_type(name: str, x: _MT) -> _MT:
 def csr_from_coo_table(
     tbl: pa.Table, num_rows: int, num_cols: int, context: SOMATileDBContext
 ) -> sp.csr_matrix:
-    """Given an Arrow Table containing COO data, return a ``scipy.sparse.csr_matrx``."""
+    """Given an Arrow Table containing COO data, return a ``scipy.sparse.csr_matrix``."""
     s = CompressedMatrix.from_soma(
         tbl, (num_rows, num_cols), "csr", True, context
     ).to_scipy()
diff --git a/apis/python/tests/test_fastercsx.py b/apis/python/tests/test_fastercsx.py
index c2c3587bdf..8d44f055d4 100644
--- a/apis/python/tests/test_fastercsx.py
+++ b/apis/python/tests/test_fastercsx.py
@@ -361,10 +361,3 @@ def test_bad_shapes(context: soma.SOMATileDBContext, rng: np.random.Generator) -
             fastercsx.CompressedMatrix.from_ijd(
                 sp.row, sp.col, sp.data, shp, "csr", True, context
             )
-
-
-"""
-other tests to consider
-1. duplicates
-
-"""
diff --git a/apis/python/tests/test_libfastercsx.py b/apis/python/tests/test_libfastercsx.py
index 822cd9b87f..87f5a58aa7 100644
--- a/apis/python/tests/test_libfastercsx.py
+++ b/apis/python/tests/test_libfastercsx.py
@@ -68,7 +68,7 @@ def test_construction(
     )
 
     if sp.nnz >= np.iinfo(csr_major_index_dtype).max:
-        # only occur if we mess up the test params
+        # only occurs if we mess up the test params
         pytest.skip(reason="NNZ is too large for index type.")
 
     indptr = np.empty((sp.shape[0] + 1), dtype=csr_major_index_dtype)
@@ -85,7 +85,7 @@ def test_construction(
         indices,
         data,
     )
-    fastercsx.sort_indices(context, indptr, indices, data)
+    fastercsx.sort_csx_indices(context, indptr, indices, data)
 
     # Verify equality with SciPy constructed CSR
     csr = sp.tocsr()
@@ -144,7 +144,7 @@ def test_partitioning(
         indices,
         data,
     )
-    fastercsx.sort_indices(context, indptr, indices, data)
+    fastercsx.sort_csx_indices(context, indptr, indices, data)
 
     # Verify equality with SciPy constructed CSR
     csr = sp.tocsr()
@@ -164,7 +164,7 @@ def test_partitioning(
 def test_multichunk(
     nchunks: int, shape: tuple[int, int], density: float, context: clib.SOMAContext
 ) -> None:
-    """check that multi-chunk COO input functions correctly"""
+    """Check that multi-chunk COO input functions correctly"""
 
     rng = np.random.default_rng()
     sp = sparse.random(
@@ -194,7 +194,7 @@ def test_multichunk(
         indices,
         data,
     )
-    fastercsx.sort_indices(context, indptr, indices, data)
+    fastercsx.sort_csx_indices(context, indptr, indices, data)
 
     # Verify equality with SciPy constructed CSR
     csr = sp.tocsr()
@@ -203,7 +203,7 @@ def test_multichunk(
     assert np.array_equal(data, csr.data)
 
 
-def test_sort_indices_bad_args(
+def test_sort_csx_indices_bad_args(
     rng: np.random.Generator, context: clib.SOMAContext
 ) -> None:
 
@@ -211,37 +211,37 @@ def test_sort_indices_bad_args(
     p, j, d = sp.indptr, sp.indices, sp.data
 
     with pytest.raises(TypeError):
-        fastercsx.sort_indices(None, p, j, d)
+        fastercsx.sort_csx_indices(None, p, j, d)
     with pytest.raises(TypeError):
-        fastercsx.sort_indices(0, p, j, d)
+        fastercsx.sort_csx_indices(0, p, j, d)
     with pytest.raises(TypeError):
-        fastercsx.sort_indices(context, None, j, d)
+        fastercsx.sort_csx_indices(context, None, j, d)
     with pytest.raises(TypeError):
-        fastercsx.sort_indices(context, p, None, d)
+        fastercsx.sort_csx_indices(context, p, None, d)
     with pytest.raises(TypeError):
-        fastercsx.sort_indices(context, p, j, None)
+        fastercsx.sort_csx_indices(context, p, j, None)
 
     with pytest.raises(ValueError):
-        fastercsx.sort_indices(context, p[:-1], j, d)
+        fastercsx.sort_csx_indices(context, p[:-1], j, d)
 
     with pytest.raises(ValueError):
-        fastercsx.sort_indices(context, p, j[1:], d)
+        fastercsx.sort_csx_indices(context, p, j[1:], d)
 
     with pytest.raises(ValueError):
-        fastercsx.sort_indices(context, p, j, d[1:])
+        fastercsx.sort_csx_indices(context, p, j, d[1:])
 
     with pytest.raises(ValueError):
-        fastercsx.sort_indices(context, np.zeros_like(p), j, d)
+        fastercsx.sort_csx_indices(context, np.zeros_like(p), j, d)
 
     with pytest.raises(OverflowError):
         pbad = p.copy()
         pbad[1] = sp.nnz + 1000000
-        fastercsx.sort_indices(context, pbad, j, d)
+        fastercsx.sort_csx_indices(context, pbad, j, d)
 
     with pytest.raises(OverflowError):
         pbad = p.copy()
         pbad[1] = -1
-        fastercsx.sort_indices(context, pbad, j, d)
+        fastercsx.sort_csx_indices(context, pbad, j, d)
 
 
 def test_compress_coo_bad_args(
diff --git a/libtiledbsoma/src/utils/fastercsx.h b/libtiledbsoma/src/utils/fastercsx.h
index c5d5e4e8fc..8a7415fa86 100644
--- a/libtiledbsoma/src/utils/fastercsx.h
+++ b/libtiledbsoma/src/utils/fastercsx.h
@@ -88,8 +88,9 @@ size_t sum_over_size_(const std::vector<tcb::span<const T>>& v) noexcept {
  * greedy n-way linear partition, which works well enough for the common case
  * where most inputs are of similar sizes (which is typically the case).
  *
- * TODO: If we sort (descending) first, it will be LPT which would be better in
- * edge cases where there are a lot of variable sized input vectors.
+ * TODO: If we sort (descending) first, it will be classic LPT (Largest Processing
+ * Time) partitioning which would be better in edge cases where there are a lot
+ * of variable sized input vectors.
  */
 template <typename COO_IDX>
 struct Partition {
@@ -140,7 +141,7 @@ std::vector<Partition<COO_IDX>> partition_views_(
 }
 
 /**
- * @brief Count occurrences of all values present in Ai - used as the basis
+ * @brief Count occurrences of all values present in Ai -- used as the basis
  * for constructing the index pointer array.
  */
 template <typename COO_IDX, typename CSX_MAJOR_IDX>
@@ -326,7 +327,7 @@ void compress_coo(
     assert(Bj.size() == nnz);
     assert(Bd.size() == nnz);
 
-    // get major axis counts. Important side effect: range checks the Ai
+    // Get major axis counts. Important side effect: range checks the Ai
     // values.
     count_rows_(tp, n_row, nnz, Ai, Bp);
     sum_rows_to_pointers_(Bp);
@@ -412,10 +413,10 @@ bool index_lt_(
 }
 
 /**
- * @brief Inplace sort of minor axis, used to canonicalize the CSx ordering.
+ * @brief In-place sort of minor axis, used to canonicalize the CSx ordering.
  */
 template <class VALUE, class CSX_MINOR_IDX, class CSX_MAJOR_IDX>
-void sort_indices(
+void sort_csx_indices(
     ThreadPool* const tp,
     uint64_t n_row,
     uint64_t nnz,
@@ -460,7 +461,7 @@ void sort_indices(
  *
  */
 template <class VALUE, class CSX_MINOR_IDX, class CSX_MAJOR_IDX>
-void copy_to_dense(
+void copy_csx_to_dense(
     ThreadPool* const tp,
     uint64_t major_start,
     uint64_t major_end,
@@ -502,14 +503,15 @@ void copy_to_dense(
             });
         assert(status.ok());
     } else {
-        // parallelizing this is less beneficial than the CSR case as it
+        // Parallelizing this is less beneficial than the CSR case as it
         // partitions by columns, but writes in C order. This does not align
         // with CPU cache lines, and is therefore quite a bit slower than
         // the speedup achieved by the (above) CSR loop.
         //
-        // We could write a Fortran-ordered output array, but then it would
-        // typically need to be converted back to C order, which is even
-        // worse as you move (even) more data once it is dense.
+        // We could write a Fortran-ordered ("column major") output array, but
+        // then it would typically need to be converted back to C order ("row
+        // major"), which is even worse as you move (even) more data once it is
+        // dense.
         //
         // If there is ever a requirement for use of F-ordered data, we
         // could add the output order as an option.