From bc866f8f0e038178ded159b0ca264cde895568ea Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Fri, 15 Nov 2024 14:57:59 -0800 Subject: [PATCH] incorporate more PR fb --- apis/python/src/tiledbsoma/_fastercsx.py | 6 ++-- apis/python/src/tiledbsoma/fastercsx.cc | 30 ++++++++--------- apis/python/src/tiledbsoma/io/conversions.py | 2 +- apis/python/tests/test_fastercsx.py | 7 ---- apis/python/tests/test_libfastercsx.py | 34 ++++++++++---------- libtiledbsoma/src/utils/fastercsx.h | 24 +++++++------- 6 files changed, 49 insertions(+), 54 deletions(-) diff --git a/apis/python/src/tiledbsoma/_fastercsx.py b/apis/python/src/tiledbsoma/_fastercsx.py index d7bdd3fb3b..0bf19ccf29 100644 --- a/apis/python/src/tiledbsoma/_fastercsx.py +++ b/apis/python/src/tiledbsoma/_fastercsx.py @@ -16,7 +16,7 @@ from typing_extensions import TypeAlias from .options._soma_tiledb_context import SOMATileDBContext -from .pytiledbsoma.fastercsx import compress_coo, copy_to_dense, sort_indices +from .pytiledbsoma.fastercsx import compress_coo, copy_csx_to_dense, sort_csx_indices NDArrayIndex: TypeAlias = npt.NDArray[np.integer[Any]] NDArrayNumber: TypeAlias = npt.NDArray[Union[np.integer[Any], np.floating[Any]]] @@ -87,7 +87,7 @@ def from_ijd( context.native_context, (n_major, n_minor), i, j, d, indptr, indices, data ) if make_sorted: - sort_indices(context.native_context, indptr, indices, data) + sort_csx_indices(context.native_context, indptr, indices, data) return CompressedMatrix( indptr, indices, data, shape, format, make_sorted, context ) @@ -200,7 +200,7 @@ def to_numpy(self, index: slice | None = None) -> NDArrayNumber: else (self.shape[0], n_major) ) out = np.zeros(math.prod(out_shape), dtype=self.data.dtype) - copy_to_dense( + copy_csx_to_dense( self.context.native_context, major_idx_start, major_idx_end, diff --git a/apis/python/src/tiledbsoma/fastercsx.cc b/apis/python/src/tiledbsoma/fastercsx.cc index 29999396de..d3add6613e 100644 --- a/apis/python/src/tiledbsoma/fastercsx.cc +++ b/apis/python/src/tiledbsoma/fastercsx.cc @@ -237,7 +237,7 @@ void compress_coo_validate_args_( 3. sum(Ai.size())==sum(Aj.size())==sum(Ad.size())==Bj.size()==Bd.size() (this is nnz). 4. num chunks/items in Ai/Aj/Ad is same size and type - 5. ensure B* are writable + 5. ensure B* are writeable 6. Ensure each element in A* tuples are same type etc... @@ -286,7 +286,7 @@ void compress_coo_validate_args_( throw std::length_error("Pointer array size does not match n_rows."); if (!Bp.writeable() || !Bj.writeable() || !Bd.writeable()) - throw std::invalid_argument("Output arrays must be writable."); + throw std::invalid_argument("Output arrays must be writeable."); if (Ai.size() > 0) { if (!Ai[0].dtype().is(Aj[0].dtype())) @@ -298,7 +298,7 @@ void compress_coo_validate_args_( /** * @brief python binding for coo_compress, which converts COO chunked array (I, * J, D) into CSX arrays (P, J, D). The output arrays are NOT sorted on the - * minor dimension - see `sort_indices` if that is required. + * minor dimension -- see `sort_csx_indices` if that is required. */ void compress_coo( std::shared_ptr ctx, @@ -382,10 +382,10 @@ void compress_coo( } /** - * @brief python binding for sort_indices, which sorts minor dimension of a + * @brief Python binding for sort_csx_indices, which sorts minor dimension of a * compressed matrix */ -void sort_indices( +void sort_csx_indices( std::shared_ptr ctx, py::array Bp, py::array Bj, @@ -395,7 +395,7 @@ void sort_indices( if (Bp.ndim() != 1 || Bj.ndim() != 1 || Bd.ndim() != 1) throw std::length_error("All arrays must be 1D"); if (!Bp.writeable() || !Bj.writeable() || !Bd.writeable()) - throw std::invalid_argument("Output arrays must be writable."); + throw std::invalid_argument("Output arrays must be writeable."); // Get dispatch types CsxIndexType csx_major_index_type = lookup_dtype_( @@ -427,7 +427,7 @@ void sort_indices( make_mutable_casted_span_>(Bd); py::gil_scoped_release release; - return fastercsx::sort_indices( + return fastercsx::sort_csx_indices( ctx->thread_pool().get(), n_row, nnz, @@ -443,7 +443,7 @@ void sort_indices( /** * @brief Python binding for parallel slice+to_dense operation. */ -void copy_to_dense( +void copy_csx_to_dense( std::shared_ptr ctx, const uint64_t major_idx_start, const int64_t major_idx_end, @@ -469,7 +469,7 @@ void copy_to_dense( if (n_major != Bp.size() - 1) throw std::length_error("n_rows does not match Bp.size()"); if (!out.writeable()) - throw std::invalid_argument("out must be writable"); + throw std::invalid_argument("out must be writeable"); if (out.dtype().num() != Bd.dtype().num()) throw pybind11::type_error("out dtype must match Bd dtype"); @@ -498,7 +498,7 @@ void copy_to_dense( make_mutable_casted_span_>(out); py::gil_scoped_release release; - return fastercsx::copy_to_dense( + return fastercsx::copy_csx_to_dense( ctx->thread_pool().get(), major_idx_start, major_idx_end, @@ -538,7 +538,7 @@ void count_rows( throw std::length_error("Pointer array size does not match n_rows."); if (!Bp.writeable()) - throw std::invalid_argument("Output arrays must be writable."); + throw std::invalid_argument("Output arrays must be writeable."); CooIndexType coo_index_type = lookup_dtype_( coo_index_type_dispatch, Ai[0].dtype(), "COO index (row, col) arrays"); @@ -583,8 +583,8 @@ void load_fastercsx(py::module& m) { "Create CSX elements"); fastercsx_m.def( - "sort_indices", - sort_indices, + "sort_csx_indices", + sort_csx_indices, py::arg("ctx").noconvert(), py::arg("Bp").noconvert(), py::arg("Bj").noconvert(), @@ -592,8 +592,8 @@ void load_fastercsx(py::module& m) { "Sort minor axis indices and data"); fastercsx_m.def( - "copy_to_dense", - copy_to_dense, + "copy_csx_to_dense", + copy_csx_to_dense, py::arg("ctx").noconvert(), py::arg("major_idx_start"), py::arg("major_idx_end"), diff --git a/apis/python/src/tiledbsoma/io/conversions.py b/apis/python/src/tiledbsoma/io/conversions.py index 393aefa81e..107a83855b 100644 --- a/apis/python/src/tiledbsoma/io/conversions.py +++ b/apis/python/src/tiledbsoma/io/conversions.py @@ -83,7 +83,7 @@ def to_tiledb_supported_array_type(name: str, x: _MT) -> _MT: def csr_from_coo_table( tbl: pa.Table, num_rows: int, num_cols: int, context: SOMATileDBContext ) -> sp.csr_matrix: - """Given an Arrow Table containing COO data, return a ``scipy.sparse.csr_matrx``.""" + """Given an Arrow Table containing COO data, return a ``scipy.sparse.csr_matrix``.""" s = CompressedMatrix.from_soma( tbl, (num_rows, num_cols), "csr", True, context ).to_scipy() diff --git a/apis/python/tests/test_fastercsx.py b/apis/python/tests/test_fastercsx.py index c2c3587bdf..8d44f055d4 100644 --- a/apis/python/tests/test_fastercsx.py +++ b/apis/python/tests/test_fastercsx.py @@ -361,10 +361,3 @@ def test_bad_shapes(context: soma.SOMATileDBContext, rng: np.random.Generator) - fastercsx.CompressedMatrix.from_ijd( sp.row, sp.col, sp.data, shp, "csr", True, context ) - - -""" -other tests to consider -1. duplicates - -""" diff --git a/apis/python/tests/test_libfastercsx.py b/apis/python/tests/test_libfastercsx.py index 822cd9b87f..87f5a58aa7 100644 --- a/apis/python/tests/test_libfastercsx.py +++ b/apis/python/tests/test_libfastercsx.py @@ -68,7 +68,7 @@ def test_construction( ) if sp.nnz >= np.iinfo(csr_major_index_dtype).max: - # only occur if we mess up the test params + # only occurs if we mess up the test params pytest.skip(reason="NNZ is too large for index type.") indptr = np.empty((sp.shape[0] + 1), dtype=csr_major_index_dtype) @@ -85,7 +85,7 @@ def test_construction( indices, data, ) - fastercsx.sort_indices(context, indptr, indices, data) + fastercsx.sort_csx_indices(context, indptr, indices, data) # Verify equality with SciPy constructed CSR csr = sp.tocsr() @@ -144,7 +144,7 @@ def test_partitioning( indices, data, ) - fastercsx.sort_indices(context, indptr, indices, data) + fastercsx.sort_csx_indices(context, indptr, indices, data) # Verify equality with SciPy constructed CSR csr = sp.tocsr() @@ -164,7 +164,7 @@ def test_partitioning( def test_multichunk( nchunks: int, shape: tuple[int, int], density: float, context: clib.SOMAContext ) -> None: - """check that multi-chunk COO input functions correctly""" + """Check that multi-chunk COO input functions correctly""" rng = np.random.default_rng() sp = sparse.random( @@ -194,7 +194,7 @@ def test_multichunk( indices, data, ) - fastercsx.sort_indices(context, indptr, indices, data) + fastercsx.sort_csx_indices(context, indptr, indices, data) # Verify equality with SciPy constructed CSR csr = sp.tocsr() @@ -203,7 +203,7 @@ def test_multichunk( assert np.array_equal(data, csr.data) -def test_sort_indices_bad_args( +def test_sort_csx_indices_bad_args( rng: np.random.Generator, context: clib.SOMAContext ) -> None: @@ -211,37 +211,37 @@ def test_sort_indices_bad_args( p, j, d = sp.indptr, sp.indices, sp.data with pytest.raises(TypeError): - fastercsx.sort_indices(None, p, j, d) + fastercsx.sort_csx_indices(None, p, j, d) with pytest.raises(TypeError): - fastercsx.sort_indices(0, p, j, d) + fastercsx.sort_csx_indices(0, p, j, d) with pytest.raises(TypeError): - fastercsx.sort_indices(context, None, j, d) + fastercsx.sort_csx_indices(context, None, j, d) with pytest.raises(TypeError): - fastercsx.sort_indices(context, p, None, d) + fastercsx.sort_csx_indices(context, p, None, d) with pytest.raises(TypeError): - fastercsx.sort_indices(context, p, j, None) + fastercsx.sort_csx_indices(context, p, j, None) with pytest.raises(ValueError): - fastercsx.sort_indices(context, p[:-1], j, d) + fastercsx.sort_csx_indices(context, p[:-1], j, d) with pytest.raises(ValueError): - fastercsx.sort_indices(context, p, j[1:], d) + fastercsx.sort_csx_indices(context, p, j[1:], d) with pytest.raises(ValueError): - fastercsx.sort_indices(context, p, j, d[1:]) + fastercsx.sort_csx_indices(context, p, j, d[1:]) with pytest.raises(ValueError): - fastercsx.sort_indices(context, np.zeros_like(p), j, d) + fastercsx.sort_csx_indices(context, np.zeros_like(p), j, d) with pytest.raises(OverflowError): pbad = p.copy() pbad[1] = sp.nnz + 1000000 - fastercsx.sort_indices(context, pbad, j, d) + fastercsx.sort_csx_indices(context, pbad, j, d) with pytest.raises(OverflowError): pbad = p.copy() pbad[1] = -1 - fastercsx.sort_indices(context, pbad, j, d) + fastercsx.sort_csx_indices(context, pbad, j, d) def test_compress_coo_bad_args( diff --git a/libtiledbsoma/src/utils/fastercsx.h b/libtiledbsoma/src/utils/fastercsx.h index c5d5e4e8fc..8a7415fa86 100644 --- a/libtiledbsoma/src/utils/fastercsx.h +++ b/libtiledbsoma/src/utils/fastercsx.h @@ -88,8 +88,9 @@ size_t sum_over_size_(const std::vector>& v) noexcept { * greedy n-way linear partition, which works well enough for the common case * where most inputs are of similar sizes (which is typically the case). * - * TODO: If we sort (descending) first, it will be LPT which would be better in - * edge cases where there are a lot of variable sized input vectors. + * TODO: If we sort (descending) first, it will be classic LPT (Largest Processing + * Time) partitioning which would be better in edge cases where there are a lot + * of variable sized input vectors. */ template struct Partition { @@ -140,7 +141,7 @@ std::vector> partition_views_( } /** - * @brief Count occurrences of all values present in Ai - used as the basis + * @brief Count occurrences of all values present in Ai -- used as the basis * for constructing the index pointer array. */ template @@ -326,7 +327,7 @@ void compress_coo( assert(Bj.size() == nnz); assert(Bd.size() == nnz); - // get major axis counts. Important side effect: range checks the Ai + // Get major axis counts. Important side effect: range checks the Ai // values. count_rows_(tp, n_row, nnz, Ai, Bp); sum_rows_to_pointers_(Bp); @@ -412,10 +413,10 @@ bool index_lt_( } /** - * @brief Inplace sort of minor axis, used to canonicalize the CSx ordering. + * @brief In-place sort of minor axis, used to canonicalize the CSx ordering. */ template -void sort_indices( +void sort_csx_indices( ThreadPool* const tp, uint64_t n_row, uint64_t nnz, @@ -460,7 +461,7 @@ void sort_indices( * */ template -void copy_to_dense( +void copy_csx_to_dense( ThreadPool* const tp, uint64_t major_start, uint64_t major_end, @@ -502,14 +503,15 @@ void copy_to_dense( }); assert(status.ok()); } else { - // parallelizing this is less beneficial than the CSR case as it + // Parallelizing this is less beneficial than the CSR case as it // partitions by columns, but writes in C order. This does not align // with CPU cache lines, and is therefore quite a bit slower than // the speedup achieved by the (above) CSR loop. // - // We could write a Fortran-ordered output array, but then it would - // typically need to be converted back to C order, which is even - // worse as you move (even) more data once it is dense. + // We could write a Fortran-ordered ("column major") output array, but + // then it would typically need to be converted back to C order ("row + // major"), which is even worse as you move (even) more data once it is + // dense. // // If there is ever a requirement for use of F-ordered data, we // could add the output order as an option.