From 6cc0210ed3549cea74a390bdff72f26cbbab63a2 Mon Sep 17 00:00:00 2001
From: "Garth N. Wells" <gnw20@cam.ac.uk>
Date: Wed, 6 Nov 2024 09:03:17 +0000
Subject: [PATCH] Demo updates for LU solver bugs (#3499)

* Docker update

* Type fixes in pyvista demo

* Change solver type

* More solver switches

* Demo update

* Run all tests

* Test update

* Try mumps option

* Demo update

* Demo update

* Update

* Update

* mypy fix

* Update

* Try superlu

* Lint

* Demo update

* Demo update

* Updates

* Update

* Update

* Update

* Demo update

* Update

* Update

* Lint

* mypy fixex

* Tidy up

* Lint

* Formatting fixes

* Logic fix

* Formatting fixes
---
 .github/workflows/ccpp.yml                    | 22 ++---
 cpp/dolfinx/common/Scatterer.h                | 24 ++---
 cpp/dolfinx/io/xdmf_utils.cpp                 |  3 +-
 cpp/dolfinx/io/xdmf_utils.h                   |  1 -
 cpp/dolfinx/la/MatrixCSR.h                    | 88 ++++++++++---------
 cpp/dolfinx/la/SparsityPattern.h              | 11 +--
 cpp/dolfinx/mesh/Geometry.h                   |  8 +-
 cpp/dolfinx/mesh/Mesh.h                       |  4 +-
 cpp/dolfinx/mesh/topologycomputation.cpp      |  7 +-
 cpp/dolfinx/mesh/topologycomputation.h        | 14 +--
 cpp/dolfinx/refinement/dolfinx_refinement.h   |  2 +-
 docker/Dockerfile.test-env                    |  8 ++
 python/demo/demo_axis.py                      |  9 +-
 python/demo/demo_cahn-hilliard.py             |  9 +-
 python/demo/demo_half_loaded_waveguide.py     |  2 +-
 python/demo/demo_pml.py                       | 11 +--
 python/demo/demo_pyvista.py                   | 18 ++--
 python/demo/demo_stokes.py                    | 40 ++++-----
 python/dolfinx/fem/petsc.py                   | 50 ++++++-----
 python/dolfinx/wrappers/io.cpp                |  9 +-
 .../unit/fem/test_petsc_solver_wrappers.py    |  4 +-
 21 files changed, 190 insertions(+), 154 deletions(-)

diff --git a/.github/workflows/ccpp.yml b/.github/workflows/ccpp.yml
index eb0ef698094..b35ecf84e69 100644
--- a/.github/workflows/ccpp.yml
+++ b/.github/workflows/ccpp.yml
@@ -33,7 +33,7 @@ jobs:
     name: Lint
     steps:
       - uses: actions/checkout@v4
-      
+
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
@@ -85,7 +85,7 @@ jobs:
     name: Build and test
     steps:
       - uses: actions/checkout@v4
-      
+
       - name: Install dependencies
         run: |
           sudo apt-get update
@@ -209,8 +209,8 @@ jobs:
           ctest -V -R demo -R serial
           ctest -V -R demo -R mpi_2
 
-      - name: Install Python build dependencies 
-        run: pip install -r python/build-requirements.txt 
+      - name: Install Python build dependencies
+        run: pip install -r python/build-requirements.txt
 
       - name: Build Python interface
         run: |
@@ -282,15 +282,15 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           name: docs-cpp-doxygen
-          path: cpp/doc/html 
-          retention-days: 2 
+          path: cpp/doc/html
+          retention-days: 2
       - name: Upload C++ Sphinx documentation artifact
         uses: actions/upload-artifact@v4
         with:
           name: docs-cpp-sphinx
-          path: cpp/doc/build/html 
-          retention-days: 2 
-      
+          path: cpp/doc/build/html
+          retention-days: 2
+
       - name: Build Python interface documentation
         run: |
           cd python/doc
@@ -299,8 +299,8 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           name: docs-python
-          path: python/doc/build/html 
-          retention-days: 2 
+          path: python/doc/build/html
+          retention-days: 2
 
       - name: Checkout FEniCS/docs
         if: ${{ github.repository == 'FEniCS/dolfinx' && ( github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/v') ) }}
diff --git a/cpp/dolfinx/common/Scatterer.h b/cpp/dolfinx/common/Scatterer.h
index f3d297388bc..52bcd720fb0 100644
--- a/cpp/dolfinx/common/Scatterer.h
+++ b/cpp/dolfinx/common/Scatterer.h
@@ -145,7 +145,8 @@ class Scatterer
 
     // Scale sizes and displacements by block size
     {
-      auto rescale = [](auto& x, int bs) {
+      auto rescale = [](auto& x, int bs)
+      {
         std::ranges::transform(x, x.begin(), [bs](auto e) { return e *= bs; });
       };
       rescale(_sizes_local, bs);
@@ -459,8 +460,8 @@ class Scatterer
   /// @note This function is intended for advanced usage, and in
   /// particular when using CUDA/device-aware MPI.
   ///
-  /// @tparam T The data type to send
-  /// @tparam F The pack function
+  /// @tparam T Data type to send.
+  /// @tparam F Pack function.
   /// @param[in] remote_data Received data associated with the ghost
   /// indices. The order follows the order of the ghost indices in the
   /// IndexMap used to create the scatterer. The size equal to the
@@ -493,14 +494,15 @@ class Scatterer
                       type);
   }
 
-  /// @brief End the reverse scatter communication, and unpack the received
-  /// local buffer into local data.
+  /// @brief End the reverse scatter communication, and unpack the
+  /// received local buffer into local data.
   ///
   /// This function must be called after Scatterer::scatter_rev_begin.
   /// The buffers passed to Scatterer::scatter_rev_begin must not be
   /// modified until after the function has been called.
-  /// @param[in] local_buffer Working buffer. Same buffer should be used in
-  /// Scatterer::scatter_rev_begin.
+  ///
+  /// @param[in] local_buffer Working buffer. Same buffer should be used
+  /// in Scatterer::scatter_rev_begin.
   /// @param[out] local_data All data associated with owned indices.
   /// Size is `size_local()` from the IndexMap used to create the
   /// scatterer, multiplied by the block size. The data for each index
@@ -557,7 +559,7 @@ class Scatterer
   }
 
   /// @brief Size of buffer for local data (owned and shared) used in
-  /// forward and reverse communication
+  /// forward and reverse communication.
   /// @return The required buffer size
   std::int32_t local_buffer_size() const noexcept { return _local_inds.size(); }
 
@@ -569,9 +571,9 @@ class Scatterer
     return _remote_inds.size();
   }
 
-  /// Return a vector of local indices (owned) used to pack/unpack local data.
-  /// These indices are grouped by neighbor process (process for which an index
-  /// is a ghost).
+  /// Return a vector of local indices (owned) used to pack/unpack local
+  /// data. These indices are grouped by neighbor process (process for
+  /// which an index is a ghost).
   const std::vector<std::int32_t>& local_indices() const noexcept
   {
     return _local_inds;
diff --git a/cpp/dolfinx/io/xdmf_utils.cpp b/cpp/dolfinx/io/xdmf_utils.cpp
index 3cedcc06d8c..d52f5fd23f7 100644
--- a/cpp/dolfinx/io/xdmf_utils.cpp
+++ b/cpp/dolfinx/io/xdmf_utils.cpp
@@ -403,7 +403,8 @@ xdmf_utils::distribute_entity_data(
     std::vector<std::pair<int, std::int64_t>> dest_to_index;
     std::ranges::transform(
         indices, std::back_inserter(dest_to_index),
-        [size, num_nodes](auto n) {
+        [size, num_nodes](auto n)
+        {
           return std::pair(dolfinx::MPI::index_owner(size, n, num_nodes), n);
         });
     std::ranges::sort(dest_to_index);
diff --git a/cpp/dolfinx/io/xdmf_utils.h b/cpp/dolfinx/io/xdmf_utils.h
index c18eed98e80..b7573321dbf 100644
--- a/cpp/dolfinx/io/xdmf_utils.h
+++ b/cpp/dolfinx/io/xdmf_utils.h
@@ -26,7 +26,6 @@
 
 namespace dolfinx
 {
-
 namespace fem
 {
 template <dolfinx::scalar T, std::floating_point U>
diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h
index bd1181532f6..36d3038b155 100644
--- a/cpp/dolfinx/la/MatrixCSR.h
+++ b/cpp/dolfinx/la/MatrixCSR.h
@@ -70,17 +70,18 @@ class MatrixCSR
   /// @brief Insertion functor for setting values in a matrix. It is
   /// typically used in finite element assembly functions.
   ///
-  /// Create a function to set values in a MatrixCSR. The function signature is
-  /// `int mat_set_fn(std::span<const std::int32_t rows, std::span<const
-  /// std::int32_t cols, std::span<const value_type> data)`. The rows
-  /// and columns use process local indexing, and the given rows and
-  /// columns must pre-exist in the sparsity pattern of the matrix.
-  /// Insertion into "ghost" rows (in the ghost region of the row
-  /// `IndexMap`) is permitted, so long as there are correct entries
-  /// in the sparsity pattern.
+  /// Create a function to set values in a MatrixCSR. The function
+  /// signature is `int mat_set_fn(std::span<const std::int32_t rows,
+  /// std::span<const std::int32_t cols, std::span<const value_type>
+  /// data)`. The rows and columns use process local indexing, and the
+  /// given rows and columns must pre-exist in the sparsity pattern of
+  /// the matrix. Insertion into "ghost" rows (in the ghost region of
+  /// the row `IndexMap`) is permitted, so long as there are correct
+  /// entries in the sparsity pattern.
   ///
-  /// @note Using rows or columns which are not in the sparsity will result
-  /// in undefined behaviour (or an assert failure in Debug mode).
+  /// @note Using rows or columns which are not in the sparsity will
+  /// result in undefined behaviour (or an assert failure in Debug
+  /// mode).
   ///
   /// @note Matrix block size may be (1, 1) or (BS0, BS1)
   /// @note Data block size may be (1, 1) or (BS0, BS1)
@@ -111,17 +112,18 @@ class MatrixCSR
   /// @brief Insertion functor for adding values to a matrix. It is
   /// typically used in finite element assembly functions.
   ///
-  /// Create a function to add values to a MatrixCSR. The function signature is
-  /// `int mat_add_fn(std::span<const std::int32_t rows, std::span<const
-  /// std::int32_t cols, std::span<const value_type> data)`. The rows
-  /// and columns use process local indexing, and the given rows and
-  /// columns must pre-exist in the sparsity pattern of the matrix.
-  /// Insertion into "ghost" rows (in the ghost region of the row
-  /// `IndexMap`) is permitted, so long as there are correct entries
-  /// in the sparsity pattern.
+  /// Create a function to add values to a MatrixCSR. The function
+  /// signature is `int mat_add_fn(std::span<const std::int32_t rows,
+  /// std::span<const std::int32_t cols, std::span<const value_type>
+  /// data)`. The rows and columns use process local indexing, and the
+  /// given rows and columns must pre-exist in the sparsity pattern of
+  /// the matrix. Insertion into "ghost" rows (in the ghost region of
+  /// the row `IndexMap`) is permitted, so long as there are correct
+  /// entries in the sparsity pattern.
   ///
-  /// @note Using rows or columns which are not in the sparsity will result
-  /// in undefined behaviour (or an assert failure in Debug mode).
+  /// @note Using rows or columns which are not in the sparsity will
+  /// result in undefined behaviour (or an assert failure in Debug
+  /// mode).
   ///
   /// @note Matrix block size may be (1, 1) or (BS0, BS1)
   /// @note Data block size may be (1, 1) or (BS0, BS1)
@@ -152,16 +154,14 @@ class MatrixCSR
   /// @brief Create a distributed matrix.
   ///
   /// The structure of the matrix depends entirely on the input
-  /// `SparsityPattern`, which must be finalized.
-  /// The matrix storage is distributed Compressed Sparse Row:
-  /// the matrix is distributed by row across processes, and on each
-  /// process, there is a list of column indices and matrix entries
-  /// for each row stored. This exactly matches the layout of the
-  /// `SparsityPattern`.
-  /// There is some overlap of matrix rows between processes to allow for
-  /// independent Finite Element assembly, after which, the ghost rows
-  /// should be sent to the row owning processes by calling
-  /// `scatter_rev()`.
+  /// `SparsityPattern`, which must be finalized. The matrix storage is
+  /// distributed Compressed Sparse Row: the matrix is distributed by
+  /// row across processes, and on each process, there is a list of
+  /// column indices and matrix entries for each row stored. This
+  /// exactly matches the layout of the `SparsityPattern`. There is some
+  /// overlap of matrix rows between processes to allow for independent
+  /// Finite Element assembly, after which, the ghost rows should be
+  /// sent to the row owning processes by calling `scatter_rev()`.
   ///
   /// @note The block size of the matrix is given by the block size of
   /// the input `SparsityPattern`.
@@ -192,8 +192,9 @@ class MatrixCSR
   /// @note All indices are local to the calling MPI rank and entries
   /// cannot be set in ghost rows.
   /// @note This should be called after `scatter_rev`. Using before
-  /// `scatter_rev` will set the values correctly, but incoming values may
-  /// get added to them during a subsequent reverse scatter operation.
+  /// `scatter_rev` will set the values correctly, but incoming values
+  /// may get added to them during a subsequent reverse scatter
+  /// operation.
   /// @tparam BS0 Data row block size
   /// @tparam BS1 Data column block size
   /// @param[in] x The `m` by `n` dense block of values (row-major) to
@@ -216,8 +217,8 @@ class MatrixCSR
     }
     else if (_bs[0] == 1 and _bs[1] == 1)
     {
-      // Set blocked data in a regular CSR matrix (_bs[0]=1, _bs[1]=1) with
-      // correct sparsity
+      // Set blocked data in a regular CSR matrix (_bs[0]=1, _bs[1]=1)
+      // with correct sparsity
       impl::insert_blocked_csr<BS0, BS1>(_data, _cols, _row_ptr, x, rows, cols,
                                          set_fn, num_rows);
     }
@@ -278,20 +279,23 @@ class MatrixCSR
   /// Number of local rows including ghost rows
   std::int32_t num_all_rows() const { return _row_ptr.size() - 1; }
 
-  /// Copy to a dense matrix
+  /// @brief Copy to a dense matrix.
   /// @note This function is typically used for debugging and not used
-  /// in production
+  /// in production.
   /// @note Ghost rows are also returned, and these can be truncated
   /// manually by using num_owned_rows() if required.
-  /// @note If the block size is greater than 1, the entries are expanded.
+  /// @note If the block size is greater than 1, the entries are
+  /// expanded.
   /// @return Dense copy of the part of the matrix on the calling rank.
   /// Storage is row-major.
   std::vector<value_type> to_dense() const;
 
   /// @brief Transfer ghost row data to the owning ranks accumulating
   /// received values on the owned rows, and zeroing any existing data
-  /// in ghost rows. This process is analogous to `scatter_rev` for `Vector`
-  /// except that the values are always accumulated on the owning process.
+  /// in ghost rows.
+  ///
+  /// This process is analogous to `scatter_rev` for `Vector` except
+  /// that the values are always accumulated on the owning process.
   void scatter_rev()
   {
     scatter_rev_begin();
@@ -303,8 +307,8 @@ class MatrixCSR
   /// @note Calls to this function must be followed by
   /// MatrixCSR::scatter_rev_end(). Between the two calls matrix values
   /// must not be changed.
-  /// @note This function does not change the matrix data. Data update only
-  /// occurs with `scatter_rev_end()`.
+  /// @note This function does not change the matrix data. Data update
+  /// only occurs with `scatter_rev_end()`.
   void scatter_rev_begin();
 
   /// @brief End transfer of ghost row data to owning ranks.
@@ -637,7 +641,7 @@ MatrixCSR<U, V, W, X>::to_dense() const
       for (int i0 = 0; i0 < _bs[0]; ++i0)
         for (int i1 = 0; i1 < _bs[1]; ++i1)
         {
-          std::array<std::int32_t, 1> local_col {_cols[j]};
+          std::array<std::int32_t, 1> local_col{_cols[j]};
           std::array<std::int64_t, 1> global_col{0};
           _index_maps[1]->local_to_global(local_col, global_col);
           A[(r * _bs[1] + i0) * ncols * _bs[0] + global_col[0] * _bs[1] + i1]
diff --git a/cpp/dolfinx/la/SparsityPattern.h b/cpp/dolfinx/la/SparsityPattern.h
index 726455a0c39..0b561849806 100644
--- a/cpp/dolfinx/la/SparsityPattern.h
+++ b/cpp/dolfinx/la/SparsityPattern.h
@@ -74,11 +74,11 @@ class SparsityPattern
   /// @brief Insert non-zero locations using local (process-wise)
   /// indices.
   ///
-  /// This routine inserts non-zero locations at the outer product of rows and
-  /// cols into the sparsity pattern, i.e. adds the matrix entries at
-  ///   A[row[i], col[j]] for all i, j.
+  /// This routine inserts non-zero locations at the outer product of
+  /// rows and cols into the sparsity pattern, i.e. adds the matrix
+  /// entries at `A[row[i], col[j]] for all i, j`.
   ///
-  /// @param[in] rows list of the local row indices 
+  /// @param[in] rows list of the local row indices
   /// @param[in] cols list of the local column indices
   void insert(std::span<const std::int32_t> rows,
               std::span<const std::int32_t> cols);
@@ -88,7 +88,8 @@ class SparsityPattern
   /// must exist in the row IndexMap.
   void insert_diagonal(std::span<const std::int32_t> rows);
 
-  /// @brief Finalize sparsity pattern and communicate off-process entries
+  /// @brief Finalize sparsity pattern and communicate off-process
+  /// entries
   void finalize();
 
   /// @brief Index map for given dimension dimension. Returns the index
diff --git a/cpp/dolfinx/mesh/Geometry.h b/cpp/dolfinx/mesh/Geometry.h
index 30784a72858..2fd4145a7fb 100644
--- a/cpp/dolfinx/mesh/Geometry.h
+++ b/cpp/dolfinx/mesh/Geometry.h
@@ -165,14 +165,14 @@ class Geometry
   /// @brief Access geometry degrees-of-freedom data (const version).
   ///
   /// @return The flattened row-major geometry data, where the shape is
-  /// (num_points, 3)
+  /// `(num_points, 3)`.
   std::span<const value_type> x() const { return _x; }
 
   /// @brief Access geometry degrees-of-freedom data (non-const
   /// version).
   ///
   /// @return The flattened row-major geometry data, where the shape is
-  /// (num_points, 3)
+  /// `(num_points, 3)`.
   std::span<value_type> x() { return _x; }
 
   /// @brief The element that describes the geometry map.
@@ -231,8 +231,8 @@ template <typename U, typename V, typename W>
 Geometry(std::shared_ptr<const common::IndexMap>, U,
          const std::vector<fem::CoordinateElement<
              typename std::remove_reference_t<typename V::value_type>>>&,
-         V, int,
-         W) -> Geometry<typename std::remove_cvref_t<typename V::value_type>>;
+         V, int, W)
+    -> Geometry<typename std::remove_cvref_t<typename V::value_type>>;
 /// @endcond
 
 /// @brief Build Geometry from input data.
diff --git a/cpp/dolfinx/mesh/Mesh.h b/cpp/dolfinx/mesh/Mesh.h
index b29195e01ec..f9786ea4392 100644
--- a/cpp/dolfinx/mesh/Mesh.h
+++ b/cpp/dolfinx/mesh/Mesh.h
@@ -103,8 +103,8 @@ class Mesh
 /// @cond
 /// Template type deduction
 template <typename V>
-Mesh(MPI_Comm, std::shared_ptr<Topology>,
-     V) -> Mesh<typename std::remove_cvref_t<typename V::value_type>>;
+Mesh(MPI_Comm, std::shared_ptr<Topology>, V)
+    -> Mesh<typename std::remove_cvref_t<typename V::value_type>>;
 /// @endcond
 
 } // namespace dolfinx::mesh
diff --git a/cpp/dolfinx/mesh/topologycomputation.cpp b/cpp/dolfinx/mesh/topologycomputation.cpp
index edefdbd1e26..006435cc42a 100644
--- a/cpp/dolfinx/mesh/topologycomputation.cpp
+++ b/cpp/dolfinx/mesh/topologycomputation.cpp
@@ -538,10 +538,9 @@ compute_entities_by_key_matching(
 
         std::vector<std::size_t> perm(global_vertices.size());
         std::iota(perm.begin(), perm.end(), 0);
-        std::ranges::sort(perm,
-                          [&global_vertices](std::size_t i0, std::size_t i1) {
-                            return global_vertices[i0] < global_vertices[i1];
-                          });
+        std::ranges::sort(
+            perm, [&global_vertices](std::size_t i0, std::size_t i1)
+            { return global_vertices[i0] < global_vertices[i1]; });
         // For quadrilaterals, the vertex opposite the lowest vertex should
         // be last
         if (entity_type == mesh::CellType::quadrilateral)
diff --git a/cpp/dolfinx/mesh/topologycomputation.h b/cpp/dolfinx/mesh/topologycomputation.h
index a5e8a98f2f7..7fa9c35050f 100644
--- a/cpp/dolfinx/mesh/topologycomputation.h
+++ b/cpp/dolfinx/mesh/topologycomputation.h
@@ -29,11 +29,12 @@ namespace dolfinx::mesh
 class Topology;
 
 /// @brief Compute mesh entities of given topological dimension by
-/// computing entity-to-vertex connectivity `(dim, 0)`, and cell-to-entity
-/// connectivity `(tdim, dim)`.
+/// computing entity-to-vertex connectivity `(dim, 0)`, and
+/// cell-to-entity connectivity `(tdim, dim)`.
+///
+/// Computed entities are oriented such that their local (to the
+/// process) orientation agrees with their global orientation
 ///
-/// Computed entities are oriented such that their
-/// local (to the process) orientation agrees with their global orientation
 /// @param[in] comm MPI Communicator
 /// @param[in] topology Mesh topology
 /// @param[in] dim The dimension of the entities to create
@@ -49,8 +50,9 @@ std::tuple<std::vector<std::shared_ptr<graph::AdjacencyList<std::int32_t>>>,
            std::shared_ptr<common::IndexMap>, std::vector<std::int32_t>>
 compute_entities(MPI_Comm comm, const Topology& topology, int dim, int index);
 
-/// @brief Compute connectivity (d0 -> d1) for given pair of entity types, given
-/// by topological dimension and index, as found in `Topology::entity_types()`
+/// @brief Compute connectivity (d0 -> d1) for given pair of entity
+/// types, given by topological dimension and index, as found in
+/// `Topology::entity_types()`
 /// @param[in] topology The topology
 /// @param[in] d0 The dimension and index of the entities
 /// @param[in] d1 The dimension and index of the incident entities
diff --git a/cpp/dolfinx/refinement/dolfinx_refinement.h b/cpp/dolfinx/refinement/dolfinx_refinement.h
index b4e1bea6f16..46588b267b4 100644
--- a/cpp/dolfinx/refinement/dolfinx_refinement.h
+++ b/cpp/dolfinx/refinement/dolfinx_refinement.h
@@ -10,5 +10,5 @@ namespace dolfinx::refinement
 
 // DOLFINx refinement interface
 
-#include <dolfinx/refinement/refine.h>
 #include <dolfinx/refinement/interval.h>
+#include <dolfinx/refinement/refine.h>
diff --git a/docker/Dockerfile.test-env b/docker/Dockerfile.test-env
index 72f7f26f379..1d9147950ed 100644
--- a/docker/Dockerfile.test-env
+++ b/docker/Dockerfile.test-env
@@ -207,6 +207,7 @@ RUN apt-get -qq update && \
     --with-fortran-bindings=no \
     --with-shared-libraries \
     --download-metis \
+    --download-mumps-avoid-mpi-in-place \
     --download-mumps \
     --download-ptscotch \
     --download-scalapack \
@@ -226,9 +227,12 @@ RUN apt-get -qq update && \
     --with-fortran-bindings=no \
     --with-shared-libraries \
     --download-metis \
+    --download-mumps-avoid-mpi-in-place \
     --download-mumps \
     --download-ptscotch \
     --download-scalapack \
+    --download-superlu \
+    --download-superlu_dist \
     --with-scalar-type=complex \
     --with-precision=single && \
     make PETSC_ARCH=linux-gnu-complex64-32 ${MAKEFLAGS} all && \
@@ -244,6 +248,7 @@ RUN apt-get -qq update && \
     --with-shared-libraries \
     --download-hypre \
     --download-metis \
+    --download-mumps-avoid-mpi-in-place \
     --download-mumps \
     --download-ptscotch \
     --download-scalapack \
@@ -266,6 +271,7 @@ RUN apt-get -qq update && \
     --with-shared-libraries \
     --download-hypre \
     --download-metis \
+    --download-mumps-avoid-mpi-in-place \
     --download-mumps \
     --download-ptscotch \
     --download-scalapack \
@@ -286,6 +292,7 @@ RUN apt-get -qq update && \
     --with-fortran-bindings=no \
     --with-shared-libraries \
     --download-hypre \
+    --download-mumps-avoid-mpi-in-place \
     --download-mumps \
     --download-ptscotch \
     --download-scalapack \
@@ -305,6 +312,7 @@ RUN apt-get -qq update && \
     --with-fortran-bindings=no \
     --with-shared-libraries \
     --download-hypre \
+    --download-mumps-avoid-mpi-in-place \
     --download-mumps \
     --download-ptscotch \
     --download-scalapack \
diff --git a/python/demo/demo_axis.py b/python/demo/demo_axis.py
index 8ebb079d093..c00c326ff82 100644
--- a/python/demo/demo_axis.py
+++ b/python/demo/demo_axis.py
@@ -649,12 +649,13 @@ def create_eps_mu(pml, rho, eps_bkg, mu_bkg):
     )
     a, L = ufl.lhs(F), ufl.rhs(F)
     sys = PETSc.Sys()  # type: ignore
-    if sys.hasExternalPackage("superlu_dist"):  # type: ignore
-        mat_factor_backend = "superlu_dist"
-    elif sys.hasExternalPackage("mumps"):  # type: ignore
+    use_superlu = PETSc.IntType == np.int64
+    if sys.hasExternalPackage("mumps") and not use_superlu:  # type: ignore
         mat_factor_backend = "mumps"
+    elif sys.hasExternalPackage("superlu_dist"):  # type: ignore
+        mat_factor_backend = "superlu_dist"
     else:
-        if msh.comm > 1:
+        if msh.comm.size > 1:
             raise RuntimeError("This demo requires a parallel linear algebra backend.")
         else:
             mat_factor_backend = "petsc"
diff --git a/python/demo/demo_cahn-hilliard.py b/python/demo/demo_cahn-hilliard.py
index 846218670f8..3cfed7d48e8 100644
--- a/python/demo/demo_cahn-hilliard.py
+++ b/python/demo/demo_cahn-hilliard.py
@@ -278,11 +278,12 @@
 opts[f"{option_prefix}ksp_type"] = "preonly"
 opts[f"{option_prefix}pc_type"] = "lu"
 sys = PETSc.Sys()  # type: ignore
-# For factorisation prefer superlu_dist, then MUMPS, then default
-if sys.hasExternalPackage("superlu_dist"):
-    opts[f"{option_prefix}pc_factor_mat_solver_type"] = "superlu_dist"
-elif sys.hasExternalPackage("mumps"):
+# For factorisation prefer MUMPS, then superlu_dist, then default
+use_superlu = PETSc.IntType == np.int64  # or PETSc.ScalarType == np.complex64
+if sys.hasExternalPackage("mumps") and not use_superlu:
     opts[f"{option_prefix}pc_factor_mat_solver_type"] = "mumps"
+elif sys.hasExternalPackage("superlu_dist"):
+    opts[f"{option_prefix}pc_factor_mat_solver_type"] = "superlu_dist"
 ksp.setFromOptions()
 # -
 
diff --git a/python/demo/demo_half_loaded_waveguide.py b/python/demo/demo_half_loaded_waveguide.py
index a332f3d33aa..4833e15da20 100644
--- a/python/demo/demo_half_loaded_waveguide.py
+++ b/python/demo/demo_half_loaded_waveguide.py
@@ -52,7 +52,7 @@
         print("This demo requires DOLFINx to be compiled with PETSc enabled.")
         exit(0)
     if PETSc.IntType == np.int64 and MPI.COMM_WORLD.size > 1:
-        print("This solver fails with PETSc and 64-bit integers becaude of memory errors in MUMPS.")
+        print("This solver fails with PETSc and 64-bit integers because of memory errors in MUMPS.")
         # Note: when PETSc.IntType == np.int32, superlu_dist is used
         # rather than MUMPS and does not trigger memory failures.
         exit(0)
diff --git a/python/demo/demo_pml.py b/python/demo/demo_pml.py
index e878e809ff3..3a358952c72 100644
--- a/python/demo/demo_pml.py
+++ b/python/demo/demo_pml.py
@@ -695,15 +695,16 @@ def create_eps_mu(
 
 a, L = ufl.lhs(F), ufl.rhs(F)
 
-# For factorisation prefer superlu_dist, then MUMPS, then default
+# For factorisation prefer MUMPS, then superlu_dist, then default
 sys = PETSc.Sys()  # type: ignore
-if sys.hasExternalPackage("superlu_dist"):  # type: ignore
-    mat_factor_backend = "superlu_dist"
-elif sys.hasExternalPackage("mumps"):  # type: ignore
+use_superlu = PETSc.IntType == np.int64
+if sys.hasExternalPackage("mumps") and not use_superlu:  # type: ignore
     mat_factor_backend = "mumps"
+elif sys.hasExternalPackage("superlu_dist"):  # type: ignore
+    mat_factor_backend = "superlu_dist"
 else:
     if msh.comm > 1:
-        raise RuntimeError("This demo requires a parallel linear algebra backend.")
+        raise RuntimeError("This demo requires a parallel LU solver.")
     else:
         mat_factor_backend = "petsc"
 
diff --git a/python/demo/demo_pyvista.py b/python/demo/demo_pyvista.py
index 29ec0549295..c1b0cd81dc8 100644
--- a/python/demo/demo_pyvista.py
+++ b/python/demo/demo_pyvista.py
@@ -53,7 +53,9 @@
 def plot_scalar():
     # We start by creating a unit square mesh and interpolating a
     # function into a degree 1 Lagrange space
-    msh = create_unit_square(MPI.COMM_WORLD, 12, 12, cell_type=CellType.quadrilateral)
+    msh = create_unit_square(
+        MPI.COMM_WORLD, 12, 12, cell_type=CellType.quadrilateral, dtype=np.float64
+    )
     V = functionspace(msh, ("Lagrange", 1))
     u = Function(V, dtype=np.float64)
     u.interpolate(lambda x: np.sin(np.pi * x[0]) * np.sin(2 * x[1] * np.pi))
@@ -110,7 +112,9 @@ def plot_scalar():
 
 def plot_meshtags():
     # Create a mesh
-    msh = create_unit_square(MPI.COMM_WORLD, 25, 25, cell_type=CellType.quadrilateral)
+    msh = create_unit_square(
+        MPI.COMM_WORLD, 25, 25, cell_type=CellType.quadrilateral, dtype=np.float64
+    )
 
     # Create a geometric indicator function
     def in_circle(x):
@@ -164,7 +168,9 @@ def in_circle(x):
 
 def plot_higher_order():
     # Create a mesh
-    msh = create_unit_square(MPI.COMM_WORLD, 12, 12, cell_type=CellType.quadrilateral)
+    msh = create_unit_square(
+        MPI.COMM_WORLD, 12, 12, cell_type=CellType.quadrilateral, dtype=np.float64
+    )
 
     # Define a geometric indicator function
     def in_circle(x):
@@ -233,7 +239,9 @@ def in_circle(x):
 
 
 def plot_nedelec():
-    msh = create_unit_cube(MPI.COMM_WORLD, 4, 3, 5, cell_type=CellType.tetrahedron)
+    msh = create_unit_cube(
+        MPI.COMM_WORLD, 4, 3, 5, cell_type=CellType.tetrahedron, dtype=np.float64
+    )
 
     # We create a pyvista plotter
     plotter = pyvista.Plotter()
@@ -292,7 +300,7 @@ def plot_nedelec():
 
 
 def plot_streamlines():
-    msh = create_unit_cube(MPI.COMM_WORLD, 4, 4, 4, CellType.hexahedron)
+    msh = create_unit_cube(MPI.COMM_WORLD, 4, 4, 4, CellType.hexahedron, dtype=np.float64)
     gdim = msh.geometry.dim
     V = functionspace(msh, ("Discontinuous Lagrange", 2, (gdim,)))
     u = Function(V, dtype=np.float64)
diff --git a/python/demo/demo_stokes.py b/python/demo/demo_stokes.py
index ab14e9b3437..d6e366cfde2 100644
--- a/python/demo/demo_stokes.py
+++ b/python/demo/demo_stokes.py
@@ -118,10 +118,6 @@
 from dolfinx.mesh import CellType, create_rectangle, locate_entities_boundary
 from ufl import div, dx, grad, inner
 
-opts = PETSc.Options()
-opts["mat_superlu_dist_iterrefine"] = True
-
-
 # We create a {py:class}`Mesh <dolfinx.mesh.Mesh>`, define functions for
 # locating geometrically subsets of the boundary, and define a function
 # for the  velocity on the lid:
@@ -442,18 +438,15 @@ def block_direct_solver():
     # handle pressure nullspace
     pc = ksp.getPC()
     pc.setType("lu")
-    pc.setFactorSolverType("superlu_dist")
-    try:
+    sys = PETSc.Sys()  # type: ignore
+    use_superlu = PETSc.IntType == np.int64
+    if sys.hasExternalPackage("mumps") and not use_superlu:
+        pc.setFactorSolverType("mumps")
         pc.setFactorSetUpSolverType()
-    except PETSc.Error as e:
-        if e.ierr == 92:
-            print("The required PETSc solver/preconditioner is not available. Exiting.")
-            print(e)
-            exit(0)
-        else:
-            raise e
-    # pc.getFactorMatrix().setMumpsIcntl(icntl=24, ival=1)  # For pressure nullspace
-    # pc.getFactorMatrix().setMumpsIcntl(icntl=25, ival=0)  # For pressure nullspace
+        pc.getFactorMatrix().setMumpsIcntl(icntl=24, ival=1)
+        pc.getFactorMatrix().setMumpsIcntl(icntl=25, ival=0)
+    else:
+        pc.setFactorSolverType("superlu_dist")
 
     # Create a block vector (x) to store the full solution, and solve
     x = A.createVecLeft()
@@ -522,7 +515,7 @@ def mixed_direct():
 
     # Set Dirichlet boundary condition values in the RHS
     for bc in bcs:
-        bc.set(b)
+        bc.set(b.array_w)
 
     # Create and configure solver
     ksp = PETSc.KSP().create(msh.comm)
@@ -532,12 +525,15 @@ def mixed_direct():
     # Configure MUMPS to handle pressure nullspace
     pc = ksp.getPC()
     pc.setType("lu")
-    # pc.setFactorSolverType("mumps")
-    # pc.setFactorSetUpSolverType()
-    # pc.getFactorMatrix().setMumpsIcntl(icntl=24, ival=1)
-    # pc.getFactorMatrix().setMumpsIcntl(icntl=25, ival=0)
-
-    pc.setFactorSolverType("superlu_dist")
+    sys = PETSc.Sys()  # type: ignore
+    use_superlu = PETSc.IntType == np.int64
+    if sys.hasExternalPackage("mumps") and not use_superlu:
+        pc.setFactorSolverType("mumps")
+        pc.setFactorSetUpSolverType()
+        pc.getFactorMatrix().setMumpsIcntl(icntl=24, ival=1)
+        pc.getFactorMatrix().setMumpsIcntl(icntl=25, ival=0)
+    else:
+        pc.setFactorSolverType("superlu_dist")
 
     # Compute the solution
     U = Function(W)
diff --git a/python/dolfinx/fem/petsc.py b/python/dolfinx/fem/petsc.py
index 02e3d84a469..9c387d38b33 100644
--- a/python/dolfinx/fem/petsc.py
+++ b/python/dolfinx/fem/petsc.py
@@ -160,10 +160,12 @@ def create_matrix(a: Form, mat_type=None) -> PETSc.Mat:
     """Create a PETSc matrix that is compatible with a bilinear form.
 
     Note:
-        Due to subtle issues in the interaction between petsc4py memory management
-        and the Python garbage collector, it is recommended that the method ``PETSc.Mat.destroy()``
-        is called on the returned object once the object is no longer required. Note that
-        ``PETSc.Mat.destroy()`` is collective over the object's MPI communicator.
+        Due to subtle issues in the interaction between petsc4py memory
+        management and the Python garbage collector, it is recommended
+        that the method ``PETSc.Mat.destroy()`` is called on the
+        returned object once the object is no longer required. Note that
+        ``PETSc.Mat.destroy()`` is collective over the object's MPI
+        communicator.
 
     Args:
         a: A bilinear form.
@@ -182,10 +184,12 @@ def create_matrix_block(a: list[list[Form]]) -> PETSc.Mat:
     """Create a PETSc matrix that is compatible with a rectangular array of bilinear forms.
 
     Note:
-        Due to subtle issues in the interaction between petsc4py memory management
-        and the Python garbage collector, it is recommended that the method ``PETSc.Mat.destroy()``
-        is called on the returned object once the object is no longer required. Note that
-        ``PETSc.Mat.destroy()`` is collective over the object's MPI communicator.
+        Due to subtle issues in the interaction between petsc4py memory
+        management and the Python garbage collector, it is recommended
+        that the method ``PETSc.Mat.destroy()`` is called on the
+        returned object once the object is no longer required. Note that
+        ``PETSc.Mat.destroy()`` is collective over the object's MPI
+        communicator.
 
     Args:
         a: Rectangular array of bilinear forms.
@@ -202,10 +206,12 @@ def create_matrix_nest(a: list[list[Form]]) -> PETSc.Mat:
     """Create a PETSc matrix (``MatNest``) that is compatible with an array of bilinear forms.
 
     Note:
-        Due to subtle issues in the interaction between petsc4py memory management
-        and the Python garbage collector, it is recommended that the method ``PETSc.Mat.destroy()``
-        is called on the returned object once the object is no longer required. Note that
-        ``PETSc.Mat.destroy()`` is collective over the object's MPI communicator.
+        Due to subtle issues in the interaction between petsc4py memory
+        management and the Python garbage collector, it is recommended
+        that the method ``PETSc.Mat.destroy()`` is called on the
+        returned object once the object is no longer required. Note that
+        ``PETSc.Mat.destroy()`` is collective over the object's MPI
+        communicator.
 
     Args:
         a: Rectangular array of bilinear forms.
@@ -990,10 +996,12 @@ def discrete_gradient(space0: _FunctionSpace, space1: _FunctionSpace) -> PETSc.M
     Piola map.
 
     Note:
-        Due to subtle issues in the interaction between petsc4py memory management
-        and the Python garbage collector, it is recommended that the method ``PETSc.Mat.destroy()``
-        is called on the returned object once the object is no longer required. Note that
-        ``PETSc.Mat.destroy()`` is collective over the object's MPI communicator.
+        Due to subtle issues in the interaction between petsc4py memory
+        management and the Python garbage collector, it is recommended
+        that the method ``PETSc.Mat.destroy()`` is called on the
+        returned object once the object is no longer required. Note that
+        ``PETSc.Mat.destroy()`` is collective over the object's MPI
+        communicator.
 
     Args:
         space0: H1 space to interpolate the gradient from.
@@ -1009,10 +1017,12 @@ def interpolation_matrix(space0: _FunctionSpace, space1: _FunctionSpace) -> PETS
     """Assemble an interpolation operator matrix.
 
     Note:
-        Due to subtle issues in the interaction between petsc4py memory management
-        and the Python garbage collector, it is recommended that the method ``PETSc.Mat.destroy()``
-        is called on the returned object once the object is no longer required. Note that
-        ``PETSc.Mat.destroy()`` is collective over the object's MPI communicator.
+        Due to subtle issues in the interaction between petsc4py memory
+        management and the Python garbage collector, it is recommended
+        that the method ``PETSc.Mat.destroy()`` is called on the
+        returned object once the object is no longer required. Note that
+        ``PETSc.Mat.destroy()`` is collective over the object's MPI
+        communicator.
 
     Args:
         space0: Space to interpolate from.
diff --git a/python/dolfinx/wrappers/io.cpp b/python/dolfinx/wrappers/io.cpp
index ed7ae36816b..c5dced3e3a5 100644
--- a/python/dolfinx/wrappers/io.cpp
+++ b/python/dolfinx/wrappers/io.cpp
@@ -115,7 +115,8 @@ void declare_vtx_writer(nb::module_& m, std::string type)
             [](dolfinx::io::VTXWriter<T>* self, MPICommWrapper comm,
                std::filesystem::path filename,
                std::shared_ptr<const dolfinx::mesh::Mesh<T>> mesh,
-               std::string engine) {
+               std::string engine)
+            {
               new (self)
                   dolfinx::io::VTXWriter<T>(comm.get(), filename, mesh, engine);
             },
@@ -155,7 +156,8 @@ void declare_vtx_writer(nb::module_& m, std::string type)
             [](dolfinx::io::FidesWriter<T>* self, MPICommWrapper comm,
                std::filesystem::path filename,
                std::shared_ptr<const dolfinx::mesh::Mesh<T>> mesh,
-               std::string engine) {
+               std::string engine)
+            {
               new (self) dolfinx::io::FidesWriter<T>(comm.get(), filename, mesh,
                                                      engine);
             },
@@ -277,7 +279,8 @@ void io(nb::module_& m)
           "__init__",
           [](dolfinx::io::XDMFFile* x, MPICommWrapper comm,
              std::filesystem::path filename, std::string file_mode,
-             dolfinx::io::XDMFFile::Encoding encoding) {
+             dolfinx::io::XDMFFile::Encoding encoding)
+          {
             new (x) dolfinx::io::XDMFFile(comm.get(), filename, file_mode,
                                           encoding);
           },
diff --git a/python/test/unit/fem/test_petsc_solver_wrappers.py b/python/test/unit/fem/test_petsc_solver_wrappers.py
index 3dd9088aac7..3737c81430e 100644
--- a/python/test/unit/fem/test_petsc_solver_wrappers.py
+++ b/python/test/unit/fem/test_petsc_solver_wrappers.py
@@ -39,10 +39,10 @@ def test_compare_solvers(self, mode):
         sys = PETSc.Sys()
         if MPI.COMM_WORLD.size == 1:
             factor_type = "petsc"
-        elif sys.hasExternalPackage("superlu_dist"):
-            factor_type = "superlu_dist"
         elif sys.hasExternalPackage("mumps"):
             factor_type = "mumps"
+        elif sys.hasExternalPackage("superlu_dist"):
+            factor_type = "superlu_dist"
         else:
             pytest.skip("No external solvers available in parallel")