From d956d3db1f4c8a8b751267a5e2d30cb5e2744838 Mon Sep 17 00:00:00 2001
From: Erdal Mutlu <erdalmutlu.su@gmail.com>
Date: Fri, 28 Jun 2024 15:36:46 -0700
Subject: [PATCH] updated local tensor implementation

---
 src/tamm/CMakeLists.txt         |   1 +
 src/tamm/local_tensor.hpp       | 354 ++++++++++++++++++++++++++++++++
 src/tamm/tamm.hpp               |   1 +
 src/tamm/tensor.hpp             | 188 -----------------
 tests/tamm/Test_LocalTensor.cpp | 235 +++++++++++++++++++--
 5 files changed, 576 insertions(+), 203 deletions(-)
 create mode 100644 src/tamm/local_tensor.hpp
diff --git a/src/tamm/CMakeLists.txt b/src/tamm/CMakeLists.txt
index 31def5a65..b33863116 100644
--- a/src/tamm/CMakeLists.txt
+++ b/src/tamm/CMakeLists.txt
@@ -45,6 +45,7 @@ set(TAMM_INCLUDES
     range.hpp
     ops.hpp
     scheduler.hpp
+    local_tensor.hpp
     tensor.hpp
     tensor_impl.hpp
     tensor_base.hpp
diff --git a/src/tamm/local_tensor.hpp b/src/tamm/local_tensor.hpp
new file mode 100644
index 000000000..0b8506d56
--- /dev/null
+++ b/src/tamm/local_tensor.hpp
@@ -0,0 +1,354 @@
+#pragma once
+
+#include "tamm/tensor.hpp"
+
+namespace tamm {
+
+// template<typename T>
+// class LabeledTensor;
+
+/// @brief Creates a local copy of the distributed tensor
+/// @tparam T Data type for the tensor being made local
+template<typename T>
+class LocalTensor: public Tensor<T> { // move to another hpp
+public:
+  LocalTensor()                              = default;
+  LocalTensor(LocalTensor&&)                 = default;
+  LocalTensor(const LocalTensor&)            = default;
+  LocalTensor& operator=(LocalTensor&&)      = default;
+  LocalTensor& operator=(const LocalTensor&) = default;
+  ~LocalTensor()                             = default;
+
+  // LocalTensor(Tensor<T> dist_tensor): dist_tensor_(dist_tensor) { construct_local_tensor(); }
+
+  LocalTensor(std::initializer_list<TiledIndexSpace> tiss):
+    Tensor<T>(construct_local_tis_vec(TiledIndexSpaceVec(tiss))) {}
+
+  LocalTensor(std::vector<TiledIndexSpace> tiss): Tensor<T>(construct_local_tis_vec(tiss)) {}
+
+  LocalTensor(std::initializer_list<TiledIndexLabel> tis_labels):
+    Tensor<T>(construct_local_tis_vec(IndexLabelVec(tis_labels))) {}
+
+  LocalTensor(std::initializer_list<size_t> dim_sizes):
+    Tensor<T>(construct_tis_vec(std::vector<size_t>(dim_sizes))) {}
+
+  LocalTensor(std::vector<size_t> dim_sizes): Tensor<T>(construct_tis_vec(dim_sizes)) {}
+
+  /// @brief
+  /// @tparam ...Args
+  /// @param ...rest
+  /// @return
+  template<class... Args>
+  LabeledTensor<T> operator()(Args&&... rest) const {
+    return LabeledTensor<T>{*this, std::forward<Args>(rest)...};
+  }
+
+  // void write_back_to_dist() { fill_distributed_tensor(); }
+
+  /// @brief
+  /// @param val
+  void init(T val) {
+    EXPECTS_STR(this->is_allocated(), "LocalTensor has to be allocated");
+
+    auto ec = this->execution_context();
+    Scheduler{*ec}((*this)() = val).execute();
+  }
+
+  /// @brief
+  /// @param indices
+  /// @param val
+  void set(std::vector<size_t> indices, T val) {
+    EXPECTS_STR(this->is_allocated(), "LocalTensor has to be allocated");
+    EXPECTS_STR(indices.size() == this->num_modes(),
+                "Number of indices must match the number of dimensions");
+    size_t linearIndex = compute_linear_index(indices);
+
+    this->access_local_buf()[linearIndex] = val;
+  }
+
+  /// @brief
+  /// @param indices
+  /// @return
+  T get(const std::vector<size_t>& indices) const {
+    EXPECTS_STR(indices.size() == this->num_modes(),
+                "Number of indices must match the number of dimensions");
+    size_t linearIndex = compute_linear_index(indices);
+
+    return this->access_local_buf()[linearIndex];
+  }
+
+  /// @brief
+  /// @tparam ...Args
+  /// @param ...args
+  /// @return
+  template<typename... Args>
+  T get(Args... args) {
+    std::vector<size_t> indices;
+    unpack(indices, args...);
+    EXPECTS_STR(indices.size() == this->num_modes(),
+                "Number of indices must match the number of dimensions");
+    size_t linearIndex = compute_linear_index(indices);
+
+    return this->access_local_buf()[linearIndex];
+  }
+
+  /// @brief
+  /// @param new_sizes
+  template<typename... Args>
+  void resize(Args... args) {
+    std::vector<size_t> new_sizes;
+    unpack(new_sizes, args...);
+    EXPECTS_STR(new_sizes.size() == (*this).num_modes(),
+                "Number of new sizes must match the number of dimensions");
+    resize(std::vector<size_t>{new_sizes});
+  }
+
+  /// @brief
+  /// @param new_sizes
+  void resize(const std::vector<size_t>& new_sizes) {
+    EXPECTS_STR((*this).is_allocated(), "LocalTensor has to be allocated!");
+    auto num_dims = (*this).num_modes();
+    EXPECTS_STR(num_dims == new_sizes.size(),
+                "Number of new sizes must match the number of dimensions.");
+
+    for(size_t i = 0; i < new_sizes.size(); i++) {
+      EXPECTS_STR(new_sizes[i] != 0, "New size should be larger than 0.");
+    }
+
+    LocalTensor<T> resizedTensor;
+
+    auto dimensions = (*this).dim_sizes();
+
+    if(dimensions == new_sizes) return;
+
+    if(isWithinOldDimensions(new_sizes)) {
+      std::vector<size_t> offsets(new_sizes.size(), 0);
+      resizedTensor = (*this).block(offsets, new_sizes);
+    }
+    else {
+      resizedTensor = LocalTensor<T>{new_sizes};
+      resizedTensor.allocate((*this).execution_context());
+      (*this).copy_to_bigger(resizedTensor);
+    }
+
+    auto old_tensor = (*this);
+    (*this)         = resizedTensor;
+    old_tensor.deallocate();
+  }
+
+  // /// @brief
+  // /// @param sbuf
+  // /// @param block_dims
+  // /// @param block_offset
+  // /// @param copy_to_local
+  // void patch_copy_local(std::vector<T>& sbuf, const std::vector<size_t>& block_dims,
+  //                       const std::vector<size_t>& block_offset, bool copy_to_local) {
+  //   auto num_dims = local_tensor_.num_modes();
+  //   // Compute the total number of elements to copy
+  //   size_t total_elements = 1;
+  //   for(size_t dim: block_dims) { total_elements *= dim; }
+
+  //   // Initialize indices to the starting offset
+  //   std::vector<size_t> indices(block_offset);
+
+  //   for(size_t c = 0; c < total_elements; ++c) {
+  //     // Access the tensor element at the current indices
+  //     if(copy_to_local) (*this)(indices) = sbuf[c];
+  //     else sbuf[c] = (*this)(indices);
+
+  //     // Increment indices
+  //     for(int dim = num_dims - 1; dim >= 0; --dim) {
+  //       if(++indices[dim] < block_offset[dim] + block_dims[dim]) { break; }
+  //       indices[dim] = block_offset[dim];
+  //     }
+  //   }
+  // }
+
+  /// @brief
+  /// @param bigger_tensor
+  void copy_to_bigger(LocalTensor& bigger_tensor) const {
+    auto smallerDims = (*this).dim_sizes();
+
+    // Helper lambda to iterate over all indices of a tensor
+    auto iterateIndices = [](const std::vector<size_t>& dims) {
+      std::vector<size_t> indices(dims.size(), 0);
+      bool                done = false;
+      return [=]() mutable {
+        if(done) return std::optional<std::vector<size_t>>{};
+        auto current = indices;
+        for(int i = indices.size() - 1; i >= 0; --i) {
+          if(++indices[i] < dims[i]) break;
+          if(i == 0) {
+            done = true;
+            break;
+          }
+          indices[i] = 0;
+        }
+        return std::optional<std::vector<size_t>>{current};
+      };
+    };
+
+    auto smallerIt = iterateIndices(smallerDims);
+    while(auto indices = smallerIt()) {
+      auto bigIndices = *indices;
+      bigger_tensor.set(bigIndices, (*this).get(*indices));
+    }
+  }
+
+  /// @brief
+  /// @param start_offsets
+  /// @param span_sizes
+  /// @return
+  LocalTensor<T> block(const std::vector<size_t>& start_offsets,
+                       const std::vector<size_t>& span_sizes) const {
+    EXPECTS_STR((*this).is_allocated(), "LocalTensor has to be allocated!");
+    auto num_dims = (*this).num_modes();
+    EXPECTS_STR(num_dims == start_offsets.size(),
+                "Number of start offsets should match the number of dimensions.");
+    EXPECTS_STR(num_dims == span_sizes.size(),
+                "Number of span sizes should match the number of dimensions.");
+
+    // this has to be allocated
+    // offsets should be within limits
+    // offset + span size should be within limit
+
+    // Create a local tensor for the block
+    LocalTensor<T> blockTensor{span_sizes};
+    blockTensor.allocate(this->execution_context());
+
+    // Iterate over all dimensions to copy the block
+    std::vector<size_t> indices(num_dims, 0);
+    std::vector<size_t> source_indices = start_offsets;
+
+    bool done = false;
+    while(!done) {
+      // Copy the element
+      blockTensor.set(indices, (*this).get(source_indices));
+
+      // Update indices
+      done = true;
+      for(size_t i = 0; i < num_dims; ++i) {
+        if(++indices[i] < span_sizes[i]) {
+          ++source_indices[i];
+          done = false;
+          break;
+        }
+        else {
+          indices[i]        = 0;
+          source_indices[i] = start_offsets[i];
+        }
+      }
+    }
+
+    return blockTensor;
+  }
+
+  /// @brief
+  /// @param x_offset
+  /// @param y_offset
+  /// @param x_span
+  /// @param y_span
+  /// @return
+  LocalTensor<T> block(size_t x_offset, size_t y_offset, size_t x_span, size_t y_span) const {
+    auto num_dims = (*this).num_modes();
+    EXPECTS_STR(num_dims == 2, "This block method only works for 2-D tensors!");
+
+    return block({x_offset, y_offset}, {x_span, y_span});
+  }
+
+  /// @brief
+  /// @return
+  std::vector<size_t> dim_sizes() const {
+    std::vector<size_t> dimensions;
+
+    for(const auto& tis: (*this).tiled_index_spaces()) {
+      dimensions.push_back(tis.max_num_indices());
+    }
+
+    return dimensions;
+  }
+
+private:
+  /// @brief
+  /// @param tiss
+  /// @return
+  TiledIndexSpaceVec construct_local_tis_vec(std::vector<TiledIndexSpace> tiss) {
+    std::vector<size_t> dim_sizes;
+
+    for(const auto& tis: tiss) { dim_sizes.push_back(tis.max_num_indices()); }
+
+    return construct_tis_vec(dim_sizes);
+  }
+
+  /// @brief
+  /// @param tis_labels
+  /// @return
+  TiledIndexSpaceVec construct_local_tis_vec(std::vector<TiledIndexLabel> tis_labels) {
+    std::vector<size_t> dim_sizes;
+
+    for(const auto& tis_label: tis_labels) {
+      dim_sizes.push_back(tis_label.tiled_index_space().max_num_indices());
+    }
+
+    return construct_tis_vec(dim_sizes);
+  }
+
+  /// @brief
+  /// @param dim_sizes
+  /// @return
+  TiledIndexSpaceVec construct_tis_vec(std::vector<size_t> dim_sizes) {
+    TiledIndexSpaceVec local_tis_vec;
+    for(const auto& dim_size: dim_sizes) {
+      local_tis_vec.push_back(TiledIndexSpace{IndexSpace{range(dim_size)}, dim_size});
+    }
+
+    return local_tis_vec;
+  }
+
+  /// @brief Method for constructing the linearized index for a given location on the local tensor
+  /// @param indices The index for the corresponding location wanted to be accessed
+  /// @return The linear position to the local memory manager
+  size_t compute_linear_index(const std::vector<size_t>& indices) const {
+    auto                num_modes = this->num_modes();
+    std::vector<size_t> dims      = (*this).dim_sizes();
+    size_t              index     = 0;
+    size_t              stride    = 1;
+
+    for(size_t i = 0; i < num_modes; ++i) {
+      index += indices[num_modes - 1 - i] * stride;
+      stride *= dims[num_modes - 1 - i];
+    }
+
+    return index;
+  }
+
+  /// @brief
+  /// @param indices
+  /// @return
+  bool isWithinOldDimensions(const std::vector<size_t>& indices) const {
+    std::vector<size_t> dimensions = (*this).dim_sizes();
+
+    for(size_t i = 0; i < indices.size(); ++i) {
+      if(indices[i] > dimensions[i]) { return false; }
+    }
+    return true;
+  }
+
+  /// @brief Helper method that will unpack the variadic template for operator()
+  /// @param indices A reference to the vector of indices
+  /// @param index The last index that is provided to the operator()
+  void unpack(std::vector<size_t>& indices, size_t index) { indices.push_back(index); }
+
+  /// @brief Helper method that will unpack the variadic template for operator()
+  /// @tparam ...Args The variadic template from the arguments to the operator()
+  /// @param indices A reference to the vector of indices
+  /// @param next Unpacked index for the operator()
+  /// @param ...rest The rest of the variadic template that will be unpacked in the recursive calls
+  template<typename... Args>
+  void unpack(std::vector<size_t>& indices, size_t next, Args... rest) {
+    indices.push_back(next);
+    unpack(indices, rest...);
+  }
+};
+
+} // namespace tamm
diff --git a/src/tamm/tamm.hpp b/src/tamm/tamm.hpp
index c24093916..27635bf70 100644
--- a/src/tamm/tamm.hpp
+++ b/src/tamm/tamm.hpp
@@ -19,6 +19,7 @@
 #include "tamm/execution_context.hpp"
 #include "tamm/index_space.hpp"
 #include "tamm/labeled_tensor.hpp"
+#include "tamm/local_tensor.hpp"
 #include "tamm/ops.hpp"
 #include "tamm/rmm_memory_pool.hpp"
 #include "tamm/scheduler.hpp"
diff --git a/src/tamm/tensor.hpp b/src/tamm/tensor.hpp
index 494edfda0..b824ebf52 100644
--- a/src/tamm/tensor.hpp
+++ b/src/tamm/tensor.hpp
@@ -662,192 +662,4 @@ class IndexedTensor: public std::pair<Tensor<T>, IndexVector> {
 template<typename T>
 IndexedTensor(Tensor<T>, IndexVector) -> IndexedTensor<T>;
 
-/// @brief Creates a local copy of the distributed tensor
-/// @tparam T Data type for the tensor being made local
-template<typename T>
-class LocalTensor {
-public:
-  LocalTensor()                              = default;
-  LocalTensor(LocalTensor&&)                 = default;
-  LocalTensor(const LocalTensor&)            = default;
-  LocalTensor& operator=(LocalTensor&&)      = default;
-  LocalTensor& operator=(const LocalTensor&) = default;
-  ~LocalTensor()                             = default;
-
-  LocalTensor(Tensor<T> dist_tensor): dist_tensor_(dist_tensor) { construct_local_tensor(); }
-
-  /// @brief Overload for the parenthesis operation that gets a variadic template input for the
-  /// accessing indices
-  /// @tparam ...Args Variadic template for the indices to be access
-  /// @param ...args Input indices for accessing
-  /// @return A mutable reference to the value for the corresponding index in the local memory
-  /// region
-  template<typename... Args>
-  T& operator()(Args... args) {
-    std::vector<size_t> indices;
-    unpack(indices, args...);
-    EXPECTS_STR(indices.size() == local_tensor_.num_modes(),
-                "Number of indices must match the number of dimensions");
-    size_t linearIndex = compute_linear_index(indices);
-    return local_tensor_.access_local_buf()[linearIndex];
-  }
-
-  /// @brief Overload for the parenthesis operation that gets a variadic template input for the
-  /// accessing indices
-  /// @tparam ...Args Variadic template for the indices to be access
-  /// @param ...args Input indices for accessing
-  /// @return
-  template<typename... Args>
-  const T& operator()(Args... args) const {
-    std::vector<size_t> indices;
-    unpack(indices, args...);
-    EXPECTS_STR(indices.size() == local_tensor_.num_modes(),
-                "Number of indices must match the number of dimensions");
-    size_t linearIndex = compute_linear_index(indices);
-    return local_tensor_.access_local_buf()[linearIndex];
-  }
-
-  /// @brief Overload for the parenthesis operation that gets an index vector for the access
-  /// @param indices Vector of indices to be access
-  /// @return A mutable reference to the value for the corresponding index in the local memory
-  /// region
-  T& operator()(const std::vector<size_t>& indices) {
-    EXPECTS_STR(indices.size() == local_tensor_.num_modes(),
-                "Number of indices must match the number of dimensions");
-    size_t linearIndex = compute_linear_index(indices);
-    return local_tensor_.access_local_buf()[linearIndex];
-  }
-
-  /// @brief Overload for the parenthesis operation that gets an index vector for the access
-  /// @param indices Vector of indices to be access
-  /// @return An immutable reference to the value for the corresponding index in the local memory
-  /// region
-  template<typename... Args>
-  const T& operator()(const std::vector<size_t>& indices) const {
-    EXPECTS_STR(indices.size() == local_tensor_.num_modes(),
-                "Number of indices must match the number of dimensions");
-    size_t linearIndex = compute_linear_index(indices);
-    return local_tensor_.access_local_buf()[linearIndex];
-  }
-
-  void write_back_to_dist() { fill_distributed_tensor(); }
-
-private:
-  /// @brief reference to the source distributed tensor
-  Tensor<T> dist_tensor_;
-  /// @brief reference to the local tensor created from the source tensor
-  Tensor<T> local_tensor_;
-
-  /// @brief Method for constructing the local copy of the source tensor using local memory manager
-  ///        The construction start with constructing new tiled index spaces from the original
-  ///        tensor and constructing a new local tensor that uses local memory manager
-  void construct_local_tensor() {
-    auto               tiss = dist_tensor_.tiled_index_spaces();
-    TiledIndexSpaceVec local_tiss;
-    for(const auto& tis: tiss) {
-      local_tiss.push_back(TiledIndexSpace{tis.index_space(), tis.index_space().max_num_indices()});
-    }
-
-    EXPECTS(dist_tensor_.is_allocated());
-    auto ec = dist_tensor_.execution_context();
-
-    ExecutionContext local_ec{ec->pg(), DistributionKind::nw, MemoryManagerKind::local};
-
-    local_tensor_ = Tensor<T>{local_tiss};
-    local_tensor_.allocate(ec);
-    fill_local_tensor();
-  }
-
-  /// @brief Method for constructing the linearized index for a given location on the local tensor
-  /// @param indices The index for the corresponding location wanted to be accessed
-  /// @return The linear position to the local memory manager
-  size_t compute_linear_index(const std::vector<size_t>& indices) const {
-    auto                num_modes = local_tensor_.num_modes();
-    std::vector<size_t> dims;
-    for(auto tis: local_tensor_.tiled_index_spaces()) { dims.push_back(tis.max_num_indices()); }
-    size_t index  = 0;
-    size_t stride = 1;
-
-    for(size_t i = 0; i < num_modes; ++i) {
-      index += indices[num_modes - 1 - i] * stride;
-      stride *= dims[num_modes - 1 - i];
-    }
-
-    return index;
-  }
-
-  /// @brief Method for filling the local tensor data with the original distributed tensor.
-  ///        We first construct a loop nest and to a get on all blocks that are then written to the
-  ///        corresponding place in the new local tensor
-  void fill_local_tensor() {
-    for(const auto& blockid: dist_tensor_.loop_nest()) {
-      const tamm::TAMM_SIZE size = dist_tensor_.block_size(blockid);
-      std::vector<T>        buf(size);
-      dist_tensor_.get(blockid, buf);
-      auto block_dims   = dist_tensor_.block_dims(blockid);
-      auto block_offset = dist_tensor_.block_offsets(blockid);
-      patch_copy_local(buf, block_dims, block_offset, true);
-    }
-  }
-
-  /// @brief Method for filling the original distributed tensor data with the local tensor.
-  ///        We first construct a loop nest and to a get on all blocks that are then written to the
-  ///        corresponding place in the distributed tensor
-  void fill_distributed_tensor() {
-    for(const auto& blockid: dist_tensor_.loop_nest()) {
-      const tamm::TAMM_SIZE size = dist_tensor_.block_size(blockid);
-      std::vector<T>        buf(size);
-      dist_tensor_.get(blockid, buf);
-      auto block_dims   = dist_tensor_.block_dims(blockid);
-      auto block_offset = dist_tensor_.block_offsets(blockid);
-      patch_copy_local(buf, block_dims, block_offset, false);
-      dist_tensor_.put(blockid, buf);
-    }
-  }
-
-  /// @brief A helper method that copy a block of that to a corresponding patch in the local copy
-  /// @param sbuf Block data that wants to be copied
-  /// @param block_dims Block dimensions to find the accurate location in the linearized local
-  /// tensor
-  /// @param block_offset The offsets of the input data from the original multidimensional tensor
-  void patch_copy_local(std::vector<T>& sbuf, const std::vector<size_t>& block_dims,
-                        const std::vector<size_t>& block_offset, bool copy_to_local) {
-    auto num_dims = local_tensor_.num_modes();
-    // Compute the total number of elements to copy
-    size_t total_elements = 1;
-    for(size_t dim: block_dims) { total_elements *= dim; }
-
-    // Initialize indices to the starting offset
-    std::vector<size_t> indices(block_offset);
-
-    for(size_t c = 0; c < total_elements; ++c) {
-      // Access the tensor element at the current indices
-      if(copy_to_local) (*this)(indices) = sbuf[c];
-      else sbuf[c] = (*this)(indices);
-
-      // Increment indices
-      for(int dim = num_dims - 1; dim >= 0; --dim) {
-        if(++indices[dim] < block_offset[dim] + block_dims[dim]) { break; }
-        indices[dim] = block_offset[dim];
-      }
-    }
-  }
-
-  /// @brief Helper method that will unpack the variadic template for operator()
-  /// @param indices A reference to the vector of indices
-  /// @param index The last index that is provided to the operator()
-  void unpack(std::vector<size_t>& indices, size_t index) { indices.push_back(index); }
-
-  /// @brief Helper method that will unpack the variadic template for operator()
-  /// @tparam ...Args The variadic template from the arguments to the operator()
-  /// @param indices A reference to the vector of indices
-  /// @param next Unpacked index for the operator()
-  /// @param ...rest The rest of the variadic template that will be unpacked in the recursive calls
-  template<typename... Args>
-  void unpack(std::vector<size_t>& indices, size_t next, Args... rest) {
-    indices.push_back(next);
-    unpack(indices, rest...);
-  }
-};
-
 } // namespace tamm
diff --git a/tests/tamm/Test_LocalTensor.cpp b/tests/tamm/Test_LocalTensor.cpp
index 619bcd719..267823062 100644
--- a/tests/tamm/Test_LocalTensor.cpp
+++ b/tests/tamm/Test_LocalTensor.cpp
@@ -4,8 +4,56 @@
 
 using namespace tamm;
 
+bool check_local_tis_sizes(const TiledIndexSpace& l_tis, size_t expected_size) {
+  return (l_tis.max_num_indices() == expected_size && l_tis.tile_size(0) == expected_size &&
+          l_tis.input_tile_size() == expected_size);
+}
+
 template<typename T>
-void test_local_tensor(Scheduler& sch, size_t N, Tile tilesize) {
+bool check_local_tensor_sizes(const LocalTensor<T>&      l_tensor,
+                              const std::vector<size_t>& expected_sizes) {
+  EXPECTS_STR(l_tensor.num_modes() == expected_sizes.size(),
+              "Expected sizes should be same as the dimensions of the input LocalTensor.");
+  auto tis_vec = l_tensor.tiled_index_spaces();
+  bool result  = true;
+  for(size_t i = 0; i < tis_vec.size(); i++) {
+    if(!check_local_tis_sizes(tis_vec.at(i), expected_sizes.at(i))) {
+      result = false;
+      break;
+    }
+  }
+
+  return result;
+}
+
+template<typename T>
+bool check_local_tensor_values(const LocalTensor<T>& l_tensor, T value) {
+  EXPECTS_STR(l_tensor.is_allocated(), "LocalTensor should be allocated to check the values.");
+
+  bool result    = true;
+  auto tis_sizes = l_tensor.dim_sizes();
+
+  auto num_elements = 1;
+
+  for(auto tis_sz: tis_sizes) { num_elements *= tis_sz; }
+  auto* local_buf = l_tensor.access_local_buf();
+  for(size_t i = 0; i < num_elements; i++) {
+    if(local_buf[i] != value) {
+      result = false;
+      break;
+    }
+  }
+  return result;
+}
+
+template<typename T>
+void test_local_tensor_constructors(Scheduler& sch, size_t N, Tile tilesize) {
+  // LocalTensor construction
+  // - TIS list
+  // - TIS vec
+  // - Labels
+  // - Sizes
+
   TiledIndexSpace tis1{IndexSpace{range(N)}, tilesize};
 
   auto [i, j, k, l, m] = tis1.labels<5>("all");
@@ -14,25 +62,178 @@ void test_local_tensor(Scheduler& sch, size_t N, Tile tilesize) {
   Tensor<T> B{k, l};
   Tensor<T> C{i, j, l};
 
-  sch.allocate(A, B, C)(A() = 1.0)(B() = 2.0)(C() = 3.0).execute();
+  sch.allocate(A, B, C).execute();
+  EXPECTS_STR(A.is_allocated() && B.is_allocated() && C.is_allocated(),
+              "All distributed tensors should be able to allocate!");
+
+  ExecutionContext local_ec{sch.ec().pg(), DistributionKind::nw, MemoryManagerKind::local};
+  Scheduler        sch_local{local_ec};
+
+  LocalTensor<T> local_A{tis1, tis1, tis1};
+  LocalTensor<T> local_B{B.tiled_index_spaces()};
+  LocalTensor<T> local_C{i, j, l};
+  LocalTensor<T> local_D{N, N, N};
+  LocalTensor<T> local_E{10, 10, 10};
+
+  sch_local.allocate(local_A, local_B, local_C, local_D, local_E).execute();
+
+  EXPECTS_STR(local_A.is_allocated() && local_B.is_allocated() && local_C.is_allocated() &&
+                local_D.is_allocated() && local_E.is_allocated(),
+              "All local tensors should be able to allocate!");
+
+  EXPECTS_STR(check_local_tensor_sizes(local_A, {N, N, N}), "Local_A is not correctly created!");
+  EXPECTS_STR(check_local_tensor_sizes(local_B, {N, N}), "Local_B is not correctly created!");
+  EXPECTS_STR(check_local_tensor_sizes(local_C, {N, N, N}), "Local_C is not correctly created!");
+  EXPECTS_STR(check_local_tensor_sizes(local_D, {N, N, N}), "Local_D is not correctly created!");
+  EXPECTS_STR(check_local_tensor_sizes(local_E, {10, 10, 10}), "Local_E is not correctly created!");
+}
+
+template<typename T>
+void test_local_tensor_block(ExecutionContext& ec, size_t N) {
+  // Block
+  // - Tensor - various sizes, test with 0 for any dim size
+  // - Matrix - various sizes, test with 0 for any dim size
+
+  ExecutionContext local_ec{ec.pg(), DistributionKind::nw, MemoryManagerKind::local};
+  Scheduler        sch_local{local_ec};
+
+  LocalTensor<T> local_A{N, N, N};
+  LocalTensor<T> local_B{N, N};
+
+  sch_local.allocate(local_A, local_B)(local_A() = 42.0)(local_B() = 21.0).execute();
+
+  auto local_C = local_A.block({0, 0, 0}, {4, 4, 4});
+  auto local_D = local_B.block(0, 0, 4, 4);
+
+  EXPECTS_STR(check_local_tensor_sizes(local_C, {4, 4, 4}), "Local_C is not correctly created!");
+  EXPECTS_STR(check_local_tensor_sizes(local_D, {4, 4}), "Local_D is not correctly created!");
+
+  EXPECTS_STR(check_local_tensor_values(local_C, 42.0), "Local_C doesn't have correct values!");
+  EXPECTS_STR(check_local_tensor_values(local_D, 21.0), "Local_D doesn't have correct values!");
+}
+
+template<typename T>
+void test_local_tensor_resize(ExecutionContext& ec, size_t N) {
+  // Resize
+  // - Smaller
+  // - Larger
+  // - Same size
+  // - all 0 size
+  // - change dim?
+
+  ExecutionContext local_ec{ec.pg(), DistributionKind::nw, MemoryManagerKind::local};
+  Scheduler        sch_local{local_ec};
 
-  LocalTensor A_local{A};
-  LocalTensor B_local{B};
-  LocalTensor C_local{C};
+  LocalTensor<T> local_A{N, N, N};
+  LocalTensor<T> local_B{N, N};
 
-  std::cout << "A_local" << std::endl;
-  for(size_t i_idx = 0; i_idx < N; i_idx++) {
-    for(size_t j_idx = 0; j_idx < N; j_idx++) {
-      for(size_t k_idx = 0; k_idx < N; k_idx++) {
-        std::cout << A_local(i_idx, j_idx, k_idx) << "\t";
-        A_local(i_idx, j_idx, k_idx) = 42.0;
+  sch_local.allocate(local_A, local_B)(local_A() = 42.0)(local_B() = 21.0).execute();
+
+  local_A.resize(5, 5, 5);
+  EXPECTS_STR(check_local_tensor_sizes(local_A, {5, 5, 5}), "Local_A is not correctly created!");
+  EXPECTS_STR(check_local_tensor_values(local_A, 42.0), "Local_A doesn't have correct values!");
+
+  auto* tensor_ptr = local_A.base_ptr();
+  local_A.resize(5, 5, 5);
+  auto* tensor_resize_ptr = local_A.base_ptr();
+
+  EXPECTS_STR(tensor_ptr == tensor_resize_ptr,
+              "Resize into same size should return the old tensor!");
+
+  local_A.resize(N, N, N);
+  EXPECTS_STR(check_local_tensor_sizes(local_A, {N, N, N}), "Local_A is not correctly created!");
+  EXPECTS_STR(check_local_tensor_values(local_A.block({0, 0, 0}, {5, 5, 5}), 42.0),
+              "Local_A doesn't have correct values!");
+
+  // local_A.resize(0,0,0);
+
+  // local_A.resize(5,5);
+}
+
+template<typename T>
+void test_local_tensor_accessor(ExecutionContext& ec, size_t N) {
+  // Set/Get
+  // - Single access
+  // - Looped access
+
+  ExecutionContext local_ec{ec.pg(), DistributionKind::nw, MemoryManagerKind::local};
+  Scheduler        sch_local{local_ec};
+
+  LocalTensor<T> local_A{N, N, N};
+  LocalTensor<T> local_B{N, N};
+
+  sch_local.allocate(local_A, local_B)(local_A() = 42.0)(local_B() = 21.0).execute();
+
+  EXPECTS_STR(local_A.get(0, 0, 0) == 42.0, "The get value doesn't match the expected value.");
+
+  local_A.set({0, 0, 0}, 1.0);
+  EXPECTS_STR(local_A.get(0, 0, 0) == 1.0, "The get value doesn't match the expected value.");
+  local_A.set({0, 0, 0}, 42.0);
+
+  for(size_t i = 0; i < N; i++) {
+    for(size_t j = 0; j < N; j++) {
+      for(size_t k = 0; k < N; k++) {
+        EXPECTS_STR(local_A.get(i, j, k) == 42.0,
+                    "The get value doesn't match the expected value.");
+        local_A.set({i, j, k}, local_B.get(i, j));
+        EXPECTS_STR(local_A.get(i, j, k) == 21.0,
+                    "The get value doesn't match the expected value.");
       }
     }
-    std::cout << std::endl;
   }
-  A_local.write_back_to_dist();
+}
+
+template<typename T>
+void test_local_tensor(Scheduler& sch, size_t N, Tile tilesize) {
+  TiledIndexSpace tis1{IndexSpace{range(N)}, tilesize};
+
+  auto [i, j, k, l, m] = tis1.labels<5>("all");
+
+  Tensor<T> A{i, j, k};
+  Tensor<T> B{k, l};
+  Tensor<T> C{i, j, l};
+
+  sch.allocate(A, B, C)(A() = 1.0)(B() = 2.0)(C() = 3.0).execute();
+
+  ExecutionContext local_ec{sch.ec().pg(), DistributionKind::nw, MemoryManagerKind::local};
+
+  Scheduler sch_local{local_ec};
+
+  LocalTensor<T> new_local1{i, j, k};
+  LocalTensor<T> new_local2{tis1, tis1, tis1};
+  LocalTensor<T> new_local3{N, N, N};
+  LocalTensor<T> new_local4{A.tiled_index_spaces()};
+
+  sch_local
+    .allocate(new_local1, new_local2, new_local3, new_local4)(new_local1() = 42.0)(
+      new_local2() = 21.0)(new_local3() = 1.0)(new_local4() = 2.0)
+
+    // .deallocate()
+    .execute();
+
+  // std::cout << "A_local" << std::endl;
+  new_local3.init(42.0);
+
+  std::cout << "value at 5,5,5 - " << new_local3.get(5, 5, 5) << std::endl;
+  new_local3.set({5, 5, 5}, 1.0);       // memset val
+  auto val = new_local3.get({5, 5, 5}); // memset val
+
+  std::cout << "new value at 5,5,5 - " << new_local3.get(5, 5, 5) << std::endl;
+  std::cout << "new_local4* before resize - " << new_local4.base_ptr() << std::endl;
+  new_local4.resize(N, N, N); // vector.resize()? eigen.resize()?
+
+  std::cout << "new_local4* after resize - " << new_local4.base_ptr() << std::endl;
+  std::cout << "----------------------------------------------------" << std::endl;
+  std::cout << "new_local4* before resize - " << new_local4.base_ptr() << std::endl;
+  new_local4.resize(N + 5, N + 5, N + 5); // vector.resize()? eigen.resize()?
+  std::cout << "new_local4* after resize - " << new_local4.base_ptr() << std::endl;
+  auto new_local5 = new_local3.block({5, 5, 5}, {4, 4, 4});
 
-  print_tensor(A);
+  print_tensor(new_local1);
+  print_tensor(new_local2);
+  print_tensor(new_local3);
+  print_tensor(new_local4);
+  print_tensor(new_local5);
 }
 
 int main(int argc, char* argv[]) {
@@ -66,7 +267,11 @@ int main(int argc, char* argv[]) {
     std::cout << std::endl << std::endl;
   }
 
-  test_local_tensor<double>(sch, is_size, tile_size);
+  // test_local_tensor<double>(sch, is_size, tile_size);
+  test_local_tensor_constructors<double>(sch, is_size, tile_size);
+  test_local_tensor_block<double>(ec, is_size);
+  test_local_tensor_resize<double>(ec, is_size);
+  test_local_tensor_accessor<double>(ec, is_size);
 
   tamm::finalize();