diff --git a/MANIFEST.in b/MANIFEST.in
index 46a874d6f7..3969afe5d8 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,6 @@
 recursive-include aeon *.py
-recursive-include aeon/benchmarking/example_results *.csv
 recursive-include aeon/datasets *.csv *.arff *.txt *.ts *.tsv *.tsf
+recursive-include aeon/testing/example_results_files *.csv
 include aeon/registry/README.md
 include .coveragerc
 include conftest.py
diff --git a/aeon/benchmarking/tests/test_results_loaders.py b/aeon/benchmarking/tests/test_results_loaders.py
index dcc271df09..b6deaf1d82 100644
--- a/aeon/benchmarking/tests/test_results_loaders.py
+++ b/aeon/benchmarking/tests/test_results_loaders.py
@@ -7,6 +7,7 @@
 import pytest
 from pytest import raises
 
+import aeon
 from aeon.benchmarking.results_loaders import (
     CONNECTION_ERRORS,
     NAME_ALIASES,
@@ -72,8 +73,10 @@ def test_get_available_estimators():
 
 cls = ["HIVECOTEV2", "FreshPRINCE", "InceptionTime"]
 data = ["Chinatown", "ItalyPowerDemand", "Tools"]
-test_path = os.path.dirname(__file__)
-data_path = os.path.join(test_path, "../example_results/")
+data_path = os.path.join(
+    os.path.dirname(aeon.__file__),
+    "testing/example_results_files/",
+)
 
 
 @pytest.mark.skipif(
diff --git a/aeon/classification/compose/tests/test_pipeline.py b/aeon/classification/compose/tests/test_pipeline.py
index 84d02308f6..9dea384eb5 100644
--- a/aeon/classification/compose/tests/test_pipeline.py
+++ b/aeon/classification/compose/tests/test_pipeline.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 import pytest
+from numpy.testing import assert_array_almost_equal
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.preprocessing import StandardScaler
 
@@ -16,7 +17,6 @@
     make_example_3d_numpy_list,
 )
 from aeon.testing.mock_estimators import MockCollectionTransformer
-from aeon.testing.utils.estimator_checks import _assert_array_almost_equal
 from aeon.transformations.collection import (
     AutocorrelationFunctionTransformer,
     HOG1DTransformer,
@@ -61,7 +61,7 @@ def test_classifier_pipeline(transformers):
         X_test = t.transform(X_test)
 
     c.fit(X_train, y_train)
-    _assert_array_almost_equal(y_pred, c.predict(X_test))
+    assert_array_almost_equal(y_pred, c.predict(X_test))
 
 
 @pytest.mark.parametrize(
@@ -99,7 +99,7 @@ def test_sklearn_classifier_pipeline(transformers):
         X_test = t.transform(X_test)
 
     c.fit(X_train, y_train)
-    _assert_array_almost_equal(y_pred, c.predict(X_test))
+    assert_array_almost_equal(y_pred, c.predict(X_test))
 
 
 def test_unequal_tag_inference():
diff --git a/aeon/clustering/compose/tests/test_pipeline.py b/aeon/clustering/compose/tests/test_pipeline.py
index 0c2d26bdc0..73f751944b 100644
--- a/aeon/clustering/compose/tests/test_pipeline.py
+++ b/aeon/clustering/compose/tests/test_pipeline.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 import pytest
+from numpy.testing import assert_array_almost_equal
 from sklearn.cluster import KMeans
 from sklearn.preprocessing import StandardScaler
 
@@ -14,7 +15,6 @@
     make_example_3d_numpy_list,
 )
 from aeon.testing.mock_estimators import MockCollectionTransformer
-from aeon.testing.utils.estimator_checks import _assert_array_almost_equal
 from aeon.transformations.collection import (
     AutocorrelationFunctionTransformer,
     HOG1DTransformer,
@@ -60,7 +60,7 @@ def test_clusterer_pipeline(transformers):
         X_test = t.transform(X_test)
 
     c.fit(X_train, y_train)
-    _assert_array_almost_equal(y_pred, c.predict(X_test))
+    assert_array_almost_equal(y_pred, c.predict(X_test))
 
 
 @pytest.mark.parametrize(
@@ -98,7 +98,7 @@ def test_sklearn_clusterer_pipeline(transformers):
         X_test = t.transform(X_test)
 
     c.fit(X_train, y_train)
-    _assert_array_almost_equal(y_pred, c.predict(X_test))
+    assert_array_almost_equal(y_pred, c.predict(X_test))
 
 
 def test_unequal_tag_inference():
diff --git a/aeon/performance_metrics/tests/test_stats.py b/aeon/performance_metrics/tests/test_stats.py
index 44560c7691..012cbdbeb8 100644
--- a/aeon/performance_metrics/tests/test_stats.py
+++ b/aeon/performance_metrics/tests/test_stats.py
@@ -13,7 +13,7 @@
 
 data_path = os.path.join(
     os.path.dirname(aeon.__file__),
-    "benchmarking/example_results/",
+    "testing/example_results_files/",
 )
 
 
diff --git a/aeon/regression/compose/tests/test_pipeline.py b/aeon/regression/compose/tests/test_pipeline.py
index 6b93be0976..edafa9eecc 100644
--- a/aeon/regression/compose/tests/test_pipeline.py
+++ b/aeon/regression/compose/tests/test_pipeline.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 import pytest
+from numpy.testing import assert_array_almost_equal
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.preprocessing import StandardScaler
 
@@ -16,7 +17,6 @@
     make_example_3d_numpy_list,
 )
 from aeon.testing.mock_estimators import MockCollectionTransformer
-from aeon.testing.utils.estimator_checks import _assert_array_almost_equal
 from aeon.transformations.collection import (
     AutocorrelationFunctionTransformer,
     HOG1DTransformer,
@@ -61,7 +61,7 @@ def test_regressor_pipeline(transformers):
         X_test = t.transform(X_test)
 
     r.fit(X_train, y_train)
-    _assert_array_almost_equal(y_pred, r.predict(X_test))
+    assert_array_almost_equal(y_pred, r.predict(X_test))
 
 
 @pytest.mark.parametrize(
@@ -99,7 +99,7 @@ def test_sklearn_regressor_pipeline(transformers):
         X_test = t.transform(X_test)
 
     r.fit(X_train, y_train)
-    _assert_array_almost_equal(y_pred, r.predict(X_test))
+    assert_array_almost_equal(y_pred, r.predict(X_test))
 
 
 def test_unequal_tag_inference():
diff --git a/aeon/segmentation/_eagglo.py b/aeon/segmentation/_eagglo.py
index d482c47d1b..67605b7761 100644
--- a/aeon/segmentation/_eagglo.py
+++ b/aeon/segmentation/_eagglo.py
@@ -72,13 +72,11 @@ class EAggloSegmenter(BaseSegmenter):
 
     Examples
     --------
-    >>> from aeon.testing.data_generation import piecewise_normal_multivariate
+    >>> from aeon.testing.data_generation import make_example_dataframe_series
     >>> from aeon.segmentation import EAggloSegmenter
-    >>> X = piecewise_normal_multivariate(means=[[1, 3], [4, 5]], lengths=[3, 4],
-    ...     random_state = 10)
+    >>> X = make_example_dataframe_series(n_channels=2, random_state=10)
     >>> model = EAggloSegmenter()
-    >>> model.fit_predict(X, axis=0)
-    array([0, 0, 0, 1, 1, 1, 1])
+    >>> y = model.fit_predict(X, axis=0)
     """
 
     _tags = {
diff --git a/aeon/segmentation/_ggs.py b/aeon/segmentation/_ggs.py
index 6fef577346..d8bdd21d71 100644
--- a/aeon/segmentation/_ggs.py
+++ b/aeon/segmentation/_ggs.py
@@ -435,12 +435,10 @@ class GreedyGaussianSegmenter(BaseSegmenter):
 
     Examples
     --------
-    >>> from aeon.testing.data_generation import piecewise_normal_multivariate
+    >>> from aeon.testing.data_generation import make_example_dataframe_series
     >>> from sklearn.preprocessing import MinMaxScaler
     >>> from aeon.segmentation import GreedyGaussianSegmenter
-    >>> X = piecewise_normal_multivariate(lengths=[10, 10, 10, 10],
-    ...     means=[[0.0, 1.0], [11.0, 10.0], [5.0, 3.0], [2.0, 2.0]],
-    ...     variances=0.5)
+    >>> X = make_example_dataframe_series(n_channels=2, random_state=10)
     >>> X_scaled = MinMaxScaler(feature_range=(0, 1)).fit_transform(X)
     >>> ggs = GreedyGaussianSegmenter(k_max=3, max_shuffles=5)
     >>> y = ggs.fit_predict(X_scaled, axis=0)
diff --git a/aeon/segmentation/_igts.py b/aeon/segmentation/_igts.py
index 20d632435c..b59bbcfbbf 100644
--- a/aeon/segmentation/_igts.py
+++ b/aeon/segmentation/_igts.py
@@ -152,19 +152,6 @@ class _IGTS:
        "Information gain-based metric for recognizing transitions in human activities.",
        Pervasive and Mobile Computing, 38, 92-109, (2017).
        https://www.sciencedirect.com/science/article/abs/pii/S1574119217300081
-
-    Examples
-    --------
-    >>> from aeon.testing.data_generation import piecewise_normal_multivariate
-    >>> from sklearn.preprocessing import MinMaxScaler
-    >>> from aeon.segmentation import InformationGainSegmenter
-    >>> X = piecewise_normal_multivariate(lengths=[10, 10, 10, 10],
-    ...     means=[[0.0, 1.0], [11.0, 10.0], [5.0, 3.0], [2.0, 2.0]],
-    ...     variances=0.5)
-    >>> X_scaled = MinMaxScaler(feature_range=(0, 1)).fit_transform(X)
-    >>> igts = InformationGainSegmenter(k_max=3, step=2)
-    >>> y = igts.fit_predict(X_scaled, axis=0)
-
     """
 
     # init attributes
@@ -256,7 +243,8 @@ def find_change_points(self, X: npt.ArrayLike) -> list[int]:
         current_change_points = self.identity(X)
 
         for k in range(self.k_max):
-            ig_max = 0
+            best_candidate = -1
+            ig_max = -1
             # find a point which maximizes score
             for candidate in self.get_candidates(n_samples, current_change_points):
                 try_change_points = {candidate}
@@ -335,12 +323,10 @@ class InformationGainSegmenter(BaseSegmenter):
 
     Examples
     --------
-    >>> from aeon.testing.data_generation import piecewise_normal_multivariate
+    >>> from aeon.testing.data_generation import make_example_dataframe_series
     >>> from sklearn.preprocessing import MinMaxScaler
     >>> from aeon.segmentation import InformationGainSegmenter
-    >>> X = piecewise_normal_multivariate(lengths=[10, 10, 10, 10],
-    ...     means=[[0.0, 1.0], [11.0, 10.0], [5.0, 3.0], [2.0, 2.0]],
-    ...     variances=0.5)
+    >>> X = make_example_dataframe_series(n_channels=2, random_state=10)
     >>> X_scaled = MinMaxScaler(feature_range=(0, 1)).fit_transform(X)
     >>> igts = InformationGainSegmenter(k_max=3, step=2)
     >>> y = igts.fit_predict(X_scaled, axis=0)
diff --git a/aeon/testing/utils/_cicd_numba_caching.py b/aeon/testing/_cicd_numba_caching.py
similarity index 100%
rename from aeon/testing/utils/_cicd_numba_caching.py
rename to aeon/testing/_cicd_numba_caching.py
diff --git a/aeon/testing/data_generation/__init__.py b/aeon/testing/data_generation/__init__.py
index 67f91f5359..4cd1824071 100644
--- a/aeon/testing/data_generation/__init__.py
+++ b/aeon/testing/data_generation/__init__.py
@@ -14,15 +14,6 @@
     "make_example_2d_numpy_series",
     "make_example_pandas_series",
     "make_example_dataframe_series",
-    # other
-    "piecewise_normal_multivariate",
-    "piecewise_normal",
-    "piecewise_multinomial",
-    "piecewise_poisson",
-    "labels_with_repeats",
-    "label_piecewise_normal",
-    "_make_hierarchical",
-    "_bottom_hier_datagen",
 ]
 
 
@@ -41,15 +32,3 @@
     make_example_dataframe_series,
     make_example_pandas_series,
 )
-from aeon.testing.data_generation.hierarchical import (
-    _bottom_hier_datagen,
-    _make_hierarchical,
-)
-from aeon.testing.data_generation.segmentation import (
-    label_piecewise_normal,
-    labels_with_repeats,
-    piecewise_multinomial,
-    piecewise_normal,
-    piecewise_normal_multivariate,
-    piecewise_poisson,
-)
diff --git a/aeon/testing/data_generation/hierarchical.py b/aeon/testing/data_generation/hierarchical.py
deleted file mode 100644
index e889cf7634..0000000000
--- a/aeon/testing/data_generation/hierarchical.py
+++ /dev/null
@@ -1,246 +0,0 @@
-"""Hierarchical Data Generators."""
-
-from typing import Optional
-
-__maintainer__ = []
-
-from itertools import product
-from typing import Union
-
-import numpy as np
-import pandas as pd
-from sklearn.utils import check_random_state
-
-from aeon.datasets import load_airline
-
-
-def _make_index(n_timepoints, index_type=None):
-    """Make indices for unit testing."""
-    if index_type == "period":
-        start = "2000-01"
-        freq = "M"
-        return pd.period_range(start=start, periods=n_timepoints, freq=freq)
-
-    elif index_type == "datetime" or index_type is None:
-        start = "2000-01-01"
-        freq = "D"
-        return pd.date_range(start=start, periods=n_timepoints, freq=freq)
-
-    elif index_type == "range":
-        start = 3  # check non-zero based indices
-        return pd.RangeIndex(start=start, stop=start + n_timepoints)
-
-    elif index_type == "int":
-        start = 3
-        return pd.Index(np.arange(start, start + n_timepoints), dtype=int)
-
-    else:
-        raise ValueError(f"index_class: {index_type} is not supported")
-
-
-def _make_hierarchical(
-    hierarchy_levels: tuple = (2, 4),
-    max_timepoints: int = 12,
-    min_timepoints: int = 12,
-    same_cutoff: bool = True,
-    n_columns: int = 1,
-    all_positive: bool = True,
-    index_type: Optional[str] = None,
-    random_state: Optional[Union[int, np.random.RandomState]] = None,
-    add_nan: bool = False,
-) -> pd.DataFrame:
-    """Generate hierarchical multiindex type for testing.
-
-    Parameters
-    ----------
-    hierarchy_levels : Tuple, optional
-        the number of groups at each hierarchy level, by default (2, 4)
-    max_timepoints : int, optional
-        maximum time points a series can have, by default 12
-    min_timepoints : int, optional
-        minimum time points a seires can have, by default 12
-    same_cutoff : bool, optional
-        If it's True all series will end at the same date, by default True
-    n_columns : int, optional
-        number of columns in the output dataframe, by default 1
-    all_positive : bool, optional
-        If True the time series will be , by default True
-    index_type : str, optional
-        type of index, by default None
-        Supported types are "period", "datetime", "range" or "int".
-        If it's not provided, "datetime" is selected.
-    random_state : int, np.random.RandomState or None
-        Controls the randomness of the estimator, by default None
-    add_nan : bool, optional
-        If it's true the series will contain NaNs, by default False
-
-    Returns
-    -------
-    pd.DataFrame
-        hierarchical dataframe
-    """
-    levels = [
-        [f"h{i}_{j}" for j in range(hierarchy_levels[i])]
-        for i in range(len(hierarchy_levels))
-    ]
-    level_names = [f"h{i}" for i in range(len(hierarchy_levels))]
-    rng = check_random_state(random_state)
-    if min_timepoints == max_timepoints:
-        time_index = _make_index(max_timepoints, index_type)
-        index = pd.MultiIndex.from_product(
-            levels + [time_index], names=level_names + ["time"]
-        )
-    else:
-        df_list = []
-        for levels_tuple in product(*levels):
-            n_timepoints = rng.randint(low=min_timepoints, high=max_timepoints)
-            if same_cutoff:
-                time_index = _make_index(max_timepoints, index_type)[-n_timepoints:]
-            else:
-                time_index = _make_index(n_timepoints, index_type)
-            d = dict(zip(level_names, levels_tuple))
-            d["time"] = time_index
-            df_list.append(pd.DataFrame(d))
-        index = pd.MultiIndex.from_frame(
-            pd.concat(df_list), names=level_names + ["time"]
-        )
-
-    total_time_points = len(index)
-    data = rng.normal(size=(total_time_points, n_columns))
-    if add_nan:
-        # add some nan values
-        data[int(len(data) / 2)] = np.nan
-        data[0] = np.nan
-        data[-1] = np.nan
-    if all_positive:
-        data -= np.min(data, axis=0) - 1
-    df = pd.DataFrame(
-        data=data, index=index, columns=[f"c{i}" for i in range(n_columns)]
-    )
-
-    return df
-
-
-def _bottom_hier_datagen(
-    no_levels=3,
-    no_bottom_nodes=6,
-    intercept_max=20,
-    coef_1_max=20,
-    coef_2_max=0.1,
-    random_seed=None,
-):
-    """Hierarchical data generator using the flights dataset.
-
-    This function generates bottom level, i.e. not aggregated, time-series
-    from the flights dataset.
-
-    Each series is generated from the flights dataset using a linear model,
-    y = c0 + c1x + c2x^(c3), where the coefficients, intercept, and exponent
-    are randomly sampled for each series. The coefficients and intercept are
-    sampled between np.arange(0, *_max, 0.01) to keep the values positive. The
-    exponent is sampled from [0.5, 1, 1.5, 2].
-
-
-    Parameters
-    ----------
-    no_levels : int, optional
-        The number of levels not considering the time-index, by default 3
-    no_bottom_nodes : int, optional
-       Number of time series, i.e. bottom nodes, to generate, by default 6.
-    *_max : int, optional
-        Maximum possible value of the coefficient or intercept value.
-    random_seed : int, optional
-        Random seed for reproducability.
-
-
-    Returns
-    -------
-    pd.DataFrame with multiindex
-    """
-    if no_levels > no_bottom_nodes:
-        raise ValueError("no_levels should be less than no_bottom_nodes")
-
-    rng = np.random.default_rng(random_seed)
-
-    base_ts = load_airline(return_array=False)
-    df = pd.DataFrame(base_ts, index=base_ts.index)
-    df.index.rename(None, inplace=True)
-
-    if no_levels == 0:
-        df.columns = ["passengers"]
-        df.index.rename("timepoints", inplace=True)
-        return df
-    else:
-        df.columns = ["l1_node01"]
-
-        intercept = np.arange(0, intercept_max, 0.01)
-        coef_1 = np.arange(0, coef_1_max, 0.01)
-        coef_2 = np.arange(0, coef_2_max, 0.01)
-        power_2 = [0.5, 1, 1.5, 2]
-
-        # create structure of hierarchy
-        node_lookup = pd.DataFrame(
-            ["l1_node" + f"{x:02d}" for x in range(1, no_bottom_nodes + 1)]
-        )
-        node_lookup.columns = ["l1_agg"]
-
-        if no_levels >= 2:
-            # create index from bottom up, sampling node names
-            for i in range(2, no_levels + 1):
-                name = f"l{i}_agg"
-                name_groupby = f"l{i - 1}_agg"
-                node_lookup[name] = node_lookup.groupby([name_groupby])[
-                    "l1_agg"
-                ].transform(
-                    lambda x: "l"
-                    + str(i)  # noqa: B023
-                    + "_node"
-                    + "{:02d}".format(
-                        _sample_node(node_lookup.index, i, rng)  # noqa: B023
-                    )
-                )
-
-        node_lookup = node_lookup.set_index("l1_agg", drop=True)
-
-        # now define the series for each level by sampling coefficients etc.
-        for i in range(2, no_bottom_nodes + 1):
-            df["l1_node" + f"{i:02d}"] = (
-                rng.choice(intercept, size=1)
-                + rng.choice(coef_1, size=1) * df["l1_node01"]
-                + (
-                    rng.choice(coef_2, size=1)
-                    * (df["l1_node01"] ** rng.choice(power_2, size=1))
-                )
-            )
-
-        df = (
-            df.melt(ignore_index=False)
-            .reset_index(drop=False)
-            .rename(
-                columns={
-                    "variable": "l1_agg",
-                    "index": "timepoints",
-                    "value": "passengers",
-                }
-            )
-        )
-
-        df = pd.merge(left=df, right=node_lookup.reset_index(), on="l1_agg")
-        df = df[df.columns.sort_values(ascending=True)]
-
-        df_newindex = ["l" + str(x) + "_agg" for x in range(1, no_levels + 1)][::-1]
-        df_newindex.append("timepoints")
-
-        df = df.set_index(df_newindex)
-        df.sort_index(inplace=True)
-
-        return df
-
-
-def _sample_node(index_table, level, sampler):
-    """Sample a number of nodes depending on the size of hierarchy and level."""
-    nodes = np.arange(1, np.floor(len(index_table) / level) + 1, 1)
-    # return a single sample of them
-    sample_nodes = int(sampler.choice(nodes, size=1))
-
-    return sample_nodes
diff --git a/aeon/testing/data_generation/segmentation.py b/aeon/testing/data_generation/segmentation.py
deleted file mode 100644
index b8ccd22aff..0000000000
--- a/aeon/testing/data_generation/segmentation.py
+++ /dev/null
@@ -1,388 +0,0 @@
-"""Synthetic data generating functions."""
-
-from typing import Optional, Union
-
-import numpy as np
-import numpy.typing as npt
-from sklearn.utils.validation import check_random_state
-
-
-def piecewise_normal_multivariate(
-    means: npt.ArrayLike,
-    lengths: npt.ArrayLike,
-    variances: Union[npt.ArrayLike, float] = 1.0,
-    covariances: Optional[npt.ArrayLike] = None,
-    random_state: Optional[Union[int, np.random.RandomState]] = None,
-) -> npt.ArrayLike:
-    """
-    Generate multivariate series from segments.
-
-    Each segment has length specified in ``lengths`` and data sampled from a
-    multivariate normal distribution with a mean from ``means`` and covariance
-    from ``covariances`` (either specified or built from ``variances`` when
-    unspecified)
-
-    Parameters
-    ----------
-    lengths : array_like
-        Lengths of the segments to be generated of shape (n_segments,)
-    means : array_like
-        Means of the segments to be generated, as an array of shape
-        (n_segments, n_series)
-    variances : float or array_like (default=1.0)
-        Variance of the segments to be generated
-    covariances : array_like (default=None)
-        Covariances of segments to be generated of shape
-        (n_segments, n_series, n_series)
-        If None, this will be constructed from variances by assuming independence
-        of random vairables, i.e. variance as diagonal elements of covariance matrix
-    random_state : int or np.random.RandomState
-        Either a random seed or ``RandomState`` instance
-
-    Returns
-    -------
-    data : array_like
-        Multivariate time series as ``np.array`` of shape (sum(lengths), n_series)
-
-    Examples
-    --------
-    >>> from aeon.testing.data_generation import piecewise_normal_multivariate
-    >>> piecewise_normal_multivariate(means=[[1, 1], [2, 2], [3, 3]],\
-        lengths=[2, 3, 1], random_state=2)
-    array([[ 0.58324215,  0.94373317],
-           [-1.1361961 ,  2.64027081],
-           [ 0.20656441,  1.15825263],
-           [ 2.50288142,  0.75471191],
-           [ 0.94204778,  1.09099239],
-           [ 3.55145404,  5.29220801]])
-
-    >>> from aeon.testing.data_generation import piecewise_normal_multivariate
-    >>> piecewise_normal_multivariate(means=[[1, 1], [2, 2], [3, 3]],\
-        lengths=[2, 3, 1], variances=[[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]],\
-        random_state=2)
-    array([[ 0.58324215,  0.94373317],
-           [-1.1361961 ,  2.64027081],
-           [ 0.20656441,  1.15825263],
-           [ 2.50288142,  0.75471191],
-           [ 0.94204778,  1.09099239],
-           [ 3.55145404,  5.29220801]])
-
-    >>> from aeon.testing.data_generation import piecewise_normal_multivariate
-    >>> piecewise_normal_multivariate(means=[[1, 1], [2, 2], [3, 3]],\
-        lengths=[2, 3, 1], covariances=[[[1.0, 0], [0, 1.0]], [[1.0, 0],\
-            [0, 1.0]], [[1.0, 0], [0, 1.0]]], random_state=2)
-    array([[ 0.58324215,  0.94373317],
-           [-1.1361961 ,  2.64027081],
-           [ 0.20656441,  1.15825263],
-           [ 2.50288142,  0.75471191],
-           [ 0.94204778,  1.09099239],
-           [ 3.55145404,  5.29220801]])
-
-    >>> from aeon.testing.data_generation import piecewise_normal_multivariate
-    >>> piecewise_normal_multivariate(means=[[1, 3], [4, 5]], lengths=[3, 3],\
-        covariances=[[[0.5, 0.3], [0.3, 1.0]], [[1.0, 0.3], [0.3, 0.7]]],\
-        random_state=2)
-    array([[ 0.78066776,  2.61125356],
-           [ 0.92296736,  0.51689669],
-           [-0.2694238 ,  1.47959507],
-           [ 4.00389069,  3.95225998],
-           [ 5.32264874,  5.05088075],
-           [ 2.62479901,  6.08308546]])
-
-    """
-
-    def get_covariances(var):
-        """Fill 1D variance array of length N to 2D covariance array of size (N,N)."""
-        cov = np.zeros((N, N), float)
-        np.fill_diagonal(cov, var)
-        return cov
-
-    L, N = np.array(means).shape
-
-    rng = check_random_state(random_state)
-    assert len(lengths) == L
-
-    # if no covariance is specified, build it from variance
-    # assuming independent random variables
-    if covariances is None:
-        assert variances is not None
-
-        # variances van be specified as a float, make 1D array, repeat L times
-        if isinstance(variances, (float, int)):
-            variances = np.repeat(variances, N)
-            variances = np.tile(variances, (L, 1))
-
-        assert np.array(variances).shape == (L, N)
-
-        # get covariance matrices from variance arrays
-        covariances = [get_covariances(var) for var in variances]
-
-    else:
-        assert all(np.allclose(np.array(cov), np.array(cov).T) for cov in covariances)
-        assert all(np.all(np.linalg.eigvals(cov) >= 0) for cov in covariances)
-
-    assert np.array(covariances).shape[0] == L
-    assert np.array(covariances).shape[1] == N
-
-    return np.concatenate(
-        [
-            rng.multivariate_normal(mean=mean, cov=cov, size=length)
-            for mean, cov, length in zip(means, covariances, lengths)
-        ]
-    )
-
-
-def piecewise_normal(
-    means: npt.ArrayLike,
-    lengths: npt.ArrayLike,
-    std_dev: Union[npt.ArrayLike, float] = 1.0,
-    random_state: Optional[Union[int, np.random.RandomState]] = None,
-) -> npt.ArrayLike:
-    """
-    Generate series from segments.
-
-    Each segment has length specified in ``lengths`` and data sampled from a normal
-    distribution with a mean from ``means`` and standard deviation from ``std_dev``.
-
-    Parameters
-    ----------
-    means : array_like
-        Means of the segments to be generated
-    lengths : array_like
-        Lengths of the segments to be generated
-    std_dev : float or array_like
-        Standard deviations of the segments to be generated
-    random_state : int or np.random.RandomState
-        Either a random seed or RandomState instance
-
-    Returns
-    -------
-    data : np.array
-        univariate time series as np.array
-
-    Examples
-    --------
-    >>> from aeon.testing.data_generation import piecewise_normal
-    >>> piecewise_normal([1, 2, 3], lengths=[2, 4, 8], random_state=42) # doctest: +SKIP
-    array([1.49671415, 0.8617357 , 2.64768854, 3.52302986, 1.76584663,
-        1.76586304, 4.57921282, 3.76743473, 2.53052561, 3.54256004,
-        2.53658231, 2.53427025, 3.24196227, 1.08671976])
-
-    >>> from aeon.testing.data_generation import piecewise_normal
-    >>> piecewise_normal([1, 2, 3], lengths=[2, 4, 8], std_dev=0) # doctest: +SKIP
-    array([1., 1., 2., 2., 2., 2., 3., 3., 3., 3., 3., 3., 3., 3.])
-
-    >>> from aeon.testing.data_generation import piecewise_normal
-    >>> piecewise_normal([1, 2, 3], lengths=[2, 4, 8], std_dev=[0, 0.5, 1.0])\
-        # doctest: +SKIP
-    array([1.        , 1.        , 2.32384427, 2.76151493, 1.88292331,
-        1.88293152, 4.57921282, 3.76743473, 2.53052561, 3.54256004,
-        2.53658231, 2.53427025, 3.24196227, 1.08671976])
-
-    """
-    rng = check_random_state(random_state)
-    assert len(means) == len(lengths)
-
-    if isinstance(std_dev, (float, int)):
-        std_dev = np.repeat(std_dev, len(means))
-
-    assert len(std_dev) == len(means)
-
-    segments_data = [
-        rng.normal(loc=mean, scale=sd, size=[length])
-        for mean, length, sd in zip(means, lengths, std_dev)
-    ]
-    return np.concatenate(tuple(segments_data))
-
-
-def piecewise_multinomial(
-    n_trials: int,
-    lengths: npt.ArrayLike,
-    p_vals: npt.ArrayLike,
-    random_state: Optional[Union[int, np.random.RandomState]] = None,
-) -> npt.ArrayLike:
-    """
-    Generate series from segments.
-
-    Each segment has length specified in ``lengths`` and data sampled from a multinomial
-    distribution with a total number of experiments for each trial set from ``n_trials``
-    and the probability for each outcome stored inside a list contained in  ``p_vals``.
-
-    Parameters
-    ----------
-    n_trials : int
-        Number of experiments to run during each trial
-    lengths : array_like
-        Lengths of the segments to be generated
-    p_vals : array_like
-        Set of probabilities for each outcome for each distribution
-        Each set of probabilities must be equal length
-    random_state : int or np.random.RandomState
-        Either a random seed or RandomState instance
-
-    Returns
-    -------
-    data : np.array
-        univariate or multivariate time series as np.array
-        that has dimensions sum(lengths) x n_outcomes
-        where n_outcomes = # of outcomes for each item in ``p_vals``
-
-    Examples
-    --------
-    >>> from aeon.testing.data_generation import piecewise_multinomial
-    >>> piecewise_multinomial(20, lengths=[3, 2], p_vals=[[1/4, 3/4], \
-        [3/4, 1/4]], random_state=42) # doctest: +SKIP
-    array([[ 4, 16],
-       [ 8, 12],
-       [ 6, 14],
-       [15,  5],
-       [17,  3]])
-
-    >>> from aeon.testing.data_generation import piecewise_multinomial
-    >>> piecewise_multinomial(10, lengths=[2, 4, 8], \
-        p_vals=[[1, 0], [0, 1], [1, 0]]) # doctest: +SKIP
-    array([[10,  0],
-       [10,  0],
-       [ 0, 10],
-       [ 0, 10],
-       [ 0, 10],
-       [ 0, 10],
-       [10,  0],
-       [10,  0],
-       [10,  0],
-       [10,  0],
-       [10,  0],
-       [10,  0],
-       [10,  0],
-       [10,  0]])
-    """
-    rng = check_random_state(random_state)
-
-    # error handling for inputs
-    if len(lengths) != len(p_vals):
-        raise ValueError("lengths and p_vals arguments must be same length")
-    elif any(sum(p_val) != 1 for p_val in p_vals):
-        raise ValueError("each set of probabilities in p_val must sum to 1")
-    elif not (np.array([len(p_val) for p_val in p_vals]) == len(p_vals[0])).all():
-        raise ValueError("each set of probabilities in p_val must be equal length")
-
-    segments_data = [
-        rng.multinomial(n=n_trials, pvals=p_val, size=[length])
-        for p_val, length, in zip(p_vals, lengths)
-    ]
-    return np.concatenate(tuple(segments_data))
-
-
-def piecewise_poisson(
-    lambdas: npt.ArrayLike,
-    lengths: npt.ArrayLike,
-    random_state: Optional[Union[int, np.random.RandomState]] = None,
-) -> npt.ArrayLike:
-    """
-    Generate series using Possion distribution.
-
-    Each segment has length specified in ``lengths`` and data sampled from a Poisson
-    distribution with expected lambda from ``lambdas``.
-
-    Parameters
-    ----------
-    lambdas : array_like
-        Expected number and variance of events within a specified time interval
-    lengths : array_like
-        Lengths of the segments to be generated
-    random_state : int or np.random.RandomState
-        Either a random seed or RandomState instance
-
-    Returns
-    -------
-    data : np.array
-        univariate time series as np.array
-
-    Examples
-    --------
-    >>> from aeon.testing.data_generation import piecewise_poisson
-    >>> piecewise_poisson(lambdas=[1,2,3],lengths=[2,4,8],random_state=42)#doctest:+SKIP
-    array([1, 2, 1, 3, 3, 1, 3, 1, 3, 2, 2, 4, 2, 1])
-
-    >>> from aeon.testing.data_generation import piecewise_poisson
-    >>> piecewise_poisson(lambdas=[1,3,6],lengths=[2,4,8],random_state=42)#doctest:+SKIP
-    array([1, 2, 1, 3, 3, 2, 5, 5, 6, 4, 4, 9, 3, 5])
-
-    """
-    rng = check_random_state(random_state)
-
-    assert len(lambdas) == len(lengths)
-
-    try:
-        segments_data = [
-            rng.poisson(lam=lams, size=[length])
-            for lams, length in zip(lambdas, lengths)
-        ]
-    except ValueError as e:
-        raise Exception("Size mismatch") from e
-
-    return np.concatenate(tuple(segments_data))
-
-
-def labels_with_repeats(means: npt.ArrayLike, std_dev: npt.ArrayLike) -> npt.ArrayLike:
-    """Generate labels for unique combinations of means and std_dev."""
-    data = [means, std_dev]
-    unique, indices = np.unique(data, axis=1, return_inverse=True)
-    labels = np.arange(unique.shape[1])
-    return labels[indices]
-
-
-def label_piecewise_normal(
-    means: npt.ArrayLike,
-    lengths: npt.ArrayLike,
-    std_dev: Union[npt.ArrayLike, float] = 1.0,
-    repeated_labels: bool = True,
-) -> npt.ArrayLike:
-    """
-    Generate labels for a series composed of segments.
-
-    Parameters
-    ----------
-    means : array_like
-        Means of the segments to be generated
-    lengths : array_like
-        Lengths of the segments to be generated
-    std_dev : float or array_like
-        Standard deviations of the segments to be generated
-    repeated_labels : bool
-        Flag to indicate whether segment labels should be repeated for similar segments.
-        If ``True`` same label will be assigned for segments with same mean and std_dev,
-        independently of length. If ``False`` each consecutive segment will have
-        a unique label.
-
-    Returns
-    -------
-    labels : np.array
-        integer encoded array of labels, same length as data
-    """
-    if isinstance(std_dev, (float, int)):
-        std_dev = np.repeat(std_dev, len(means))
-    if repeated_labels:
-        unique_labels = labels_with_repeats(means, std_dev)
-    else:
-        unique_labels = range(len(lengths))
-    return np.repeat(unique_labels, lengths)
-
-
-class GenBasicGauss:
-    """Data generator base class in order to allow composition."""
-
-    def __init__(self, means, lengths, std_dev=1.0, random_state=None):
-        self.means = means
-        self.lengths = lengths
-        self.std_dev = std_dev
-        self.random_state = random_state
-
-    def sample(self):
-        """Generate univariate mean shift random data sample."""
-        return piecewise_normal(
-            means=self.means,
-            lengths=self.lengths,
-            std_dev=self.std_dev,
-            random_state=self.random_state,
-        )
diff --git a/aeon/testing/data_generation/tests/test_hierarchical.py b/aeon/testing/data_generation/tests/test_hierarchical.py
deleted file mode 100644
index 4a1fe1ef31..0000000000
--- a/aeon/testing/data_generation/tests/test_hierarchical.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""Test hierarchical generators."""
-
-import pandas as pd
-
-from aeon.testing.data_generation.hierarchical import _make_hierarchical
-
-
-def test_make_hierarchical_basic():
-    """Test make hierarchy."""
-    df = _make_hierarchical()
-    assert isinstance(df, pd.DataFrame), "Output is not a pandas DataFrame"
-    assert df.shape[1] == 1, "DataFrame does not have the expected number of columns"
-    assert not df.isnull().values.any(), "DataFrame contains unexpected NaN values"
-
-
-def test_make_hierarchical_custom_levels():
-    """Test make hierarchy."""
-    # Test custom hierarchy levels
-    hierarchy_levels = (3, 2)
-    df = _make_hierarchical(hierarchy_levels=hierarchy_levels)
-    expected_levels = len(hierarchy_levels) + 1  # +1 for the time index
-    assert df.index.nlevels == expected_levels, "Incorrect number of index levels"
-
-
-def test_make_hierarchical_timepoints_range():
-    """Test make hierarchy."""
-    # Test varying timepoints
-    min_timepoints, max_timepoints = 5, 10
-    df = _make_hierarchical(
-        min_timepoints=min_timepoints, max_timepoints=max_timepoints, same_cutoff=False
-    )
-    # Verifying that series lengths vary within the specified range
-    lengths = df.groupby(level=list(range(len(df.index.levels) - 1))).size()
-    assert (
-        lengths.min() >= min_timepoints and lengths.max() <= max_timepoints
-    ), "Time points do not fall within the specified range"
-
-
-def test_make_hierarchical_nan_values():
-    """Test make hierarchy."""
-    # Test NaN values inclusion
-    df = _make_hierarchical(add_nan=True)
-    assert df.isnull().values.any(), "DataFrame does not contain NaN values as expected"
-
-
-def test_make_hierarchical_positive_values():
-    """Test make hierarchy."""
-    # Test all positive values
-    df = _make_hierarchical(all_positive=True)
-    assert (df >= 0).all().all(), "DataFrame contains non-positive values"
-
-
-def test_make_hierarchical_index_type():
-    """Test make hierarchy."""
-    # Test for specific index types
-    index_type = "datetime"
-    df = _make_hierarchical(index_type=index_type)
-    assert isinstance(
-        df.index.get_level_values(-1)[0], pd.Timestamp
-    ), "Index type does not match 'datetime'"
diff --git a/aeon/testing/data_generation/tests/test_segmentation.py b/aeon/testing/data_generation/tests/test_segmentation.py
deleted file mode 100644
index 2d2a62c1d4..0000000000
--- a/aeon/testing/data_generation/tests/test_segmentation.py
+++ /dev/null
@@ -1,184 +0,0 @@
-"""Test segmentation data generation."""
-
-import numpy as np
-import pytest
-from numpy import array_equal
-
-from aeon.testing.data_generation.segmentation import (
-    GenBasicGauss,
-    label_piecewise_normal,
-    labels_with_repeats,
-    piecewise_multinomial,
-    piecewise_normal,
-    piecewise_normal_multivariate,
-    piecewise_poisson,
-)
-
-
-def test_segmentation_generation():
-    """Test the piecewise generation functions."""
-    X = piecewise_normal_multivariate(
-        means=[[1, 1], [2, 2], [3, 3]], lengths=[2, 3, 1], random_state=2
-    )
-    assert isinstance(X, np.ndarray)
-    exp = np.array(
-        [
-            [0.58324215, 0.94373317],
-            [-1.1361961, 2.64027081],
-            [0.20656441, 1.15825263],
-            [2.50288142, 0.75471191],
-            [0.94204778, 1.09099239],
-            [3.55145404, 5.29220801],
-        ]
-    )
-    assert np.allclose(X, exp)
-    X = piecewise_normal([1, 2, 3], lengths=[2, 4, 8], random_state=42)
-    exp = np.array(
-        [
-            1.49671415,
-            0.8617357,
-            2.64768854,
-            3.52302986,
-            1.76584663,
-            1.76586304,
-            4.57921282,
-            3.76743473,
-            2.53052561,
-            3.54256004,
-            2.53658231,
-            2.53427025,
-            3.24196227,
-            1.08671976,
-        ]
-    )
-    assert np.allclose(X, exp)
-
-    X = piecewise_normal(
-        [1, 2, 3], lengths=[2, 4, 8], std_dev=[0, 0.5, 1.0], random_state=42
-    )
-    exp = np.array(
-        [
-            1.0,
-            1.0,
-            2.32384427,
-            2.76151493,
-            1.88292331,
-            1.88293152,
-            4.57921282,
-            3.76743473,
-            2.53052561,
-            3.54256004,
-            2.53658231,
-            2.53427025,
-            3.24196227,
-            1.08671976,
-        ]
-    )
-    assert np.allclose(X, exp)
-    X = piecewise_multinomial(
-        20, lengths=[3, 2], p_vals=[[1 / 4, 3 / 4], [3 / 4, 1 / 4]], random_state=42
-    )
-    exp = np.array([[4, 16], [8, 12], [6, 14], [15, 5], [17, 3]])
-    assert np.allclose(X, exp)
-    X = piecewise_multinomial(10, lengths=[2, 4, 8], p_vals=[[1, 0], [0, 1], [1, 0]])
-    exp = np.array(
-        [
-            [10, 0],
-            [10, 0],
-            [0, 10],
-            [0, 10],
-            [0, 10],
-            [0, 10],
-            [10, 0],
-            [10, 0],
-            [10, 0],
-            [10, 0],
-            [10, 0],
-            [10, 0],
-            [10, 0],
-            [10, 0],
-        ]
-    )
-    assert np.allclose(X, exp)
-    X = piecewise_poisson(lambdas=[1, 2, 3], lengths=[2, 4, 8], random_state=42)
-    exp = np.array([1, 2, 1, 3, 3, 1, 3, 1, 3, 2, 2, 4, 2, 1])
-    assert np.allclose(X, exp)
-    X = piecewise_poisson(lambdas=[1, 3, 6], lengths=[2, 4, 8], random_state=42)
-    exp = np.array([1, 2, 1, 3, 3, 2, 5, 5, 6, 4, 4, 9, 3, 5])
-    assert np.allclose(X, exp)
-
-
-def test_label_generation():
-    """Test label generation."""
-    y = labels_with_repeats(means=[1.0, 2.0, 3.0], std_dev=[0.5, 1.0, 2.0])
-    exp = np.array([0, 1, 2])
-    assert np.allclose(y, exp)
-    y = label_piecewise_normal([1, 2, 3], lengths=[10, 10, 10], std_dev=[0.5, 1.0, 2.0])
-    exp = np.array(
-        [
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            2,
-            2,
-            2,
-            2,
-            2,
-            2,
-            2,
-            2,
-            2,
-            2,
-        ]
-    )
-    assert np.allclose(y, exp)
-    gen = GenBasicGauss([1, 2, 3], lengths=[2, 4, 8], random_state=42)
-    X = gen.sample()
-    exp = np.array(
-        [
-            1.49671415,
-            0.8617357,
-            2.64768854,
-            3.52302986,
-            1.76584663,
-            1.76586304,
-            4.57921282,
-            3.76743473,
-            2.53052561,
-            3.54256004,
-            2.53658231,
-            2.53427025,
-            3.24196227,
-            1.08671976,
-        ]
-    )
-    assert np.allclose(X, exp)
-
-
-@pytest.mark.parametrize(
-    "lambdas, lengths, random_state, output",
-    [
-        ([1, 2, 3], [2, 4, 8], 42, [1, 2, 1, 3, 3, 1, 3, 1, 3, 2, 2, 4, 2, 1]),
-        ([1, 3, 6], [2, 4, 8], 42, [1, 2, 1, 3, 3, 2, 5, 5, 6, 4, 4, 9, 3, 5]),
-    ],
-)
-def test_piecewise_poisson(lambdas, lengths, random_state, output):
-    """Test piecewise_poisson fuction returns the expected Poisson distributed array."""
-    assert array_equal(piecewise_poisson(lambdas, lengths, random_state), output)
diff --git a/aeon/testing/estimator_checking/_yield_classification_checks.py b/aeon/testing/estimator_checking/_yield_classification_checks.py
index 2d49236c8c..271284b94d 100644
--- a/aeon/testing/estimator_checking/_yield_classification_checks.py
+++ b/aeon/testing/estimator_checking/_yield_classification_checks.py
@@ -8,6 +8,7 @@
 from sys import platform
 
 import numpy as np
+from numpy.testing import assert_array_almost_equal
 from sklearn.utils._testing import set_random_state
 
 from aeon.base._base import _clone_estimator
@@ -18,7 +19,7 @@
     unit_test_proba,
 )
 from aeon.testing.testing_data import FULL_TEST_DATA_DICT
-from aeon.testing.utils.estimator_checks import _assert_array_almost_equal, _get_tag
+from aeon.testing.utils.estimator_checks import _get_tag
 from aeon.utils.validation import get_n_cases
 
 
@@ -124,7 +125,7 @@ def check_classifier_against_expected_results(estimator_class):
         y_proba = estimator_instance.predict_proba(X_test[indices])
 
         # assert probabilities are the same
-        _assert_array_almost_equal(
+        assert_array_almost_equal(
             y_proba,
             expected_probas,
             decimal=2,
diff --git a/aeon/testing/estimator_checking/_yield_early_classification_checks.py b/aeon/testing/estimator_checking/_yield_early_classification_checks.py
index 9459b39442..e19b8bd0f0 100644
--- a/aeon/testing/estimator_checking/_yield_early_classification_checks.py
+++ b/aeon/testing/estimator_checking/_yield_early_classification_checks.py
@@ -4,6 +4,7 @@
 from sys import platform
 
 import numpy as np
+from numpy.testing import assert_array_almost_equal
 from sklearn.utils._testing import set_random_state
 
 from aeon.base._base import _clone_estimator
@@ -13,7 +14,6 @@
     unit_test_proba,
 )
 from aeon.testing.testing_data import FULL_TEST_DATA_DICT
-from aeon.testing.utils.estimator_checks import _assert_array_almost_equal
 from aeon.utils.validation import get_n_cases
 
 
@@ -74,7 +74,7 @@ def check_early_classifier_against_expected_results(estimator_class):
         y_proba, _ = estimator_instance.predict_proba(X_test[indices])
 
         # assert probabilities are the same
-        _assert_array_almost_equal(
+        assert_array_almost_equal(
             y_proba,
             expected_probas,
             decimal=2,
diff --git a/aeon/testing/estimator_checking/_yield_estimator_checks.py b/aeon/testing/estimator_checking/_yield_estimator_checks.py
index 20664bea73..112fc96aee 100644
--- a/aeon/testing/estimator_checking/_yield_estimator_checks.py
+++ b/aeon/testing/estimator_checking/_yield_estimator_checks.py
@@ -1,5 +1,6 @@
 """Tests for all estimators."""
 
+import inspect
 import numbers
 import pickle
 import types
@@ -10,6 +11,7 @@
 import joblib
 import numpy as np
 import pytest
+from numpy.testing import assert_array_almost_equal
 from sklearn.exceptions import NotFittedError
 from sklearn.utils.estimator_checks import check_get_params_invariance
 
@@ -64,13 +66,7 @@
 )
 from aeon.testing.testing_data import FULL_TEST_DATA_DICT, _get_datatypes_for_estimator
 from aeon.testing.utils.deep_equals import deep_equals
-from aeon.testing.utils.estimator_checks import (
-    _assert_array_almost_equal,
-    _get_args,
-    _get_tag,
-    _list_required_methods,
-    _run_estimator_method,
-)
+from aeon.testing.utils.estimator_checks import _get_tag, _run_estimator_method
 from aeon.transformations.base import BaseTransformer
 from aeon.transformations.collection import BaseCollectionTransformer
 from aeon.transformations.series import BaseSeriesTransformer
@@ -286,21 +282,22 @@ def check_inheritance(estimator_class):
 
 def check_has_common_interface(estimator_class):
     """Check estimator implements the common interface."""
-    # Check class for type of attribute
-    if isinstance(estimator_class, BaseAeonEstimator):
-        assert isinstance(estimator_class.is_fitted, property)
-
-    required_methods = _list_required_methods(estimator_class)
-
-    for attr in required_methods:
-        assert hasattr(
-            estimator_class, attr
-        ), f"Estimator: {estimator_class.__name__} does not implement attribute: {attr}"
-
-    if hasattr(estimator_class, "inverse_transform"):
-        assert hasattr(estimator_class, "transform")
-    if hasattr(estimator_class, "predict_proba"):
-        assert hasattr(estimator_class, "predict")
+    assert issubclass(estimator_class, BaseAeonEstimator)
+    assert hasattr(estimator_class, "fit") and callable(estimator_class.fit)
+    assert hasattr(estimator_class, "reset") and callable(estimator_class.reset)
+    assert hasattr(estimator_class, "clone") and callable(estimator_class.clone)
+    assert hasattr(estimator_class, "get_class_tags") and callable(
+        estimator_class.get_class_tags
+    )
+    assert hasattr(estimator_class, "get_class_tag") and callable(
+        estimator_class.get_class_tag
+    )
+    assert hasattr(estimator_class, "get_tags") and callable(estimator_class.get_tags)
+    assert hasattr(estimator_class, "get_tag") and callable(estimator_class.get_tag)
+    assert hasattr(estimator_class, "set_tags") and callable(estimator_class.set_tags)
+    assert hasattr(estimator_class, "get_fitted_params") and callable(
+        estimator_class.get_fitted_params
+    )
 
 
 def check_set_params_sklearn(estimator_class):
@@ -322,9 +319,10 @@ def check_set_params_sklearn(estimator_class):
         params_full = estimator.get_params(deep=False)
         params_full.update(params)
 
-        msg = f"set_params of {estimator_class.__name__} does not return self"
         est_after_set = estimator.set_params(**params_full)
-        assert est_after_set is estimator, msg
+        assert (
+            est_after_set is estimator
+        ), f"set_params of {estimator_class.__name__} does not return self"
 
         is_equal, equals_msg = deep_equals(
             estimator.get_params(deep=False), params_full, return_msg=True
@@ -361,7 +359,7 @@ def check_constructor(estimator_class):
     assert isinstance(estimator, estimator_class)
 
     # Ensure that each parameter is set in init
-    init_params = _get_args(type(estimator).__init__)
+    init_params = inspect.signature(estimator_class.__init__).parameters
     invalid_attr = set(init_params) - set(vars(estimator)) - {"self"}
     assert not invalid_attr, (
         "Estimator %s should store all parameters"
@@ -461,8 +459,9 @@ def check_set_params(estimator):
     estimator = _clone_estimator(estimator)
     params = estimator.get_params()
 
-    msg = f"set_params of {type(estimator).__name__} does not return self"
-    assert estimator.set_params(**params) is estimator, msg
+    assert (
+        estimator.set_params(**params) is estimator
+    ), f"set_params of {type(estimator).__name__} does not return self"
 
     is_equal, equals_msg = deep_equals(estimator.get_params(), params, return_msg=True)
     msg = (
@@ -559,8 +558,7 @@ def check_non_state_changing_method(estimator, datatype):
         ), f"Estimator: {type(estimator)} has side effects on arguments of {method}"
 
         # dict_after = dictionary of estimator after predict and fit
-        dict_after = estimator.__dict__
-        is_equal, msg = deep_equals(dict_after, dict_before, return_msg=True)
+        is_equal, msg = deep_equals(estimator.__dict__, dict_before, return_msg=True)
         assert is_equal, (
             f"Estimator: {type(estimator).__name__} changes __dict__ "
             f"during {method}, "
@@ -667,7 +665,7 @@ def check_persistence_via_pickle(estimator, datatype):
         if hasattr(estimator, method) and callable(getattr(estimator, method)):
             output = _run_estimator_method(estimator, method, datatype, "test")
 
-            _assert_array_almost_equal(
+            assert_array_almost_equal(
                 output,
                 results[i],
                 err_msg=f"Running {method} after fit twice with test "
@@ -699,7 +697,7 @@ def check_fit_deterministic(estimator, datatype):
         if hasattr(estimator, method) and callable(getattr(estimator, method)):
             output = _run_estimator_method(estimator, method, datatype, "test")
 
-            _assert_array_almost_equal(
+            assert_array_almost_equal(
                 output,
                 results[i],
                 err_msg=f"Running {method} after fit twice with test "
diff --git a/aeon/testing/estimator_checking/_yield_regression_checks.py b/aeon/testing/estimator_checking/_yield_regression_checks.py
index af498a520d..ce0ae00462 100644
--- a/aeon/testing/estimator_checking/_yield_regression_checks.py
+++ b/aeon/testing/estimator_checking/_yield_regression_checks.py
@@ -7,6 +7,7 @@
 from sys import platform
 
 import numpy as np
+from numpy.testing import assert_array_almost_equal
 from sklearn.utils._testing import set_random_state
 
 from aeon.base._base import _clone_estimator
@@ -17,7 +18,6 @@
     covid_3month_preds,
 )
 from aeon.testing.testing_data import FULL_TEST_DATA_DICT
-from aeon.testing.utils.estimator_checks import _assert_array_almost_equal
 
 
 def _yield_regression_checks(estimator_class, estimator_instances, datatypes):
@@ -92,7 +92,7 @@ def check_regressor_against_expected_results(estimator_class):
         y_pred = estimator_instance.predict(X_test[indices_test])
 
         # assert predictions are the same
-        _assert_array_almost_equal(
+        assert_array_almost_equal(
             y_pred,
             expected_preds,
             decimal=2,
diff --git a/aeon/testing/estimator_checking/_yield_transformation_checks.py b/aeon/testing/estimator_checking/_yield_transformation_checks.py
index 6383c8797b..88936cd719 100644
--- a/aeon/testing/estimator_checking/_yield_transformation_checks.py
+++ b/aeon/testing/estimator_checking/_yield_transformation_checks.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import pandas as pd
+from numpy.testing import assert_array_almost_equal
 from sklearn.utils._testing import set_random_state
 
 from aeon.base._base import _clone_estimator
@@ -14,10 +15,7 @@
     unit_test_result,
 )
 from aeon.testing.testing_data import FULL_TEST_DATA_DICT
-from aeon.testing.utils.estimator_checks import (
-    _assert_array_almost_equal,
-    _run_estimator_method,
-)
+from aeon.testing.utils.estimator_checks import _run_estimator_method
 
 
 def _yield_transformation_checks(estimator_class, estimator_instances, datatypes):
@@ -81,7 +79,7 @@ def check_transformer_against_expected_results(estimator_class):
         )
 
         # assert results are the same
-        _assert_array_almost_equal(
+        assert_array_almost_equal(
             results,
             expected_results,
             decimal=2,
@@ -105,6 +103,6 @@ def check_transform_inverse_transform_equivalent(estimator, datatype):
     Xit = estimator.inverse_transform(Xt)
 
     if isinstance(X, pd.DataFrame):
-        _assert_array_almost_equal(X.loc[Xit.index], Xit)
+        assert_array_almost_equal(X.loc[Xit.index], Xit)
     else:
-        _assert_array_almost_equal(X, Xit)
+        assert_array_almost_equal(X, Xit)
diff --git a/aeon/benchmarking/example_results/classification/accuracy/FreshPRINCE_accuracy.csv b/aeon/testing/example_results_files/classification/accuracy/FreshPRINCE_accuracy.csv
similarity index 100%
rename from aeon/benchmarking/example_results/classification/accuracy/FreshPRINCE_accuracy.csv
rename to aeon/testing/example_results_files/classification/accuracy/FreshPRINCE_accuracy.csv
diff --git a/aeon/benchmarking/example_results/classification/accuracy/HC2_accuracy.csv b/aeon/testing/example_results_files/classification/accuracy/HC2_accuracy.csv
similarity index 100%
rename from aeon/benchmarking/example_results/classification/accuracy/HC2_accuracy.csv
rename to aeon/testing/example_results_files/classification/accuracy/HC2_accuracy.csv
diff --git a/aeon/benchmarking/example_results/classification/accuracy/InceptionTime_accuracy.csv b/aeon/testing/example_results_files/classification/accuracy/InceptionTime_accuracy.csv
similarity index 100%
rename from aeon/benchmarking/example_results/classification/accuracy/InceptionTime_accuracy.csv
rename to aeon/testing/example_results_files/classification/accuracy/InceptionTime_accuracy.csv
diff --git a/aeon/benchmarking/example_results/classification/accuracy/WEASEL-2.0_accuracy.csv b/aeon/testing/example_results_files/classification/accuracy/WEASEL-2.0_accuracy.csv
similarity index 100%
rename from aeon/benchmarking/example_results/classification/accuracy/WEASEL-2.0_accuracy.csv
rename to aeon/testing/example_results_files/classification/accuracy/WEASEL-2.0_accuracy.csv
diff --git a/aeon/testing/testing_config.py b/aeon/testing/testing_config.py
index fde6662879..260bf48b74 100644
--- a/aeon/testing/testing_config.py
+++ b/aeon/testing/testing_config.py
@@ -3,7 +3,7 @@
 __maintainer__ = ["MatthewMiddlehurst"]
 __all__ = ["PR_TESTING", "EXCLUDE_ESTIMATORS", "EXCLUDED_TESTS"]
 
-import aeon.testing.utils._cicd_numba_caching  # noqa: F401
+import aeon.testing._cicd_numba_caching  # noqa: F401
 
 # whether to use smaller parameter matrices for test generation and subsample estimators
 # per os/version default is False, can be set to True by pytest --prtesting True flag
diff --git a/aeon/testing/tests/test_all_estimators.py b/aeon/testing/tests/test_all_estimators.py
index f2e747045d..2716021bba 100644
--- a/aeon/testing/tests/test_all_estimators.py
+++ b/aeon/testing/tests/test_all_estimators.py
@@ -3,10 +3,12 @@
 import platform
 import sys
 
+import numpy as np
+from sklearn.utils import check_random_state
+
 from aeon.testing.estimator_checking import parametrize_with_checks
 from aeon.testing.testing_config import PR_TESTING
 from aeon.utils.discovery import all_estimators
-from aeon.utils.sampling import random_partition
 
 ALL_TEST_ESTIMATORS = all_estimators(return_names=False, include_sklearn=False)
 
@@ -15,28 +17,29 @@
 # but all are tested on every OS at least once, and on every python version once
 if PR_TESTING:
     # only use 3 Python versions in PR
-    ix = sys.version_info.minor
-    if ix == 9:
-        ix = 0
-    elif ix == 11:
-        ix = 1
-    elif ix == 12:
-        ix = 2
+    i = sys.version_info.minor
+    if i == 9:
+        i = 0
+    elif i == 11:
+        i = 1
+    elif i == 12:
+        i = 2
 
     os_str = platform.system()
     if os_str == "Windows":
-        ix = ix
+        i = i
     elif os_str == "Linux":
-        ix = ix + 1
+        i = i + 1
     elif os_str == "Darwin":
-        ix = ix + 2
+        i = i + 2
+
+    i = i % 3
 
-    ix = ix % 3
+    rng = check_random_state(42)
+    idx = np.arange(len(ALL_TEST_ESTIMATORS))
+    rng.shuffle(idx)
 
-    ALL_TEST_ESTIMATORS = [
-        ALL_TEST_ESTIMATORS[i]
-        for i in random_partition(len(ALL_TEST_ESTIMATORS), 3)[ix]
-    ]
+    ALL_TEST_ESTIMATORS = [ALL_TEST_ESTIMATORS[n] for n in idx[i::3]]
 
 
 @parametrize_with_checks(ALL_TEST_ESTIMATORS)
diff --git a/aeon/testing/utils/deep_equals.py b/aeon/testing/utils/deep_equals.py
index 1b4c99c85d..aedaa202d4 100644
--- a/aeon/testing/utils/deep_equals.py
+++ b/aeon/testing/utils/deep_equals.py
@@ -1,29 +1,21 @@
-"""Testing utility to compare equality in value for nested objects.
-
-Objects compared can have one of the following valid types:
-    types compatible with != comparison
-    pd.Series, pd.DataFrame, np.ndarray
-    lists, tuples, or dicts of a valid type (recursive)
-"""
+"""Testing utility to compare equality in value for nested objects."""
 
 __maintainer__ = []
-
 __all__ = ["deep_equals"]
 
 from inspect import isclass
 
 import numpy as np
 import pandas as pd
+from scipy.sparse import csr_matrix
 
 
 def deep_equals(x, y, return_msg=False):
     """Test two objects for equality in value.
 
-    Correct if x/y are one of the following valid types:
-        types compatible with != comparison
-        pd.Series, pd.DataFrame, np.ndarray
-        lists, tuples, or dicts of a valid type (recursive)
-        delayed types that result in the above when calling .compute(), e.g., dask df
+    Intended for:
+        pd.Series, pd.DataFrame, np.ndarray, lists, tuples, or dicts.
+        Will recursively compare nested objects.
 
     Important note:
         this function will return "not equal" if types of x,y are different
@@ -32,263 +24,148 @@ def deep_equals(x, y, return_msg=False):
     Parameters
     ----------
     x : object
+        First item to compare.
     y : object
-    return_msg : bool, optional, default=False
-        whether to return informative message about what is not equal
+        Second item to compare.
+    return_msg : bool, default=False
+        Whether to return an informative message about what is not equal.
 
     Returns
     -------
-    is_equal: bool - True if x and y are equal in value
-        x and y do not need to be equal in reference
-    msg : str, only returned if return_msg = True
-        indication of what is the reason for not being equal
-            concatenation of the following strings:
-            .type - type is not equal
-            .len - length is not equal
-            .value - value is not equal
-            .keys - if dict, keys of dict are not equal
-                    if class/object, names of attributes and methods are not equal
-            .dtype - dtype of pandas or numpy object is not equal
-            .index - index of pandas object is not equal
-            .series_equals, .df_equals, .index_equals - .equals of pd returns False
-            [i] - if tuple/list: i-th element not equal
-            [key] - if dict: value at key is not equal
-            [colname] - if pandas.DataFrame: column with name colname is not equal
-            != - call to generic != returns False
+    is_equal: bool
+        True if x and y are equal in value, x and y do not need to be equal in
+        reference.
+    msg : str
+        Only returned if return_msg is True.
+        Indication of what is the reason for not being equal
     """
+    eq, msg = _deep_equals(x, y, 0)
+    return eq if not return_msg else (eq, msg)
 
-    def ret(is_equal, msg):
-        if return_msg:
-            if is_equal:
-                msg = ""
-            return is_equal, msg
-        else:
-            return is_equal
 
+def _deep_equals(x, y, depth):
+    if x is y:
+        return True, ""
     if type(x) is not type(y):
-        return ret(False, f".type, x.type = {type(x)} != y.type = {type(y)}")
+        return False, f"x.type ({type(x)}) != y.type ({type(y)}), depth={depth}"
 
-    # compute delayed objects (dask)
-    if hasattr(x, "compute"):
-        x = x.compute()
-    if hasattr(y, "compute"):
-        y = y.compute()
-
-    # we now know all types are the same
-    # so now we compare values
     if isinstance(x, pd.Series):
-        if x.dtype != y.dtype:
-            return ret(False, f".dtype, x.dtype= {x.dtype} != y.dtype = {y.dtype}")
-        # if columns are object, recurse over entries and index
-        if x.dtype == "object":
-            index_equal = x.index.equals(y.index)
-            values_equal, values_msg = deep_equals(
-                list(x.values), list(y.values), return_msg=True
-            )
-            if not values_equal:
-                msg = ".values" + values_msg
-            elif not index_equal:
-                msg = f".index, x.index: {x.index}, y.index: {y.index}"
-            else:
-                msg = ""
-            return ret(index_equal and values_equal, msg)
-        else:
-            return ret(x.equals(y), f".series_equals, x = {x} != y = {y}")
+        return _series_equals(x, y, depth)
     elif isinstance(x, pd.DataFrame):
-        if not x.columns.equals(y.columns):
-            return ret(
-                False, f".columns, x.columns = {x.columns} != y.columns = {y.columns}"
-            )
-        # if columns are equal and at least one is object, recurse over Series
-        if sum(x.dtypes == "object") > 0:
-            for c in x.columns:
-                is_equal, msg = deep_equals(x[c], y[c], return_msg=True)
-                if not is_equal:
-                    return ret(False, f'["{c}"]' + msg)
-            return ret(True, "")
-        else:
-            return ret(x.equals(y), f".df_equals, x = {x} != y = {y}")
-    elif isinstance(x, pd.Index):
-        return ret(x.equals(y), f".index_equals, x = {x} != y = {y}")
+        return _dataframe_equals(x, y, depth)
     elif isinstance(x, np.ndarray):
-        if x.dtype != y.dtype:
-            return ret(False, f".dtype, x.dtype = {x.dtype} != y.dtype = {y.dtype}")
-        return ret(np.array_equal(x, y, equal_nan=True), ".values")
-    # recursion through lists, tuples and dicts
+        return _numpy_equals(x, y, depth)
     elif isinstance(x, (list, tuple)):
-        return ret(*_tuple_equals(x, y, return_msg=True))
+        return _list_equals(x, y, depth)
     elif isinstance(x, dict):
-        return ret(*_dict_equals(x, y, return_msg=True))
-    elif _is_np_nan(x):
-        return ret(_is_np_nan(y), f"type(x)={type(x)} != type(y)={type(y)}")
+        return _dict_equals(x, y, depth)
+    elif isinstance(x, csr_matrix):
+        return _csrmatrix_equals(x, y, depth)
+    # non-iterable types
     elif isclass(x):
-        return ret(x == y, f".class, x={x.__name__} != y={y.__name__}")
-    elif type(x).__name__ == "ForecastingHorizon":
-        return ret(*_fh_equals(x, y, return_msg=True))
-    elif isinstance(x != y, bool) and x != y:
-        return ret(False, f" !=, {x} != {y}")
-    # csr-matrix must not be compared using np.any(x!=y)
-    elif type(x).__name__ == "csr_matrix":  # isinstance(x, csr_matrix):
-        if not np.allclose(x.toarray(), y.toarray()):
-            return ret(False, f" !=, {x} != {y}")
-    elif np.any(x != y):
-        return ret(False, f" !=, {x} != {y}")
-    return ret(True, "")
+        eq = x == y
+        msg = "" if eq else f"x ({x.__name__}) != y ({y.__name__}), depth={depth}"
+        return eq, msg
+    elif np.isnan(x):
+        eq = np.isnan(y)
+        msg = "" if eq else f"x ({x}) != y ({y}), depth={depth}"
+        return eq, msg
+    elif isinstance(x == y, bool):
+        eq = x == y
+        msg = "" if eq else f"x ({x}) != y ({y}), depth={depth}"
+        return eq, msg
+    # unknown type
+    else:
+        raise ValueError(f"Unknown type: {type(x)}, depth={depth}")
+
+
+def _series_equals(x, y, depth):
+    if x.dtype != y.dtype:
+        return False, f"x.dtype ({x.dtype}) != y.dtype ({y.dtype}), depth={depth}"
+
+    # if columns are object, recurse over entries and index
+    if x.dtype == "object":
+        index_equal = x.index.equals(y.index)
+        values_equal, values_msg = _deep_equals(list(x.values), list(y.values), depth)
+
+        if not values_equal:
+            msg = values_msg
+        elif not index_equal:
+            msg = f".index, x.index: {x.index}, y.index: {y.index}, depth={depth}"
+        else:
+            msg = ""
 
+        return index_equal and values_equal, msg
+    else:
+        eq = x.equals(y)
+        msg = "" if eq else f"x ({x}) != y ({y}), depth={depth}"
+        return eq, msg
 
-def _is_np_nan(x):
-    return isinstance(x, float) and np.isnan(x)
 
+def _dataframe_equals(x, y, depth):
+    if not x.columns.equals(y.columns):
+        return False, f"x.columns ({x.columns}) != y.columns ({y.columns})"
 
-def _tuple_equals(x, y, return_msg=False):
-    """Test two tuples or lists for equality.
+    # if columns are equal and at least one is object, recurse over Series
+    if sum(x.dtypes == "object") > 0:
+        for i, c in enumerate(x.columns):
+            eq, msg = _deep_equals(x[c], y[c], depth + 1)
 
-    Correct if tuples/lists contain the following valid types:
-        types compatible with != comparison
-        pd.Series, pd.DataFrame, np.ndarray
-        lists, tuples, or dicts of a valid type (recursive)
+            if not eq:
+                return False, msg + f", idx={i}"
+        return True, ""
+    else:
+        eq = x.equals(y)
+        msg = "" if eq else f"x ({x}) != y ({y}), depth={depth}"
+        return eq, msg
 
-    Parameters
-    ----------
-    x: tuple or list
-    y: tuple or list
-    return_msg : bool, optional, default=False
-        whether to return informative message about what is not equal
 
-    Returns
-    -------
-    is_equal: bool - True if x and y are equal in value
-        x and y do not need to be equal in reference
-    msg : str, only returned if return_msg = True
-        indication of what is the reason for not being equal
-            concatenation of the following elements:
-            .len - length is not equal
-            [i] - i-th element not equal
-    """
-
-    def ret(is_equal, msg):
-        if return_msg:
-            if is_equal:
-                msg = ""
-            return is_equal, msg
-        else:
-            return is_equal
-
-    n = len(x)
-
-    if n != len(y):
-        return ret(False, f".len, x.len = {n} != y.len = {len(y)}")
+def _numpy_equals(x, y, depth):
+    if x.dtype != y.dtype:
+        return False, f"x.dtype ({x.dtype}) != y.dtype ({y.dtype})"
 
-    # we now know dicts are same length
-    for i in range(n):
-        xi = x[i]
-        yi = y[i]
+    eq = np.array_equal(x, y, equal_nan=True)
+    msg = "" if eq else f"x ({x}) != y ({y}), depth={depth}"
+    return eq, msg
 
-        # recurse through xi/yi
-        is_equal, msg = deep_equals(xi, yi, return_msg=True)
-        if not is_equal:
-            return ret(False, f"[{i}]" + msg)
 
-    return ret(True, "")
+def _csrmatrix_equals(x, y, depth):
+    if not np.allclose(x.toarray(), y.toarray()):
+        return False, f"x ({x}) !=  y ({y}), depth={depth}"
+    return True, ""
 
 
-def _dict_equals(x, y, return_msg=False):
-    """Test two dicts for equality.
+def _list_equals(x, y, depth):
+    if len(x) != len(y):
+        return False, f"x.len ({len(x)}) != y.len ({len(y)}), depth={depth}"
 
-    Correct if dicts contain the following valid types:
-        types compatible with != comparison
-        pd.Series, pd.DataFrame, np.ndarray
-        lists, tuples, or dicts of a valid type (recursive)
+    for i in range(len(x)):
+        eq, msg = _deep_equals(x[i], y[i], depth + 1)
 
-    Parameters
-    ----------
-    x: dict
-    y: dict
-    return_msg : bool, optional, default=False
-        whether to return informative message about what is not equal
-
-    Returns
-    -------
-    is_equal: bool - True if x and y are equal in value
-        x and y do not need to be equal in reference
-    msg : str, only returned if return_msg = True
-        indication of what is the reason for not being equal
-            concatenation of the following strings:
-            .keys - keys are not equal
-            [key] - values at key is not equal
-    """
+        if not eq:
+            return False, msg + f", idx={i}"
+    return True, ""
 
-    def ret(is_equal, msg):
-        if return_msg:
-            if is_equal:
-                msg = ""
-            return is_equal, msg
-        else:
-            return is_equal
 
+def _dict_equals(x, y, depth):
     xkeys = set(x.keys())
     ykeys = set(y.keys())
-
     if xkeys != ykeys:
         xmy = xkeys.difference(ykeys)
         ymx = ykeys.difference(xkeys)
-        diffmsg = ".keys,"
+
+        msg = "x.keys != y.keys"
         if len(xmy) > 0:
-            diffmsg += f" x.keys-y.keys = {xmy}."
+            msg += f", x.keys-y.keys = {xmy}"
         if len(ymx) > 0:
-            diffmsg += f" y.keys-x.keys = {ymx}."
-        return ret(False, diffmsg)
-
-    # we now know that xkeys == ykeys
-    for key in xkeys:
-        xi = x[key]
-        yi = y[key]
+            msg += f", y.keys-x.keys = {ymx}"
 
-        # recurse through xi/yi
-        is_equal, msg = deep_equals(xi, yi, return_msg=True)
-        if not is_equal:
-            return ret(False, f"[{key}]" + msg)
+        return False, msg + f", depth={depth}"
 
-    return ret(True, "")
-
-
-def _fh_equals(x, y, return_msg=False):
-    """Test two forecasting horizons for equality.
-
-    Correct if both x and y are ForecastingHorizon
-
-    Parameters
-    ----------
-    x: ForcastingHorizon
-    y: ForcastingHorizon
-    return_msg : bool, optional, default=False
-        whether to return informative message about what is not equal
-
-    Returns
-    -------
-    is_equal: bool - True if x and y are equal in value
-        x and y do not need to be equal in reference
-    msg : str, only returned if return_msg = True
-        indication of what is the reason for not being equal
-            concatenation of the following strings:
-            .is_relative - x is absolute and y is relative, or vice versa
-            .values - values of x and y are not equal
-    """
-
-    def ret(is_equal, msg):
-        if return_msg:
-            if is_equal:
-                msg = ""
-            return is_equal, msg
-        else:
-            return is_equal
-
-    if x.is_relative != y.is_relative:
-        return ret(False, ".is_relative")
-
-    # recurse through values of x, y
-    is_equal, msg = deep_equals(x._values, y._values, return_msg=True)
-    if not is_equal:
-        return ret(False, ".values" + msg)
+    # we now know that xkeys == ykeys
+    for i, key in enumerate(xkeys):
+        eq, msg = _deep_equals(x[key], y[key], depth + 1)
 
-    return ret(True, "")
+        if not eq:
+            return False, msg + f", idx={i}"
+    return True, ""
diff --git a/aeon/testing/utils/estimator_checks.py b/aeon/testing/utils/estimator_checks.py
index 18227002ba..f5db28964b 100644
--- a/aeon/testing/utils/estimator_checks.py
+++ b/aeon/testing/utils/estimator_checks.py
@@ -1,18 +1,12 @@
 """Utility function for estimator testing."""
 
-__maintainer__ = []
+__maintainer__ = ["MatthewMiddlehurst"]
 
 import inspect
-from inspect import isclass, signature
+from inspect import isclass
 
-import numpy as np
-
-from aeon.base import BaseAeonEstimator
-from aeon.clustering.base import BaseClusterer
-from aeon.regression.base import BaseRegressor
 from aeon.similarity_search.base import BaseSimilaritySearch
 from aeon.testing.testing_data import FULL_TEST_DATA_DICT
-from aeon.transformations.base import BaseTransformer
 
 
 def _run_estimator_method(estimator, method_name, datatype, split):
@@ -64,68 +58,3 @@ def _get_tag(estimator, tag_name, default=None, raise_error=False):
         return estimator.get_tag(
             tag_name=tag_name, raise_error=raise_error, tag_value_default=default
         )
-
-
-def _list_required_methods(estimator):
-    """Return list of required method names (beyond BaseAeonEstimator ones)."""
-    # all BaseAeonEstimator children must implement these
-    MUST_HAVE_FOR_OBJECTS = ["set_params", "get_params"]
-
-    # all BaseAeonEstimator children must implement these
-    MUST_HAVE_FOR_ESTIMATORS = [
-        "fit",
-        "check_is_fitted",
-        "is_fitted",  # read-only property
-    ]
-    # prediction/forecasting base classes that must have predict
-    BASE_CLASSES_THAT_MUST_HAVE_PREDICT = (
-        BaseClusterer,
-        BaseRegressor,
-    )
-    # transformation base classes that must have transform
-    BASE_CLASSES_THAT_MUST_HAVE_TRANSFORM = (BaseTransformer,)
-
-    required_methods = []
-
-    if isinstance(estimator, BaseAeonEstimator):
-        required_methods += MUST_HAVE_FOR_OBJECTS
-
-    if isinstance(estimator, BaseAeonEstimator):
-        required_methods += MUST_HAVE_FOR_ESTIMATORS
-
-    if isinstance(estimator, BASE_CLASSES_THAT_MUST_HAVE_PREDICT):
-        required_methods += ["predict"]
-
-    if isinstance(estimator, BASE_CLASSES_THAT_MUST_HAVE_TRANSFORM):
-        required_methods += ["transform"]
-
-    return required_methods
-
-
-def _assert_array_almost_equal(x, y, decimal=6, err_msg=""):
-    np.testing.assert_array_almost_equal(x, y, decimal=decimal, err_msg=err_msg)
-
-
-def _get_args(function, varargs=False):
-    """Get function arguments."""
-    try:
-        params = signature(function).parameters
-    except ValueError:
-        # Error on builtin C function
-        return []
-    args = [
-        key
-        for key, param in params.items()
-        if param.kind not in (param.VAR_POSITIONAL, param.VAR_KEYWORD)
-    ]
-    if varargs:
-        varargs = [
-            param.name
-            for param in params.values()
-            if param.kind == param.VAR_POSITIONAL
-        ]
-        if len(varargs) == 0:
-            varargs = None
-        return args, varargs
-    else:
-        return args
diff --git a/aeon/testing/utils/tests/test_deep_equals.py b/aeon/testing/utils/tests/test_deep_equals.py
index 63cdb6c0c0..f2d143d236 100644
--- a/aeon/testing/utils/tests/test_deep_equals.py
+++ b/aeon/testing/utils/tests/test_deep_equals.py
@@ -5,11 +5,12 @@
 import numpy as np
 import pandas as pd
 import pytest
+from scipy.sparse import csr_matrix
 
 from aeon.testing.utils.deep_equals import deep_equals
 
 # examples used for comparison below
-EXAMPLES = [
+DEEPEQUALS_ITEMS = [
     42,
     [],
     (()),
@@ -19,41 +20,52 @@
     3.5,
     4.2,
     np.nan,
+    pd.Series([1, 2], ["a", "b"]),
     pd.DataFrame({"a": [4, 2]}),
     pd.DataFrame({"a": [4, 3]}),
     (np.array([1, 2, 4]), [pd.DataFrame({"a": [4, 2]})]),
     {"foo": [42], "bar": pd.Series([1, 2])},
-    {"bar": [42], "foo": pd.Series([1, 2])},
+    {"bar": [12], "foo": pd.Series([1, 2])},
+    csr_matrix([1, 2, 3]),
+]
+DEEPEQUALS_PAIRS = [
+    (DEEPEQUALS_ITEMS[i], DEEPEQUALS_ITEMS[j])
+    for i in range(len(DEEPEQUALS_ITEMS))
+    for j in range(len(DEEPEQUALS_ITEMS))
+    if i is not j
 ]
 
 
-@pytest.mark.parametrize("fixture", EXAMPLES)
-def test_deep_equals_positive(fixture):
+@pytest.mark.parametrize("item", DEEPEQUALS_ITEMS)
+def test_deep_equals_positive(item):
     """Tests that deep_equals correctly identifies equal objects as equal."""
-    x = deepcopy(fixture)
-    y = deepcopy(fixture)
+    x = deepcopy(item)
+    y = deepcopy(item)
+    eq, msg = deep_equals(x, y, return_msg=True)
 
     msg = (
-        f"deep_copy incorrectly returned False for two identical copies of "
-        f"the following object: {x}"
+        f"deep_equals incorrectly returned False for two identical copies of "
+        f"the following object: {x}. msg = {msg}"
     )
-    assert deep_equals(x, y), msg
-
-
-n = len(EXAMPLES)
-DIFFERENT_PAIRS = [
-    (EXAMPLES[i], EXAMPLES[j]) for i in range(n) for j in range(n) if i != j
-]
+    assert eq, msg
 
 
-@pytest.mark.parametrize("fixture1,fixture2", DIFFERENT_PAIRS)
-def test_deep_equals_negative(fixture1, fixture2):
+@pytest.mark.parametrize("item1, item2", DEEPEQUALS_PAIRS)
+def test_deep_equals_negative(item1, item2):
     """Tests that deep_equals correctly identifies unequal objects as unequal."""
-    x = deepcopy(fixture1)
-    y = deepcopy(fixture2)
+    x = deepcopy(item1)
+    y = deepcopy(item2)
+    eq = deep_equals(x, y)
 
     msg = (
-        f"deep_copy incorrectly returned True when comparing "
-        f"the following, different objects: x={x}, y={y}"
+        f"deep_equals incorrectly returned True when comparing "
+        f"the following, different objects: x={x}, y={y}."
     )
-    assert not deep_equals(x, y), msg
+    assert not eq, msg
+
+
+def test_deep_equals_same():
+    """Tests that deep_equals correctly identifies the same object as equal."""
+    x = [1, 2, 3]
+    eq = deep_equals(x, x)
+    assert eq
diff --git a/aeon/transformations/collection/compose/tests/test_pipeline.py b/aeon/transformations/collection/compose/tests/test_pipeline.py
index 14fb6eb700..a3a8ca2d1e 100644
--- a/aeon/transformations/collection/compose/tests/test_pipeline.py
+++ b/aeon/transformations/collection/compose/tests/test_pipeline.py
@@ -3,6 +3,7 @@
 __maintainer__ = ["MatthewMiddlehurst"]
 
 import pytest
+from numpy.testing import assert_array_almost_equal
 from sklearn.preprocessing import StandardScaler
 
 from aeon.testing.data_generation import (
@@ -10,7 +11,6 @@
     make_example_3d_numpy_list,
 )
 from aeon.testing.mock_estimators import MockCollectionTransformer
-from aeon.testing.utils.estimator_checks import _assert_array_almost_equal
 from aeon.transformations.collection import (
     AutocorrelationFunctionTransformer,
     HOG1DTransformer,
@@ -50,7 +50,7 @@ def test_collection_transform_pipeline(transformers):
     for t in transformers:
         X = t.fit_transform(X, y)
 
-    _assert_array_almost_equal(Xt, X)
+    assert_array_almost_equal(Xt, X)
 
 
 def test_unequal_tag_inference():
diff --git a/aeon/utils/sampling.py b/aeon/utils/sampling.py
deleted file mode 100644
index 2860c82b8d..0000000000
--- a/aeon/utils/sampling.py
+++ /dev/null
@@ -1,40 +0,0 @@
-"""Functions to sample aeon datasets.
-
-Used in experiments to get deterministic resamples.
-"""
-
-import random
-
-
-def random_partition(n, k=2, seed=42):
-    """Construct a uniformly random partition, iloc reference.
-
-    Parameters
-    ----------
-    n : int
-        size of set to partition
-    k : int, optional, default=2
-        number of sets to partition into
-    seed : int
-        random seed, used in random.shuffle
-
-    Returns
-    -------
-    parts : list of list of int
-        elements of `parts` are lists of iloc int indices between 0 and n-1
-        elements of `parts` are of length floor(n / k) or ceil(n / k)
-        elements of `parts`, as sets, are disjoint partition of [0, ..., n-1]
-        elements of elements of `parts` are in no particular order
-        `parts` is sampled uniformly at random, subject to the above properties
-    """
-    rng = random.Random(seed)
-    idx = list(range(n))
-    rng.shuffle(idx)
-
-    parts = []
-    for i in range(k):
-        d = round(len(idx) / (k - i))
-        parts += [idx[:d]]
-        idx = idx[d:]
-
-    return parts
diff --git a/aeon/utils/tests/test_sampling.py b/aeon/utils/tests/test_sampling.py
deleted file mode 100644
index 68a4f84f98..0000000000
--- a/aeon/utils/tests/test_sampling.py
+++ /dev/null
@@ -1,43 +0,0 @@
-"""Testing sampling utilities."""
-
-import pytest
-
-from aeon.testing.utils.deep_equals import deep_equals
-from aeon.utils.sampling import random_partition
-
-NK_FIXTURES = [(10, 3), (15, 5), (19, 6), (3, 1), (1, 2)]
-SEED_FIXTURES = [42, 0, 100, -5]
-
-
-@pytest.mark.parametrize("n, k", NK_FIXTURES)
-def test_partition(n, k):
-    """Test that random_partition returns a disjoint partition."""
-    part = random_partition(n, k)
-
-    assert isinstance(part, list)
-    assert all(isinstance(x, list) for x in part)
-    assert all(isinstance(x, int) for y in part for x in y)
-
-    low_size = n // k
-    hi_size = low_size + 1
-    assert all(len(x) == low_size or len(x) == hi_size for x in part)
-
-    part_union = set()
-    for x in part:
-        part_union = part_union.union(x)
-    assert set(range(n)) == part_union
-
-    for i, x in enumerate(part):
-        for j, y in enumerate(part):
-            if i != j:
-                assert len(set(x).intersection(y)) == 0
-
-
-@pytest.mark.parametrize("seed", SEED_FIXTURES)
-@pytest.mark.parametrize("n, k", NK_FIXTURES)
-def test_seed(n, k, seed):
-    """Test that seed is deterministic."""
-    part = random_partition(n, k, seed)
-    part2 = random_partition(n, k, seed)
-
-    assert deep_equals(part, part2)
diff --git a/aeon/utils/validation/tests/test_series.py b/aeon/utils/validation/tests/test_series.py
index be07ddf3c7..f7f6ed4621 100644
--- a/aeon/utils/validation/tests/test_series.py
+++ b/aeon/utils/validation/tests/test_series.py
@@ -2,12 +2,13 @@
 
 __maintainer__ = ["TonyBagnall"]
 
+from typing import Optional, Union
+
 import numpy as np
 import pandas as pd
 import pytest
 
 from aeon.testing.data_generation import (
-    _make_hierarchical,
     make_example_1d_numpy,
     make_example_2d_numpy_series,
     make_example_3d_numpy,
@@ -64,3 +65,113 @@ def test_check_series():
     with pytest.raises(ValueError, match="Input type of y should be one "):
         check_series(None)
     # check
+
+
+def _make_hierarchical(
+    hierarchy_levels: tuple = (2, 4),
+    max_timepoints: int = 12,
+    min_timepoints: int = 12,
+    same_cutoff: bool = True,
+    n_columns: int = 1,
+    all_positive: bool = True,
+    index_type: Optional[str] = None,
+    random_state: Optional[Union[int, np.random.RandomState]] = None,
+    add_nan: bool = False,
+) -> pd.DataFrame:
+    """Generate hierarchical multiindex type for testing.
+
+    Parameters
+    ----------
+    hierarchy_levels : Tuple, optional
+        the number of groups at each hierarchy level, by default (2, 4)
+    max_timepoints : int, optional
+        maximum time points a series can have, by default 12
+    min_timepoints : int, optional
+        minimum time points a seires can have, by default 12
+    same_cutoff : bool, optional
+        If it's True all series will end at the same date, by default True
+    n_columns : int, optional
+        number of columns in the output dataframe, by default 1
+    all_positive : bool, optional
+        If True the time series will be , by default True
+    index_type : str, optional
+        type of index, by default None
+        Supported types are "period", "datetime", "range" or "int".
+        If it's not provided, "datetime" is selected.
+    random_state : int, np.random.RandomState or None
+        Controls the randomness of the estimator, by default None
+    add_nan : bool, optional
+        If it's true the series will contain NaNs, by default False
+
+    Returns
+    -------
+    pd.DataFrame
+        hierarchical dataframe
+    """
+    from itertools import product
+
+    from sklearn.utils import check_random_state
+
+    def _make_index(n_timepoints, index_type=None):
+        """Make indices for unit testing."""
+        if index_type == "period":
+            start = "2000-01"
+            freq = "M"
+            return pd.period_range(start=start, periods=n_timepoints, freq=freq)
+
+        elif index_type == "datetime" or index_type is None:
+            start = "2000-01-01"
+            freq = "D"
+            return pd.date_range(start=start, periods=n_timepoints, freq=freq)
+
+        elif index_type == "range":
+            start = 3  # check non-zero based indices
+            return pd.RangeIndex(start=start, stop=start + n_timepoints)
+
+        elif index_type == "int":
+            start = 3
+            return pd.Index(np.arange(start, start + n_timepoints), dtype=int)
+
+        else:
+            raise ValueError(f"index_class: {index_type} is not supported")
+
+    levels = [
+        [f"h{i}_{j}" for j in range(hierarchy_levels[i])]
+        for i in range(len(hierarchy_levels))
+    ]
+    level_names = [f"h{i}" for i in range(len(hierarchy_levels))]
+    rng = check_random_state(random_state)
+    if min_timepoints == max_timepoints:
+        time_index = _make_index(max_timepoints, index_type)
+        index = pd.MultiIndex.from_product(
+            levels + [time_index], names=level_names + ["time"]
+        )
+    else:
+        df_list = []
+        for levels_tuple in product(*levels):
+            n_timepoints = rng.randint(low=min_timepoints, high=max_timepoints)
+            if same_cutoff:
+                time_index = _make_index(max_timepoints, index_type)[-n_timepoints:]
+            else:
+                time_index = _make_index(n_timepoints, index_type)
+            d = dict(zip(level_names, levels_tuple))
+            d["time"] = time_index
+            df_list.append(pd.DataFrame(d))
+        index = pd.MultiIndex.from_frame(
+            pd.concat(df_list), names=level_names + ["time"]
+        )
+
+    total_time_points = len(index)
+    data = rng.normal(size=(total_time_points, n_columns))
+    if add_nan:
+        # add some nan values
+        data[int(len(data) / 2)] = np.nan
+        data[0] = np.nan
+        data[-1] = np.nan
+    if all_positive:
+        data -= np.min(data, axis=0) - 1
+    df = pd.DataFrame(
+        data=data, index=index, columns=[f"c{i}" for i in range(n_columns)]
+    )
+
+    return df
diff --git a/aeon/visualisation/results/tests/test_boxplot.py b/aeon/visualisation/results/tests/test_boxplot.py
index cd35f423d2..81b27cbfa0 100644
--- a/aeon/visualisation/results/tests/test_boxplot.py
+++ b/aeon/visualisation/results/tests/test_boxplot.py
@@ -12,7 +12,7 @@
 
 data_path = os.path.join(
     os.path.dirname(aeon.__file__),
-    "benchmarking/example_results/",
+    "testing/example_results_files/",
 )
 
 
diff --git a/aeon/visualisation/results/tests/test_critical_difference.py b/aeon/visualisation/results/tests/test_critical_difference.py
index bcd6645417..2ed87bc96a 100644
--- a/aeon/visualisation/results/tests/test_critical_difference.py
+++ b/aeon/visualisation/results/tests/test_critical_difference.py
@@ -16,7 +16,7 @@
 
 data_path = os.path.join(
     os.path.dirname(aeon.__file__),
-    "benchmarking/example_results/",
+    "testing/example_results_files/",
 )
 
 test_clique1 = np.array(
diff --git a/aeon/visualisation/results/tests/test_scatter.py b/aeon/visualisation/results/tests/test_scatter.py
index 7d7a61616e..0c3f4d5bf8 100644
--- a/aeon/visualisation/results/tests/test_scatter.py
+++ b/aeon/visualisation/results/tests/test_scatter.py
@@ -18,7 +18,7 @@
 
 data_path = os.path.join(
     os.path.dirname(aeon.__file__),
-    "benchmarking/example_results/",
+    "testing/example_results_files/",
 )
 
 
diff --git a/aeon/visualisation/results/tests/test_significance.py b/aeon/visualisation/results/tests/test_significance.py
index 71b4a456c3..4568d13c92 100644
--- a/aeon/visualisation/results/tests/test_significance.py
+++ b/aeon/visualisation/results/tests/test_significance.py
@@ -13,7 +13,7 @@
 
 data_path = os.path.join(
     os.path.dirname(aeon.__file__),
-    "benchmarking/example_results/",
+    "testing/example_results_files/",
 )