matrix-profile-foundation · AndrewWilkins84 · Jan 26, 2021 · Jan 26, 2021 · Jan 26, 2021
diff --git a/matrixprofile/algorithms/mpx.py b/matrixprofile/algorithms/mpx.py
@@ -14,6 +14,9 @@
 
 from matrixprofile import core
 from matrixprofile.algorithms.cympx import mpx_parallel as cympx_parallel
+# --- My import
+from matrixprofile.algorithms.mpx_char import mpx_single_char
+# ---
 from matrixprofile.algorithms.cympx import mpx_ab_parallel as cympx_ab_parallel
 
 
@@ -61,23 +64,37 @@ def mpx(ts, w, query=None, cross_correlation=False, n_jobs=1):
         >>> }
 
     """
-    ts = core.to_np_array(ts).astype('d')
+    # --- Drew's addition ---
+    dtype = core.get_dtype(ts)
+    ts = core.to_np_array(ts).astype(dtype)
+    #ts = core.to_np_array(ts).astype('d')
     n_jobs = core.valid_n_jobs(n_jobs)
     is_join = False
 
     if core.is_array_like(query):
-        query = core.to_np_array(query).astype('d')
+        query = core.to_np_array(query).astype(dtype)
+        #query = core.to_np_array(query).astype('d')
         is_join = True
         mp, mpi, mpb, mpib = cympx_ab_parallel(ts, query, w, 
             int(cross_correlation), n_jobs)
     else:
-        mp, mpi = cympx_parallel(ts, w, int(cross_correlation), n_jobs)
+        # --- More changes... ---
+        if np.issubdtype(dtype, 'U'):
+            #ts = np.array([ord(x) for x in ts], dtype = 'd')
+            mp, mpi = mpx_single_char(ts, w)
+        else:
+            mp, mpi = cympx_parallel(ts, w, int(cross_correlation), n_jobs)
+        # --- That's it for now... ---
+        #mp, mpi = cympx_parallel(ts, w, int(cross_correlation), n_jobs)
 
     mp = np.asarray(mp)
     mpi = np.asarray(mpi)
-    distance_metric = 'euclidean'
-    if cross_correlation:
-        distance_metric = 'cross_correlation'
+    if np.issubdtype(dtype, 'U'):
+        distance_metric = 'hamming'
+    else:
+        distance_metric = 'euclidean'
+        if cross_correlation:
+            distance_metric = 'cross_correlation'
 
     return {
         'mp': mp,

diff --git a/matrixprofile/algorithms/mpx_char.py b/matrixprofile/algorithms/mpx_char.py
@@ -0,0 +1,119 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jan 20 12:48:43 2021
+
+@author: awilkins
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+#import math
+from numba import jit, prange
+
+
+#@jit
+def mpx_single_char(ts, w):
+    """
+    The MPX algorithm computes the matrix profile using Hamming distance.
+
+    Parameters
+    ----------
+    ts : array_like
+        The time series to compute the matrix profile for.
+    w : int
+        The window size.
+
+    Returns
+    -------
+    (array_like, array_like) :
+        The matrix profile (distance profile, profile index).
+
+    """
+    n = ts.shape[0]
+
+    profile_len = n - w + 1
+
+    mp = np.full(profile_len, -1.0, dtype = 'd')
+    mpi = np.full(profile_len, -1, dtype = 'int')
+    # Iterate over every starting location
+    for i in range(w, n + 1):
+        # Select the next 'w' indices starting at ts[i - w]
+        source = ts[i - w: i]
+        dist = np.inf
+        # Slide the starting location
+        for j in range(w, n + 1):
+            # Make sure not to compare with itself
+            if j == i:
+                continue
+            # Select the next 'w' indices starting at ts[j - w]
+            target = ts[j - w: j]
+            # Measure the Hamming distance
+            tmp_dist = editDistance(target, source)
+            # If it beats the best so far, update mp and mpi
+            if tmp_dist < dist:
+                dist = tmp_dist
+                mp[i - w] = dist
+                mpi[i - w] = j - w
+                # Add an early stopping criteria
+                if dist == 0:
+                    break
+
+    return (mp, mpi)
+
+
+
+def editDistance(target, source):
+    """
+    Returns the Levenshtein distance between two strings or vectors of strings.
+
+    Parameters
+    ----------
+    target : str, np.ndarray
+        String or vector to be compared against source.
+    source : str, np.ndarray
+        String or vector to be compared to target.  len(source) should be
+        greater than or equal to len(target).
+
+    Returns
+    -------
+    distance : int
+        Levenshtein distance between target and source.
+
+    """
+
+    # Ensure source is the longer of the two strings
+    if len(source) < len(target):
+        return editDistance(target, source)
+    # So now we have len(source) >= len(target).
+    if len(target) == 0:
+        return len(source)
+
+    # We call tuple() to force strings to be used as sequences
+    source = np.array(tuple(source))
+    target = np.array(tuple(target))
+
+    # We use a dynamic programming algorithm, but with the added optimization
+    # that we only need the last two rows of the matrix.
+    previous_row = np.arange(target.size + 1)
+    for s in source:
+        # Insertion (target grows longer than source):
+        current_row = previous_row + 1
+
+        # Substitution or matching:
+        # Target and source items are aligned, and either
+        # are different (cost of 1), or are the same (cost of 0).
+        current_row[1:] = np.minimum(
+                current_row[1:],
+                np.add(previous_row[:-1], target != s))
+
+        # Deletion (target grows shorter than source):
+        current_row[1:] = np.minimum(
+                current_row[1:],
+                current_row[0:-1] + 1)
+
+        previous_row = current_row
+
+    return previous_row[-1]
diff --git a/matrixprofile/algorithms/skimp.py b/matrixprofile/algorithms/skimp.py
@@ -224,10 +224,16 @@ def skimp(ts, windows=None, show_progress=False, cross_correlation=False,
             if int_pct % 5 == 0 and int_pct not in pct_shown:
                 print('{}% complete'.format(int_pct))
                 pct_shown[int_pct] = 1
-
-    metric = 'euclidean'
-    if cross_correlation:
-        metric = 'pearson'
+
+    if np.issubdtype(ts.dtype, 'U'):
+        metric = 'hamming'
+    else:
+        metric = 'euclidean'
+        if cross_correlation:
+            metric = 'pearson'
+    # metric = 'euclidean'
+    # if cross_correlation:
+    #     metric = 'pearson'
 
     return {
         'pmp': pmp,
@@ -244,13 +250,13 @@ def skimp(ts, windows=None, show_progress=False, cross_correlation=False,
 
 
 def maximum_subsequence(ts, threshold=0.95, refine_stepsize=0.05, n_jobs=1,
-    include_pmp=False, lower_window=8):
+    include_pmp=False, lower_window=4):
     """
     Finds the maximum subsequence length based on the threshold provided. Note
     that this threshold is domain specific requiring some knowledge about the
     underyling time series in question.
 
-    The subsequence length starts at 8 and iteratively doubles until the
+    The subsequence length starts at 4 and iteratively doubles until the
     maximum correlation coefficient is no longer met. When no solution is
     possible given the threshold, a matrixprofile.exceptions.NoSolutionPossible
     exception is raised.
@@ -303,7 +309,9 @@ def maximum_subsequence(ts, threshold=0.95, refine_stepsize=0.05, n_jobs=1,
     def resize(mp, pi, n):
         """Helper function to resize mp and pi to be aligned with the
         PMP. Also convert pearson to euclidean."""
-        mp = core.pearson_to_euclidean(profile['mp'], window_size)
+        # Only convert pearson to euclidean if not string data type
+        if not np.issubdtype(ts.dtype, 'U'):
+            mp = core.pearson_to_euclidean(profile['mp'], window_size)
         infs = np.full(n - mp.shape[0], np.inf)
         nans = np.full(n - mp.shape[0], np.nan)
         mp = np.append(mp, infs)

diff --git a/matrixprofile/analyze.py b/matrixprofile/analyze.py
@@ -65,7 +65,8 @@ def analyze_pmp(ts, query, sample_pct, threshold, windows=None, n_jobs=1):
 
         # determine windows to be computed
         # from 8 in steps of 2 until upper w
-        start = 8
+        start = 4
+        #start = 8
         windows = range(start, profile['upper_window'] + 1)
 
     # compute the pmp

diff --git a/matrixprofile/compute.py b/matrixprofile/compute.py
@@ -118,7 +118,8 @@ def compute(ts, windows=None, query=None, sample_pct=1, threshold=0.98,
 
         # determine windows to be computed
         # from 8 in steps of 2 until upper w
-        start = 8
+        start = 4
+        #start = 8
         windows = range(start, profile['upper_window'] + 1)
 
         # compute the pmp