From 1aa8a564b044aa032e9b4ebeb5ba4106013363dd Mon Sep 17 00:00:00 2001
From: Wilkins <awilkins@arete.com>
Date: Tue, 26 Jan 2021 10:49:24 -0800
Subject: [PATCH 1/2] Added string compatibility with compute().  Changed min
 window size to 4

---
 matrixprofile/algorithms/mpx.py      |  29 ++++-
 matrixprofile/algorithms/mpx_char.py | 181 +++++++++++++++++++++++++++
 matrixprofile/algorithms/skimp.py    |  22 ++--
 matrixprofile/analyze.py             |   3 +-
 matrixprofile/compute.py             |   3 +-
 5 files changed, 223 insertions(+), 15 deletions(-)
 create mode 100644 matrixprofile/algorithms/mpx_char.py

diff --git a/matrixprofile/algorithms/mpx.py b/matrixprofile/algorithms/mpx.py
index b5a8709..90300ce 100644
--- a/matrixprofile/algorithms/mpx.py
+++ b/matrixprofile/algorithms/mpx.py
@@ -14,6 +14,9 @@
 
 from matrixprofile import core
 from matrixprofile.algorithms.cympx import mpx_parallel as cympx_parallel
+# --- My import
+from matrixprofile.algorithms.mpx_char import mpx_single_char
+# ---
 from matrixprofile.algorithms.cympx import mpx_ab_parallel as cympx_ab_parallel
 
 
@@ -61,23 +64,37 @@ def mpx(ts, w, query=None, cross_correlation=False, n_jobs=1):
         >>> }
 
     """
-    ts = core.to_np_array(ts).astype('d')
+    # --- Drew's addition ---
+    dtype = core.get_dtype(ts)
+    ts = core.to_np_array(ts).astype(dtype)
+    #ts = core.to_np_array(ts).astype('d')
     n_jobs = core.valid_n_jobs(n_jobs)
     is_join = False
 
     if core.is_array_like(query):
-        query = core.to_np_array(query).astype('d')
+        query = core.to_np_array(query).astype(dtype)
+        #query = core.to_np_array(query).astype('d')
         is_join = True
         mp, mpi, mpb, mpib = cympx_ab_parallel(ts, query, w, 
             int(cross_correlation), n_jobs)
     else:
-        mp, mpi = cympx_parallel(ts, w, int(cross_correlation), n_jobs)
+        # --- More changes... ---
+        if np.issubdtype(dtype, 'U'):
+            #ts = np.array([ord(x) for x in ts], dtype = 'd')
+            mp, mpi = mpx_single_char(ts, w, int(cross_correlation), n_jobs)
+        else:
+            mp, mpi = cympx_parallel(ts, w, int(cross_correlation), n_jobs)
+        # --- That's it for now... ---
+        #mp, mpi = cympx_parallel(ts, w, int(cross_correlation), n_jobs)
 
     mp = np.asarray(mp)
     mpi = np.asarray(mpi)
-    distance_metric = 'euclidean'
-    if cross_correlation:
-        distance_metric = 'cross_correlation'
+    if np.issubdtype(dtype, 'U'):
+        distance_metric = 'hamming'
+    else:
+        distance_metric = 'euclidean'
+        if cross_correlation:
+            distance_metric = 'cross_correlation'
 
     return {
         'mp': mp,
diff --git a/matrixprofile/algorithms/mpx_char.py b/matrixprofile/algorithms/mpx_char.py
new file mode 100644
index 0000000..e70fb6e
--- /dev/null
+++ b/matrixprofile/algorithms/mpx_char.py
@@ -0,0 +1,181 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jan 20 12:48:43 2021
+
+@author: awilkins
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+#import math
+from numba import jit, prange
+
+
+#@jit
+def mpx_single_char(ts, w, cross_correlation = 0, n_jobs = 1):
+    """
+    The MPX algorithm computes the matrix profile without using the FFT. Right
+    now it only supports single dimension self joins.
+
+    Parameters
+    ----------
+    ts : array_like
+        The time series to compute the matrix profile for.
+    w : int
+        The window size.
+    cross_correlation : bint
+        Flag (0, 1) to determine if cross_correlation distance should be
+        returned. It defaults to Euclidean Distance (0).
+    n_jobs : int, Default = 1
+        Number of cpu cores to use.
+    
+    Returns
+    -------
+    (array_like, array_like) :
+        The matrix profile (distance profile, profile index).
+
+    """
+    #cdef int i, j, diag, offset, threadnum, col
+    n = ts.shape[0]
+
+    # the original implementation allows the minlag to be manually set
+    # here it is always w / 4 similar to SCRIMP++
+    #minlag = int(np.ceil(w / 4.0))
+    profile_len = n - w + 1
+    
+    #df = np.empty(profile_len, dtype = 'd')
+    #dg = np.empty(profile_len, dtype = 'd')
+    mp = np.full(profile_len, -1.0, dtype = 'd')
+    mpi = np.full(profile_len, -1, dtype = 'int')
+    
+    #tmp_mp = np.full((n_jobs, profile_len), -1.0, dtype = 'd')
+    #tmp_mpi = np.full((n_jobs, profile_len), -1, dtype = 'int')
+    
+    # this is where we compute the diagonals and later the matrix profile
+    #df[0] = 0
+    #dg[0] = 0
+    
+    # Iterate over every starting location
+    for i in range(w, n + 1):
+        # Select the next 'w' indices starting at ts[i - w]
+        source = ts[i - w: i]
+        dist = np.inf
+        # Slide the starting location
+        for j in range(w, n + 1):
+            # Make sure not to compare with itself
+            if j == i:
+                continue
+            # Select the next 'w' indices starting at ts[j - w]
+            target = ts[j - w: j]
+            # Measure the Hamming distance
+            tmp_dist = editDistance(target, source)
+            # If it beats the best so far, update mp and mpi
+            if tmp_dist < dist:
+                dist = tmp_dist
+                mp[i - w] = dist
+                mpi[i - w] = j - w
+                # Add an early stopping criteria
+                if dist == 0:
+                    break
+    
+    
+    # for i in prange(w, n, num_threads=n_jobs, nogil=True):
+    #     df[i - w + 1] = (0.5 * (ts[i] - ts[i - w]))
+    #     dg[i - w + 1] = (ts[i] - mu[i - w + 1]) + (ts[i - w] - mu[i - w])    
+
+    # for diag in prange(minlag + 1, profile_len, num_threads=n_jobs, nogil=True):
+    #     c = 0
+    #     threadnum = openmp.omp_get_thread_num()
+    #     for i in range(diag, diag + w):
+    #         c = c + ((ts[i] - mu[diag]) * (ts[i-diag] - mu[0]))
+
+    #     for offset in range(n - w - diag + 1):
+    #         col = offset + diag
+    #         c = c + df[offset] * dg[col] + df[col] * dg[offset]
+    #         c_cmp = c * sig[offset] * sig[col]
+            
+    #         # update the distance profile and profile index
+    #         if c_cmp > tmp_mp[threadnum, offset]:
+    #             tmp_mp[threadnum, offset] = c_cmp
+    #             tmp_mpi[threadnum, offset] = col
+            
+    #         if c_cmp > tmp_mp[threadnum, col]:
+    #             if c_cmp > 1.0:
+    #                 c_cmp = 1.0
+    #             tmp_mp[threadnum, col] = c_cmp
+    #             tmp_mpi[threadnum, col] = offset
+    
+    # # combine parallel results...
+    # for i in range(tmp_mp.shape[0]):
+    #     for j in range(tmp_mp.shape[1]):
+    #         if tmp_mp[i,j] > mp[j]:
+    #             if tmp_mp[i, j] > 1.0:
+    #                 mp[j] = 1.0
+    #             else:
+    #                 mp[j] = tmp_mp[i, j]
+    #             mpi[j] = tmp_mpi[i, j]
+    
+    # # convert normalized cross correlation to euclidean distance
+    # if cross_correlation == 0:
+    #     for i in range(profile_len):
+    #         mp[i] = sqrt(2.0 * w * (1.0 - mp[i]))
+    
+    return (mp, mpi)
+
+
+
+def editDistance(target, source):
+    """
+    Returns the Levenshtein distance between two strings or vectors of strings.
+
+    Parameters
+    ----------
+    target : str, np.ndarray
+        String or vector to be compared against source.
+    source : str, np.ndarray
+        String or vector to be compared to target.  len(source) should be
+        greater than or equal to len(target).
+
+    Returns
+    -------
+    distance : int
+        Levenshtein distance between target and source.
+
+    """
+    
+    # Ensure source is the longer of the two strings
+    if len(source) < len(target):
+        return editDistance(target, source)
+    # So now we have len(source) >= len(target).
+    if len(target) == 0:
+        return len(source)
+
+    # We call tuple() to force strings to be used as sequences
+    source = np.array(tuple(source))
+    target = np.array(tuple(target))
+
+    # We use a dynamic programming algorithm, but with the added optimization
+    # that we only need the last two rows of the matrix.
+    previous_row = np.arange(target.size + 1)
+    for s in source:
+        # Insertion (target grows longer than source):
+        current_row = previous_row + 1
+
+        # Substitution or matching:
+        # Target and source items are aligned, and either
+        # are different (cost of 1), or are the same (cost of 0).
+        current_row[1:] = np.minimum(
+                current_row[1:],
+                np.add(previous_row[:-1], target != s))
+
+        # Deletion (target grows shorter than source):
+        current_row[1:] = np.minimum(
+                current_row[1:],
+                current_row[0:-1] + 1)
+
+        previous_row = current_row
+
+    return previous_row[-1]
\ No newline at end of file
diff --git a/matrixprofile/algorithms/skimp.py b/matrixprofile/algorithms/skimp.py
index 6806849..b721b90 100644
--- a/matrixprofile/algorithms/skimp.py
+++ b/matrixprofile/algorithms/skimp.py
@@ -224,10 +224,16 @@ def skimp(ts, windows=None, show_progress=False, cross_correlation=False,
             if int_pct % 5 == 0 and int_pct not in pct_shown:
                 print('{}% complete'.format(int_pct))
                 pct_shown[int_pct] = 1
-
-    metric = 'euclidean'
-    if cross_correlation:
-        metric = 'pearson'
+    
+    if np.issubdtype(ts.dtype, 'U'):
+        metric = 'hamming'
+    else:
+        metric = 'euclidean'
+        if cross_correlation:
+            metric = 'pearson'
+    # metric = 'euclidean'
+    # if cross_correlation:
+    #     metric = 'pearson'
 
     return {
         'pmp': pmp,
@@ -244,13 +250,13 @@ def skimp(ts, windows=None, show_progress=False, cross_correlation=False,
     
 
 def maximum_subsequence(ts, threshold=0.95, refine_stepsize=0.05, n_jobs=1,
-    include_pmp=False, lower_window=8):
+    include_pmp=False, lower_window=4):
     """
     Finds the maximum subsequence length based on the threshold provided. Note
     that this threshold is domain specific requiring some knowledge about the
     underyling time series in question.
 
-    The subsequence length starts at 8 and iteratively doubles until the
+    The subsequence length starts at 4 and iteratively doubles until the
     maximum correlation coefficient is no longer met. When no solution is
     possible given the threshold, a matrixprofile.exceptions.NoSolutionPossible
     exception is raised.
@@ -303,7 +309,9 @@ def maximum_subsequence(ts, threshold=0.95, refine_stepsize=0.05, n_jobs=1,
     def resize(mp, pi, n):
         """Helper function to resize mp and pi to be aligned with the
         PMP. Also convert pearson to euclidean."""
-        mp = core.pearson_to_euclidean(profile['mp'], window_size)
+        # Only convert pearson to euclidean if not string data type
+        if not np.issubdtype(ts.dtype, 'U'):
+            mp = core.pearson_to_euclidean(profile['mp'], window_size)
         infs = np.full(n - mp.shape[0], np.inf)
         nans = np.full(n - mp.shape[0], np.nan)
         mp = np.append(mp, infs)
diff --git a/matrixprofile/analyze.py b/matrixprofile/analyze.py
index 2b10a74..7e0675e 100644
--- a/matrixprofile/analyze.py
+++ b/matrixprofile/analyze.py
@@ -65,7 +65,8 @@ def analyze_pmp(ts, query, sample_pct, threshold, windows=None, n_jobs=1):
 
         # determine windows to be computed
         # from 8 in steps of 2 until upper w
-        start = 8
+        start = 4
+        #start = 8
         windows = range(start, profile['upper_window'] + 1)
 
     # compute the pmp
diff --git a/matrixprofile/compute.py b/matrixprofile/compute.py
index 7b776f3..934f35d 100644
--- a/matrixprofile/compute.py
+++ b/matrixprofile/compute.py
@@ -118,7 +118,8 @@ def compute(ts, windows=None, query=None, sample_pct=1, threshold=0.98,
 
         # determine windows to be computed
         # from 8 in steps of 2 until upper w
-        start = 8
+        start = 4
+        #start = 8
         windows = range(start, profile['upper_window'] + 1)
 
         # compute the pmp

From c26431eba78e6aaf322617a90cd6337032c9b984 Mon Sep 17 00:00:00 2001
From: Wilkins <awilkins@arete.com>
Date: Tue, 26 Jan 2021 11:58:26 -0800
Subject: [PATCH 2/2] Fixed documentation on mpx_single_char and call signature

---
 MANIFEST.in                          |  2 +
 matrixprofile/algorithms/mpx.py      |  2 +-
 matrixprofile/algorithms/mpx_char.py | 66 +---------------------------
 3 files changed, 5 insertions(+), 65 deletions(-)
 create mode 100644 MANIFEST.in

diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..4931333
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+include version.py
+recursive-include . *.pyx *.proto *.txt *.csv *.json
diff --git a/matrixprofile/algorithms/mpx.py b/matrixprofile/algorithms/mpx.py
index 90300ce..34f5c87 100644
--- a/matrixprofile/algorithms/mpx.py
+++ b/matrixprofile/algorithms/mpx.py
@@ -81,7 +81,7 @@ def mpx(ts, w, query=None, cross_correlation=False, n_jobs=1):
         # --- More changes... ---
         if np.issubdtype(dtype, 'U'):
             #ts = np.array([ord(x) for x in ts], dtype = 'd')
-            mp, mpi = mpx_single_char(ts, w, int(cross_correlation), n_jobs)
+            mp, mpi = mpx_single_char(ts, w)
         else:
             mp, mpi = cympx_parallel(ts, w, int(cross_correlation), n_jobs)
         # --- That's it for now... ---
diff --git a/matrixprofile/algorithms/mpx_char.py b/matrixprofile/algorithms/mpx_char.py
index e70fb6e..cec5ddd 100644
--- a/matrixprofile/algorithms/mpx_char.py
+++ b/matrixprofile/algorithms/mpx_char.py
@@ -15,10 +15,9 @@
 
 
 #@jit
-def mpx_single_char(ts, w, cross_correlation = 0, n_jobs = 1):
+def mpx_single_char(ts, w):
     """
-    The MPX algorithm computes the matrix profile without using the FFT. Right
-    now it only supports single dimension self joins.
+    The MPX algorithm computes the matrix profile using Hamming distance.
 
     Parameters
     ----------
@@ -26,11 +25,6 @@ def mpx_single_char(ts, w, cross_correlation = 0, n_jobs = 1):
         The time series to compute the matrix profile for.
     w : int
         The window size.
-    cross_correlation : bint
-        Flag (0, 1) to determine if cross_correlation distance should be
-        returned. It defaults to Euclidean Distance (0).
-    n_jobs : int, Default = 1
-        Number of cpu cores to use.
     
     Returns
     -------
@@ -38,26 +32,12 @@ def mpx_single_char(ts, w, cross_correlation = 0, n_jobs = 1):
         The matrix profile (distance profile, profile index).
 
     """
-    #cdef int i, j, diag, offset, threadnum, col
     n = ts.shape[0]
 
-    # the original implementation allows the minlag to be manually set
-    # here it is always w / 4 similar to SCRIMP++
-    #minlag = int(np.ceil(w / 4.0))
     profile_len = n - w + 1
     
-    #df = np.empty(profile_len, dtype = 'd')
-    #dg = np.empty(profile_len, dtype = 'd')
     mp = np.full(profile_len, -1.0, dtype = 'd')
     mpi = np.full(profile_len, -1, dtype = 'int')
-    
-    #tmp_mp = np.full((n_jobs, profile_len), -1.0, dtype = 'd')
-    #tmp_mpi = np.full((n_jobs, profile_len), -1, dtype = 'int')
-    
-    # this is where we compute the diagonals and later the matrix profile
-    #df[0] = 0
-    #dg[0] = 0
-    
     # Iterate over every starting location
     for i in range(w, n + 1):
         # Select the next 'w' indices starting at ts[i - w]
@@ -81,48 +61,6 @@ def mpx_single_char(ts, w, cross_correlation = 0, n_jobs = 1):
                 if dist == 0:
                     break
     
-    
-    # for i in prange(w, n, num_threads=n_jobs, nogil=True):
-    #     df[i - w + 1] = (0.5 * (ts[i] - ts[i - w]))
-    #     dg[i - w + 1] = (ts[i] - mu[i - w + 1]) + (ts[i - w] - mu[i - w])    
-
-    # for diag in prange(minlag + 1, profile_len, num_threads=n_jobs, nogil=True):
-    #     c = 0
-    #     threadnum = openmp.omp_get_thread_num()
-    #     for i in range(diag, diag + w):
-    #         c = c + ((ts[i] - mu[diag]) * (ts[i-diag] - mu[0]))
-
-    #     for offset in range(n - w - diag + 1):
-    #         col = offset + diag
-    #         c = c + df[offset] * dg[col] + df[col] * dg[offset]
-    #         c_cmp = c * sig[offset] * sig[col]
-            
-    #         # update the distance profile and profile index
-    #         if c_cmp > tmp_mp[threadnum, offset]:
-    #             tmp_mp[threadnum, offset] = c_cmp
-    #             tmp_mpi[threadnum, offset] = col
-            
-    #         if c_cmp > tmp_mp[threadnum, col]:
-    #             if c_cmp > 1.0:
-    #                 c_cmp = 1.0
-    #             tmp_mp[threadnum, col] = c_cmp
-    #             tmp_mpi[threadnum, col] = offset
-    
-    # # combine parallel results...
-    # for i in range(tmp_mp.shape[0]):
-    #     for j in range(tmp_mp.shape[1]):
-    #         if tmp_mp[i,j] > mp[j]:
-    #             if tmp_mp[i, j] > 1.0:
-    #                 mp[j] = 1.0
-    #             else:
-    #                 mp[j] = tmp_mp[i, j]
-    #             mpi[j] = tmp_mpi[i, j]
-    
-    # # convert normalized cross correlation to euclidean distance
-    # if cross_correlation == 0:
-    #     for i in range(profile_len):
-    #         mp[i] = sqrt(2.0 * w * (1.0 - mp[i]))
-    
     return (mp, mpi)