From 1aa8a564b044aa032e9b4ebeb5ba4106013363dd Mon Sep 17 00:00:00 2001 From: Wilkins Date: Tue, 26 Jan 2021 10:49:24 -0800 Subject: [PATCH 1/2] Added string compatibility with compute(). Changed min window size to 4 --- matrixprofile/algorithms/mpx.py | 29 ++++- matrixprofile/algorithms/mpx_char.py | 181 +++++++++++++++++++++++++++ matrixprofile/algorithms/skimp.py | 22 ++-- matrixprofile/analyze.py | 3 +- matrixprofile/compute.py | 3 +- 5 files changed, 223 insertions(+), 15 deletions(-) create mode 100644 matrixprofile/algorithms/mpx_char.py diff --git a/matrixprofile/algorithms/mpx.py b/matrixprofile/algorithms/mpx.py index b5a8709..90300ce 100644 --- a/matrixprofile/algorithms/mpx.py +++ b/matrixprofile/algorithms/mpx.py @@ -14,6 +14,9 @@ from matrixprofile import core from matrixprofile.algorithms.cympx import mpx_parallel as cympx_parallel +# --- My import +from matrixprofile.algorithms.mpx_char import mpx_single_char +# --- from matrixprofile.algorithms.cympx import mpx_ab_parallel as cympx_ab_parallel @@ -61,23 +64,37 @@ def mpx(ts, w, query=None, cross_correlation=False, n_jobs=1): >>> } """ - ts = core.to_np_array(ts).astype('d') + # --- Drew's addition --- + dtype = core.get_dtype(ts) + ts = core.to_np_array(ts).astype(dtype) + #ts = core.to_np_array(ts).astype('d') n_jobs = core.valid_n_jobs(n_jobs) is_join = False if core.is_array_like(query): - query = core.to_np_array(query).astype('d') + query = core.to_np_array(query).astype(dtype) + #query = core.to_np_array(query).astype('d') is_join = True mp, mpi, mpb, mpib = cympx_ab_parallel(ts, query, w, int(cross_correlation), n_jobs) else: - mp, mpi = cympx_parallel(ts, w, int(cross_correlation), n_jobs) + # --- More changes... --- + if np.issubdtype(dtype, 'U'): + #ts = np.array([ord(x) for x in ts], dtype = 'd') + mp, mpi = mpx_single_char(ts, w, int(cross_correlation), n_jobs) + else: + mp, mpi = cympx_parallel(ts, w, int(cross_correlation), n_jobs) + # --- That's it for now... --- + #mp, mpi = cympx_parallel(ts, w, int(cross_correlation), n_jobs) mp = np.asarray(mp) mpi = np.asarray(mpi) - distance_metric = 'euclidean' - if cross_correlation: - distance_metric = 'cross_correlation' + if np.issubdtype(dtype, 'U'): + distance_metric = 'hamming' + else: + distance_metric = 'euclidean' + if cross_correlation: + distance_metric = 'cross_correlation' return { 'mp': mp, diff --git a/matrixprofile/algorithms/mpx_char.py b/matrixprofile/algorithms/mpx_char.py new file mode 100644 index 0000000..e70fb6e --- /dev/null +++ b/matrixprofile/algorithms/mpx_char.py @@ -0,0 +1,181 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jan 20 12:48:43 2021 + +@author: awilkins +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +#import math +from numba import jit, prange + + +#@jit +def mpx_single_char(ts, w, cross_correlation = 0, n_jobs = 1): + """ + The MPX algorithm computes the matrix profile without using the FFT. Right + now it only supports single dimension self joins. + + Parameters + ---------- + ts : array_like + The time series to compute the matrix profile for. + w : int + The window size. + cross_correlation : bint + Flag (0, 1) to determine if cross_correlation distance should be + returned. It defaults to Euclidean Distance (0). + n_jobs : int, Default = 1 + Number of cpu cores to use. + + Returns + ------- + (array_like, array_like) : + The matrix profile (distance profile, profile index). + + """ + #cdef int i, j, diag, offset, threadnum, col + n = ts.shape[0] + + # the original implementation allows the minlag to be manually set + # here it is always w / 4 similar to SCRIMP++ + #minlag = int(np.ceil(w / 4.0)) + profile_len = n - w + 1 + + #df = np.empty(profile_len, dtype = 'd') + #dg = np.empty(profile_len, dtype = 'd') + mp = np.full(profile_len, -1.0, dtype = 'd') + mpi = np.full(profile_len, -1, dtype = 'int') + + #tmp_mp = np.full((n_jobs, profile_len), -1.0, dtype = 'd') + #tmp_mpi = np.full((n_jobs, profile_len), -1, dtype = 'int') + + # this is where we compute the diagonals and later the matrix profile + #df[0] = 0 + #dg[0] = 0 + + # Iterate over every starting location + for i in range(w, n + 1): + # Select the next 'w' indices starting at ts[i - w] + source = ts[i - w: i] + dist = np.inf + # Slide the starting location + for j in range(w, n + 1): + # Make sure not to compare with itself + if j == i: + continue + # Select the next 'w' indices starting at ts[j - w] + target = ts[j - w: j] + # Measure the Hamming distance + tmp_dist = editDistance(target, source) + # If it beats the best so far, update mp and mpi + if tmp_dist < dist: + dist = tmp_dist + mp[i - w] = dist + mpi[i - w] = j - w + # Add an early stopping criteria + if dist == 0: + break + + + # for i in prange(w, n, num_threads=n_jobs, nogil=True): + # df[i - w + 1] = (0.5 * (ts[i] - ts[i - w])) + # dg[i - w + 1] = (ts[i] - mu[i - w + 1]) + (ts[i - w] - mu[i - w]) + + # for diag in prange(minlag + 1, profile_len, num_threads=n_jobs, nogil=True): + # c = 0 + # threadnum = openmp.omp_get_thread_num() + # for i in range(diag, diag + w): + # c = c + ((ts[i] - mu[diag]) * (ts[i-diag] - mu[0])) + + # for offset in range(n - w - diag + 1): + # col = offset + diag + # c = c + df[offset] * dg[col] + df[col] * dg[offset] + # c_cmp = c * sig[offset] * sig[col] + + # # update the distance profile and profile index + # if c_cmp > tmp_mp[threadnum, offset]: + # tmp_mp[threadnum, offset] = c_cmp + # tmp_mpi[threadnum, offset] = col + + # if c_cmp > tmp_mp[threadnum, col]: + # if c_cmp > 1.0: + # c_cmp = 1.0 + # tmp_mp[threadnum, col] = c_cmp + # tmp_mpi[threadnum, col] = offset + + # # combine parallel results... + # for i in range(tmp_mp.shape[0]): + # for j in range(tmp_mp.shape[1]): + # if tmp_mp[i,j] > mp[j]: + # if tmp_mp[i, j] > 1.0: + # mp[j] = 1.0 + # else: + # mp[j] = tmp_mp[i, j] + # mpi[j] = tmp_mpi[i, j] + + # # convert normalized cross correlation to euclidean distance + # if cross_correlation == 0: + # for i in range(profile_len): + # mp[i] = sqrt(2.0 * w * (1.0 - mp[i])) + + return (mp, mpi) + + + +def editDistance(target, source): + """ + Returns the Levenshtein distance between two strings or vectors of strings. + + Parameters + ---------- + target : str, np.ndarray + String or vector to be compared against source. + source : str, np.ndarray + String or vector to be compared to target. len(source) should be + greater than or equal to len(target). + + Returns + ------- + distance : int + Levenshtein distance between target and source. + + """ + + # Ensure source is the longer of the two strings + if len(source) < len(target): + return editDistance(target, source) + # So now we have len(source) >= len(target). + if len(target) == 0: + return len(source) + + # We call tuple() to force strings to be used as sequences + source = np.array(tuple(source)) + target = np.array(tuple(target)) + + # We use a dynamic programming algorithm, but with the added optimization + # that we only need the last two rows of the matrix. + previous_row = np.arange(target.size + 1) + for s in source: + # Insertion (target grows longer than source): + current_row = previous_row + 1 + + # Substitution or matching: + # Target and source items are aligned, and either + # are different (cost of 1), or are the same (cost of 0). + current_row[1:] = np.minimum( + current_row[1:], + np.add(previous_row[:-1], target != s)) + + # Deletion (target grows shorter than source): + current_row[1:] = np.minimum( + current_row[1:], + current_row[0:-1] + 1) + + previous_row = current_row + + return previous_row[-1] \ No newline at end of file diff --git a/matrixprofile/algorithms/skimp.py b/matrixprofile/algorithms/skimp.py index 6806849..b721b90 100644 --- a/matrixprofile/algorithms/skimp.py +++ b/matrixprofile/algorithms/skimp.py @@ -224,10 +224,16 @@ def skimp(ts, windows=None, show_progress=False, cross_correlation=False, if int_pct % 5 == 0 and int_pct not in pct_shown: print('{}% complete'.format(int_pct)) pct_shown[int_pct] = 1 - - metric = 'euclidean' - if cross_correlation: - metric = 'pearson' + + if np.issubdtype(ts.dtype, 'U'): + metric = 'hamming' + else: + metric = 'euclidean' + if cross_correlation: + metric = 'pearson' + # metric = 'euclidean' + # if cross_correlation: + # metric = 'pearson' return { 'pmp': pmp, @@ -244,13 +250,13 @@ def skimp(ts, windows=None, show_progress=False, cross_correlation=False, def maximum_subsequence(ts, threshold=0.95, refine_stepsize=0.05, n_jobs=1, - include_pmp=False, lower_window=8): + include_pmp=False, lower_window=4): """ Finds the maximum subsequence length based on the threshold provided. Note that this threshold is domain specific requiring some knowledge about the underyling time series in question. - The subsequence length starts at 8 and iteratively doubles until the + The subsequence length starts at 4 and iteratively doubles until the maximum correlation coefficient is no longer met. When no solution is possible given the threshold, a matrixprofile.exceptions.NoSolutionPossible exception is raised. @@ -303,7 +309,9 @@ def maximum_subsequence(ts, threshold=0.95, refine_stepsize=0.05, n_jobs=1, def resize(mp, pi, n): """Helper function to resize mp and pi to be aligned with the PMP. Also convert pearson to euclidean.""" - mp = core.pearson_to_euclidean(profile['mp'], window_size) + # Only convert pearson to euclidean if not string data type + if not np.issubdtype(ts.dtype, 'U'): + mp = core.pearson_to_euclidean(profile['mp'], window_size) infs = np.full(n - mp.shape[0], np.inf) nans = np.full(n - mp.shape[0], np.nan) mp = np.append(mp, infs) diff --git a/matrixprofile/analyze.py b/matrixprofile/analyze.py index 2b10a74..7e0675e 100644 --- a/matrixprofile/analyze.py +++ b/matrixprofile/analyze.py @@ -65,7 +65,8 @@ def analyze_pmp(ts, query, sample_pct, threshold, windows=None, n_jobs=1): # determine windows to be computed # from 8 in steps of 2 until upper w - start = 8 + start = 4 + #start = 8 windows = range(start, profile['upper_window'] + 1) # compute the pmp diff --git a/matrixprofile/compute.py b/matrixprofile/compute.py index 7b776f3..934f35d 100644 --- a/matrixprofile/compute.py +++ b/matrixprofile/compute.py @@ -118,7 +118,8 @@ def compute(ts, windows=None, query=None, sample_pct=1, threshold=0.98, # determine windows to be computed # from 8 in steps of 2 until upper w - start = 8 + start = 4 + #start = 8 windows = range(start, profile['upper_window'] + 1) # compute the pmp From c26431eba78e6aaf322617a90cd6337032c9b984 Mon Sep 17 00:00:00 2001 From: Wilkins Date: Tue, 26 Jan 2021 11:58:26 -0800 Subject: [PATCH 2/2] Fixed documentation on mpx_single_char and call signature --- MANIFEST.in | 2 + matrixprofile/algorithms/mpx.py | 2 +- matrixprofile/algorithms/mpx_char.py | 66 +--------------------------- 3 files changed, 5 insertions(+), 65 deletions(-) create mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..4931333 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include version.py +recursive-include . *.pyx *.proto *.txt *.csv *.json diff --git a/matrixprofile/algorithms/mpx.py b/matrixprofile/algorithms/mpx.py index 90300ce..34f5c87 100644 --- a/matrixprofile/algorithms/mpx.py +++ b/matrixprofile/algorithms/mpx.py @@ -81,7 +81,7 @@ def mpx(ts, w, query=None, cross_correlation=False, n_jobs=1): # --- More changes... --- if np.issubdtype(dtype, 'U'): #ts = np.array([ord(x) for x in ts], dtype = 'd') - mp, mpi = mpx_single_char(ts, w, int(cross_correlation), n_jobs) + mp, mpi = mpx_single_char(ts, w) else: mp, mpi = cympx_parallel(ts, w, int(cross_correlation), n_jobs) # --- That's it for now... --- diff --git a/matrixprofile/algorithms/mpx_char.py b/matrixprofile/algorithms/mpx_char.py index e70fb6e..cec5ddd 100644 --- a/matrixprofile/algorithms/mpx_char.py +++ b/matrixprofile/algorithms/mpx_char.py @@ -15,10 +15,9 @@ #@jit -def mpx_single_char(ts, w, cross_correlation = 0, n_jobs = 1): +def mpx_single_char(ts, w): """ - The MPX algorithm computes the matrix profile without using the FFT. Right - now it only supports single dimension self joins. + The MPX algorithm computes the matrix profile using Hamming distance. Parameters ---------- @@ -26,11 +25,6 @@ def mpx_single_char(ts, w, cross_correlation = 0, n_jobs = 1): The time series to compute the matrix profile for. w : int The window size. - cross_correlation : bint - Flag (0, 1) to determine if cross_correlation distance should be - returned. It defaults to Euclidean Distance (0). - n_jobs : int, Default = 1 - Number of cpu cores to use. Returns ------- @@ -38,26 +32,12 @@ def mpx_single_char(ts, w, cross_correlation = 0, n_jobs = 1): The matrix profile (distance profile, profile index). """ - #cdef int i, j, diag, offset, threadnum, col n = ts.shape[0] - # the original implementation allows the minlag to be manually set - # here it is always w / 4 similar to SCRIMP++ - #minlag = int(np.ceil(w / 4.0)) profile_len = n - w + 1 - #df = np.empty(profile_len, dtype = 'd') - #dg = np.empty(profile_len, dtype = 'd') mp = np.full(profile_len, -1.0, dtype = 'd') mpi = np.full(profile_len, -1, dtype = 'int') - - #tmp_mp = np.full((n_jobs, profile_len), -1.0, dtype = 'd') - #tmp_mpi = np.full((n_jobs, profile_len), -1, dtype = 'int') - - # this is where we compute the diagonals and later the matrix profile - #df[0] = 0 - #dg[0] = 0 - # Iterate over every starting location for i in range(w, n + 1): # Select the next 'w' indices starting at ts[i - w] @@ -81,48 +61,6 @@ def mpx_single_char(ts, w, cross_correlation = 0, n_jobs = 1): if dist == 0: break - - # for i in prange(w, n, num_threads=n_jobs, nogil=True): - # df[i - w + 1] = (0.5 * (ts[i] - ts[i - w])) - # dg[i - w + 1] = (ts[i] - mu[i - w + 1]) + (ts[i - w] - mu[i - w]) - - # for diag in prange(minlag + 1, profile_len, num_threads=n_jobs, nogil=True): - # c = 0 - # threadnum = openmp.omp_get_thread_num() - # for i in range(diag, diag + w): - # c = c + ((ts[i] - mu[diag]) * (ts[i-diag] - mu[0])) - - # for offset in range(n - w - diag + 1): - # col = offset + diag - # c = c + df[offset] * dg[col] + df[col] * dg[offset] - # c_cmp = c * sig[offset] * sig[col] - - # # update the distance profile and profile index - # if c_cmp > tmp_mp[threadnum, offset]: - # tmp_mp[threadnum, offset] = c_cmp - # tmp_mpi[threadnum, offset] = col - - # if c_cmp > tmp_mp[threadnum, col]: - # if c_cmp > 1.0: - # c_cmp = 1.0 - # tmp_mp[threadnum, col] = c_cmp - # tmp_mpi[threadnum, col] = offset - - # # combine parallel results... - # for i in range(tmp_mp.shape[0]): - # for j in range(tmp_mp.shape[1]): - # if tmp_mp[i,j] > mp[j]: - # if tmp_mp[i, j] > 1.0: - # mp[j] = 1.0 - # else: - # mp[j] = tmp_mp[i, j] - # mpi[j] = tmp_mpi[i, j] - - # # convert normalized cross correlation to euclidean distance - # if cross_correlation == 0: - # for i in range(profile_len): - # mp[i] = sqrt(2.0 * w * (1.0 - mp[i])) - return (mp, mpi)