Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Compatibility with string data types #74

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 23 additions & 6 deletions matrixprofile/algorithms/mpx.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@

from matrixprofile import core
from matrixprofile.algorithms.cympx import mpx_parallel as cympx_parallel
# --- My import
from matrixprofile.algorithms.mpx_char import mpx_single_char
# ---
from matrixprofile.algorithms.cympx import mpx_ab_parallel as cympx_ab_parallel


Expand Down Expand Up @@ -61,23 +64,37 @@ def mpx(ts, w, query=None, cross_correlation=False, n_jobs=1):
>>> }

"""
ts = core.to_np_array(ts).astype('d')
# --- Drew's addition ---
dtype = core.get_dtype(ts)
ts = core.to_np_array(ts).astype(dtype)
#ts = core.to_np_array(ts).astype('d')
n_jobs = core.valid_n_jobs(n_jobs)
is_join = False

if core.is_array_like(query):
query = core.to_np_array(query).astype('d')
query = core.to_np_array(query).astype(dtype)
#query = core.to_np_array(query).astype('d')
is_join = True
mp, mpi, mpb, mpib = cympx_ab_parallel(ts, query, w,
int(cross_correlation), n_jobs)
else:
mp, mpi = cympx_parallel(ts, w, int(cross_correlation), n_jobs)
# --- More changes... ---
if np.issubdtype(dtype, 'U'):
#ts = np.array([ord(x) for x in ts], dtype = 'd')
mp, mpi = mpx_single_char(ts, w)
else:
mp, mpi = cympx_parallel(ts, w, int(cross_correlation), n_jobs)
# --- That's it for now... ---
#mp, mpi = cympx_parallel(ts, w, int(cross_correlation), n_jobs)

mp = np.asarray(mp)
mpi = np.asarray(mpi)
distance_metric = 'euclidean'
if cross_correlation:
distance_metric = 'cross_correlation'
if np.issubdtype(dtype, 'U'):
distance_metric = 'hamming'
else:
distance_metric = 'euclidean'
if cross_correlation:
distance_metric = 'cross_correlation'

return {
'mp': mp,
Expand Down
119 changes: 119 additions & 0 deletions matrixprofile/algorithms/mpx_char.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 20 12:48:43 2021

@author: awilkins
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import numpy as np
#import math
from numba import jit, prange


#@jit
def mpx_single_char(ts, w):
"""
The MPX algorithm computes the matrix profile using Hamming distance.

Parameters
----------
ts : array_like
The time series to compute the matrix profile for.
w : int
The window size.

Returns
-------
(array_like, array_like) :
The matrix profile (distance profile, profile index).

"""
n = ts.shape[0]

profile_len = n - w + 1

mp = np.full(profile_len, -1.0, dtype = 'd')
mpi = np.full(profile_len, -1, dtype = 'int')
# Iterate over every starting location
for i in range(w, n + 1):
# Select the next 'w' indices starting at ts[i - w]
source = ts[i - w: i]
dist = np.inf
# Slide the starting location
for j in range(w, n + 1):
# Make sure not to compare with itself
if j == i:
continue
# Select the next 'w' indices starting at ts[j - w]
target = ts[j - w: j]
# Measure the Hamming distance
tmp_dist = editDistance(target, source)
# If it beats the best so far, update mp and mpi
if tmp_dist < dist:
dist = tmp_dist
mp[i - w] = dist
mpi[i - w] = j - w
# Add an early stopping criteria
if dist == 0:
break

return (mp, mpi)



def editDistance(target, source):
"""
Returns the Levenshtein distance between two strings or vectors of strings.

Parameters
----------
target : str, np.ndarray
String or vector to be compared against source.
source : str, np.ndarray
String or vector to be compared to target. len(source) should be
greater than or equal to len(target).

Returns
-------
distance : int
Levenshtein distance between target and source.

"""

# Ensure source is the longer of the two strings
if len(source) < len(target):
return editDistance(target, source)
# So now we have len(source) >= len(target).
if len(target) == 0:
return len(source)

# We call tuple() to force strings to be used as sequences
source = np.array(tuple(source))
target = np.array(tuple(target))

# We use a dynamic programming algorithm, but with the added optimization
# that we only need the last two rows of the matrix.
previous_row = np.arange(target.size + 1)
for s in source:
# Insertion (target grows longer than source):
current_row = previous_row + 1

# Substitution or matching:
# Target and source items are aligned, and either
# are different (cost of 1), or are the same (cost of 0).
current_row[1:] = np.minimum(
current_row[1:],
np.add(previous_row[:-1], target != s))

# Deletion (target grows shorter than source):
current_row[1:] = np.minimum(
current_row[1:],
current_row[0:-1] + 1)

previous_row = current_row

return previous_row[-1]
22 changes: 15 additions & 7 deletions matrixprofile/algorithms/skimp.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,10 +224,16 @@ def skimp(ts, windows=None, show_progress=False, cross_correlation=False,
if int_pct % 5 == 0 and int_pct not in pct_shown:
print('{}% complete'.format(int_pct))
pct_shown[int_pct] = 1

metric = 'euclidean'
if cross_correlation:
metric = 'pearson'

if np.issubdtype(ts.dtype, 'U'):
metric = 'hamming'
else:
metric = 'euclidean'
if cross_correlation:
metric = 'pearson'
# metric = 'euclidean'
# if cross_correlation:
# metric = 'pearson'

return {
'pmp': pmp,
Expand All @@ -244,13 +250,13 @@ def skimp(ts, windows=None, show_progress=False, cross_correlation=False,


def maximum_subsequence(ts, threshold=0.95, refine_stepsize=0.05, n_jobs=1,
include_pmp=False, lower_window=8):
include_pmp=False, lower_window=4):
"""
Finds the maximum subsequence length based on the threshold provided. Note
that this threshold is domain specific requiring some knowledge about the
underyling time series in question.

The subsequence length starts at 8 and iteratively doubles until the
The subsequence length starts at 4 and iteratively doubles until the
maximum correlation coefficient is no longer met. When no solution is
possible given the threshold, a matrixprofile.exceptions.NoSolutionPossible
exception is raised.
Expand Down Expand Up @@ -303,7 +309,9 @@ def maximum_subsequence(ts, threshold=0.95, refine_stepsize=0.05, n_jobs=1,
def resize(mp, pi, n):
"""Helper function to resize mp and pi to be aligned with the
PMP. Also convert pearson to euclidean."""
mp = core.pearson_to_euclidean(profile['mp'], window_size)
# Only convert pearson to euclidean if not string data type
if not np.issubdtype(ts.dtype, 'U'):
mp = core.pearson_to_euclidean(profile['mp'], window_size)
infs = np.full(n - mp.shape[0], np.inf)
nans = np.full(n - mp.shape[0], np.nan)
mp = np.append(mp, infs)
Expand Down
3 changes: 2 additions & 1 deletion matrixprofile/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ def analyze_pmp(ts, query, sample_pct, threshold, windows=None, n_jobs=1):

# determine windows to be computed
# from 8 in steps of 2 until upper w
start = 8
start = 4
#start = 8
windows = range(start, profile['upper_window'] + 1)

# compute the pmp
Expand Down
3 changes: 2 additions & 1 deletion matrixprofile/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,8 @@ def compute(ts, windows=None, query=None, sample_pct=1, threshold=0.98,

# determine windows to be computed
# from 8 in steps of 2 until upper w
start = 8
start = 4
#start = 8
windows = range(start, profile['upper_window'] + 1)

# compute the pmp
Expand Down