Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Outliers: Offload work onto separate thread #4412

Merged
merged 6 commits into from
Feb 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 31 additions & 9 deletions Orange/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import itertools
from collections import Iterable
import re
import warnings

import numpy as np
import scipy
Expand All @@ -13,7 +14,8 @@
from Orange.misc.wrapper_meta import WrapperMeta
from Orange.preprocess import Continuize, RemoveNaNColumns, SklImpute, Normalize
from Orange.statistics.util import all_nan
from Orange.util import Reprable
from Orange.util import Reprable, OrangeDeprecationWarning, wrap_callback, \
dummy_callback

__all__ = ["Learner", "Model", "SklLearner", "SklModel",
"ReprableWithPreprocessors"]
Expand Down Expand Up @@ -101,7 +103,7 @@ def fit_storage(self, data):
X, Y, W = data.X, data.Y, data.W if data.has_weights() else None
return self.fit(X, Y, W)

def __call__(self, data):
def __call__(self, data, progress_callback=None):
if not self.check_learner_adequacy(data.domain):
raise ValueError(self.learner_adequacy_err_msg)

Expand All @@ -110,19 +112,34 @@ def __call__(self, data):
if isinstance(data, Instance):
data = Table(data.domain, [data])
origdata = data
data = self.preprocess(data)

if progress_callback is None:
progress_callback = dummy_callback
progress_callback(0, "Preprocessing...")
try:
cb = wrap_callback(progress_callback, end=0.1)
data = self.preprocess(data, progress_callback=cb)
except TypeError:
data = self.preprocess(data)
warnings.warn("A keyword argument 'progress_callback' has been "
"added to the preprocess() signature. Implementing "
"the method without the argument is deprecated and "
"will result in an error in the future.",
OrangeDeprecationWarning)

if len(data.domain.class_vars) > 1 and not self.supports_multiclass:
raise TypeError("%s doesn't support multiple class variables" %
self.__class__.__name__)

progress_callback(0.1, "Fitting...")
model = self._fit_model(data)
model.used_vals = [np.unique(y).astype(int) for y in data.Y[:, None].T]
model.domain = data.domain
model.supports_multiclass = self.supports_multiclass
model.name = self.name
model.original_domain = origdomain
model.original_data = origdata
progress_callback(1)
return model

def _fit_model(self, data):
Expand All @@ -132,10 +149,15 @@ def _fit_model(self, data):
X, Y, W = data.X, data.Y, data.W if data.has_weights() else None
return self.fit(X, Y, W)

def preprocess(self, data):
def preprocess(self, data, progress_callback=None):
"""Apply the `preprocessors` to the data"""
for pp in self.active_preprocessors:
if progress_callback is None:
progress_callback = dummy_callback
n_pps = len(list(self.active_preprocessors))
for i, pp in enumerate(self.active_preprocessors):
progress_callback(i / n_pps)
data = pp(data)
progress_callback(1)
return data

@property
Expand Down Expand Up @@ -468,8 +490,8 @@ def _get_sklparams(self, values):
raise TypeError("Wrapper does not define '__wraps__'")
return params

def preprocess(self, data):
data = super().preprocess(data)
def preprocess(self, data, progress_callback=None):
data = super().preprocess(data, progress_callback)

if any(v.is_discrete and len(v.values) > 2
for v in data.domain.attributes):
Expand All @@ -478,8 +500,8 @@ def preprocess(self, data):

return data

def __call__(self, data):
m = super().__call__(data)
def __call__(self, data, progress_callback=None):
m = super().__call__(data, progress_callback)
m.params = self.params
return m

Expand Down
40 changes: 30 additions & 10 deletions Orange/classification/outlier_detection.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# pylint: disable=unused-argument
from typing import Callable

import numpy as np

from Orange.data.table import DomainTransformationError
from Orange.data.util import get_unique_names
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
Expand All @@ -11,8 +11,11 @@
from Orange.base import SklLearner, SklModel
from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable, \
Variable
from Orange.data.table import DomainTransformationError
from Orange.data.util import get_unique_names
from Orange.preprocess import AdaptiveNormalize
from Orange.statistics.util import all_nan
from Orange.util import wrap_callback, dummy_callback

__all__ = ["LocalOutlierFactorLearner", "IsolationForestLearner",
"EllipticEnvelopeLearner", "OneClassSVMLearner"]
Expand All @@ -29,29 +32,46 @@ def predict(self, X: np.ndarray) -> np.ndarray:
pred[pred == -1] = 0
return pred[:, None]

def __call__(self, data: Table) -> Table:
def __call__(self, data: Table, progress_callback: Callable = None) \
-> Table:
assert isinstance(data, Table)
assert self.outlier_var is not None

domain = Domain(data.domain.attributes, data.domain.class_vars,
data.domain.metas + (self.outlier_var,))
self._cached_data = self.data_to_model_domain(data)
if progress_callback is None:
progress_callback = dummy_callback
progress_callback(0, "Preprocessing...")
self._cached_data = self.data_to_model_domain(
data, wrap_callback(progress_callback, end=0.1))
progress_callback(0.1, "Predicting...")
metas = np.hstack((data.metas, self.predict(self._cached_data.X)))
progress_callback(1)
return Table.from_numpy(domain, data.X, data.Y, metas)

def data_to_model_domain(self, data: Table) -> Table:
def data_to_model_domain(self, data: Table, progress_callback: Callable) \
-> Table:
if data.domain == self.domain:
return data

progress_callback(0)
if self.original_domain.attributes != data.domain.attributes \
and data.X.size \
and not all_nan(data.X):
progress_callback(0.5)
new_data = data.transform(self.original_domain)
if all_nan(new_data.X):
raise DomainTransformationError(
"domain transformation produced no defined values")
return new_data.transform(self.domain)
return data.transform(self.domain)
progress_callback(0.75)
data = new_data.transform(self.domain)
progress_callback(1)
return data

progress_callback(0.5)
data = data.transform(self.domain)
progress_callback(1)
return data


class _OutlierLearner(SklLearner):
Expand Down Expand Up @@ -148,8 +168,9 @@ def mahalanobis(self, observations: np.ndarray) -> np.ndarray:
"""
return self.skl_model.mahalanobis(observations)[:, None]

def __call__(self, data: Table) -> Table:
pred = super().__call__(data)
def __call__(self, data: Table, progress_callback: Callable = None) \
-> Table:
pred = super().__call__(data, progress_callback)
domain = Domain(pred.domain.attributes, pred.domain.class_vars,
pred.domain.metas + (self.mahal_var,))
metas = np.hstack((pred.metas, self.mahalanobis(self._cached_data.X)))
Expand Down Expand Up @@ -181,4 +202,3 @@ def _fit_model(self, data: Table) -> EllipticEnvelopeClassifier:
transformer.variable = variable
model.mahal_var = variable
return model

30 changes: 30 additions & 0 deletions Orange/classification/tests/test_outlier_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
import pickle
import tempfile
import unittest
from unittest.mock import Mock

import numpy as np

from Orange.classification import EllipticEnvelopeLearner, \
IsolationForestLearner, LocalOutlierFactorLearner, OneClassSVMLearner
from Orange.data import Table, Domain, ContinuousVariable
from Orange.data.table import DomainTransformationError


class _TestDetector(unittest.TestCase):
Expand Down Expand Up @@ -207,6 +209,17 @@ def test_unique_name(self):
pred = detect(table)
self.assertEqual(pred.domain.metas[0].name, "Outlier (1)")

def test_predict(self):
detect = self.detector(self.iris)
subset = self.iris[:, :3]
pred = detect(subset)
self.assert_table_appended_outlier(subset, pred)

def test_predict_all_nan(self):
detect = self.detector(self.iris[:, :2])
subset = self.iris[:, 2:]
self.assertRaises(DomainTransformationError, detect, subset)

def test_transform(self):
detect = self.detector(self.iris)
pred = detect(self.iris)
Expand Down Expand Up @@ -235,6 +248,23 @@ def test_pickle_prediction(self):
pickle.dump(pred, f)
f.close()

def test_fit_callback(self):
callback = Mock()
self.detector(self.iris, callback)
args = [x[0][0] for x in callback.call_args_list]
self.assertEqual(min(args), 0)
self.assertEqual(max(args), 1)
self.assertListEqual(args, sorted(args))

def test_predict_callback(self):
callback = Mock()
detect = self.detector(self.iris)
detect(self.iris, callback)
args = [x[0][0] for x in callback.call_args_list]
self.assertEqual(min(args), 0)
self.assertEqual(max(args), 1)
self.assertListEqual(args, sorted(args))


if __name__ == "__main__":
unittest.main()
4 changes: 2 additions & 2 deletions Orange/data/tests/test_util.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import unittest

from Orange.data import Domain, ContinuousVariable
from Orange.data.util import \
get_unique_names, get_unique_names_duplicates, get_unique_names_domain
from Orange.data.util import get_unique_names, get_unique_names_duplicates, \
get_unique_names_domain


class TestGetUniqueNames(unittest.TestCase):
Expand Down
4 changes: 2 additions & 2 deletions Orange/modelling/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ def _fit_model(self, data):
X, Y, W = data.X, data.Y, data.W if data.has_weights() else None
return learner.fit(X, Y, W)

def preprocess(self, data):
return self.get_learner(data).preprocess(data)
def preprocess(self, data, progress_callback=None):
return self.get_learner(data).preprocess(data, progress_callback)

def get_learner(self, problem_type):
"""Get the learner for a given problem type.
Expand Down
34 changes: 31 additions & 3 deletions Orange/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,19 @@
import unittest

from Orange.base import SklLearner, Learner, Model
from Orange.data import Domain
from Orange.preprocess import Discretize, Randomize
from Orange.data import Domain, Table
from Orange.preprocess import Discretize, Randomize, Continuize
from Orange.regression import LinearRegressionLearner


class DummyLearner(Learner):
pass
def fit(self, *_, **__):
return unittest.mock.Mock()


class DummySklLearner(SklLearner):
def fit(self, *_, **__):
return unittest.mock.Mock()


class DummyLearnerPP(Learner):
Expand Down Expand Up @@ -71,6 +77,15 @@ def test_preprocessors_can_be_passed_in_as_generator(self):
'Preprocessors should be able to be passed in as single object '
'as well as an iterable object')

def test_callback(self):
callback = unittest.mock.Mock()
learner = DummyLearner(preprocessors=[Discretize(), Randomize()])
learner(Table("iris"), callback)
args = [x[0][0] for x in callback.call_args_list]
self.assertEqual(min(args), 0)
self.assertEqual(max(args), 1)
self.assertListEqual(args, sorted(args))


class TestSklLearner(unittest.TestCase):
def test_sklearn_supports_weights(self):
Expand Down Expand Up @@ -101,6 +116,15 @@ def test_linreg(self):
"Either LinearRegression no longer supports weighted tables or "
"SklLearner.supports_weights is out-of-date.")

def test_callback(self):
callback = unittest.mock.Mock()
learner = DummySklLearner(preprocessors=[Continuize(), Randomize()])
learner(Table("iris"), callback)
args = [x[0][0] for x in callback.call_args_list]
self.assertEqual(min(args), 0)
self.assertEqual(max(args), 1)
self.assertListEqual(args, sorted(args))


class TestModel(unittest.TestCase):
def test_pickle(self):
Expand All @@ -111,3 +135,7 @@ def test_pickle(self):
self.assertEqual(model.domain, model2.domain)
self.assertEqual(model.original_data, [1, 2, 3])
self.assertEqual(model2.original_data, None)


if __name__ == "__main__":
unittest.main()
19 changes: 19 additions & 0 deletions Orange/tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from Orange.data.util import vstack, hstack, array_equal
from Orange.statistics.util import stats
from Orange.tests.test_statistics import dense_sparse
from Orange.util import wrap_callback

SOMETHING = 0xf00babe

Expand Down Expand Up @@ -158,3 +159,21 @@ def test_csc_unordered_array_equal(self):
a1 = sp.csc_matrix(([1, 4, 5], [0, 0, 1], [0, 1, 1, 3]), shape=(2, 3))
a2 = sp.csc_matrix(([1, 5, 4], [0, 1, 0], [0, 1, 1, 3]), shape=(2, 3))
self.assertTrue(array_equal(a1, a2))

def test_wrap_callback(self):
def func(i):
return i

f = wrap_callback(func, start=0, end=0.8)
self.assertEqual(f(0), 0)
self.assertEqual(round(f(0.1), 2), 0.08)
self.assertEqual(f(1), 0.8)

f = wrap_callback(func, start=0.1, end=0.8)
self.assertEqual(f(0), 0.1)
self.assertEqual(f(0.1), 0.17)
self.assertEqual(f(1), 0.8)


if __name__ == "__main__":
unittest.main()
23 changes: 23 additions & 0 deletions Orange/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,29 @@ def __repr__(self):
name, ", ".join("{}={!r}".format(f, v) for f, _, v in self._reprable_items())
)


def wrap_callback(progress_callback, start=0, end=1):
"""
Wraps a progress callback function to allocate it end-start proportion
of an execution time.

:param progress_callback: callable
:param start: float
:param end: float
:return: callable
"""
@wraps(progress_callback)
def func(progress, *args, **kwargs):
adjusted_progress = start + progress * (end - start)
return progress_callback(adjusted_progress, *args, **kwargs)
return func


def dummy_callback(*_, **__):
""" A dummy callable. """
return 1


# For best result, keep this at the bottom
__all__ = export_globals(globals(), __name__)

Expand Down
Loading