diff --git a/Orange/base.py b/Orange/base.py index d504b65097f..748cbd3dac9 100644 --- a/Orange/base.py +++ b/Orange/base.py @@ -2,6 +2,7 @@ import itertools from collections import Iterable import re +import warnings import numpy as np import scipy @@ -13,7 +14,8 @@ from Orange.misc.wrapper_meta import WrapperMeta from Orange.preprocess import Continuize, RemoveNaNColumns, SklImpute, Normalize from Orange.statistics.util import all_nan -from Orange.util import Reprable +from Orange.util import Reprable, OrangeDeprecationWarning, wrap_callback, \ + dummy_callback __all__ = ["Learner", "Model", "SklLearner", "SklModel", "ReprableWithPreprocessors"] @@ -101,7 +103,7 @@ def fit_storage(self, data): X, Y, W = data.X, data.Y, data.W if data.has_weights() else None return self.fit(X, Y, W) - def __call__(self, data): + def __call__(self, data, progress_callback=None): if not self.check_learner_adequacy(data.domain): raise ValueError(self.learner_adequacy_err_msg) @@ -110,12 +112,26 @@ def __call__(self, data): if isinstance(data, Instance): data = Table(data.domain, [data]) origdata = data - data = self.preprocess(data) + + if progress_callback is None: + progress_callback = dummy_callback + progress_callback(0, "Preprocessing...") + try: + cb = wrap_callback(progress_callback, end=0.1) + data = self.preprocess(data, progress_callback=cb) + except TypeError: + data = self.preprocess(data) + warnings.warn("A keyword argument 'progress_callback' has been " + "added to the preprocess() signature. Implementing " + "the method without the argument is deprecated and " + "will result in an error in the future.", + OrangeDeprecationWarning) if len(data.domain.class_vars) > 1 and not self.supports_multiclass: raise TypeError("%s doesn't support multiple class variables" % self.__class__.__name__) + progress_callback(0.1, "Fitting...") model = self._fit_model(data) model.used_vals = [np.unique(y).astype(int) for y in data.Y[:, None].T] model.domain = data.domain @@ -123,6 +139,7 @@ def __call__(self, data): model.name = self.name model.original_domain = origdomain model.original_data = origdata + progress_callback(1) return model def _fit_model(self, data): @@ -132,10 +149,15 @@ def _fit_model(self, data): X, Y, W = data.X, data.Y, data.W if data.has_weights() else None return self.fit(X, Y, W) - def preprocess(self, data): + def preprocess(self, data, progress_callback=None): """Apply the `preprocessors` to the data""" - for pp in self.active_preprocessors: + if progress_callback is None: + progress_callback = dummy_callback + n_pps = len(list(self.active_preprocessors)) + for i, pp in enumerate(self.active_preprocessors): + progress_callback(i / n_pps) data = pp(data) + progress_callback(1) return data @property @@ -468,8 +490,8 @@ def _get_sklparams(self, values): raise TypeError("Wrapper does not define '__wraps__'") return params - def preprocess(self, data): - data = super().preprocess(data) + def preprocess(self, data, progress_callback=None): + data = super().preprocess(data, progress_callback) if any(v.is_discrete and len(v.values) > 2 for v in data.domain.attributes): @@ -478,8 +500,8 @@ def preprocess(self, data): return data - def __call__(self, data): - m = super().__call__(data) + def __call__(self, data, progress_callback=None): + m = super().__call__(data, progress_callback) m.params = self.params return m diff --git a/Orange/classification/outlier_detection.py b/Orange/classification/outlier_detection.py index 2ddb9366b9f..e7c704c6ca1 100644 --- a/Orange/classification/outlier_detection.py +++ b/Orange/classification/outlier_detection.py @@ -1,8 +1,8 @@ # pylint: disable=unused-argument +from typing import Callable + import numpy as np -from Orange.data.table import DomainTransformationError -from Orange.data.util import get_unique_names from sklearn.covariance import EllipticEnvelope from sklearn.ensemble import IsolationForest from sklearn.neighbors import LocalOutlierFactor @@ -11,8 +11,11 @@ from Orange.base import SklLearner, SklModel from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable, \ Variable +from Orange.data.table import DomainTransformationError +from Orange.data.util import get_unique_names from Orange.preprocess import AdaptiveNormalize from Orange.statistics.util import all_nan +from Orange.util import wrap_callback, dummy_callback __all__ = ["LocalOutlierFactorLearner", "IsolationForestLearner", "EllipticEnvelopeLearner", "OneClassSVMLearner"] @@ -29,29 +32,46 @@ def predict(self, X: np.ndarray) -> np.ndarray: pred[pred == -1] = 0 return pred[:, None] - def __call__(self, data: Table) -> Table: + def __call__(self, data: Table, progress_callback: Callable = None) \ + -> Table: assert isinstance(data, Table) assert self.outlier_var is not None domain = Domain(data.domain.attributes, data.domain.class_vars, data.domain.metas + (self.outlier_var,)) - self._cached_data = self.data_to_model_domain(data) + if progress_callback is None: + progress_callback = dummy_callback + progress_callback(0, "Preprocessing...") + self._cached_data = self.data_to_model_domain( + data, wrap_callback(progress_callback, end=0.1)) + progress_callback(0.1, "Predicting...") metas = np.hstack((data.metas, self.predict(self._cached_data.X))) + progress_callback(1) return Table.from_numpy(domain, data.X, data.Y, metas) - def data_to_model_domain(self, data: Table) -> Table: + def data_to_model_domain(self, data: Table, progress_callback: Callable) \ + -> Table: if data.domain == self.domain: return data + progress_callback(0) if self.original_domain.attributes != data.domain.attributes \ and data.X.size \ and not all_nan(data.X): + progress_callback(0.5) new_data = data.transform(self.original_domain) if all_nan(new_data.X): raise DomainTransformationError( "domain transformation produced no defined values") - return new_data.transform(self.domain) - return data.transform(self.domain) + progress_callback(0.75) + data = new_data.transform(self.domain) + progress_callback(1) + return data + + progress_callback(0.5) + data = data.transform(self.domain) + progress_callback(1) + return data class _OutlierLearner(SklLearner): @@ -148,8 +168,9 @@ def mahalanobis(self, observations: np.ndarray) -> np.ndarray: """ return self.skl_model.mahalanobis(observations)[:, None] - def __call__(self, data: Table) -> Table: - pred = super().__call__(data) + def __call__(self, data: Table, progress_callback: Callable = None) \ + -> Table: + pred = super().__call__(data, progress_callback) domain = Domain(pred.domain.attributes, pred.domain.class_vars, pred.domain.metas + (self.mahal_var,)) metas = np.hstack((pred.metas, self.mahalanobis(self._cached_data.X))) @@ -181,4 +202,3 @@ def _fit_model(self, data: Table) -> EllipticEnvelopeClassifier: transformer.variable = variable model.mahal_var = variable return model - diff --git a/Orange/classification/tests/test_outlier_detection.py b/Orange/classification/tests/test_outlier_detection.py index 272ed93438f..dc49ad2fcce 100644 --- a/Orange/classification/tests/test_outlier_detection.py +++ b/Orange/classification/tests/test_outlier_detection.py @@ -3,12 +3,14 @@ import pickle import tempfile import unittest +from unittest.mock import Mock import numpy as np from Orange.classification import EllipticEnvelopeLearner, \ IsolationForestLearner, LocalOutlierFactorLearner, OneClassSVMLearner from Orange.data import Table, Domain, ContinuousVariable +from Orange.data.table import DomainTransformationError class _TestDetector(unittest.TestCase): @@ -207,6 +209,17 @@ def test_unique_name(self): pred = detect(table) self.assertEqual(pred.domain.metas[0].name, "Outlier (1)") + def test_predict(self): + detect = self.detector(self.iris) + subset = self.iris[:, :3] + pred = detect(subset) + self.assert_table_appended_outlier(subset, pred) + + def test_predict_all_nan(self): + detect = self.detector(self.iris[:, :2]) + subset = self.iris[:, 2:] + self.assertRaises(DomainTransformationError, detect, subset) + def test_transform(self): detect = self.detector(self.iris) pred = detect(self.iris) @@ -235,6 +248,23 @@ def test_pickle_prediction(self): pickle.dump(pred, f) f.close() + def test_fit_callback(self): + callback = Mock() + self.detector(self.iris, callback) + args = [x[0][0] for x in callback.call_args_list] + self.assertEqual(min(args), 0) + self.assertEqual(max(args), 1) + self.assertListEqual(args, sorted(args)) + + def test_predict_callback(self): + callback = Mock() + detect = self.detector(self.iris) + detect(self.iris, callback) + args = [x[0][0] for x in callback.call_args_list] + self.assertEqual(min(args), 0) + self.assertEqual(max(args), 1) + self.assertListEqual(args, sorted(args)) + if __name__ == "__main__": unittest.main() diff --git a/Orange/data/tests/test_util.py b/Orange/data/tests/test_util.py index 0006ea4773b..2df6c287be9 100644 --- a/Orange/data/tests/test_util.py +++ b/Orange/data/tests/test_util.py @@ -1,8 +1,8 @@ import unittest from Orange.data import Domain, ContinuousVariable -from Orange.data.util import \ - get_unique_names, get_unique_names_duplicates, get_unique_names_domain +from Orange.data.util import get_unique_names, get_unique_names_duplicates, \ + get_unique_names_domain class TestGetUniqueNames(unittest.TestCase): diff --git a/Orange/modelling/base.py b/Orange/modelling/base.py index 70fe22b073c..b7c2a24e10f 100644 --- a/Orange/modelling/base.py +++ b/Orange/modelling/base.py @@ -41,8 +41,8 @@ def _fit_model(self, data): X, Y, W = data.X, data.Y, data.W if data.has_weights() else None return learner.fit(X, Y, W) - def preprocess(self, data): - return self.get_learner(data).preprocess(data) + def preprocess(self, data, progress_callback=None): + return self.get_learner(data).preprocess(data, progress_callback) def get_learner(self, problem_type): """Get the learner for a given problem type. diff --git a/Orange/tests/test_base.py b/Orange/tests/test_base.py index bb4429f5d79..9b1a0462a8c 100644 --- a/Orange/tests/test_base.py +++ b/Orange/tests/test_base.py @@ -4,13 +4,19 @@ import unittest from Orange.base import SklLearner, Learner, Model -from Orange.data import Domain -from Orange.preprocess import Discretize, Randomize +from Orange.data import Domain, Table +from Orange.preprocess import Discretize, Randomize, Continuize from Orange.regression import LinearRegressionLearner class DummyLearner(Learner): - pass + def fit(self, *_, **__): + return unittest.mock.Mock() + + +class DummySklLearner(SklLearner): + def fit(self, *_, **__): + return unittest.mock.Mock() class DummyLearnerPP(Learner): @@ -71,6 +77,15 @@ def test_preprocessors_can_be_passed_in_as_generator(self): 'Preprocessors should be able to be passed in as single object ' 'as well as an iterable object') + def test_callback(self): + callback = unittest.mock.Mock() + learner = DummyLearner(preprocessors=[Discretize(), Randomize()]) + learner(Table("iris"), callback) + args = [x[0][0] for x in callback.call_args_list] + self.assertEqual(min(args), 0) + self.assertEqual(max(args), 1) + self.assertListEqual(args, sorted(args)) + class TestSklLearner(unittest.TestCase): def test_sklearn_supports_weights(self): @@ -101,6 +116,15 @@ def test_linreg(self): "Either LinearRegression no longer supports weighted tables or " "SklLearner.supports_weights is out-of-date.") + def test_callback(self): + callback = unittest.mock.Mock() + learner = DummySklLearner(preprocessors=[Continuize(), Randomize()]) + learner(Table("iris"), callback) + args = [x[0][0] for x in callback.call_args_list] + self.assertEqual(min(args), 0) + self.assertEqual(max(args), 1) + self.assertListEqual(args, sorted(args)) + class TestModel(unittest.TestCase): def test_pickle(self): @@ -111,3 +135,7 @@ def test_pickle(self): self.assertEqual(model.domain, model2.domain) self.assertEqual(model.original_data, [1, 2, 3]) self.assertEqual(model2.original_data, None) + + +if __name__ == "__main__": + unittest.main() diff --git a/Orange/tests/test_util.py b/Orange/tests/test_util.py index d9862343f60..eaba6448f27 100644 --- a/Orange/tests/test_util.py +++ b/Orange/tests/test_util.py @@ -11,6 +11,7 @@ from Orange.data.util import vstack, hstack, array_equal from Orange.statistics.util import stats from Orange.tests.test_statistics import dense_sparse +from Orange.util import wrap_callback SOMETHING = 0xf00babe @@ -158,3 +159,21 @@ def test_csc_unordered_array_equal(self): a1 = sp.csc_matrix(([1, 4, 5], [0, 0, 1], [0, 1, 1, 3]), shape=(2, 3)) a2 = sp.csc_matrix(([1, 5, 4], [0, 1, 0], [0, 1, 1, 3]), shape=(2, 3)) self.assertTrue(array_equal(a1, a2)) + + def test_wrap_callback(self): + def func(i): + return i + + f = wrap_callback(func, start=0, end=0.8) + self.assertEqual(f(0), 0) + self.assertEqual(round(f(0.1), 2), 0.08) + self.assertEqual(f(1), 0.8) + + f = wrap_callback(func, start=0.1, end=0.8) + self.assertEqual(f(0), 0.1) + self.assertEqual(f(0.1), 0.17) + self.assertEqual(f(1), 0.8) + + +if __name__ == "__main__": + unittest.main() diff --git a/Orange/util.py b/Orange/util.py index 9a62cbcf6f6..1c8983610fa 100644 --- a/Orange/util.py +++ b/Orange/util.py @@ -416,6 +416,29 @@ def __repr__(self): name, ", ".join("{}={!r}".format(f, v) for f, _, v in self._reprable_items()) ) + +def wrap_callback(progress_callback, start=0, end=1): + """ + Wraps a progress callback function to allocate it end-start proportion + of an execution time. + + :param progress_callback: callable + :param start: float + :param end: float + :return: callable + """ + @wraps(progress_callback) + def func(progress, *args, **kwargs): + adjusted_progress = start + progress * (end - start) + return progress_callback(adjusted_progress, *args, **kwargs) + return func + + +def dummy_callback(*_, **__): + """ A dummy callable. """ + return 1 + + # For best result, keep this at the bottom __all__ = export_globals(globals(), __name__) diff --git a/Orange/widgets/data/owoutliers.py b/Orange/widgets/data/owoutliers.py index 84dac78cd89..5a1c76142a9 100644 --- a/Orange/widgets/data/owoutliers.py +++ b/Orange/widgets/data/owoutliers.py @@ -1,4 +1,5 @@ from typing import Dict, Tuple +from types import SimpleNamespace import numpy as np @@ -7,16 +8,52 @@ from orangewidget.settings import SettingProvider +from Orange.base import Learner from Orange.classification import OneClassSVMLearner, EllipticEnvelopeLearner,\ LocalOutlierFactorLearner, IsolationForestLearner from Orange.data import Table +from Orange.util import wrap_callback from Orange.widgets import gui from Orange.widgets.settings import Setting +from Orange.widgets.utils.concurrent import TaskState, ConcurrentWidgetMixin from Orange.widgets.utils.sql import check_sql_input from Orange.widgets.utils.widgetpreview import WidgetPreview from Orange.widgets.widget import Msg, Input, Output, OWWidget +class Results(SimpleNamespace): + inliers = None # type: Optional[Table] + outliers = None # type: Optional[Table] + annotated_data = None # type: Optional[Table] + + +def run(data: Table, learner: Learner, state: TaskState) -> Results: + results = Results() + if not data: + return results + + def callback(i: float, status=""): + state.set_progress_value(i * 100) + if status: + state.set_status(status) + if state.is_interruption_requested(): + raise Exception + + callback(0, "Initializing...") + model = learner(data, wrap_callback(callback, end=0.6)) + pred = model(data, wrap_callback(callback, start=0.6, end=0.99)) + + col = pred.get_column_view(model.outlier_var)[0] + inliers_ind = np.where(col == 1)[0] + outliers_ind = np.where(col == 0)[0] + + results.inliers = data[inliers_ind] + results.outliers = data[outliers_ind] + results.annotated_data = pred + callback(1) + return results + + class ParametersEditor(QWidget, gui.OWComponent): param_changed = Signal() @@ -132,7 +169,7 @@ def get_parameters(self): "random_state": 42 if self.replicable else None} -class OWOutliers(OWWidget): +class OWOutliers(OWWidget, ConcurrentWidgetMixin): name = "Outliers" description = "Detect outliers." icon = "icons/Outliers.svg" @@ -173,7 +210,8 @@ class Error(OWWidget.Error): memory_error = Msg("Not enough memory") def __init__(self): - super().__init__() + OWWidget.__init__(self) + ConcurrentWidgetMixin.__init__(self) self.data = None # type: Table self.n_inliers = None # type: int self.n_outliers = None # type: int @@ -224,6 +262,7 @@ def set_current_editor(self): @Inputs.data @check_sql_input def set_data(self, data): + self.cancel() self.clear_messages() self.data = data self.info.set_input_summary(len(data) if data else self.info.NoOutput) @@ -239,38 +278,41 @@ def enable_controls(self): self.Warning.disabled_cov() def commit(self): - inliers, outliers, data = self.detect_outliers() + self.Error.singular_cov.clear() + self.Error.memory_error.clear() + self.n_inliers = self.n_outliers = None + + learner_class = self.METHODS[self.outlier_method] + kwargs = self.current_editor.get_parameters() + learner = learner_class(**kwargs) + + self.start(run, self.data, learner) + + def on_partial_result(self, _): + pass + + def on_done(self, result: Results): + inliers, outliers = result.inliers, result.outliers summary = len(inliers) if inliers else self.info.NoOutput self.info.set_output_summary(summary) + self.n_inliers = len(inliers) if inliers else None + self.n_outliers = len(outliers) if outliers else None + self.Outputs.inliers.send(inliers) self.Outputs.outliers.send(outliers) - self.Outputs.data.send(data) + self.Outputs.data.send(result.annotated_data) - def detect_outliers(self) -> Tuple[Table, Table, Table]: - self.n_inliers = self.n_outliers = None - self.Error.singular_cov.clear() - self.Error.memory_error.clear() - if not self.data: - return None, None, None - try: - learner_class = self.METHODS[self.outlier_method] - kwargs = self.current_editor.get_parameters() - learner = learner_class(**kwargs) - model = learner(self.data) - pred = model(self.data) - except ValueError: - self.Error.singular_cov() - return None, None, None - except MemoryError: + def on_exception(self, ex): + if isinstance(ex, ValueError): + self.Error.singular_cov(ex) + elif isinstance(ex, MemoryError): self.Error.memory_error() - return None, None, None else: - col = pred[:, model.outlier_var].metas - inliers_ind = np.where(col == 1)[0] - outliers_ind = np.where(col == 0)[0] - self.n_inliers = len(inliers_ind) - self.n_outliers = len(outliers_ind) - return self.data[inliers_ind], self.data[outliers_ind], pred + raise ex + + def onDeleteWidget(self): + self.shutdown() + super().onDeleteWidget() def send_report(self): if self.n_outliers is None or self.n_inliers is None: diff --git a/Orange/widgets/data/tests/test_owoutliers.py b/Orange/widgets/data/tests/test_owoutliers.py index 2b351c2e06f..f0c095ade09 100644 --- a/Orange/widgets/data/tests/test_owoutliers.py +++ b/Orange/widgets/data/tests/test_owoutliers.py @@ -5,10 +5,36 @@ from unittest.mock import patch, Mock from Orange.data import Table -from Orange.widgets.data.owoutliers import OWOutliers +from Orange.classification import LocalOutlierFactorLearner +from Orange.widgets.data.owoutliers import OWOutliers, run from Orange.widgets.tests.base import WidgetTest, simulate +class TestRun(unittest.TestCase): + def test_results(self): + iris = Table("iris") + state = Mock() + state.is_interruption_requested = Mock(return_value=False) + res = run(iris, LocalOutlierFactorLearner(), state) + self.assertIsInstance(res.inliers, Table) + self.assertIsInstance(res.outliers, Table) + self.assertIsInstance(res.annotated_data, Table) + + self.assertEqual(iris.domain, res.inliers.domain) + self.assertEqual(iris.domain, res.outliers.domain) + self.assertIn("Outlier", res.annotated_data.domain) + + self.assertEqual(len(res.inliers), 145) + self.assertEqual(len(res.outliers), 5) + self.assertEqual(len(res.annotated_data), 150) + + def test_no_data(self): + res = run(None, LocalOutlierFactorLearner(), Mock()) + self.assertIsNone(res.inliers) + self.assertIsNone(res.outliers) + self.assertIsNone(res.annotated_data) + + class TestOWOutliers(WidgetTest): def setUp(self): self.widget = self.create_widget(OWOutliers) @@ -82,6 +108,7 @@ def test_memory_error(self, mocked_predict: Mock): self.assertFalse(self.widget.Error.memory_error.is_shown()) mocked_predict.side_effect = MemoryError self.send_signal(self.widget.Inputs.data, self.iris) + self.wait_until_finished() self.assertTrue(self.widget.Error.memory_error.is_shown()) @patch("Orange.classification.outlier_detection._OutlierModel.predict") @@ -89,6 +116,7 @@ def test_singular_cov_error(self, mocked_predict: Mock): self.assertFalse(self.widget.Error.singular_cov.is_shown()) mocked_predict.side_effect = ValueError self.send_signal(self.widget.Inputs.data, self.iris) + self.wait_until_finished() self.assertTrue(self.widget.Error.singular_cov.is_shown()) def test_nans(self): @@ -106,10 +134,12 @@ def test_in_out_summary(self): self.assertEqual(info._StateInfo__output_summary.brief, "") self.send_signal(self.widget.Inputs.data, self.iris) + self.wait_until_finished() self.assertEqual(info._StateInfo__input_summary.brief, "150") self.assertEqual(info._StateInfo__output_summary.brief, "135") self.send_signal(self.widget.Inputs.data, None) + self.wait_until_finished() self.assertEqual(info._StateInfo__input_summary.brief, "") self.assertEqual(info._StateInfo__output_summary.brief, "") @@ -133,6 +163,18 @@ def test_covariance_enabled(self): self.assertFalse(self.widget.Warning.disabled_cov.is_shown()) self.assertTrue(cov_item.isEnabled()) + @patch("Orange.widgets.data.owoutliers.OWOutliers.report_items") + def test_report(self, mocked_report: Mock): + self.send_signal(self.widget.Inputs.data, self.iris) + self.wait_until_finished() + self.widget.send_report() + mocked_report.assert_called() + mocked_report.reset_mock() + + self.send_signal(self.widget.Inputs.data, None) + self.widget.send_report() + mocked_report.assert_not_called() + def test_migrate_settings(self): settings = {"cont": 20, "empirical_covariance": True, "gamma": 0.04, "nu": 30, "outlier_method": 0,