diff --git a/Orange/widgets/unsupervised/icons/DBSCAN.svg b/Orange/widgets/unsupervised/icons/DBSCAN.svg new file mode 100644 index 00000000000..6efec89a4b7 --- /dev/null +++ b/Orange/widgets/unsupervised/icons/DBSCAN.svg @@ -0,0 +1,53 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Orange/widgets/unsupervised/owdbscan.py b/Orange/widgets/unsupervised/owdbscan.py new file mode 100644 index 00000000000..2b1d8a55ca1 --- /dev/null +++ b/Orange/widgets/unsupervised/owdbscan.py @@ -0,0 +1,153 @@ +import numpy as np + +from AnyQt.QtWidgets import QLayout +from AnyQt.QtCore import Qt + +from Orange.widgets import widget, gui +from Orange.widgets.settings import Setting +from Orange.data import Table, Domain, DiscreteVariable +from Orange.clustering import DBSCAN +from Orange import distance +from Orange.widgets.utils.annotated_data import ANNOTATED_DATA_SIGNAL_NAME +from Orange.widgets.utils.signals import Input, Output +from Orange.widgets.widget import Msg + + +class OWDBSCAN(widget.OWWidget): + name = "DBSCAN" + description = "Density-based spatial clustering." + icon = "icons/DBSCAN.svg" + priority = 2150 + + class Inputs: + data = Input("Data", Table) + + class Outputs: + annotated_data = Output(ANNOTATED_DATA_SIGNAL_NAME, Table) + + class Error(widget.OWWidget.Error): + not_enough_instances = Msg("Not enough unique data instances. " + "At least two are required.") + + METRICS = [ + ("Euclidean", "euclidean"), + ("Manhattan", "manhattan"), + ("Cosine", distance.Cosine), + ("Jaccard", distance.Jaccard), + # ("Spearman", distance.SpearmanR), + # ("Spearman absolute", distance.SpearmanRAbsolute), + # ("Pearson", distance.PearsonR), + # ("Pearson absolute", distance.PearsonRAbsolute), + ] + + min_samples = Setting(5) + eps = Setting(0.5) + metric_idx = Setting(0) + auto_commit = Setting(True) + + want_main_area = False + + def __init__(self): + super().__init__() + + self.data = None + self.db = None + self.model = None + + box = gui.widgetBox(self.controlArea, "Parameters") + gui.spin(box, self, "min_samples", 1, 100, 1, callback=self._invalidate, + label="Core point neighbors") + gui.doubleSpin(box, self, "eps", 0.1, 10, 0.01, + callback=self._invalidate, + label="Neighborhood distance") + + box = gui.widgetBox(self.controlArea, self.tr("Distance Metric")) + gui.comboBox(box, self, "metric_idx", + items=list(zip(*self.METRICS))[0], + callback=self._invalidate) + + gui.auto_commit(self.controlArea, self, "auto_commit", "Apply", + orientation=Qt.Horizontal) + gui.rubber(self.controlArea) + + self.controlArea.setMinimumWidth(self.controlArea.sizeHint().width()) + self.layout().setSizeConstraint(QLayout.SetFixedSize) + + def adjustSize(self): + self.ensurePolished() + self.resize(self.controlArea.sizeHint()) + + def check_data_size(self): + if len(self.data) < 2: + self.Error.not_enough_instances() + return False + return True + + def commit(self): + self.cluster() + + def cluster(self): + if not self.check_data_size(): + return + self.model = DBSCAN( + eps=self.eps, + min_samples=self.min_samples, + metric=self.METRICS[self.metric_idx][1] + ).get_model(self.data) + self.send_data() + + def send_data(self): + model = self.model + + clusters = [c if c >= 0 else np.nan for c in model.labels] + k = len(set(clusters) - {np.nan}) + clusters = np.array(clusters).reshape(len(self.data), 1) + core_samples = set(model.projector.core_sample_indices_) + in_core = np.array([1 if (i in core_samples) else 0 + for i in range(len(self.data))]) + in_core = in_core.reshape(len(self.data), 1) + + clust_var = DiscreteVariable( + "Cluster", values=["C%d" % (x + 1) for x in range(k)]) + in_core_var = DiscreteVariable("DBSCAN Core", values=["0", "1"]) + + domain = self.data.domain + attributes, classes = domain.attributes, domain.class_vars + meta_attrs = domain.metas + x, y, metas = self.data.X, self.data.Y, self.data.metas + + meta_attrs += (clust_var, ) + metas = np.hstack((metas, clusters)) + meta_attrs += (in_core_var, ) + metas = np.hstack((metas, in_core)) + + domain = Domain(attributes, classes, meta_attrs) + new_table = Table(domain, x, y, metas, self.data.W) + + self.Outputs.annotated_data.send(new_table) + + @Inputs.data + def set_data(self, data): + self.data = data + if self.data is None: + self.Outputs.annotated_data.send(None) + self.Error.clear() + if self.data is None: + return + self.unconditional_commit() + + def _invalidate(self): + self.commit() + + +if __name__ == "__main__": + import sys + from AnyQt.QtWidgets import QApplication + + a = QApplication(sys.argv) + ow = OWDBSCAN() + d = Table("iris.tab") + ow.set_data(d) + ow.show() + a.exec() + ow.saveSettings() diff --git a/Orange/widgets/unsupervised/tests/test_owdbscan.py b/Orange/widgets/unsupervised/tests/test_owdbscan.py new file mode 100644 index 00000000000..c53fe2ef81d --- /dev/null +++ b/Orange/widgets/unsupervised/tests/test_owdbscan.py @@ -0,0 +1,119 @@ +import numpy as np +from scipy.sparse import csr_matrix + +from Orange.data import Table +from Orange.widgets.tests.base import WidgetTest +from Orange.widgets.tests.utils import simulate +from Orange.widgets.unsupervised.owdbscan import OWDBSCAN + + +class TestOWCSVFileImport(WidgetTest): + def setUp(self): + self.widget = self.create_widget(OWDBSCAN) + self.iris = Table("iris") + + def tearDown(self): + self.widgets.remove(self.widget) + self.widget.onDeleteWidget() + self.widget = None + + def test_cluster(self): + w = self.widget + + self.send_signal(w.Inputs.data, self.iris) + + output = self.get_output(w.Outputs.annotated_data) + self.assertIsNotNone(output) + self.assertEqual(len(self.iris), len(output)) + self.assertTupleEqual(self.iris.X.shape, output.X.shape) + self.assertTupleEqual(self.iris.Y.shape, output.Y.shape) + self.assertEqual(2, output.metas.shape[1]) + + self.assertEqual("Cluster", str(output.domain.metas[0])) + self.assertEqual("DBSCAN Core", str(output.domain.metas[1])) + + def test_bad_input(self): + w = self.widget + + self.send_signal(w.Inputs.data, self.iris[:1]) + self.assertTrue(w.Error.not_enough_instances.is_shown()) + + self.send_signal(w.Inputs.data, self.iris[:2]) + self.assertFalse(w.Error.not_enough_instances.is_shown()) + + self.send_signal(w.Inputs.data, self.iris) + self.assertFalse(w.Error.not_enough_instances.is_shown()) + + def test_data_none(self): + w = self.widget + + self.send_signal(w.Inputs.data, self.iris[:5]) + self.send_signal(w.Inputs.data, None) + + output = self.get_output(w.Outputs.annotated_data) + self.assertIsNone(output) + + def test_change_eps(self): + w = self.widget + + self.send_signal(w.Inputs.data, self.iris) + + # change parameters + self.widget.controls.eps.valueChanged.emit(0.5) + output1 = self.get_output(w.Outputs.annotated_data) + self.widget.controls.eps.valueChanged.emit(1) + output2 = self.get_output(w.Outputs.annotated_data) + + # on this data higher eps has greater sum of clusters - less nan + # values + self.assertGreater(np.nansum(output2.metas[:, 0]), + np.nansum(output1.metas[:, 0])) + + def test_change_min_samples(self): + w = self.widget + + self.send_signal(w.Inputs.data, self.iris) + + # change parameters + self.widget.controls.min_samples.valueChanged.emit(5) + output1 = self.get_output(w.Outputs.annotated_data) + self.widget.controls.min_samples.valueChanged.emit(1) + output2 = self.get_output(w.Outputs.annotated_data) + + # on this data lower min_samples has greater sum of clusters - less nan + # values + self.assertGreater(np.nansum(output2.metas[:, 0]), + np.nansum(output1.metas[:, 0])) + + def test_change_metric_idx(self): + w = self.widget + + self.send_signal(w.Inputs.data, self.iris) + + # change parameters + cbox = self.widget.controls.metric_idx + simulate.combobox_activate_index(cbox, 0) # Euclidean + output1 = self.get_output(w.Outputs.annotated_data) + simulate.combobox_activate_index(cbox, 1) # Manhattan + output2 = self.get_output(w.Outputs.annotated_data) + + # Manhattan has more nan clusters + self.assertGreater(np.nansum(output1.metas[:, 0]), + np.nansum(output2.metas[:, 0])) + + def test_sparse_data(self): + self.iris.X = csr_matrix(self.iris.X) + + w = self.widget + + self.send_signal(w.Inputs.data, self.iris) + + output = self.get_output(w.Outputs.annotated_data) + self.assertIsNotNone(output) + self.assertEqual(len(self.iris), len(output)) + self.assertTupleEqual(self.iris.X.shape, output.X.shape) + self.assertTupleEqual(self.iris.Y.shape, output.Y.shape) + self.assertEqual(2, output.metas.shape[1]) + + self.assertEqual("Cluster", str(output.domain.metas[0])) + self.assertEqual("DBSCAN Core", str(output.domain.metas[1]))