Skip to content

Commit

Permalink
Merge pull request #1747 from ales-erjavec/fixes/silhouette-n-clusters
Browse files Browse the repository at this point in the history
[FIX] Silhouette Plot: Fixes
  • Loading branch information
lanzagar authored Nov 15, 2016
2 parents 1ee9e1e + c7efeed commit fbf23ba
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 19 deletions.
90 changes: 71 additions & 19 deletions Orange/widgets/visualize/owsilhouetteplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from xml.sax.saxutils import escape
from types import SimpleNamespace as namespace

if sys.version_info > (3, 5):
from typing import Optional

import numpy
import sklearn.metrics

Expand Down Expand Up @@ -57,7 +60,7 @@ class OWSilhouettePlot(widget.OWWidget):
cluster_var_idx = settings.ContextSetting(0)
#: Annotation variable index
annotation_var_idx = settings.ContextSetting(0)
#: Group the silhouettes by cluster
#: Group the (displayed) silhouettes by cluster
group_by_cluster = settings.Setting(True)
#: A fixed size for an instance bar
bar_size = settings.Setting(3)
Expand All @@ -73,16 +76,30 @@ class OWSilhouettePlot(widget.OWWidget):

class Error(widget.OWWidget.Error):
need_two_clusters = Msg("Need at least two non-empty clusters")
singleton_clusters_all = Msg("All clusters are singletons")

class Warning(widget.OWWidget.Warning):
missing_cluster_assignment = Msg(
"{} instance{s} omitted (missing cluster assignment)")

def __init__(self):
super().__init__()

self.data = None
self._effective_data = None
self._matrix = None
self._silhouette = None
self._labels = None
self._silplot = None
#: The input data
self.data = None # type: Optional[Orange.data.Table]
#: Data after any applied pre-processing step
self._effective_data = None # type: Optional[Orange.data.Table]
#: Distance matrix computed from _effective_data
self._matrix = None # type: Optional[Orange.misc.DistMatrix]
#: An bool mask (size == len(data)) indicating missing group/cluster
#: assignments
self._mask = None # type: Optional[numpy.ndarray]
#: An array of cluster/group labels for instances with valid group
#: assignment
self._labels = None # type: Optional[numpy.ndarray]
#: An array of silhouette scores for instances with valid group
#: assignment
self._silhouette = None # type: Optional[numpy.ndarray]
self._silplot = None # type: Optional[SilhouettePlot]

gui.comboBox(
self.controlArea, self, "distance_idx", box="Distance",
Expand Down Expand Up @@ -199,11 +216,14 @@ def clear(self):
self.data = None
self._effective_data = None
self._matrix = None
self._mask = None
self._silhouette = None
self._labels = None
self.cluster_var_model[:] = []
self.annotation_var_model[:] = ["None"]
self._clear_scene()
self.Error.clear()
self.Warning.clear()

def _clear_scene(self):
# Clear the graphics scene and associated objects
Expand All @@ -218,7 +238,7 @@ def _invalidate_distances(self):

def _invalidate_scores(self):
# Invalidate and recompute the current silhouette scores.
self._labels = self._silhouette = None
self._labels = self._silhouette = self._mask = None
self._update()
self._replot()
if self.data is not None:
Expand All @@ -227,6 +247,7 @@ def _invalidate_scores(self):
def _update(self):
# Update/recompute the distances/scores as required
if self.data is None:
self._mask = None
self._silhouette = None
self._labels = None
self._matrix = None
Expand All @@ -239,19 +260,35 @@ def _update(self):

labelvar = self.cluster_var_model[self.cluster_var_idx]
labels, _ = self.data.get_column_view(labelvar)
mask = numpy.isnan(labels)
labels = labels.astype(int)
_, counts = numpy.unique(labels, return_counts=True)
if numpy.count_nonzero(counts) >= 2:
self.Error.need_two_clusters.clear()
silhouette = sklearn.metrics.silhouette_samples(
self._matrix, labels, metric="precomputed")
else:
self.Error.need_two_clusters()
labels = silhouette = None
labels = labels[~mask]

labels_unq, counts = numpy.unique(labels, return_counts=True)

self.Error.singleton_clusters_all.clear()
self.Error.need_two_clusters.clear()
self.Warning.missing_cluster_assignment.clear()

if len(labels_unq) < 2:
self.Error.need_two_clusters()
labels = silhouette = mask = None
elif len(labels_unq) == len(labels):
self.Error.singleton_clusters_all()
labels = silhouette = mask = None
else:
silhouette = sklearn.metrics.silhouette_samples(
self._matrix[~mask, :][:, ~mask], labels, metric="precomputed")
self._mask = mask
self._labels = labels
self._silhouette = silhouette

if labels is not None:
count_missing = numpy.count_nonzero(mask)
if count_missing:
self.Warning.missing_cluster_assignment(
count_missing, s="s" if count_missing > 1 else "")

def _set_bar_height(self):
visible = self.bar_size >= 5
self._silplot.setBarHeight(self.bar_size)
Expand Down Expand Up @@ -304,6 +341,9 @@ def _update_annotations(self):
if self._silplot is not None:
if annot_var is not None:
column, _ = self.data.get_column_view(annot_var)
if self._mask is not None:
assert column.shape == self._mask.shape
column = column[~self._mask]
self._silplot.setRowNames(
[annot_var.str_val(value) for value in column])
else:
Expand All @@ -318,8 +358,18 @@ def commit(self):
selectedmask = numpy.full(len(self.data), False, dtype=bool)
if self._silplot is not None:
indices = self._silplot.selection()
assert (numpy.diff(indices) > 0).all(), "strictly increasing"
if self._mask is not None:
indices = numpy.flatnonzero(~self._mask)[indices]
selectedmask[indices] = True
scores = self._silhouette

if self._mask is not None:
scores = numpy.full(shape=selectedmask.shape,
fill_value=numpy.nan)
scores[~self._mask] = self._silhouette
else:
scores = self._silhouette

silhouette_var = None
if self.add_scores:
var = self.cluster_var_model[self.cluster_var_idx]
Expand Down Expand Up @@ -421,7 +471,9 @@ def setScores(self, scores, labels, values, rownames=None):
raise ValueError("rownames must have the same size as scores")

Ck = numpy.unique(labels)
assert Ck[0] >= 0 and Ck[-1] < len(values)
if not Ck[0] >= 0 and Ck[-1] < len(values):
raise ValueError(
"All indices in `labels` must be in `range(len(values))`")
cluster_indices = [numpy.flatnonzero(labels == i)
for i in range(len(values))]
cluster_indices = [indices[numpy.argsort(scores[indices])[::-1]]
Expand Down
33 changes: 33 additions & 0 deletions Orange/widgets/visualize/tests/test_owsilhouetteplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def setUpClass(cls):
def setUp(self):
self.widget = self.create_widget(OWSilhouettePlot,
stored_settings={"auto_commit": True})
self.widget = self.widget # type: OWSilhouettePlot

def test_outputs_add_scores(self):
# check output when appending scores
Expand All @@ -39,3 +40,35 @@ def _select_data(self):
points = random.sample(range(0, len(self.data)), 20)
self.widget._silplot.setSelection(points)
return sorted(points)

def test_insufficient_clusters(self):
iris = self.data
data_one_cluster = iris[:3] # three instances Iris-setosa only
self.send_signal("Data", data_one_cluster)
self.assertTrue(self.widget.Error.need_two_clusters.is_shown())

data_singletons = iris[[0, 50, 100]]
assert len(np.unique(data_singletons.Y)) == 3 # 3 instances 3 labels
self.send_signal("Data", data_singletons)
self.assertTrue(self.widget.Error.singleton_clusters_all.is_shown())

def test_unknowns_in_labels(self):
self.widget.controls.add_scores.setChecked(1)
scorename = "Silhouette (iris)"
data = self.data[[0, 1, 2, 50, 51, 52, 100, 101, 102]]
data.Y[::3] = np.nan
valid = ~np.isnan(data.Y.flatten())
self.send_signal("Data", data)
output = self.get_output(ANNOTATED_DATA_SIGNAL_NAME)
scores = output[:, scorename].metas.flatten()
self.assertTrue(np.all(np.isnan(scores[::3])))
self.assertTrue(np.all(np.isfinite(scores[valid])))

# Run again on subset with known labels
data_1 = data[np.flatnonzero(valid)]
self.send_signal("Data", data_1)
output_1 = self.get_output(ANNOTATED_DATA_SIGNAL_NAME)
scores_1 = output_1[:, scorename].metas.flatten()
self.assertTrue(np.all(np.isfinite(scores_1)))
# the scores must match
np.testing.assert_almost_equal(scores_1, scores[valid], decimal=12)

0 comments on commit fbf23ba

Please sign in to comment.