Skip to content

Commit

Permalink
Merge pull request #6348 from janezd/lazy-signals
Browse files Browse the repository at this point in the history
[ENH] Lazy signals for Hierarchical Clustering
  • Loading branch information
markotoplak authored Aug 21, 2023
2 parents 1725baa + 2fc2717 commit eb3633c
Show file tree
Hide file tree
Showing 8 changed files with 321 additions and 102 deletions.
78 changes: 42 additions & 36 deletions Orange/widgets/unsupervised/owhierarchicalclustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@

from Orange.widgets.utils.localization import pl
from orangewidget.utils.itemmodels import PyListModel
from orangewidget.utils.signals import LazyValue

import Orange.data
from Orange.data.domain import filter_visible
from Orange.data import Domain, DiscreteVariable, ContinuousVariable, \
StringVariable
StringVariable, Table
import Orange.misc
from Orange.clustering.hierarchical import \
postorder, preorder, Tree, tree_from_linkage, dist_matrix_linkage, \
Expand All @@ -32,8 +33,11 @@

from Orange.widgets import widget, gui, settings
from Orange.widgets.utils import itemmodels, combobox
from Orange.widgets.utils.annotated_data import (create_annotated_table,
ANNOTATED_DATA_SIGNAL_NAME)
from Orange.widgets.utils.annotated_data import (lazy_annotated_table,
ANNOTATED_DATA_SIGNAL_NAME,
domain_with_annotation_column,
add_columns,
create_annotated_table)
from Orange.widgets.utils.widgetpreview import WidgetPreview
from Orange.widgets.visualize.utils.plotutils import AxisItem
from Orange.widgets.widget import Input, Output, Msg
Expand Down Expand Up @@ -776,71 +780,73 @@ def commit(self):
for node in selection]

selected_indices = list(chain(*maps))
unselected_indices = sorted(set(range(self.root.value.last)) -
set(selected_indices))

if not selected_indices:
self.Outputs.selected_data.send(None)
annotated_data = create_annotated_table(items, []) \
annotated_data = lazy_annotated_table(items, []) \
if self.selection_method == 0 and self.matrix.axis else None
self.Outputs.annotated_data.send(annotated_data)
return

selected_data = None
selected_data = annotated_data = None

if isinstance(items, Orange.data.Table) and self.matrix.axis == 1:
# Select rows
c = np.zeros(self.matrix.shape[0])
data, domain = items, items.domain

c = np.full(self.matrix.shape[0], len(maps))
for i, indices in enumerate(maps):
c[indices] = i
c[unselected_indices] = len(maps)

mask = c != len(maps)

data, domain = items, items.domain
attrs = domain.attributes
classes = domain.class_vars
metas = domain.metas

var_name = get_unique_names(domain, "Cluster")
clust_name = get_unique_names(domain, "Cluster")
values = [f"C{i + 1}" for i in range(len(maps))]

clust_var = Orange.data.DiscreteVariable(
var_name, values=values + ["Other"])
domain = Orange.data.Domain(attrs, classes, metas + (clust_var,))
data = items.transform(domain)
with data.unlocked(data.metas):
data.set_column(clust_var, c)

if selected_indices:
selected_data = data[mask]
clust_var = Orange.data.DiscreteVariable(
var_name, values=values)
selected_data.domain = Domain(
attrs, classes, metas + (clust_var, ))

annotated_data = create_annotated_table(data, selected_indices)
sel_clust_var = Orange.data.DiscreteVariable(
name=clust_name, values=values)
sel_domain = add_columns(domain, metas=(sel_clust_var,))
selected_data = LazyValue[Table](
lambda: items.add_column(
sel_clust_var, c, to_metas=True)[c != len(maps)],
domain=sel_domain, length=len(selected_indices))

ann_clust_var = Orange.data.DiscreteVariable(
name=clust_name, values=values + ["Other"]
)
ann_domain = add_columns(
domain_with_annotation_column(data)[0], metas=(ann_clust_var, ))
annotated_data = LazyValue[Table](
lambda: create_annotated_table(
data=items.add_column(ann_clust_var, c, to_metas=True),
selected_indices=selected_indices),
domain=ann_domain, length=len(items)
)

elif isinstance(items, Orange.data.Table) and self.matrix.axis == 0:
# Select columns
attrs = []
unselected_indices = sorted(set(range(self.root.value.last)) -
set(selected_indices))
for clust, indices in chain(enumerate(maps, start=1),
[(0, unselected_indices)]):
for i in indices:
attr = items.domain[i].copy()
attr.attributes["cluster"] = clust
attrs.append(attr)
domain = Orange.data.Domain(
all_domain = Orange.data.Domain(
# len(unselected_indices) can be 0
attrs[:len(attrs) - len(unselected_indices)],
items.domain.class_vars, items.domain.metas)
selected_data = items.from_table(domain, items)

domain = Orange.data.Domain(
selected_data = LazyValue[Table](
lambda: items.from_table(all_domain, items),
domain=all_domain, length=len(items))

sel_domain = Orange.data.Domain(
attrs,
items.domain.class_vars, items.domain.metas)
annotated_data = items.from_table(domain, items)
annotated_data = LazyValue[Table](
lambda: items.from_table(sel_domain, items),
domain=sel_domain, length=len(items))

self.Outputs.selected_data.send(selected_data)
self.Outputs.annotated_data.send(annotated_data)
Expand Down
61 changes: 50 additions & 11 deletions Orange/widgets/utils/annotated_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
from typing import Union

import numpy as np
from Orange.data import Domain, DiscreteVariable

from orangewidget.utils.signals import LazyValue

from Orange.data import Domain, DiscreteVariable, Table
from Orange.data.util import get_unique_names

ANNOTATED_DATA_SIGNAL_NAME = "Data"
Expand Down Expand Up @@ -30,16 +35,26 @@ def add_columns(domain, attributes=(), class_vars=(), metas=()):
return Domain(attributes, class_vars, metas)


def domain_with_annotation_column(
data: Union[Table, Domain],
values=("No", "Yes"),
var_name=ANNOTATED_DATA_FEATURE_NAME):
domain = data if isinstance(data, Domain) else data.domain
var = DiscreteVariable(get_unique_names(domain, var_name), values)
class_vars, metas = domain.class_vars, domain.metas
if not domain.class_vars:
class_vars += (var, )
else:
metas += (var, )
return Domain(domain.attributes, class_vars, metas), var


def _table_with_annotation_column(data, values, column_data, var_name):
var = DiscreteVariable(get_unique_names(data.domain, var_name), values)
class_vars, metas = data.domain.class_vars, data.domain.metas
domain, var = domain_with_annotation_column(data, values, var_name)
if not data.domain.class_vars:
class_vars += (var, )
column_data = column_data.reshape((len(data), ))
else:
metas += (var, )
column_data = column_data.reshape((len(data), 1))
domain = Domain(data.domain.attributes, class_vars, metas)
table = data.transform(domain)
with table.unlocked(table.Y if not data.domain.class_vars else table.metas):
table[:, var] = column_data
Expand All @@ -65,17 +80,20 @@ def create_annotated_table(data, selected_indices):
data, ("No", "Yes"), annotated, ANNOTATED_DATA_FEATURE_NAME)


def lazy_annotated_table(data, selected_indices):
domain, _ = domain_with_annotation_column(data)
return LazyValue[Table](
lambda: create_annotated_table(data, selected_indices),
length=len(data), domain=domain)


def create_groups_table(data, selection,
include_unselected=True,
var_name=ANNOTATED_DATA_FEATURE_NAME,
values=None):
if data is None:
return None
max_sel = np.max(selection)
if values is None:
values = ["G{}".format(i + 1) for i in range(max_sel)]
if include_unselected:
values.append("Unselected")
values, max_sel = group_values(selection, include_unselected, values)
if include_unselected:
# Place Unselected instances in the "last group", so that the group
# colors and scatter diagram marker colors will match
Expand All @@ -88,3 +106,24 @@ def create_groups_table(data, selection,
data = data[mask]
selection = selection[mask] - 1
return _table_with_annotation_column(data, values, selection, var_name)


def lazy_groups_table(data, selection, include_unselected=True,
var_name=ANNOTATED_DATA_FEATURE_NAME, values=None):
length = len(data) if include_unselected else np.sum(selection != 0)
values, _ = group_values(selection, include_unselected, values)
domain, _ = domain_with_annotation_column(data, values, var_name)
return LazyValue[Table](
lambda: create_groups_table(data, selection, include_unselected,
var_name, values),
length=length, domain=domain
)


def group_values(selection, include_unselected, values):
max_sel = np.max(selection)
if values is None:
values = ["G{}".format(i + 1) for i in range(max_sel)]
if include_unselected:
values.append("Unselected")
return values, max_sel
Loading

0 comments on commit eb3633c

Please sign in to comment.