Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] OWContinuize: Provide the same options as in Preprocess/Normalize #4466

Merged
merged 1 commit into from
Mar 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
223 changes: 98 additions & 125 deletions Orange/widgets/data/owcontinuize.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from functools import reduce
from types import SimpleNamespace

from AnyQt.QtCore import Qt

import Orange.data
from Orange.util import Reprable
from Orange.statistics import distribution
from Orange.preprocess import Continuize, Normalize
from Orange.preprocess.transformation import \
Identity, Indicator, Indicator1, Normalizer
from Orange.preprocess import Continuize
from Orange.preprocess.transformation import Identity, Indicator, Normalizer
from Orange.data.table import Table
from Orange.widgets import gui, widget
from Orange.widgets.settings import Setting
Expand All @@ -34,16 +34,13 @@ class Outputs:
buttons_area_orientation = Qt.Vertical
resizing_enabled = False

# continuous treats
Leave, NormalizeBySpan, NormalizeBySD = range(3)
Normalize = SimpleNamespace(Leave=0, Standardize=1, Center=2, Scale=3,
Normalize11=4, Normalize01=5)

settings_version = 2
multinomial_treatment = Setting(0)
zero_based = Setting(1)
continuous_treatment = Setting(Leave)
continuous_treatment = Setting(Normalize.Leave)
class_treatment = Setting(0)

transform_class = Setting(False)

autosend = Setting(True)

multinomial_treats = (
Expand All @@ -56,9 +53,13 @@ class Outputs:
("Divide by number of values", Continuize.AsNormalizedOrdinal))

continuous_treats = (
("Leave them as they are", Continuize.Leave),
("Normalize by span", Normalize.NormalizeBySpan),
("Normalize by standard deviation", Normalize.NormalizeBySD))
("Leave them as they are", True),
("Standardize to μ=0, σ²=1", False),
("Center to μ=0", False),
("Scale to σ²=1", True),
("Normalize to interval [-1, 1]", False),
("Normalize to interval [0, 1]", False)
)

class_treats = (
("Leave it as it is", Continuize.Leave),
Expand All @@ -67,8 +68,6 @@ class Outputs:
("One class per value", Continuize.Indicators),
)

value_ranges = ["From -1 to 1", "From 0 to 1"]

def __init__(self):
super().__init__()

Expand All @@ -84,19 +83,12 @@ def __init__(self):
btnLabels=[x[0] for x in self.continuous_treats],
callback=self.settings_changed)

box = gui.vBox(self.controlArea, "Categorical Outcomes")
box = gui.vBox(self.controlArea, "Categorical Outcome(s)")
gui.radioButtonsInBox(
box, self, "class_treatment",
btnLabels=[t[0] for t in self.class_treats],
callback=self.settings_changed)

zbbox = gui.vBox(self.controlArea, "Value Range")

gui.radioButtonsInBox(
zbbox, self, "zero_based",
btnLabels=self.value_ranges,
callback=self.settings_changed)

gui.auto_apply(self.buttonsArea, self, "autosend", box=False)

self.data = None
Expand All @@ -120,31 +112,27 @@ def setData(self, data):
self.unconditional_commit()

def enable_normalization(self):
enable = not (self.data and self.data.is_sparse())
if not enable and self.continuous_treatment in (self.NormalizeBySpan,
self.NormalizeBySD):
self.continuous_treatment = self.Leave
buttons = self.controls.continuous_treatment.buttons
buttons[self.NormalizeBySpan].setEnabled(enable)
buttons[self.NormalizeBySD].setEnabled(enable)
if self.data is not None and self.data.is_sparse():
if self.continuous_treatment == self.Normalize.Standardize:
self.continuous_treatment = self.Normalize.Scale
else:
self.continuous_treatment = self.Normalize.Leave
for button, (_, supports_sparse) \
in zip(buttons, self.continuous_treats):
button.setEnabled(supports_sparse)
else:
for button in buttons:
button.setEnabled(True)

def constructContinuizer(self):
conzer = DomainContinuizer(
zero_based=self.zero_based,
multinomial_treatment=self.multinomial_treats[self.multinomial_treatment][1],
continuous_treatment=self.continuous_treats[self.continuous_treatment][1],
continuous_treatment=self.continuous_treatment,
class_treatment=self.class_treats[self.class_treatment][1]
)
return conzer

# def sendPreprocessor(self):
# continuizer = self.constructContinuizer()
# self.send("Preprocessor", PreprocessedLearner(
# lambda data, weightId=0, tc=(self.targetValue if self.classTreatment else -1):
# Table(continuizer(data, weightId, tc)
# if data.domain.has_discrete_class
# else continuizer(data, weightId), data)))

def commit(self):
continuizer = self.constructContinuizer()
if self.data:
Expand All @@ -155,16 +143,28 @@ def commit(self):
else:
self.Outputs.data.send(self.data) # None or empty data


def send_report(self):
self.report_items(
"Settings",
[("Categorical features",
self.multinomial_treats[self.multinomial_treatment][0]),
("Numeric features",
self.continuous_treats[self.continuous_treatment][0]),
("Class", self.class_treats[self.class_treatment][0]),
("Value range", self.value_ranges[self.zero_based])])
("Class", self.class_treats[self.class_treatment][0])])

@classmethod
def migrate_settings(cls, settings, version):
if version < 2:
Normalize = cls.Normalize
cont_treat = settings.pop("continuous_treatment", 0)
zero_based = settings.pop("zero_based", True)
if cont_treat == 1:
if zero_based:
settings["continuous_treatment"] = Normalize.Normalize01
else:
settings["continuous_treatment"] = Normalize.Normalize11
elif cont_treat == 2:
settings["continuous_treatment"] = Normalize.Standardize


class WeightedIndicator(Indicator):
Expand All @@ -179,56 +179,33 @@ def transform(self, c):
return t


class WeightedIndicator1(Indicator1):
def __init__(self, variable, value, weight=1.0):
super().__init__(variable, value)
self.weight = weight

def transform(self, c):
t = super().transform(c) * self.weight
if self.weight != 1.0:
t *= self.weight
return t


def make_indicator_var(source, value_ind, weight=None, zero_based=True):
if zero_based and weight is None:
def make_indicator_var(source, value_ind, weight=None):
if weight is None:
indicator = Indicator(source, value=value_ind)
elif zero_based:
indicator = WeightedIndicator(source, value=value_ind, weight=weight)
elif weight is None:
indicator = Indicator1(source, value=value_ind)
else:
indicator = WeightedIndicator1(source, value=value_ind, weight=weight)
indicator = WeightedIndicator(source, value=value_ind, weight=weight)
return Orange.data.ContinuousVariable(
"{}={}".format(source.name, source.values[value_ind]),
compute_value=indicator
)


def dummy_coding(var, base_value=0, zero_based=True):
def dummy_coding(var, base_value=0):
N = len(var.values)
return [make_indicator_var(var, i, zero_based=zero_based)
return [make_indicator_var(var, i)
for i in range(N) if i != base_value]


def one_hot_coding(var, zero_based=True):
def one_hot_coding(var):
N = len(var.values)
return [make_indicator_var(var, i, zero_based=zero_based)
for i in range(N)]
return [make_indicator_var(var, i) for i in range(N)]


def continuize_domain(data_or_domain,
def continuize_domain(data,
multinomial_treatment=Continuize.Indicators,
continuous_treatment=Continuize.Leave,
class_treatment=Continuize.Leave,
zero_based=True):

if isinstance(data_or_domain, Orange.data.Domain):
data, domain = None, data_or_domain
else:
data, domain = data_or_domain, data_or_domain.domain

class_treatment=Continuize.Leave):
domain = data.domain
def needs_dist(var, mtreat, ctreat):
"Does the `var` need a distribution given specified flags"
if var.is_discrete:
Expand Down Expand Up @@ -258,14 +235,11 @@ def needs_dist(var, mtreat, ctreat):
dist_iter = iter(dist)

newattrs = [continuize_var(var, next(dist_iter) if needs_dist else None,
multinomial_treatment, continuous_treatment,
zero_based)
multinomial_treatment, continuous_treatment)
for var, needs_dist in zip(domain.attributes, attr_needs_dist)]

newclass = [continuize_var(var,
next(dist_iter) if needs_dist else None,
class_treatment, Continuize.Remove,
zero_based)
class_treatment, Continuize.Remove)
for var, needs_dist in zip(domain.class_vars, cls_needs_dist)]

newattrs = reduce(list.__iadd__, newattrs, [])
Expand All @@ -276,16 +250,16 @@ def needs_dist(var, mtreat, ctreat):
def continuize_var(var,
data_or_dist=None,
multinomial_treatment=Continuize.Indicators,
continuous_treatment=Continuize.Leave,
zero_based=True):

continuous_treatment=Continuize.Leave):
def continuize_continuous():
if continuous_treatment == Normalize.NormalizeBySpan:
return [normalize_by_span(var, data_or_dist, zero_based)]
elif continuous_treatment == Normalize.NormalizeBySD:
return [normalize_by_sd(var, data_or_dist)]
else:
dist = _ensure_dist(var, data_or_dist)
treatments = [lambda var, _: var,
normalize_by_sd, center_to_mean, divide_by_sd,
normalize_to_11, normalize_to_01]
if dist.shape[1] == 0:
return [var]
new_var = treatments[continuous_treatment](var, dist)
return [new_var]

def continuize_discrete():
if len(var.values) > 2 and \
Expand All @@ -299,16 +273,16 @@ def continuize_discrete():
elif multinomial_treatment == Continuize.AsOrdinal:
return [ordinal_to_continuous(var)]
elif multinomial_treatment == Continuize.AsNormalizedOrdinal:
return [ordinal_to_norm_continuous(var, zero_based)]
return [ordinal_to_norm_continuous(var)]
elif multinomial_treatment == Continuize.Indicators:
return one_hot_coding(var, zero_based)
return one_hot_coding(var)
elif multinomial_treatment in (
Continuize.FirstAsBase, Continuize.RemoveMultinomial):
return dummy_coding(var, zero_based=zero_based)
return dummy_coding(var)
elif multinomial_treatment == Continuize.FrequentAsBase:
dist = _ensure_dist(var, data_or_dist)
modus = dist.modus()
return dummy_coding(var, base_value=modus, zero_based=zero_based)
return dummy_coding(var, base_value=modus)
elif multinomial_treatment == Continuize.Leave:
return [var]
raise ValueError("Invalid value of `multinomial_treatment`")
Expand Down Expand Up @@ -345,68 +319,67 @@ def ordinal_to_continuous(var):
compute_value=Identity(var))


def ordinal_to_norm_continuous(var, zero_based=True):
def ordinal_to_norm_continuous(var):
n_values = len(var.values)
if zero_based:
return normalized_var(var, 0, 1 / (n_values - 1))
else:
return normalized_var(var, (n_values - 1) / 2, 2 / (n_values - 1))
return normalized_var(var, 0, 1 / (n_values - 1))


def normalize_by_span(var, data_or_dist, zero_based=True):
dist = _ensure_dist(var, data_or_dist)
if dist.shape[1] > 0:
v_max, v_min = dist.max(), dist.min()
else:
v_max, v_min = 0, 0
def normalize_by_sd(var, dist):
mean, sd = dist.mean(), dist.standard_deviation()
sd = sd if sd > 1e-10 else 1
return normalized_var(var, mean, 1 / sd)


def center_to_mean(var, dist):
return normalized_var(var, dist.mean(), 1)


def divide_by_sd(var, dist):
sd = dist.standard_deviation()
sd = sd if sd > 1e-10 else 1
return normalized_var(var, 0, 1 / sd)


def normalize_to_11(var, dist):
return normalize_by_span(var, dist, False)


def normalize_to_01(var, dist):
return normalize_by_span(var, dist, True)


def normalize_by_span(var, dist, zero_based=True):
v_max, v_min = dist.max(), dist.min()
span = (v_max - v_min)
if span < 1e-15:
span = 1

if zero_based:
return normalized_var(var, v_min, 1 / span)
else:
return normalized_var(var, (v_min + v_max) / 2, 2 / span)


def normalize_by_sd(var, data_or_dist):
dist = _ensure_dist(var, data_or_dist)
if dist.shape[1] > 0:
mean, sd = dist.mean(), dist.standard_deviation()
else:
mean, sd = 0, 1
sd = sd if sd > 1e-10 else 1
return normalized_var(var, mean, 1 / sd)


class DomainContinuizer(Reprable):
def __init__(self, zero_based=True,
def __init__(self,
multinomial_treatment=Continuize.Indicators,
continuous_treatment=Continuize.Leave,
class_treatment=Continuize.Leave):
self.zero_based = zero_based
self.multinomial_treatment = multinomial_treatment
self.continuous_treatment = continuous_treatment
self.class_treatment = class_treatment

def __call__(self, data):
treat = self.multinomial_treatment
if isinstance(data, Orange.data.Domain):
domain, data = data, None
else:
domain = data.domain

domain = data.domain
if (treat == Continuize.ReportError and
any(var.is_discrete and len(var.values) > 2 for var in domain)):
raise ValueError("Domain has multinomial attributes")

newdomain = continuize_domain(
data or domain,
data,
self.multinomial_treatment,
self.continuous_treatment,
self.class_treatment,
self.zero_based
)
self.class_treatment)
return newdomain


Expand Down
Loading