Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Add remove sparse features preprocessor #4093

Merged
merged 6 commits into from
Nov 14, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 34 additions & 6 deletions Orange/preprocess/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
__all__ = ["Continuize", "Discretize", "Impute", "RemoveNaNRows",
"SklImpute", "Normalize", "Randomize", "Preprocess",
"RemoveConstant", "RemoveNaNClasses", "RemoveNaNColumns",
"ProjectPCA", "ProjectCUR", "Scale", "AdaptiveNormalize"]
"ProjectPCA", "ProjectCUR", "Scale", "RemoveSparse",
"AdaptiveNormalize"]


class Preprocess(_RefuseDataInConstructor, Reprable):
Expand Down Expand Up @@ -102,11 +103,11 @@ def transform(var):
else:
return var

def discretized(vars, do_discretize):
def discretized(vars_, do_discretize):
if do_discretize:
vars = (transform(var) for var in vars)
vars = [var for var in vars if var is not None]
return vars
vars_ = (transform(var) for var in vars_)
vars_ = [var for var in vars_ if var is not None]
return vars_

method = self.method or discretize.EqualFreq()
domain = Orange.data.Domain(
Expand Down Expand Up @@ -422,7 +423,8 @@ def __call__(self, data):
new_data.metas = self.randomize(new_data.metas, r3)
return new_data

def randomize(self, table, rand_state=None):
@staticmethod
def randomize(table, rand_state=None):
rstate = np.random.RandomState(rand_state)
if sp.issparse(table):
table = table.tocsc() # type: sp.spmatrix
Expand Down Expand Up @@ -569,6 +571,32 @@ def __call__(self, data):
data = pp(data)
return data

class RemoveSparse(Preprocess):
"""
Remove sparse features. Sparseness is determined according to
user-defined treshold.

Parameters
----------
threshold : float
Minimal proportion of non-zero entries of a feature
"""

def __init__(self, threshold=0.05):
self.threshold = threshold

def __call__(self, data):
if sp.issparse(data.X):
data_csc = sp.csc_matrix(data.X)
h, w = data_csc.shape
sparsness = [data_csc[:, i].count_nonzero() / h for i in range(w)]
else:
sparsness = np.count_nonzero(data.X, axis=0) / data.X.shape[0]
att = [a for a, s in zip(data.domain.attributes, sparsness) if s >= self.threshold]
domain = Orange.data.Domain(att, data.domain.class_vars,
data.domain.metas)
return data.transform(domain)


class AdaptiveNormalize(Preprocess):
"""
Expand Down
31 changes: 28 additions & 3 deletions Orange/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@
import numpy as np
from scipy.sparse import csr_matrix

from Orange.data import Table
from Orange.data import Table, Domain, ContinuousVariable
from Orange.preprocess import EntropyMDL, DoNotImpute, Default, Average, \
SelectRandomFeatures, EqualFreq, RemoveNaNColumns, DropInstances, \
EqualWidth, SelectBestFeatures, RemoveNaNRows, Preprocess, Scale, \
Randomize, Continuize, Discretize, Impute, SklImpute, Normalize, \
ProjectCUR, ProjectPCA, RemoveConstant, AdaptiveNormalize
ProjectCUR, ProjectPCA, RemoveConstant, AdaptiveNormalize, RemoveSparse
from Orange.util import OrangeDeprecationWarning


Expand Down Expand Up @@ -139,7 +139,7 @@ def test_reprs(self):
Randomize, ProjectPCA, ProjectCUR, Scale,
EqualFreq, EqualWidth, EntropyMDL, SelectBestFeatures,
SelectRandomFeatures, RemoveNaNColumns, DoNotImpute, DropInstances,
Average, Default]
Average, Default, RemoveSparse]

for preproc in preprocs:
repr_str = repr(preproc())
Expand Down Expand Up @@ -189,3 +189,28 @@ def test_sparse_pps(self):
true_out = Scale(center=Scale.NoCentering, scale=Scale.Span)(self.data)
np.testing.assert_array_equal(out, true_out)
self.data = self.data.X.toarray()


class TestRemoveSparse(unittest.TestCase):

def setUp(self):
domain = Domain([ContinuousVariable('a'), ContinuousVariable('b')])
self.data = Table.from_numpy(domain, np.zeros((3, 2)))
self.data[1:, 1] = 7

def test_dense(self):
true_out = self.data[:, 1]
true_out.X = true_out.X.reshape(-1, 1)
out = RemoveSparse(0.5)(self.data)
np.testing.assert_array_equal(out, true_out)

def test_sparse(self):
true_out = self.data[:, 1]
self.data.X = csr_matrix(self.data.X)
true_out.X = csr_matrix(true_out.X)
out = RemoveSparse(0.5)(self.data).X
np.testing.assert_array_equal(out, true_out)


if __name__ == '__main__':
unittest.main()
39 changes: 38 additions & 1 deletion Orange/widgets/data/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import Orange.data
from Orange import preprocess
from Orange.preprocess import Continuize, ProjectPCA, RemoveNaNRows, \
ProjectCUR, Scale as _Scale, Randomize as _Randomize
ProjectCUR, Scale as _Scale, Randomize as _Randomize, RemoveSparse
from Orange.widgets import widget, gui
from Orange.widgets.settings import Setting
from Orange.widgets.utils.overlay import OverlayWidget
Expand Down Expand Up @@ -250,6 +250,37 @@ def createinstance(params):
def __repr__(self):
return self.Continuizers[self.__treatment]

class RemoveSparseEditor(BaseEditor):

def __init__(self, parent=None, **kwargs):
super().__init__(parent, **kwargs)
self.setLayout(QVBoxLayout())
self.sparse_thresh = 5
form = QFormLayout()
self.cspin = QSpinBox(minimum=1, maximum=100, value=self.sparse_thresh)
self.cspin.valueChanged[int].connect(self.setThresh)
self.cspin.editingFinished.connect(self.edited)

form.addRow("Min % of nonzero values:", self.cspin)
self.layout().addLayout(form)

def setThresh(self, thresh):
if self.sparse_thresh != thresh:
self.sparse_thresh = thresh
self.cspin.setValue(thresh)
self.changed.emit()

def parameters(self):
return {'sparse_thresh': self.sparse_thresh}

def setParameters(self, params):
self.setThresh(params.get('sparse_thresh', 5))

@staticmethod
def createinstance(params):
params = dict(params)
threshold = params.pop('sparse_thresh', 5)
return RemoveSparse(threshold=threshold / 100)

class ImputeEditor(BaseEditor):
(NoImputation, Constant, Average,
Expand Down Expand Up @@ -922,6 +953,12 @@ def icon_path(basename):
icon_path("Random.svg")),
Randomize
),
PreprocessAction(
"Remove Sparse", "orange.preprocess.remove_sparse", "Feature Selection",
Description("Remove Sparse Features",
icon_path("PurgeDomain.svg")),
RemoveSparseEditor
),
PreprocessAction(
"PCA", "orange.preprocess.pca", "PCA",
Description("Principal Component Analysis",
Expand Down
32 changes: 31 additions & 1 deletion Orange/widgets/data/tests/test_owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

from Orange.data import Table
from Orange.preprocess import (
Randomize, Scale, Discretize, Continuize, Impute, ProjectPCA, ProjectCUR
Randomize, Scale, Discretize, Continuize, Impute, ProjectPCA, \
ProjectCUR, RemoveSparse
)
from Orange.preprocess import discretize, impute, fss, score
from Orange.widgets.data import owpreprocess
Expand Down Expand Up @@ -37,6 +38,20 @@ def test_randomize(self):
np.testing.assert_array_equal(self.zoo.metas, output.metas)
self.assertFalse(np.array_equal(self.zoo.Y, output.Y))

def test_remove_sparse(self):
data = Table("iris")
idx = int(data.X.shape[0]/10)
data.X[:idx+1, 0] = np.zeros((idx+1,))
saved = {"preprocessors": [("orange.preprocess.remove_sparse", {'sparse_thresh':90})]}
model = self.widget.load(saved)

self.widget.set_model(model)
self.send_signal(self.widget.Inputs.data, data)
output = self.get_output(self.widget.Outputs.preprocessed_data)
np.testing.assert_array_equal(output.X, data.X[:, 1:])
np.testing.assert_array_equal(output.Y, data.Y)
np.testing.assert_array_equal(output.metas, data.metas)

def test_normalize(self):
data = Table("iris")
saved = {"preprocessors": [("orange.preprocess.scale",
Expand Down Expand Up @@ -246,3 +261,18 @@ def test_editor(self):
self.assertIsInstance(p, ProjectCUR)
self.assertEqual(p.rank, 5)
self.assertEqual(p.max_error, 0.5)

class TestRemoveSparseEditor(WidgetTest):

def test_editor(self):
widget = owpreprocess.RemoveSparseEditor()
self.assertEqual(widget.parameters(), {"sparse_thresh": 5})

p = widget.createinstance(widget.parameters())
self.assertIsInstance(p, RemoveSparse)
self.assertEqual(p.threshold, 0.05)

widget.setParameters({"sparse_thresh": 90})
p = widget.createinstance(widget.parameters())
self.assertIsInstance(p, RemoveSparse)
self.assertEqual(p.threshold, 0.9)
11 changes: 6 additions & 5 deletions doc/visual-programming/source/widgets/data/preprocess.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,12 @@ Preprocessors

![](images/Preprocess2-stamped.png)

1. *Select random features* outputs either a fixed number of features from the original data or a percentage. This is mainly used for advanced testing and educational purposes.
2. Normalize adjusts values to a common scale. Center values by mean or median or omit centering altogether. Similar for scaling, one can scale by SD (standard deviation), by span or not at all.
3. Randomize instances. Randomize classes shuffles class values and destroys connection between instances and class. Similarly, one can randomize features or meta data. If replicable shuffling is on, randomization results can be shared and repeated with a saved workflow. This is mainly used for advanced testing and educational purposes.
4. Principal component analysis outputs results of a PCA transformation. Similar to the [PCA](../unsupervised/PCA.md) widget.
5. [CUR matrix decomposition](https://en.wikipedia.org/wiki/CUR_matrix_approximation) is a dimensionality reduction method, similar to SVD.
5. *Select random features* outputs either a fixed number of features from the original data or a percentage. This is mainly used for advanced testing and educational purposes.
6. Normalize adjusts values to a common scale. Center values by mean or median or omit centering altogether. Similar for scaling, one can scale by SD (standard deviation), by span or not at all.
7. Randomize instances. Randomize classes shuffles class values and destroys connection between instances and class. Similarly, one can randomize features or meta data. If replicable shuffling is on, randomization results can be shared and repeated with a saved workflow. This is mainly used for advanced testing and educational purposes.
8. *Remove sparse features* retains features that have more than user-defined threshold percentage of non-zero values. The rest are discarded.
9. Principal component analysis outputs results of a PCA transformation. Similar to the [PCA](../unsupervised/PCA.md) widget.
10. [CUR matrix decomposition](https://en.wikipedia.org/wiki/CUR_matrix_approximation) is a dimensionality reduction method, similar to SVD.

Examples
--------
Expand Down