Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Improved Sparsity Handling #2341

Merged
merged 14 commits into from
Nov 20, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions Orange/data/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,18 @@ class DomainConversion:
.. attribute:: metas

Indices for meta attributes

.. attribute:: sparse_X

Flag whether the resulting X matrix should be sparse.

.. attribute:: sparse_Y

Flag whether the resulting Y matrix should be sparse.

.. attribute:: sparse_metas

Flag whether the resulting metas matrix should be sparse.
"""

def __init__(self, source, destination):
Expand All @@ -63,6 +75,21 @@ def __init__(self, source, destination):
source.index(var) if var in source
else var.compute_value for var in destination.metas]

def should_be_sparse(feats):
"""
For a matrix to be stored in sparse, more than 2/3 of columns
should be marked as sparse and there should be no string columns
since Scipy's sparse matrices don't support dtype=object.
"""
fraction_sparse = sum(f.sparse for f in feats) / max(len(feats), 1)
contain_strings = any(f.is_string for f in feats)
return fraction_sparse > 2/3 and not contain_strings

# check whether X, Y or metas should be sparse
self.sparse_X = should_be_sparse(destination.attributes)
self.sparse_Y = should_be_sparse(destination.class_vars)
self.sparse_metas = should_be_sparse(destination.metas)


def filter_visible(feats):
"""
Expand Down
112 changes: 82 additions & 30 deletions Orange/data/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
Domain, Variable, Storage, StringVariable, Unknown, Value, Instance,
ContinuousVariable, DiscreteVariable, MISSING_VALUES
)
from Orange.data.util import SharedComputeValue, vstack, hstack
from Orange.data.util import SharedComputeValue, vstack, hstack, assure_array_dense, assure_array_sparse, \
assure_column_dense, assure_column_sparse
from Orange.statistics.util import bincount, countnans, contingency, \
stats as fast_stats, sparse_has_implicit_zeros, sparse_count_implicit_zeros, \
sparse_implicit_zero_weights
Expand Down Expand Up @@ -280,44 +281,38 @@ def from_table(cls, domain, source, row_indices=...):

global _conversion_cache

def get_columns(row_indices, src_cols, n_rows, dtype=np.float64,
is_sparse=False):

def get_columns(row_indices, src_cols, n_rows, dtype=np.float64, is_sparse=False):
if not len(src_cols):
if is_sparse:
return sp.csr_matrix((n_rows, 0), dtype=source.X.dtype)
else:
return np.zeros((n_rows, 0), dtype=source.X.dtype)

# match density for subarrays
match_density = assure_array_sparse if is_sparse else assure_array_dense
n_src_attrs = len(source.domain.attributes)
if all(isinstance(x, Integral) and 0 <= x < n_src_attrs
for x in src_cols):
return _subarray(source.X, row_indices, src_cols)
return match_density(_subarray(source.X, row_indices, src_cols))
if all(isinstance(x, Integral) and x < 0 for x in src_cols):
arr = _subarray(source.metas, row_indices,
[-1 - x for x in src_cols])
arr = match_density(_subarray(source.metas, row_indices,
[-1 - x for x in src_cols]))
if arr.dtype != dtype:
return arr.astype(dtype)
return arr
if all(isinstance(x, Integral) and x >= n_src_attrs
for x in src_cols):
return _subarray(source._Y, row_indices,
[x - n_src_attrs for x in src_cols])
return match_density(_subarray(
source._Y, row_indices,
[x - n_src_attrs for x in src_cols]))

# initialize final array & set `match_density` for columns
if is_sparse:
a = sp.dok_matrix((n_rows, len(src_cols)), dtype=dtype)
match_density = assure_column_sparse
else:
a = np.empty((n_rows, len(src_cols)), dtype=dtype)

def match_type(x):
""" Assure that matrix and column are both dense or sparse. """
if is_sparse == sp.issparse(x):
return x
elif is_sparse:
x = np.asarray(x)
return sp.csc_matrix(x.reshape(-1, 1).astype(np.float))
else:
return np.ravel(x.toarray())
match_density = assure_column_dense

shared_cache = _conversion_cache
for i, col in enumerate(src_cols):
Expand All @@ -330,22 +325,22 @@ def match_type(x):
col.compute_shared(source)
shared = shared_cache[id(col.compute_shared), id(source)]
if row_indices is not ...:
a[:, i] = match_type(
a[:, i] = match_density(
col(source, shared_data=shared)[row_indices])
else:
a[:, i] = match_type(
a[:, i] = match_density(
col(source, shared_data=shared))
else:
if row_indices is not ...:
a[:, i] = match_type(col(source)[row_indices])
a[:, i] = match_density(col(source)[row_indices])
else:
a[:, i] = match_type(col(source))
a[:, i] = match_density(col(source))
elif col < 0:
a[:, i] = match_type(source.metas[row_indices, -1 - col])
a[:, i] = match_density(source.metas[row_indices, -1 - col])
elif col < n_src_attrs:
a[:, i] = match_type(source.X[row_indices, col])
a[:, i] = match_density(source.X[row_indices, col])
else:
a[:, i] = match_type(
a[:, i] = match_density(
source._Y[row_indices, col - n_src_attrs])

if is_sparse:
Expand All @@ -366,6 +361,8 @@ def match_type(x):
table = cls.from_table_rows(source, row_indices)
# assure resulting domain is the instance passed on input
table.domain = domain
# since sparse flags are not considered when checking for domain equality, fix manually.
table = assure_domain_conversion_sparsity(table, source)
return table

if isinstance(row_indices, slice):
Expand All @@ -382,18 +379,19 @@ def match_type(x):
self.domain = domain
conversion = domain.get_conversion(source.domain)
self.X = get_columns(row_indices, conversion.attributes, n_rows,
is_sparse=sp.issparse(source.X))
is_sparse=conversion.sparse_X)
if self.X.ndim == 1:
self.X = self.X.reshape(-1, len(self.domain.attributes))

self.Y = get_columns(row_indices, conversion.class_vars, n_rows,
is_sparse=sp.issparse(source.Y))
is_sparse=conversion.sparse_Y)

dtype = np.float64
if any(isinstance(var, StringVariable) for var in domain.metas):
dtype = np.object
self.metas = get_columns(row_indices, conversion.metas,
n_rows, dtype,
is_sparse=sp.issparse(source.metas))
is_sparse=conversion.sparse_metas)
if self.metas.ndim == 1:
self.metas = self.metas.reshape(-1, len(self.domain.metas))
if source.has_weights():
Expand Down Expand Up @@ -1651,6 +1649,40 @@ def guessed_var(i, var_name):
self.attributes["old_domain"] = table.domain
return self

def to_sparse(self, sparse_attributes=True, sparse_class=False,
sparse_metas=False):
def sparsify(features):
for f in features:
f.sparse = True

new_domain = self.domain.copy()

if sparse_attributes:
sparsify(new_domain.attributes)
if sparse_class:
sparsify(new_domain.class_vars)
if sparse_metas:
sparsify(new_domain.metas)
return self.transform(new_domain)

def to_dense(self, dense_attributes=True, dense_class=True,
dense_metas=True):
def densify(features):
for f in features:
f.sparse = False

new_domain = self.domain.copy()

if dense_attributes:
densify(new_domain.attributes)
if dense_class:
densify(new_domain.class_vars)
if dense_metas:
densify(new_domain.metas)
t = self.transform(new_domain)
t.ids = self.ids # preserve indices
return t


def _check_arrays(*arrays, dtype=None):
checked = []
Expand All @@ -1672,7 +1704,7 @@ def ninstances(array):

if ninstances(array) != shape_1:
raise ValueError("Leading dimension mismatch (%d != %d)"
% (len(array), shape_1))
% (ninstances(array), shape_1))

if sp.issparse(array):
array.data = np.asarray(array.data)
Expand Down Expand Up @@ -1743,3 +1775,23 @@ def _rxc_ix(rows, cols):
else:
r, c = np.ix_(rows, cols)
return np.asarray(r, int), np.asarray(c, int)


def assure_domain_conversion_sparsity(target, source):
"""
Assure that the table obeys the domain conversion's suggestions about sparsity.

Args:
target (Table): the target table.
source (Table): the source table.

Returns:
Table: with fixed sparsity. The sparsity is set as it is recommended by domain conversion
for transformation from source to the target domain.
"""
conversion = target.domain.get_conversion(source.domain)
match_density = [assure_array_dense, assure_array_sparse]
target.X = match_density[conversion.sparse_X](target.X)
target.Y = match_density[conversion.sparse_Y](target.Y)
target.metas = match_density[conversion.sparse_metas](target.metas)
return target
31 changes: 31 additions & 0 deletions Orange/data/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,34 @@ def hstack(arrays):
return sp.hstack(arrays)
else:
return np.hstack(arrays)


def assure_array_dense(a):
if sp.issparse(a):
a = a.toarray()
return a


def assure_array_sparse(a):
if not sp.issparse(a):
# since x can be a list, cast to np.array
# since x can come from metas with string, cast to float
a = np.asarray(a).astype(np.float)
return sp.csc_matrix(a)
return a


def assure_column_sparse(a):
a = assure_array_sparse(a)
# if x of shape (n, ) is passed to csc_matrix constructor,
# the resulting matrix is of shape (1, n) and hence we
# need to transpose it to make it a column
if a.shape[0] == 1:
a = a.T
return a


def assure_column_dense(a):
a = assure_array_dense(a)
# column assignments must be of shape (n,) and not (n, 1)
return np.ravel(a)
2 changes: 1 addition & 1 deletion Orange/preprocess/discretize.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import scipy.sparse as sp

from Orange.data import DiscreteVariable, Domain, Table
from Orange.data import DiscreteVariable, Domain
from Orange.data.sql.table import SqlTable
from Orange.preprocess.util import _RefuseDataInConstructor
from Orange.statistics import distribution, contingency
Expand Down
2 changes: 1 addition & 1 deletion Orange/preprocess/impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def __call__(self, data):
column = np.array([float(data[self.variable])])
else:
column = np.array(data.get_column_view(self.variable)[0],
copy=True)
copy=True)

mask = np.isnan(column)
if not np.any(mask):
Expand Down
47 changes: 42 additions & 5 deletions Orange/tests/test_domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
import warnings
from time import time
from numbers import Real
from itertools import starmap
from itertools import starmap, chain
import unittest
import pickle

import numpy as np
from numpy.testing import assert_array_equal

from Orange.data import (
ContinuousVariable, DiscreteVariable, StringVariable, TimeVariable,
ContinuousVariable, DiscreteVariable, StringVariable, TimeVariable,
Variable, Domain, Table, DomainConversion)
from Orange.data.domain import filter_visible
from Orange.preprocess import Continuize, Impute
Expand Down Expand Up @@ -165,7 +165,8 @@ def test_from_numpy_values(self):
(0, 2, DiscreteVariable),
(18, 23, ContinuousVariable)]:
n_rows, n_cols, = aran_max - aran_min, 1
d = Domain.from_numpy(np.zeros((1, 1)), np.arange(aran_min, aran_max).reshape(n_rows, n_cols))
d = Domain.from_numpy(np.zeros((1, 1)),
np.arange(aran_min, aran_max).reshape(n_rows, n_cols))
self.assertTrue(d.anonymous)
self.assertIsInstance(d.class_var, vartype)
if isinstance(vartype, DiscreteVariable):
Expand Down Expand Up @@ -402,14 +403,14 @@ def test_conversion(self):
assert_array_equal(y, np.array([0]))
metas_exp = [gender.Unknown, education.Unknown, ssn.Unknown]

def eq(a, b):
def equal(a, b):
if isinstance(a, Real) and isinstance(b, Real) and \
np.isnan(a) and np.isnan(b):
return True
else:
return a == b

self.assertTrue(all(starmap(eq, zip(metas, metas_exp))))
self.assertTrue(all(starmap(equal, zip(metas, metas_exp))))

x, y, metas = domain.convert([42, 13, "White", "M", "HS", "1234567"])
assert_array_equal(x, np.array([42, 13]))
Expand Down Expand Up @@ -502,6 +503,42 @@ def test_copy(self):
self.assertEqual(domain[age].number_of_decimals, 5)
self.assertEqual(new_domain[age].number_of_decimals, 10)

def test_domain_conversion_sparsity(self):
destination = Domain(
attributes=[
ContinuousVariable(name='a'),
ContinuousVariable(name='b'),
ContinuousVariable(name='c'),
],
class_vars=[DiscreteVariable('d', values=['e'])],
metas=[StringVariable('f')]
)

# all dense
source = Domain(attributes=[])
conversion = DomainConversion(source, destination)
self.assertFalse(conversion.sparse_X)
self.assertFalse(conversion.sparse_Y)
self.assertFalse(conversion.sparse_metas)

# set destination attributes as sparse
for a in destination.attributes:
a.sparse = True
source = Domain(attributes=[])
conversion = DomainConversion(source, destination)
self.assertTrue(conversion.sparse_X)
self.assertFalse(conversion.sparse_Y)
self.assertFalse(conversion.sparse_metas)

# set all destination variable as sparse
for a in chain(destination.variables, destination.metas):
a.sparse = True
source = Domain(attributes=[])
conversion = DomainConversion(source, destination)
self.assertTrue(conversion.sparse_X)
self.assertTrue(conversion.sparse_Y)
self.assertFalse(conversion.sparse_metas)


class TestDomainFilter(unittest.TestCase):
def setUp(self):
Expand Down
6 changes: 3 additions & 3 deletions Orange/tests/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,11 +98,11 @@ def test_normalize_transform_by_span_zero_class(self):
def test_normalize_sparse(self):
domain = Domain([ContinuousVariable(str(i)) for i in range(3)])
# pylint: disable=bad-whitespace
X = sp.csr_matrix(np.array([
X = np.array([
[0, -1, -2],
[0, 1, 2],
]))
data = Table.from_numpy(domain, X)
])
data = Table.from_numpy(domain, X).to_sparse()

# pylint: disable=bad-whitespace
solution = sp.csr_matrix(np.array([
Expand Down
Loading