Merge pull request #2341 from nikicc/sparsity-from-domain-conversion

[ENH] Improved Sparsity Handling
biolab · Nov 20, 2017 · 301d13a · 301d13a
2 parents d7252f3 + 4bf6a20
commit 301d13a
Show file tree

Hide file tree

Showing 20 changed files with 298 additions and 96 deletions.
diff --git a/Orange/data/domain.py b/Orange/data/domain.py
@@ -44,6 +44,18 @@ class DomainConversion:
     .. attribute:: metas
 
         Indices for meta attributes
+
+    .. attribute:: sparse_X
+
+        Flag whether the resulting X matrix should be sparse.
+
+    .. attribute:: sparse_Y
+
+        Flag whether the resulting Y matrix should be sparse.
+
+    .. attribute:: sparse_metas
+
+        Flag whether the resulting metas matrix should be sparse.
     """
 
     def __init__(self, source, destination):
@@ -63,6 +75,21 @@ def __init__(self, source, destination):
             source.index(var) if var in source
             else var.compute_value for var in destination.metas]
 
+        def should_be_sparse(feats):
+            """
+            For a matrix to be stored in sparse, more than 2/3 of columns
+            should be marked as sparse and there should be no string columns
+            since Scipy's sparse matrices don't support dtype=object.
+            """
+            fraction_sparse = sum(f.sparse for f in feats) / max(len(feats), 1)
+            contain_strings = any(f.is_string for f in feats)
+            return fraction_sparse > 2/3 and not contain_strings
+
+        # check whether X, Y or metas should be sparse
+        self.sparse_X = should_be_sparse(destination.attributes)
+        self.sparse_Y = should_be_sparse(destination.class_vars)
+        self.sparse_metas = should_be_sparse(destination.metas)
+
 
 def filter_visible(feats):
     """

diff --git a/Orange/data/table.py b/Orange/data/table.py
@@ -17,7 +17,8 @@
     Domain, Variable, Storage, StringVariable, Unknown, Value, Instance,
     ContinuousVariable, DiscreteVariable, MISSING_VALUES
 )
-from Orange.data.util import SharedComputeValue, vstack, hstack
+from Orange.data.util import SharedComputeValue, vstack, hstack, assure_array_dense, assure_array_sparse, \
+    assure_column_dense, assure_column_sparse
 from Orange.statistics.util import bincount, countnans, contingency, \
     stats as fast_stats, sparse_has_implicit_zeros, sparse_count_implicit_zeros, \
     sparse_implicit_zero_weights
@@ -280,44 +281,38 @@ def from_table(cls, domain, source, row_indices=...):
 
         global _conversion_cache
 
-        def get_columns(row_indices, src_cols, n_rows, dtype=np.float64,
-                        is_sparse=False):
-
+        def get_columns(row_indices, src_cols, n_rows, dtype=np.float64, is_sparse=False):
             if not len(src_cols):
                 if is_sparse:
                     return sp.csr_matrix((n_rows, 0), dtype=source.X.dtype)
                 else:
                     return np.zeros((n_rows, 0), dtype=source.X.dtype)
 
+            # match density for subarrays
+            match_density = assure_array_sparse if is_sparse else assure_array_dense
             n_src_attrs = len(source.domain.attributes)
             if all(isinstance(x, Integral) and 0 <= x < n_src_attrs
                    for x in src_cols):
-                return _subarray(source.X, row_indices, src_cols)
+                return match_density(_subarray(source.X, row_indices, src_cols))
             if all(isinstance(x, Integral) and x < 0 for x in src_cols):
-                arr = _subarray(source.metas, row_indices,
-                                [-1 - x for x in src_cols])
+                arr = match_density(_subarray(source.metas, row_indices,
+                                            [-1 - x for x in src_cols]))
                 if arr.dtype != dtype:
                     return arr.astype(dtype)
                 return arr
             if all(isinstance(x, Integral) and x >= n_src_attrs
                    for x in src_cols):
-                return _subarray(source._Y, row_indices,
-                                 [x - n_src_attrs for x in src_cols])
+                return match_density(_subarray(
+                    source._Y, row_indices,
+                    [x - n_src_attrs for x in src_cols]))
 
+            # initialize final array & set `match_density` for columns
             if is_sparse:
                 a = sp.dok_matrix((n_rows, len(src_cols)), dtype=dtype)
+                match_density = assure_column_sparse
             else:
                 a = np.empty((n_rows, len(src_cols)), dtype=dtype)
-
-            def match_type(x):
-                """ Assure that matrix and column are both dense or sparse. """
-                if is_sparse == sp.issparse(x):
-                    return x
-                elif is_sparse:
-                    x = np.asarray(x)
-                    return sp.csc_matrix(x.reshape(-1, 1).astype(np.float))
-                else:
-                    return np.ravel(x.toarray())
+                match_density = assure_column_dense
 
             shared_cache = _conversion_cache
             for i, col in enumerate(src_cols):
@@ -330,22 +325,22 @@ def match_type(x):
                                 col.compute_shared(source)
                         shared = shared_cache[id(col.compute_shared), id(source)]
                         if row_indices is not ...:
-                            a[:, i] = match_type(
+                            a[:, i] = match_density(
                                 col(source, shared_data=shared)[row_indices])
                         else:
-                            a[:, i] = match_type(
+                            a[:, i] = match_density(
                                 col(source, shared_data=shared))
                     else:
                         if row_indices is not ...:
-                            a[:, i] = match_type(col(source)[row_indices])
+                            a[:, i] = match_density(col(source)[row_indices])
                         else:
-                            a[:, i] = match_type(col(source))
+                            a[:, i] = match_density(col(source))
                 elif col < 0:
-                    a[:, i] = match_type(source.metas[row_indices, -1 - col])
+                    a[:, i] = match_density(source.metas[row_indices, -1 - col])
                 elif col < n_src_attrs:
-                    a[:, i] = match_type(source.X[row_indices, col])
+                    a[:, i] = match_density(source.X[row_indices, col])
                 else:
-                    a[:, i] = match_type(
+                    a[:, i] = match_density(
                         source._Y[row_indices, col - n_src_attrs])
 
             if is_sparse:
@@ -366,6 +361,8 @@ def match_type(x):
                     table = cls.from_table_rows(source, row_indices)
                     # assure resulting domain is the instance passed on input
                     table.domain = domain
+                    # since sparse flags are not considered when checking for domain equality, fix manually.
+                    table = assure_domain_conversion_sparsity(table, source)
                     return table
 
                 if isinstance(row_indices, slice):
@@ -382,18 +379,19 @@ def match_type(x):
                 self.domain = domain
                 conversion = domain.get_conversion(source.domain)
                 self.X = get_columns(row_indices, conversion.attributes, n_rows,
-                                     is_sparse=sp.issparse(source.X))
+                                     is_sparse=conversion.sparse_X)
                 if self.X.ndim == 1:
                     self.X = self.X.reshape(-1, len(self.domain.attributes))
+
                 self.Y = get_columns(row_indices, conversion.class_vars, n_rows,
-                                     is_sparse=sp.issparse(source.Y))
+                                     is_sparse=conversion.sparse_Y)
 
                 dtype = np.float64
                 if any(isinstance(var, StringVariable) for var in domain.metas):
                     dtype = np.object
                 self.metas = get_columns(row_indices, conversion.metas,
                                          n_rows, dtype,
-                                         is_sparse=sp.issparse(source.metas))
+                                         is_sparse=conversion.sparse_metas)
                 if self.metas.ndim == 1:
                     self.metas = self.metas.reshape(-1, len(self.domain.metas))
                 if source.has_weights():
@@ -1651,6 +1649,40 @@ def guessed_var(i, var_name):
         self.attributes["old_domain"] = table.domain
         return self
 
+    def to_sparse(self, sparse_attributes=True, sparse_class=False,
+                  sparse_metas=False):
+        def sparsify(features):
+            for f in features:
+                f.sparse = True
+
+        new_domain = self.domain.copy()
+
+        if sparse_attributes:
+            sparsify(new_domain.attributes)
+        if sparse_class:
+            sparsify(new_domain.class_vars)
+        if sparse_metas:
+            sparsify(new_domain.metas)
+        return self.transform(new_domain)
+
+    def to_dense(self, dense_attributes=True, dense_class=True,
+                 dense_metas=True):
+        def densify(features):
+            for f in features:
+                f.sparse = False
+
+        new_domain = self.domain.copy()
+
+        if dense_attributes:
+            densify(new_domain.attributes)
+        if dense_class:
+            densify(new_domain.class_vars)
+        if dense_metas:
+            densify(new_domain.metas)
+        t = self.transform(new_domain)
+        t.ids = self.ids    # preserve indices
+        return t
+
 
 def _check_arrays(*arrays, dtype=None):
     checked = []
@@ -1672,7 +1704,7 @@ def ninstances(array):
 
         if ninstances(array) != shape_1:
             raise ValueError("Leading dimension mismatch (%d != %d)"
-                             % (len(array), shape_1))
+                             % (ninstances(array), shape_1))
 
         if sp.issparse(array):
             array.data = np.asarray(array.data)
@@ -1743,3 +1775,23 @@ def _rxc_ix(rows, cols):
     else:
         r, c = np.ix_(rows, cols)
         return np.asarray(r, int), np.asarray(c, int)
+
+
+def assure_domain_conversion_sparsity(target, source):
+    """
+    Assure that the table obeys the domain conversion's suggestions about sparsity.
+
+    Args:
+        target (Table): the target table.
+        source (Table): the source table.
+
+    Returns:
+        Table: with fixed sparsity. The sparsity is set as it is recommended by domain conversion
+            for transformation from source to the target domain.
+    """
+    conversion = target.domain.get_conversion(source.domain)
+    match_density = [assure_array_dense, assure_array_sparse]
+    target.X = match_density[conversion.sparse_X](target.X)
+    target.Y = match_density[conversion.sparse_Y](target.Y)
+    target.metas = match_density[conversion.sparse_metas](target.metas)
+    return target
diff --git a/Orange/data/util.py b/Orange/data/util.py
@@ -89,3 +89,34 @@ def hstack(arrays):
         return sp.hstack(arrays)
     else:
         return np.hstack(arrays)
+
+
+def assure_array_dense(a):
+    if sp.issparse(a):
+        a = a.toarray()
+    return a
+
+
+def assure_array_sparse(a):
+    if not sp.issparse(a):
+        # since x can be a list, cast to np.array
+        # since x can come from metas with string, cast to float
+        a = np.asarray(a).astype(np.float)
+        return sp.csc_matrix(a)
+    return a
+
+
+def assure_column_sparse(a):
+    a = assure_array_sparse(a)
+    # if x of shape (n, ) is passed to csc_matrix constructor,
+    # the resulting matrix is of shape (1, n) and hence we
+    # need to transpose it to make it a column
+    if a.shape[0] == 1:
+        a = a.T
+    return a
+
+
+def assure_column_dense(a):
+    a = assure_array_dense(a)
+    # column assignments must be of shape (n,) and not (n, 1)
+    return np.ravel(a)
diff --git a/Orange/preprocess/discretize.py b/Orange/preprocess/discretize.py
@@ -1,7 +1,7 @@
 import numpy as np
 import scipy.sparse as sp
 
-from Orange.data import DiscreteVariable, Domain, Table
+from Orange.data import DiscreteVariable, Domain
 from Orange.data.sql.table import SqlTable
 from Orange.preprocess.util import _RefuseDataInConstructor
 from Orange.statistics import distribution, contingency

diff --git a/Orange/preprocess/impute.py b/Orange/preprocess/impute.py
@@ -157,7 +157,7 @@ def __call__(self, data):
             column = np.array([float(data[self.variable])])
         else:
             column = np.array(data.get_column_view(self.variable)[0],
-                                 copy=True)
+                              copy=True)
 
         mask = np.isnan(column)
         if not np.any(mask):

diff --git a/Orange/tests/test_domain.py b/Orange/tests/test_domain.py
@@ -3,15 +3,15 @@
 import warnings
 from time import time
 from numbers import Real
-from itertools import starmap
+from itertools import starmap, chain
 import unittest
 import pickle
 
 import numpy as np
 from numpy.testing import assert_array_equal
 
 from Orange.data import (
-    ContinuousVariable, DiscreteVariable,  StringVariable, TimeVariable,
+    ContinuousVariable, DiscreteVariable, StringVariable, TimeVariable,
     Variable, Domain, Table, DomainConversion)
 from Orange.data.domain import filter_visible
 from Orange.preprocess import Continuize, Impute
@@ -165,7 +165,8 @@ def test_from_numpy_values(self):
                                             (0, 2, DiscreteVariable),
                                             (18, 23, ContinuousVariable)]:
             n_rows, n_cols, = aran_max - aran_min, 1
-            d = Domain.from_numpy(np.zeros((1, 1)), np.arange(aran_min, aran_max).reshape(n_rows, n_cols))
+            d = Domain.from_numpy(np.zeros((1, 1)),
+                                  np.arange(aran_min, aran_max).reshape(n_rows, n_cols))
             self.assertTrue(d.anonymous)
             self.assertIsInstance(d.class_var, vartype)
             if isinstance(vartype, DiscreteVariable):
@@ -402,14 +403,14 @@ def test_conversion(self):
         assert_array_equal(y, np.array([0]))
         metas_exp = [gender.Unknown, education.Unknown, ssn.Unknown]
 
-        def eq(a, b):
+        def equal(a, b):
             if isinstance(a, Real) and isinstance(b, Real) and \
                     np.isnan(a) and np.isnan(b):
                 return True
             else:
                 return a == b
 
-        self.assertTrue(all(starmap(eq, zip(metas, metas_exp))))
+        self.assertTrue(all(starmap(equal, zip(metas, metas_exp))))
 
         x, y, metas = domain.convert([42, 13, "White", "M", "HS", "1234567"])
         assert_array_equal(x, np.array([42, 13]))
@@ -502,6 +503,42 @@ def test_copy(self):
         self.assertEqual(domain[age].number_of_decimals, 5)
         self.assertEqual(new_domain[age].number_of_decimals, 10)
 
+    def test_domain_conversion_sparsity(self):
+        destination = Domain(
+            attributes=[
+                ContinuousVariable(name='a'),
+                ContinuousVariable(name='b'),
+                ContinuousVariable(name='c'),
+            ],
+            class_vars=[DiscreteVariable('d', values=['e'])],
+            metas=[StringVariable('f')]
+        )
+
+        # all dense
+        source = Domain(attributes=[])
+        conversion = DomainConversion(source, destination)
+        self.assertFalse(conversion.sparse_X)
+        self.assertFalse(conversion.sparse_Y)
+        self.assertFalse(conversion.sparse_metas)
+
+        # set destination attributes as sparse
+        for a in destination.attributes:
+            a.sparse = True
+        source = Domain(attributes=[])
+        conversion = DomainConversion(source, destination)
+        self.assertTrue(conversion.sparse_X)
+        self.assertFalse(conversion.sparse_Y)
+        self.assertFalse(conversion.sparse_metas)
+
+        # set all destination variable as sparse
+        for a in chain(destination.variables, destination.metas):
+            a.sparse = True
+        source = Domain(attributes=[])
+        conversion = DomainConversion(source, destination)
+        self.assertTrue(conversion.sparse_X)
+        self.assertTrue(conversion.sparse_Y)
+        self.assertFalse(conversion.sparse_metas)
+
 
 class TestDomainFilter(unittest.TestCase):
     def setUp(self):

diff --git a/Orange/tests/test_normalize.py b/Orange/tests/test_normalize.py
@@ -98,11 +98,11 @@ def test_normalize_transform_by_span_zero_class(self):
     def test_normalize_sparse(self):
         domain = Domain([ContinuousVariable(str(i)) for i in range(3)])
         # pylint: disable=bad-whitespace
-        X = sp.csr_matrix(np.array([
+        X = np.array([
             [0, -1, -2],
             [0,  1,  2],
-        ]))
-        data = Table.from_numpy(domain, X)
+        ])
+        data = Table.from_numpy(domain, X).to_sparse()
 
         # pylint: disable=bad-whitespace
         solution = sp.csr_matrix(np.array([