From 120f92e2bdccac4eb31b4fbe346b692b773a6d2a Mon Sep 17 00:00:00 2001 From: nikicc Date: Thu, 25 May 2017 12:44:47 +0200 Subject: [PATCH] Table.from_table: Obey is_sparse when returning subarrays When we return subarryas, the flag `is_sparse` wasn't considered, but we simpy returned the subarray in it's original format. Also, make sure subarrays aren't flattened to 1d, as it is required for columns. --- Orange/data/table.py | 56 +++++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/Orange/data/table.py b/Orange/data/table.py index cbbc972d8bc..f94a1e74bb4 100644 --- a/Orange/data/table.py +++ b/Orange/data/table.py @@ -268,6 +268,27 @@ def from_table(cls, domain, source, row_indices=...): def get_columns(row_indices, src_cols, n_rows, dtype=np.float64, is_sparse=False): + def match_type(x, force_1d=False): + """ Assure that matrix and column are both dense or sparse. + + Args: + x (np.ndarray, scipy.sparse): data + force_1d (bool): If set, flatten resulting array to 1d. + + Returns: + array of correct density. + """ + if is_sparse == sp.issparse(x): + return x + if is_sparse: + x = np.asarray(x) + return sp.csc_matrix(x.reshape(-1, 1).astype(np.float)) + x = x.toarray() + if force_1d: + x = np.ravel(x) + return x + + match_type_1d = lambda x: match_type(x, force_1d=True) if not len(src_cols): if is_sparse: @@ -278,33 +299,24 @@ def get_columns(row_indices, src_cols, n_rows, dtype=np.float64, n_src_attrs = len(source.domain.attributes) if all(isinstance(x, Integral) and 0 <= x < n_src_attrs for x in src_cols): - return _subarray(source.X, row_indices, src_cols) + return match_type(_subarray(source.X, row_indices, src_cols)) if all(isinstance(x, Integral) and x < 0 for x in src_cols): - arr = _subarray(source.metas, row_indices, - [-1 - x for x in src_cols]) + arr = match_type(_subarray(source.metas, row_indices, + [-1 - x for x in src_cols])) if arr.dtype != dtype: return arr.astype(dtype) return arr if all(isinstance(x, Integral) and x >= n_src_attrs for x in src_cols): - return _subarray(source._Y, row_indices, - [x - n_src_attrs for x in src_cols]) + return match_type(_subarray( + source._Y, row_indices, + [x - n_src_attrs for x in src_cols])) if is_sparse: a = sp.dok_matrix((n_rows, len(src_cols)), dtype=dtype) else: a = np.empty((n_rows, len(src_cols)), dtype=dtype) - def match_type(x): - """ Assure that matrix and column are both dense or sparse. """ - if is_sparse == sp.issparse(x): - return x - elif is_sparse: - x = np.asarray(x) - return sp.csc_matrix(x.reshape(-1, 1).astype(np.float)) - else: - return np.ravel(x.toarray()) - shared_cache = _conversion_cache for i, col in enumerate(src_cols): if col is None: @@ -316,22 +328,22 @@ def match_type(x): col.compute_shared(source) shared = shared_cache[id(col.compute_shared), id(source)] if row_indices is not ...: - a[:, i] = match_type( + a[:, i] = match_type_1d( col(source, shared_data=shared)[row_indices]) else: - a[:, i] = match_type( + a[:, i] = match_type_1d( col(source, shared_data=shared)) else: if row_indices is not ...: - a[:, i] = match_type(col(source)[row_indices]) + a[:, i] = match_type_1d(col(source)[row_indices]) else: - a[:, i] = match_type(col(source)) + a[:, i] = match_type_1d(col(source)) elif col < 0: - a[:, i] = match_type(source.metas[row_indices, -1 - col]) + a[:, i] = match_type_1d(source.metas[row_indices, -1 - col]) elif col < n_src_attrs: - a[:, i] = match_type(source.X[row_indices, col]) + a[:, i] = match_type_1d(source.X[row_indices, col]) else: - a[:, i] = match_type( + a[:, i] = match_type_1d( source._Y[row_indices, col - n_src_attrs]) if is_sparse: