Skip to content

Commit

Permalink
Table: Add locking
Browse files Browse the repository at this point in the history
  • Loading branch information
janezd committed Jun 11, 2021
1 parent f02f102 commit 408fa64
Show file tree
Hide file tree
Showing 88 changed files with 1,373 additions and 775 deletions.
30 changes: 19 additions & 11 deletions Orange/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,6 +664,13 @@ def __init__(self, cat_model, cat_features, domain):
self.cat_model = cat_model
self.cat_features = cat_features

def __call__(self, data, ret=Model.Value):
if isinstance(data, Table):
with data.force_unlocked(data.X):
return super().__call__(data, ret)
else:
return super().__call__(data, ret)

def predict(self, X):
if self.cat_features:
X = X.astype(str)
Expand Down Expand Up @@ -824,17 +831,18 @@ def __call__(self, data, progress_callback=None):
return m

def fit_storage(self, data: Table):
domain, X, Y, W = data.domain, data.X, data.Y.reshape(-1), None
if self.supports_weights and data.has_weights():
W = data.W.reshape(-1)
# pylint: disable=not-callable
clf = self.__wraps__(**self.params)
cat_features = [i for i, attr in enumerate(domain.attributes)
if attr.is_discrete]
if cat_features:
X = X.astype(str)
cat_model = clf.fit(X, Y, cat_features=cat_features, sample_weight=W)
return self.__returns__(cat_model, cat_features, domain)
with data.force_unlocked(data.X):
domain, X, Y, W = data.domain, data.X, data.Y.reshape(-1), None
if self.supports_weights and data.has_weights():
W = data.W.reshape(-1)
# pylint: disable=not-callable
clf = self.__wraps__(**self.params)
cat_features = [i for i, attr in enumerate(domain.attributes)
if attr.is_discrete]
if cat_features:
X = X.astype(str)
cat_model = clf.fit(X, Y, cat_features=cat_features, sample_weight=W)
return self.__returns__(cat_model, cat_features, domain)

def __getattr__(self, item):
try:
Expand Down
51 changes: 32 additions & 19 deletions Orange/classification/_tree_scorers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ cdef extern from "numpy/npy_math.h":
cpdef enum:
NULL_BRANCH = -1

def contingency(double[:] x, int nx, double[:] y, int ny):
def contingency(const double[:] x, int nx, const double[:] y, int ny):
cdef:
np.ndarray[np.uint32_t, ndim=2] cont = np.zeros((ny, nx), dtype=np.uint32)
int n = len(x), yi, xi
Expand All @@ -28,7 +28,8 @@ def contingency(double[:] x, int nx, double[:] y, int ny):
cont[yi, xi] += 1
return cont

def find_threshold_entropy(double[:] x, double[:] y, np.intp_t[:] idx,
def find_threshold_entropy(const double[:] x, const double[:] y,
const np.intp_t[:] idx,
int n_classes, int min_leaf):
"""
Find the threshold for continuous attribute values that maximizes
Expand Down Expand Up @@ -89,8 +90,9 @@ def find_threshold_entropy(double[:] x, double[:] y, np.intp_t[:] idx,
return (class_entro - best_entro) / N / log(2), x[idx[best_idx]]


def find_binarization_entropy(double[:, :] cont, double[:] class_distr,
double[:] val_distr, int min_leaf):
def find_binarization_entropy(const double[:, :] cont,
const double[:] class_distr,
const double[:] val_distr, int min_leaf):
"""
Find the split of discrete values into two groups that optimizes information
gain.
Expand Down Expand Up @@ -187,7 +189,9 @@ def find_binarization_entropy(double[:, :] cont, double[:] class_distr,
return (class_entro - best_entro) / N / log(2), best_mapping


def find_threshold_MSE(double[:] x, double[:] y, np.intp_t[:] idx, int min_leaf):
def find_threshold_MSE(const double[:] x,
const double[:] y,
const np.intp_t[:] idx, int min_leaf):
"""
Find the threshold for continuous attribute values that minimizes MSE.
Expand Down Expand Up @@ -232,7 +236,8 @@ def find_threshold_MSE(double[:] x, double[:] y, np.intp_t[:] idx, int min_leaf)
return (best_inter - (sum * sum) / N) / N, x[idx[best_idx]]


def find_binarization_MSE(double[:] x, double[:] y, int n_values, int min_leaf):
def find_binarization_MSE(const double[:] x,
const double[:] y, int n_values, int min_leaf):
"""
Find the split of discrete values into two groups that minimizes the MSE.
Expand Down Expand Up @@ -315,7 +320,9 @@ def find_binarization_MSE(double[:] x, double[:] y, int n_values, int min_leaf):
return (best_inter - start_inter) / x.shape[0], best_mapping


def compute_grouped_MSE(double[:] x, double[:] y, int n_values, int min_leaf):
def compute_grouped_MSE(const double[:] x,
const double[:] y,
int n_values, int min_leaf):
"""
Compute the MSE decrease of the given split into groups.
Expand Down Expand Up @@ -371,8 +378,10 @@ def compute_grouped_MSE(double[:] x, double[:] y, int n_values, int min_leaf):
return (inter - sum * sum / n) / x.shape[0]


def compute_predictions(double[:, :] X, int[:] code,
double[:, :] values, double[:] thresholds):
def compute_predictions(const double[:, :] X,
const int[:] code,
const double[:, :] values,
const double[:] thresholds):
"""
Return the values (distributions, means and variances) stored in the nodes
to which the tree classify the rows in X.
Expand Down Expand Up @@ -419,8 +428,10 @@ def compute_predictions(double[:, :] X, int[:] code,
return np.asarray(predictions)


def compute_predictions_csr(X, int[:] code,
double[:, :] values, double[:] thresholds):
def compute_predictions_csr(X,
const int[:] code,
const double[:, :] values,
const double[:] thresholds):
"""
Same as compute_predictions except for sparse data
"""
Expand All @@ -431,9 +442,9 @@ def compute_predictions_csr(X, int[:] code,
double[: ,:] predictions = np.empty(
(X.shape[0], values.shape[1]), dtype=np.float64)

double[:] data = X.data
np.int32_t[:] indptr = X.indptr
np.int32_t[:] indices = X.indices
const double[:] data = X.data
const np.int32_t[:] indptr = X.indptr
const np.int32_t[:] indices = X.indices
int ind, attr, n_rows

n_rows = X.shape[0]
Expand Down Expand Up @@ -463,8 +474,10 @@ def compute_predictions_csr(X, int[:] code,
predictions[i, j] = values[node_idx, j]
return np.asarray(predictions)

def compute_predictions_csc(X, int[:] code,
double[:, :] values, double[:] thresholds):
def compute_predictions_csc(X,
const int[:] code,
const double[:, :] values,
const double[:] thresholds):
"""
Same as compute_predictions except for sparse data
"""
Expand All @@ -475,9 +488,9 @@ def compute_predictions_csc(X, int[:] code,
double[: ,:] predictions = np.empty(
(X.shape[0], values.shape[1]), dtype=np.float64)

double[:] data = X.data
np.int32_t[:] indptr = X.indptr
np.int32_t[:] indices = X.indices
const double[:] data = X.data
const np.int32_t[:] indptr = X.indptr
const np.int32_t[:] indices = X.indices
int ind, attr, n_rows

n_rows = X.shape[0]
Expand Down
2 changes: 1 addition & 1 deletion Orange/classification/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def _score_disc():
cont_entr = np.sum(cont * np.log(cont))
score = (class_entr - attr_entr + cont_entr) / n / np.log(2)
score *= n / len(data) # punishment for missing values
branches = col_x
branches = col_x.copy()
branches[np.isnan(branches)] = -1
if score == 0:
return REJECT_ATTRIBUTE
Expand Down
8 changes: 6 additions & 2 deletions Orange/data/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,12 @@ def __init__(self, domain, data=None, id=None):
self._weight = 1
elif isinstance(data, Instance) and data.domain == domain:
self._x = np.array(data._x)
self._y = np.array(data._y)
self._y = np.atleast_1d(np.array(data._y))
self._metas = np.array(data._metas)
self._weight = data._weight
else:
self._x, self._y, self._metas = domain.convert(data)
self._y = np.atleast_1d(self._y)
self._weight = 1

if id is not None:
Expand Down Expand Up @@ -116,7 +117,10 @@ def __getitem__(self, key):
if 0 <= idx < len(self._domain.attributes):
value = self._x[idx]
elif idx >= len(self._domain.attributes):
value = self._y[idx - len(self.domain.attributes)]
if self._y.ndim == 0:
value = self._y
else:
value = self._y[idx - len(self.domain.attributes)]
else:
value = self._metas[-1 - idx]
var = self._domain[idx]
Expand Down
4 changes: 4 additions & 0 deletions Orange/data/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,10 @@ def vars_from_df(df, role=None, force_nominal=False):
for var, col, expr in zip(Avars, Acols, Aexpr)]).T
XYM.append(A)

# Let the tables share memory with pandas frame
if XYM[1] is not None and XYM[1].ndim == 2 and XYM[1].shape[1] == 1:
XYM[1] = XYM[1][:, 0]

return XYM, Domain(attrs, class_vars, metas)


Expand Down
3 changes: 2 additions & 1 deletion Orange/data/sql/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,8 @@ def _filter_values(self, f):
return t2

@classmethod
def from_table(cls, domain, source, row_indices=...):
def from_table(cls, domain, source, row_indices=..., *, copy=False):
# pylint: disable=unused-argument
assert row_indices is ...

table = source.copy()
Expand Down
Loading

0 comments on commit 408fa64

Please sign in to comment.