Skip to content

Commit

Permalink
Naive Bayes: Ignore existing classes in Laplacian smoothing
Browse files Browse the repository at this point in the history
  • Loading branch information
janezd committed Feb 2, 2019
1 parent 5eba886 commit 325575e
Show file tree
Hide file tree
Showing 2 changed files with 136 additions and 28 deletions.
59 changes: 32 additions & 27 deletions Orange/classification/naive_bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,23 @@ def fit_storage(self, table):
cont = contingency.get_contingencies(table)
class_freq = np.array(np.diag(
contingency.get_contingency(table, table.domain.class_var)))
class_prob = (class_freq + 1) / (np.sum(class_freq) + len(class_freq))
nclss = (class_freq != 0).sum()
if not nclss:
raise ValueError("Data has no defined target values")

# Laplacian smoothing considers only classes that appear in the data,
# in part to avoid cases where the probabilities are affected by empty
# (or completely spurious) classes that appear because of Orange's reuse
# of variables. See GH-2943.
# The corresponding elements of class_probs are set to zero only after
# mock non-zero values are used in computation of log_cont_prob to
# prevent division by zero.
class_prob = (class_freq + 1) / (np.sum(class_freq) + nclss)
log_cont_prob = [np.log(
(np.array(c) + 1) / (np.sum(np.array(c), axis=0)[None, :] +
c.shape[0]) / class_prob[:, None])
for c in cont]
(np.array(c) + 1) / (np.sum(np.array(c), axis=0)[None, :] + nclss)
/ class_prob[:, None])
for c in cont]
class_prob[class_freq == 0] = 0
return NaiveBayesModel(log_cont_prob, class_prob, table.domain)


Expand All @@ -58,35 +70,30 @@ def predict_storage(self, data):
else:
isnan = np.isnan
zeros = np.zeros_like(self.class_prob)
probs = np.atleast_2d(np.exp(
np.log(self.class_prob) +
np.array([
zeros if isnan(ins.x).all() else
sum(attr_prob[:, int(attr_val)]
for attr_val, attr_prob in zip(ins, self.log_cont_prob)
if not isnan(attr_val))
for ins in data])))
probs = self.class_prob * np.exp(np.array([
zeros if isnan(ins.x).all() else
sum(attr_prob[:, int(attr_val)]
for attr_val, attr_prob in zip(ins, self.log_cont_prob)
if not isnan(attr_val))
for ins in data]))
probs /= probs.sum(axis=1)[:, None]
values = probs.argmax(axis=1)
return values, probs

def predict(self, X):
if not self.log_cont_prob:
probs = self._priors(X)
elif sp.issparse(X):
probs = self._sparse_probs(X)
else:
probs = self._dense_probs(X)
probs = np.exp(probs)
probs = np.zeros((X.shape[0], self.class_prob.shape[0]))
if self.log_cont_prob is not None:
if sp.issparse(X):
self._sparse_probs(X, probs)
else:
self._dense_probs(X, probs)
np.exp(probs, probs)
probs *= self.class_prob
probs /= probs.sum(axis=1)[:, None]
values = probs.argmax(axis=1)
return values, probs

def _priors(self, data):
return np.tile(np.log(self.class_prob), (data.shape[0], 1))

def _dense_probs(self, data):
probs = self._priors(data)
def _dense_probs(self, data, probs):
zeros = np.zeros((1, probs.shape[1]))
for col, attr_prob in zip(data.T, self.log_cont_prob):
col = col.copy()
Expand All @@ -96,9 +103,7 @@ def _dense_probs(self, data):
probs += probs0[col]
return probs

def _sparse_probs(self, data):
probs = self._priors(data)

def _sparse_probs(self, data, probs):
n_vals = max(p.shape[1] for p in self.log_cont_prob) + 1
log_prob = np.zeros((len(self.log_cont_prob),
n_vals,
Expand Down
105 changes: 104 additions & 1 deletion Orange/tests/test_naive_bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,18 +92,21 @@ def test_compare_results_of_predict_and_predict_storage(self):

def test_predictions(self):
self._test_predictions(sparse=None)
self._test_predictions_with_absent_class(sparse=None)

def test_predictions_csr_matrix(self):
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", ".*the matrix subclass.*", PendingDeprecationWarning)
self._test_predictions(sparse=sp.csr_matrix)
self._test_predictions_with_absent_class(sparse=sp.csr_matrix)

def test_predictions_csc_matrix(self):
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", ".*the matrix subclass.*", PendingDeprecationWarning)
self._test_predictions(sparse=sp.csc_matrix)
self._test_predictions_with_absent_class(sparse=sp.csc_matrix)

def _test_predictions(self, sparse):
x = np.array([
Expand Down Expand Up @@ -205,6 +208,107 @@ def _test_predictions(self, sparse):
np.testing.assert_almost_equal(exp_probs, probs)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))

def _test_predictions_with_absent_class(self, sparse):
"""Empty classes should not affect predictions"""
x = np.array([
[1, 0, 0],
[0, np.nan, 0],
[0, 1, 0],
[0, 0, 0],
[1, 2, 0],
[1, 1, 0],
[1, 2, 0],
[0, 1, 0]])
if sparse is not None:
x = sparse(x)

y = np.array([0, 0, 0, 2, 2, 2, 3, 3])
domain = Domain(
[DiscreteVariable("a", values="ab"),
DiscreteVariable("b", values="abc"),
DiscreteVariable("c", values="a")],
DiscreteVariable("y", values="abcd"))
data = Table.from_numpy(domain, x, y)

model = self.learner(data)
np.testing.assert_almost_equal(
model.class_prob,
[4/11, 0, 4/11, 3/11]
)
np.testing.assert_almost_equal(
np.exp(model.log_cont_prob[0]) * model.class_prob[:, None],
[[3/7, 2/7], [0, 0], [2/7, 3/7], [2/7, 2/7]])
np.testing.assert_almost_equal(
np.exp(model.log_cont_prob[1]) * model.class_prob[:, None],
[[2/5, 1/3, 1/5], [0, 0, 0], [2/5, 1/3, 2/5], [1/5, 1/3, 2/5]])
np.testing.assert_almost_equal(
np.exp(model.log_cont_prob[2]) * model.class_prob[:, None],
[[4/11], [0], [4/11], [3/11]])

test_x = np.array([[a, b, 0] for a in [0, 1] for b in [0, 1, 2]])
# Classifiers reject csc matrices in the base class
# Naive bayesian classifier supports them if predict_storage is
# called directly, which we do below
if sparse is not None and sparse is not sp.csc_matrix:
test_x = sparse(test_x)
test_y = np.full((6, ), np.nan)
# The following was computed manually, too
exp_probs = np.array([
[0.47368421052632, 0, 0.31578947368421, 0.21052631578947],
[0.39130434782609, 0, 0.26086956521739, 0.34782608695652],
[0.24324324324324, 0, 0.32432432432432, 0.43243243243243],
[0.31578947368421, 0, 0.47368421052632, 0.21052631578947],
[0.26086956521739, 0, 0.39130434782609, 0.34782608695652],
[0.15000000000000, 0, 0.45000000000000, 0.40000000000000]
])

# Test the faster algorithm for Table (numpy matrices)
test_data = Table.from_numpy(domain, test_x, test_y)
probs = model(test_data, ret=model.Probs)
np.testing.assert_almost_equal(exp_probs, probs)
values = model(test_data)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
values, probs = model(test_data, ret=model.ValueProbs)
np.testing.assert_almost_equal(exp_probs, probs)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))

# Test the slower algorithm for non-Table data (iteration in Python)
test_data = NotATable.from_numpy(domain, test_x, test_y)
probs = model(test_data, ret=model.Probs)
np.testing.assert_almost_equal(exp_probs, probs)
values = model(test_data)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
values, probs = model(test_data, ret=model.ValueProbs)
np.testing.assert_almost_equal(exp_probs, probs)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))

# Test prediction directly on numpy
probs = model(test_x, ret=model.Probs)
np.testing.assert_almost_equal(exp_probs, probs)
values = model(test_x)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
values, probs = model(test_x, ret=model.ValueProbs)
np.testing.assert_almost_equal(exp_probs, probs)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))

# Test prediction on instances
for inst, exp_prob in zip(test_data, exp_probs):
np.testing.assert_almost_equal(
model(inst, ret=model.Probs)[0],
exp_prob)
self.assertEqual(model(inst), np.argmax(exp_prob))
value, prob = model(inst, ret=model.ValueProbs)
np.testing.assert_almost_equal(prob[0], exp_prob)
self.assertEqual(value, np.argmax(exp_prob))

# Test prediction by directly calling predict. This is needed to test
# csc_matrix, but doesn't hurt others
if sparse is sp.csc_matrix:
test_x = sparse(test_x)
values, probs = model.predict(test_x)
np.testing.assert_almost_equal(exp_probs, probs)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))

def test_no_attributes(self):
y = np.array([0, 0, 0, 1, 1, 1, 2, 2])
domain = Domain([], DiscreteVariable("y", values="abc"))
Expand All @@ -215,6 +319,5 @@ def test_no_attributes(self):
[[4/11, 4/11, 3/11]] * 5
)


if __name__ == "__main__":
unittest.main()

0 comments on commit 325575e

Please sign in to comment.