forked from XuegongLab/neoguider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
TargetEncoder.py
104 lines (94 loc) · 4.19 KB
/
TargetEncoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python
import collections,logging,math,pprint,random
import numpy as np
import scipy
class TargetEncoder:
def __init__(self, excluded_cols = [], pseudocount=0.5, random_state=0):
""" Initialize """
self.pseudocount = pseudocount
self.random_state = random_state
self.n_neg_training_examples = -1
self.n_pos_training_examples = -1
self.colidxto01counter = None
def set_random_state(self, random_state):
self.random_state = random_state
def _abbrevshow(alist, anum=5):
if len(alist) <= anum*2: return [alist]
else: return [alist[0:anum], alist[(len(alist)-anum):len(alist)]]
def _assert_input(self, X, y):
for label in y: assert label in [0, 1], F'Label {label} is not binary'
X0 = self.X0 = X[y==0,:]
X1 = self.X1 = X[y==1,:]
assert X.shape[1] == X0.shape[1] and X0.shape[1] == X1.shape[1], 'InternalError'
assert X0.shape[0] > 1, 'At least two negative examples should be provided'
assert X1.shape[0] > 1, 'At least two positive examples should be provided'
assert X1.shape[0] < X0.shape[0], 'The number of positive examples should be less than the number of negative examples'
''' Note: this method (which uses resampling) is supposed to outperform sklearn.preprocessing.TargetEncoder.fit_transform (which uses cross-fitting)'''
def fit_transform(self, X1, y1, is_training=True):
"""
X1: categorical features
y1: binary response
return: numerically encoded features using the WOE (weight of evidence) encoding (https://letsdatascience.com/target-encoding/) with resampling
"""
self.fit(X1, y1)
return self.transform(X1, is_training=is_training)
def fit(self, X1, y1):
X = np.array(X1)
y = np.array(y1)
self._assert_input(X, y)
X0 = self.X0 = X[y==0,:]
X1 = self.X1 = X[y==1,:]
baseratio = (float(len(X1)) / float(len(X0)))
logbase = np.log(baseratio)
prior_w = self.pseudocount
self.n_neg_training_examples = X0.shape[0]
self.n_pos_training_examples = X1.shape[0]
self.colidxto01counter = []
for colidx in range(X.shape[1]):
elements = sorted(np.unique(X[:,colidx]))
x0counter = collections.Counter(sorted(X0[:,colidx]))
x1counter = collections.Counter(sorted(X1[:,colidx]))
self.colidxto01counter.append((x0counter, x1counter))
return self
def transform(self, X1, is_training=False):
X = np.array(X1)
assert X.shape[1] == len(self.colidxto01counter), F'{X} and {colidxto01counter} have different number of features ({X.shape[1]} != {len(self.colidxto01counter)})'
running_rand = np.random.default_rng(self.random_state)
npc = self.pseudocount * self.n_neg_training_examples / self.n_pos_training_examples
ppc = self.pseudocount
n_training_examples = (self.n_neg_training_examples+self.n_pos_training_examples)
ret = []
for colidx in range(X.shape[1]):
x0counter, x1counter = self.colidxto01counter[colidx]
ps = [float(x1counter[k]+ppc)/(x0counter[k]+x1counter[k]+ppc+npc) for k in X[:,colidx]]
if is_training:
binom_rvs = scipy.stats.binom.rvs(n=n_training_examples, p=ps, random_state=running_rand)
targets = [(rv/float(n_training_examples)) for rv in binom_rvs]
else:
targets = ps
ret.append(targets)
return np.array(ret).transpose()
def test_1():
pp = pprint.PrettyPrinter(indent=4)
logging.basicConfig(format='TargetEncoder %(asctime)s - %(message)s', level=logging.DEBUG)
X = np.array([
[ 1, 'A'],
[ 1, 'B'],
[ 1, 'B'],
[ 1, 'B'],
[ 1, 'B'],
[ 2, 'C'],
[ 2, 'C'],
[ 2, 'C'],
[ 2, 'C'],
[ 2, 'D']
])
y = np.array([1,1,0,1,1,0,0,0,0,0])
encoder = TargetEncoder()
X2 = encoder.fit_transform(X, y)
logging.info(F'Encoded={X2}')
encoder.set_random_state(997)
X3 = encoder.fit_transform(X, y)
logging.info(F'Encoded={X3}')
if __name__ == '__main__':
test_1()