-
Notifications
You must be signed in to change notification settings - Fork 2
/
updates.py
80 lines (62 loc) · 2.63 KB
/
updates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import theano
import theano.tensor as T
from lasagne import utils
from lasagne.updates import get_or_compute_grads
from collections import OrderedDict
import numpy as np
def amsgrad(loss_or_grads, params, learning_rate=0.001, beta1=0.9,
beta2=0.999, epsilon=1e-8, bias_correction=True):
all_grads = get_or_compute_grads(loss_or_grads, params)
t_prev = theano.shared(utils.floatX(0.))
updates = OrderedDict()
# Using theano constant to prevent upcasting of float32
one = T.constant(1)
t = t_prev + 1
if bias_correction:
a_t = learning_rate*T.sqrt(one-beta2**t)/(one-beta1**t)
else:
a_t = learning_rate
for param, g_t in zip(params, all_grads):
value = param.get_value(borrow=True)
m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
v_hat_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
m_t = beta1*m_prev + (one-beta1)*g_t
v_t = beta2*v_prev + (one-beta2)*g_t**2
v_hat_t = T.maximum(v_hat_prev, v_t)
step = a_t*m_t/(T.sqrt(v_hat_t) + epsilon)
updates[m_prev] = m_t
updates[v_prev] = v_t
updates[v_hat_prev] = v_hat_t
updates[param] = param - step
updates[t_prev] = t
return updates
def adam(loss_or_grads, params, learning_rate=0.001, beta1=0.9,
beta2=0.999, epsilon=1e-8, bias_correction=True):
all_grads = get_or_compute_grads(loss_or_grads, params)
t_prev = theano.shared(utils.floatX(0.))
updates = OrderedDict()
# Using theano constant to prevent upcasting of float32
one = T.constant(1)
t = t_prev + 1
if bias_correction:
a_t = learning_rate*T.sqrt(one-beta2**t)/(one-beta1**t)
else:
a_t = learning_rate
for param, g_t in zip(params, all_grads):
value = param.get_value(borrow=True)
m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
m_t = beta1*m_prev + (one-beta1)*g_t
v_t = beta2*v_prev + (one-beta2)*g_t**2
step = a_t*m_t/(T.sqrt(v_t) + epsilon)
updates[m_prev] = m_t
updates[v_prev] = v_t
updates[param] = param - step
updates[t_prev] = t
return updates