Skip to content

Commit ff11d81

Browse files
MorvanZhouMorvan Zhou
authored and
Morvan Zhou
committed
update and fix bug
1 parent 0e326ee commit ff11d81

File tree

3 files changed

+38
-41
lines changed

3 files changed

+38
-41
lines changed

experiments/2D_car/DDPG.py

+15-16
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,14 @@
2727
np.random.seed(1)
2828
tf.set_random_seed(1)
2929

30-
MAX_EPISODES = 225
30+
MAX_EPISODES = 500
3131
MAX_EP_STEPS = 600
32-
LR_A = 1e-3 # learning rate for actor
33-
LR_C = 1e-3 # learning rate for critic
34-
GAMMA = 0.95 # reward discount
32+
LR_A = 1e-4 # learning rate for actor
33+
LR_C = 1e-4 # learning rate for critic
34+
GAMMA = 0.9 # reward discount
3535
REPLACE_ITER_A = 800
3636
REPLACE_ITER_C = 700
37-
MEMORY_CAPACITY = 5000
37+
MEMORY_CAPACITY = 2000
3838
BATCH_SIZE = 16
3939
VAR_MIN = 0.1
4040
RENDER = True
@@ -49,8 +49,6 @@
4949
# all placeholder for tf
5050
with tf.name_scope('S'):
5151
S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s')
52-
with tf.name_scope('A'):
53-
A = tf.placeholder(tf.float32, shape=[None, ACTION_DIM], name='a')
5452
with tf.name_scope('R'):
5553
R = tf.placeholder(tf.float32, [None, 1], name='r')
5654
with tf.name_scope('S_'):
@@ -92,8 +90,8 @@ def _build_net(self, s, scope, trainable):
9290
scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') # Scale output to -action_bound to action_bound
9391
return scaled_a
9492

95-
def learn(self, s, a): # batch update
96-
self.sess.run(self.train_op, feed_dict={S: s, A: a})
93+
def learn(self, s): # batch update
94+
self.sess.run(self.train_op, feed_dict={S: s})
9795
if self.t_replace_counter % self.t_replace_iter == 0:
9896
self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
9997
self.t_replace_counter += 1
@@ -107,12 +105,12 @@ def add_grad_to_graph(self, a_grads):
107105
self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)
108106

109107
with tf.variable_scope('A_train'):
110-
opt = tf.train.RMSPropOptimizer(-self.lr / BATCH_SIZE) # (- learning rate) for ascent policy, div to take mean
108+
opt = tf.train.RMSPropOptimizer(-self.lr) # (- learning rate) for ascent policy
111109
self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params))
112110

113111

114112
class Critic(object):
115-
def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a_):
113+
def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a, a_):
116114
self.sess = sess
117115
self.s_dim = state_dim
118116
self.a_dim = action_dim
@@ -123,7 +121,8 @@ def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_
123121

124122
with tf.variable_scope('Critic'):
125123
# Input (s, a), output q
126-
self.q = self._build_net(S, A, 'eval_net', trainable=True)
124+
self.a = a
125+
self.q = self._build_net(S, self.a, 'eval_net', trainable=True)
127126

128127
# Input (s_, a_), output q_ for q_target
129128
self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net
@@ -141,7 +140,7 @@ def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_
141140
self.train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
142141

143142
with tf.variable_scope('a_grad'):
144-
self.a_grads = tf.gradients(self.q, A)[0] # tensor of gradients of each sample (None, a_dim)
143+
self.a_grads = tf.gradients(self.q, a)[0] # tensor of gradients of each sample (None, a_dim)
145144

146145
def _build_net(self, s, a, scope, trainable):
147146
with tf.variable_scope(scope):
@@ -162,7 +161,7 @@ def _build_net(self, s, a, scope, trainable):
162161
return q
163162

164163
def learn(self, s, a, r, s_):
165-
self.sess.run(self.train_op, feed_dict={S: s, A: a, R: r, S_: s_})
164+
self.sess.run(self.train_op, feed_dict={S: s, self.a: a, R: r, S_: s_})
166165
if self.t_replace_counter % self.t_replace_iter == 0:
167166
self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
168167
self.t_replace_counter += 1
@@ -190,7 +189,7 @@ def sample(self, n):
190189

191190
# Create actor and critic.
192191
actor = Actor(sess, ACTION_DIM, ACTION_BOUND[1], LR_A, REPLACE_ITER_A)
193-
critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a_)
192+
critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a, actor.a_)
194193
actor.add_grad_to_graph(critic.a_grads)
195194

196195
M = Memory(MEMORY_CAPACITY, dims=2 * STATE_DIM + ACTION_DIM + 1)
@@ -230,7 +229,7 @@ def train():
230229
b_s_ = b_M[:, -STATE_DIM:]
231230

232231
critic.learn(b_s, b_a, b_r, b_s_)
233-
actor.learn(b_s, b_a)
232+
actor.learn(b_s)
234233

235234
s = s_
236235
ep_step += 1

experiments/Robot_arm/DDPG.py

+13-14
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,10 @@
3131
MAX_EP_STEPS = 200
3232
LR_A = 1e-4 # learning rate for actor
3333
LR_C = 1e-4 # learning rate for critic
34-
GAMMA = 0.999 # reward discount
34+
GAMMA = 0.9 # reward discount
3535
REPLACE_ITER_A = 1100
3636
REPLACE_ITER_C = 1000
37-
MEMORY_CAPACITY = 10000
37+
MEMORY_CAPACITY = 5000
3838
BATCH_SIZE = 16
3939
VAR_MIN = 0.1
4040
RENDER = True
@@ -50,8 +50,6 @@
5050
# all placeholder for tf
5151
with tf.name_scope('S'):
5252
S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s')
53-
with tf.name_scope('A'):
54-
A = tf.placeholder(tf.float32, shape=[None, ACTION_DIM], name='a')
5553
with tf.name_scope('R'):
5654
R = tf.placeholder(tf.float32, [None, 1], name='r')
5755
with tf.name_scope('S_'):
@@ -96,8 +94,8 @@ def _build_net(self, s, scope, trainable):
9694
scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') # Scale output to -action_bound to action_bound
9795
return scaled_a
9896

99-
def learn(self, s, a): # batch update
100-
self.sess.run(self.train_op, feed_dict={S: s, A: a})
97+
def learn(self, s): # batch update
98+
self.sess.run(self.train_op, feed_dict={S: s})
10199
if self.t_replace_counter % self.t_replace_iter == 0:
102100
self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
103101
self.t_replace_counter += 1
@@ -111,12 +109,12 @@ def add_grad_to_graph(self, a_grads):
111109
self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)
112110

113111
with tf.variable_scope('A_train'):
114-
opt = tf.train.RMSPropOptimizer(-self.lr / BATCH_SIZE) # (- learning rate) for ascent policy, div to take mean
112+
opt = tf.train.RMSPropOptimizer(-self.lr) # (- learning rate) for ascent policy
115113
self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params))
116114

117115

118116
class Critic(object):
119-
def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a_):
117+
def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a, a_):
120118
self.sess = sess
121119
self.s_dim = state_dim
122120
self.a_dim = action_dim
@@ -127,7 +125,8 @@ def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_
127125

128126
with tf.variable_scope('Critic'):
129127
# Input (s, a), output q
130-
self.q = self._build_net(S, A, 'eval_net', trainable=True)
128+
self.a = a
129+
self.q = self._build_net(S, self.a, 'eval_net', trainable=True)
131130

132131
# Input (s_, a_), output q_ for q_target
133132
self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net
@@ -145,7 +144,7 @@ def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_
145144
self.train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
146145

147146
with tf.variable_scope('a_grad'):
148-
self.a_grads = tf.gradients(self.q, A)[0] # tensor of gradients of each sample (None, a_dim)
147+
self.a_grads = tf.gradients(self.q, a)[0] # tensor of gradients of each sample (None, a_dim)
149148

150149
def _build_net(self, s, a, scope, trainable):
151150
with tf.variable_scope(scope):
@@ -169,7 +168,7 @@ def _build_net(self, s, a, scope, trainable):
169168
return q
170169

171170
def learn(self, s, a, r, s_):
172-
self.sess.run(self.train_op, feed_dict={S: s, A: a, R: r, S_: s_})
171+
self.sess.run(self.train_op, feed_dict={S: s, self.a: a, R: r, S_: s_})
173172
if self.t_replace_counter % self.t_replace_iter == 0:
174173
self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
175174
self.t_replace_counter += 1
@@ -197,7 +196,7 @@ def sample(self, n):
197196

198197
# Create actor and critic.
199198
actor = Actor(sess, ACTION_DIM, ACTION_BOUND[1], LR_A, REPLACE_ITER_A)
200-
critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a_)
199+
critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a, actor.a_)
201200
actor.add_grad_to_graph(critic.a_grads)
202201

203202
M = Memory(MEMORY_CAPACITY, dims=2 * STATE_DIM + ACTION_DIM + 1)
@@ -230,15 +229,15 @@ def train():
230229
M.store_transition(s, a, r, s_)
231230

232231
if M.pointer > MEMORY_CAPACITY:
233-
var = max([var*.99995, VAR_MIN]) # decay the action randomness
232+
var = max([var*.9999, VAR_MIN]) # decay the action randomness
234233
b_M = M.sample(BATCH_SIZE)
235234
b_s = b_M[:, :STATE_DIM]
236235
b_a = b_M[:, STATE_DIM: STATE_DIM + ACTION_DIM]
237236
b_r = b_M[:, -STATE_DIM - 1: -STATE_DIM]
238237
b_s_ = b_M[:, -STATE_DIM:]
239238

240239
critic.learn(b_s, b_a, b_r, b_s_)
241-
actor.learn(b_s, b_a)
240+
actor.learn(b_s)
242241

243242
s = s_
244243
ep_reward += r

experiments/Solve_BipedalWalker/DDPG.py

+10-11
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,6 @@
3939
# all placeholder for tf
4040
with tf.name_scope('S'):
4141
S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s')
42-
with tf.name_scope('A'):
43-
A = tf.placeholder(tf.float32, shape=[None, ACTION_DIM], name='a')
4442
with tf.name_scope('R'):
4543
R = tf.placeholder(tf.float32, [None, 1], name='r')
4644
with tf.name_scope('S_'):
@@ -82,8 +80,8 @@ def _build_net(self, s, scope, trainable):
8280
scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') # Scale output to -action_bound to action_bound
8381
return scaled_a
8482

85-
def learn(self, s, a): # batch update
86-
self.sess.run(self.train_op, feed_dict={S: s, A: a})
83+
def learn(self, s): # batch update
84+
self.sess.run(self.train_op, feed_dict={S: s})
8785
if self.t_replace_counter % self.t_replace_iter == 0:
8886
self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
8987
self.t_replace_counter += 1
@@ -101,14 +99,14 @@ def add_grad_to_graph(self, a_grads):
10199
self.policy_grads_and_vars = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)
102100

103101
with tf.variable_scope('A_train'):
104-
opt = tf.train.AdamOptimizer(-self.lr/BATCH_SIZE) # (- learning rate) for ascent policy
102+
opt = tf.train.RMSPropOptimizer(-self.lr) # (- learning rate) for ascent policy
105103
self.train_op = opt.apply_gradients(zip(self.policy_grads_and_vars, self.e_params), global_step=GLOBAL_STEP)
106104

107105

108106
############################### Critic ####################################
109107

110108
class Critic(object):
111-
def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a_):
109+
def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a, a_):
112110
self.sess = sess
113111
self.s_dim = state_dim
114112
self.a_dim = action_dim
@@ -119,7 +117,8 @@ def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_
119117

120118
with tf.variable_scope('Critic'):
121119
# Input (s, a), output q
122-
self.q = self._build_net(S, A, 'eval_net', trainable=True)
120+
self.a = a
121+
self.q = self._build_net(S, self.a, 'eval_net', trainable=True)
123122

124123
# Input (s_, a_), output q_ for q_target
125124
self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net
@@ -140,7 +139,7 @@ def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_
140139
self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss, global_step=GLOBAL_STEP)
141140

142141
with tf.variable_scope('a_grad'):
143-
self.a_grads = tf.gradients(self.q, A)[0] # tensor of gradients of each sample (None, a_dim)
142+
self.a_grads = tf.gradients(self.q, a)[0] # tensor of gradients of each sample (None, a_dim)
144143

145144
def _build_net(self, s, a, scope, trainable):
146145
with tf.variable_scope(scope):
@@ -162,7 +161,7 @@ def _build_net(self, s, a, scope, trainable):
162161
return q
163162

164163
def learn(self, s, a, r, s_, ISW):
165-
_, abs_td = self.sess.run([self.train_op, self.abs_td], feed_dict={S: s, A: a, R: r, S_: s_, self.ISWeights: ISW})
164+
_, abs_td = self.sess.run([self.train_op, self.abs_td], feed_dict={S: s, self.a: a, R: r, S_: s_, self.ISWeights: ISW})
166165
if self.t_replace_counter % self.t_replace_iter == 0:
167166
self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
168167
self.t_replace_counter += 1
@@ -309,7 +308,7 @@ def _get_priority(self, error):
309308

310309
# Create actor and critic.
311310
actor = Actor(sess, ACTION_DIM, ACTION_BOUND, LR_A, REPLACE_ITER_A)
312-
critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a_)
311+
critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a, actor.a_)
313312
actor.add_grad_to_graph(critic.a_grads)
314313

315314
M = Memory(MEMORY_CAPACITY)
@@ -357,7 +356,7 @@ def _get_priority(self, error):
357356
b_s_ = b_M[:, -STATE_DIM:]
358357

359358
abs_td = critic.learn(b_s, b_a, b_r, b_s_, ISWeights)
360-
actor.learn(b_s, b_a)
359+
actor.learn(b_s)
361360
for i in range(len(tree_idx)): # update priority
362361
idx = tree_idx[i]
363362
M.update(idx, abs_td[i])

0 commit comments

Comments
 (0)