31
31
MAX_EP_STEPS = 200
32
32
LR_A = 1e-4 # learning rate for actor
33
33
LR_C = 1e-4 # learning rate for critic
34
- GAMMA = 0.999 # reward discount
34
+ GAMMA = 0.9 # reward discount
35
35
REPLACE_ITER_A = 1100
36
36
REPLACE_ITER_C = 1000
37
- MEMORY_CAPACITY = 10000
37
+ MEMORY_CAPACITY = 5000
38
38
BATCH_SIZE = 16
39
39
VAR_MIN = 0.1
40
40
RENDER = True
50
50
# all placeholder for tf
51
51
with tf .name_scope ('S' ):
52
52
S = tf .placeholder (tf .float32 , shape = [None , STATE_DIM ], name = 's' )
53
- with tf .name_scope ('A' ):
54
- A = tf .placeholder (tf .float32 , shape = [None , ACTION_DIM ], name = 'a' )
55
53
with tf .name_scope ('R' ):
56
54
R = tf .placeholder (tf .float32 , [None , 1 ], name = 'r' )
57
55
with tf .name_scope ('S_' ):
@@ -96,8 +94,8 @@ def _build_net(self, s, scope, trainable):
96
94
scaled_a = tf .multiply (actions , self .action_bound , name = 'scaled_a' ) # Scale output to -action_bound to action_bound
97
95
return scaled_a
98
96
99
- def learn (self , s , a ): # batch update
100
- self .sess .run (self .train_op , feed_dict = {S : s , A : a })
97
+ def learn (self , s ): # batch update
98
+ self .sess .run (self .train_op , feed_dict = {S : s })
101
99
if self .t_replace_counter % self .t_replace_iter == 0 :
102
100
self .sess .run ([tf .assign (t , e ) for t , e in zip (self .t_params , self .e_params )])
103
101
self .t_replace_counter += 1
@@ -111,12 +109,12 @@ def add_grad_to_graph(self, a_grads):
111
109
self .policy_grads = tf .gradients (ys = self .a , xs = self .e_params , grad_ys = a_grads )
112
110
113
111
with tf .variable_scope ('A_train' ):
114
- opt = tf .train .RMSPropOptimizer (- self .lr / BATCH_SIZE ) # (- learning rate) for ascent policy, div to take mean
112
+ opt = tf .train .RMSPropOptimizer (- self .lr ) # (- learning rate) for ascent policy
115
113
self .train_op = opt .apply_gradients (zip (self .policy_grads , self .e_params ))
116
114
117
115
118
116
class Critic (object ):
119
- def __init__ (self , sess , state_dim , action_dim , learning_rate , gamma , t_replace_iter , a_ ):
117
+ def __init__ (self , sess , state_dim , action_dim , learning_rate , gamma , t_replace_iter , a , a_ ):
120
118
self .sess = sess
121
119
self .s_dim = state_dim
122
120
self .a_dim = action_dim
@@ -127,7 +125,8 @@ def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_
127
125
128
126
with tf .variable_scope ('Critic' ):
129
127
# Input (s, a), output q
130
- self .q = self ._build_net (S , A , 'eval_net' , trainable = True )
128
+ self .a = a
129
+ self .q = self ._build_net (S , self .a , 'eval_net' , trainable = True )
131
130
132
131
# Input (s_, a_), output q_ for q_target
133
132
self .q_ = self ._build_net (S_ , a_ , 'target_net' , trainable = False ) # target_q is based on a_ from Actor's target_net
@@ -145,7 +144,7 @@ def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_
145
144
self .train_op = tf .train .RMSPropOptimizer (self .lr ).minimize (self .loss )
146
145
147
146
with tf .variable_scope ('a_grad' ):
148
- self .a_grads = tf .gradients (self .q , A )[0 ] # tensor of gradients of each sample (None, a_dim)
147
+ self .a_grads = tf .gradients (self .q , a )[0 ] # tensor of gradients of each sample (None, a_dim)
149
148
150
149
def _build_net (self , s , a , scope , trainable ):
151
150
with tf .variable_scope (scope ):
@@ -169,7 +168,7 @@ def _build_net(self, s, a, scope, trainable):
169
168
return q
170
169
171
170
def learn (self , s , a , r , s_ ):
172
- self .sess .run (self .train_op , feed_dict = {S : s , A : a , R : r , S_ : s_ })
171
+ self .sess .run (self .train_op , feed_dict = {S : s , self . a : a , R : r , S_ : s_ })
173
172
if self .t_replace_counter % self .t_replace_iter == 0 :
174
173
self .sess .run ([tf .assign (t , e ) for t , e in zip (self .t_params , self .e_params )])
175
174
self .t_replace_counter += 1
@@ -197,7 +196,7 @@ def sample(self, n):
197
196
198
197
# Create actor and critic.
199
198
actor = Actor (sess , ACTION_DIM , ACTION_BOUND [1 ], LR_A , REPLACE_ITER_A )
200
- critic = Critic (sess , STATE_DIM , ACTION_DIM , LR_C , GAMMA , REPLACE_ITER_C , actor .a_ )
199
+ critic = Critic (sess , STATE_DIM , ACTION_DIM , LR_C , GAMMA , REPLACE_ITER_C , actor .a , actor . a_ )
201
200
actor .add_grad_to_graph (critic .a_grads )
202
201
203
202
M = Memory (MEMORY_CAPACITY , dims = 2 * STATE_DIM + ACTION_DIM + 1 )
@@ -230,15 +229,15 @@ def train():
230
229
M .store_transition (s , a , r , s_ )
231
230
232
231
if M .pointer > MEMORY_CAPACITY :
233
- var = max ([var * .99995 , VAR_MIN ]) # decay the action randomness
232
+ var = max ([var * .9999 , VAR_MIN ]) # decay the action randomness
234
233
b_M = M .sample (BATCH_SIZE )
235
234
b_s = b_M [:, :STATE_DIM ]
236
235
b_a = b_M [:, STATE_DIM : STATE_DIM + ACTION_DIM ]
237
236
b_r = b_M [:, - STATE_DIM - 1 : - STATE_DIM ]
238
237
b_s_ = b_M [:, - STATE_DIM :]
239
238
240
239
critic .learn (b_s , b_a , b_r , b_s_ )
241
- actor .learn (b_s , b_a )
240
+ actor .learn (b_s )
242
241
243
242
s = s_
244
243
ep_reward += r
0 commit comments