Skip to content

Commit 4abdde1

Browse files
committed
improve performance
1 parent 601990e commit 4abdde1

File tree

2 files changed

+27
-59
lines changed

2 files changed

+27
-59
lines changed

contents/12_Proximal_Policy_Optimization/DPPO.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ def work(self):
124124
s = s_
125125
ep_r += r
126126

127-
GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers
127+
GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers
128128
if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
129129
v_s_ = self.ppo.get_v(s_)
130130
discounted_r = [] # compute discounted reward

contents/12_Proximal_Policy_Optimization/discrete_DPP0.py renamed to contents/12_Proximal_Policy_Optimization/discrete_DPPO.py

Lines changed: 26 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,14 @@
1010
View more on my tutorial website: https://morvanzhou.github.io/tutorials
1111
1212
Dependencies:
13-
tensorflow r1.3
13+
tensorflow 1.8.0
1414
gym 0.9.2
1515
"""
1616

1717
import tensorflow as tf
1818
import numpy as np
1919
import matplotlib.pyplot as plt
2020
import gym, threading, queue
21-
import time
2221

2322
EP_MAX = 1000
2423
EP_LEN = 500
@@ -27,7 +26,7 @@
2726
A_LR = 0.0001 # learning rate for actor
2827
C_LR = 0.0001 # learning rate for critic
2928
MIN_BATCH_SIZE = 64 # minimum batch size for updating PPO
30-
UPDATE_STEP = 10 # loop update operation n-steps
29+
UPDATE_STEP = 15 # loop update operation n-steps
3130
EPSILON = 0.2 # for clipping surrogate objective
3231
GAME = 'CartPole-v0'
3332

@@ -51,22 +50,18 @@ def __init__(self):
5150
self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs)
5251

5352
# actor
54-
self.pi, self.pi_params = self._build_anet('pi', trainable=True)
53+
self.pi, pi_params = self._build_anet('pi', trainable=True)
5554
oldpi, oldpi_params = self._build_anet('oldpi', trainable=False)
5655

57-
self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(self.pi_params, oldpi_params)]
56+
self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]
5857

59-
self.tfa = tf.placeholder(tf.int32, [None,], 'action')
60-
58+
self.tfa = tf.placeholder(tf.int32, [None, ], 'action')
6159
self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')
6260

63-
#debug
64-
self.val1 = tf.reduce_sum(self.pi * tf.one_hot(self.tfa, A_DIM, dtype=tf.float32), axis=1, keep_dims=True)
65-
self.val2 = tf.reduce_sum(oldpi * tf.one_hot(self.tfa, A_DIM, dtype=tf.float32), axis=1, keep_dims=True)
66-
#debug
67-
68-
ratio = self.val1/self.val2
69-
61+
a_indices = tf.stack([tf.range(tf.shape(self.tfa)[0], dtype=tf.int32), self.tfa], axis=1)
62+
pi_prob = tf.gather_nd(params=self.pi, indices=a_indices) # shape=(None, )
63+
oldpi_prob = tf.gather_nd(params=oldpi, indices=a_indices) # shape=(None, )
64+
ratio = pi_prob/oldpi_prob
7065
surr = ratio * self.tfadv # surrogate loss
7166

7267
self.aloss = -tf.reduce_mean(tf.minimum( # clipped surrogate objective
@@ -82,20 +77,10 @@ def update(self):
8277
if GLOBAL_EP < EP_MAX:
8378
UPDATE_EVENT.wait() # wait until get batch of data
8479
self.sess.run(self.update_oldpi_op) # copy pi to old pi
85-
s, a, r = [],[],[]
86-
for iter in range(QUEUE.qsize()):
87-
data = QUEUE.get()
88-
if iter == 0:
89-
s = data['bs']
90-
a = data['ba']
91-
r = data['br']
92-
else:
93-
s = np.append(s, data['bs'], axis=0)
94-
a = np.append(a, data['ba'], axis=0)
95-
r = np.append(r, data['br'], axis=0)
96-
80+
data = [QUEUE.get() for _ in range(QUEUE.qsize())] # collect data from all workers
81+
data = np.vstack(data)
82+
s, a, r = data[:, :S_DIM], data[:, S_DIM: S_DIM + 1].ravel(), data[:, -1:]
9783
adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r})
98-
9984
# update actor and critic in a update loop
10085
[self.sess.run(self.atrain_op, {self.tfs: s, self.tfa: a, self.tfadv: adv}) for _ in range(UPDATE_STEP)]
10186
[self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(UPDATE_STEP)]
@@ -104,16 +89,14 @@ def update(self):
10489
ROLLING_EVENT.set() # set roll-out available
10590

10691
def _build_anet(self, name, trainable):
107-
w_init = tf.random_normal_initializer(0., .1)
108-
10992
with tf.variable_scope(name):
11093
l_a = tf.layers.dense(self.tfs, 200, tf.nn.relu, trainable=trainable)
111-
a_prob = tf.layers.dense(l_a, A_DIM, tf.nn.softmax, trainable=trainable, name='ap')
94+
a_prob = tf.layers.dense(l_a, A_DIM, tf.nn.softmax, trainable=trainable)
11295
params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
11396
return a_prob, params
11497

11598
def choose_action(self, s): # run by a local
116-
prob_weights = self.sess.run(self.pi, feed_dict={self.tfs: s[np.newaxis, :]})
99+
prob_weights = self.sess.run(self.pi, feed_dict={self.tfs: s[None, :]})
117100
action = np.random.choice(range(prob_weights.shape[1]),
118101
p=prob_weights.ravel()) # select action w.r.t the actions prob
119102
return action
@@ -132,28 +115,26 @@ def __init__(self, wid):
132115
def work(self):
133116
global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
134117
while not COORD.should_stop():
135-
s = self.env.reset()#new episode
118+
s = self.env.reset()
136119
ep_r = 0
137120
buffer_s, buffer_a, buffer_r = [], [], []
138121
for t in range(EP_LEN):
139122
if not ROLLING_EVENT.is_set(): # while global PPO is updating
140123
ROLLING_EVENT.wait() # wait until PPO is updated
141124
buffer_s, buffer_a, buffer_r = [], [], [] # clear history buffer, use new policy to collect data
142-
143125
a = self.ppo.choose_action(s)
144126
s_, r, done, _ = self.env.step(a)
145-
if done: r = -5
127+
if done: r = -10
146128
buffer_s.append(s)
147129
buffer_a.append(a)
148-
buffer_r.append((r + 8) / 8) # normalize reward, find to be useful
130+
buffer_r.append(r-1) # 0 for not down, -11 for down. Reward engineering
149131
s = s_
150132
ep_r += r
151133

152-
GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers
134+
GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers
153135
if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE or done:
154-
155136
if done:
156-
v_s_ = 0 #episode ends
137+
v_s_ = 0 # end of episode
157138
else:
158139
v_s_ = self.ppo.get_v(s_)
159140

@@ -162,33 +143,25 @@ def work(self):
162143
v_s_ = r + GAMMA * v_s_
163144
discounted_r.append(v_s_)
164145
discounted_r.reverse()
165-
166-
bs, ba, br = np.vstack(buffer_s), np.array(buffer_a), np.array(discounted_r)[:, np.newaxis]
167-
168-
buffer_s, buffer_a, buffer_r = [], [], []
169-
170-
q_in = dict([('bs', bs), ('ba', ba), ('br', br)])
171-
# q_in = dict([('bs', list(bs)), ('ba', list(ba)), ('br', list(br))])
172-
173-
QUEUE.put(q_in)
174146

147+
bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, None]
148+
buffer_s, buffer_a, buffer_r = [], [], []
149+
QUEUE.put(np.hstack((bs, ba, br))) # put data in the queue
175150
if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
176151
ROLLING_EVENT.clear() # stop collecting data
177152
UPDATE_EVENT.set() # globalPPO update
178-
153+
179154
if GLOBAL_EP >= EP_MAX: # stop training
180155
COORD.request_stop()
181156
break
182157

183-
if done:break
158+
if done: break
184159

185160
# record reward changes, plot later
186161
if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r)
187162
else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1]*0.9+ep_r*0.1)
188163
GLOBAL_EP += 1
189-
print("EP", GLOBAL_EP,'|W%i' % self.wid, '|step %i' %t, '|Ep_r: %.2f' % ep_r,)
190-
np.save("Global_return",GLOBAL_RUNNING_R)
191-
np.savez("PI_PARA",self.ppo.sess.run(GLOBAL_PPO.pi_params))
164+
print('{0:.1f}%'.format(GLOBAL_EP/EP_MAX*100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r,)
192165

193166

194167
if __name__ == '__main__':
@@ -197,9 +170,7 @@ def work(self):
197170
UPDATE_EVENT.clear() # not update now
198171
ROLLING_EVENT.set() # start to roll out
199172
workers = [Worker(wid=i) for i in range(N_WORKER)]
200-
201-
start = time.time()
202-
173+
203174
GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0
204175
GLOBAL_RUNNING_R = []
205176
COORD = tf.train.Coordinator()
@@ -214,9 +185,6 @@ def work(self):
214185
threads[-1].start()
215186
COORD.join(threads)
216187

217-
end = time.time()
218-
print "Total time ", (end - start)
219-
220188
# plot reward change and test
221189
plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
222190
plt.xlabel('Episode'); plt.ylabel('Moving reward'); plt.ion(); plt.show()

0 commit comments

Comments
 (0)