Skip to content

Commit 91218c0

Browse files
committed
Update PPO with curriculum
Update PPO with curriculum
1 parent 07d6828 commit 91218c0

File tree

6 files changed

+443
-0
lines changed

6 files changed

+443
-0
lines changed

PPO2/ResultLogger.py

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import tensorflow as tf
2+
class ResultLogger:
3+
def __init__(self, writer):
4+
"""
5+
6+
:param writer: TensorboardX writer
7+
"""
8+
self.writer = writer
9+
self.score = []
10+
self.mean = []
11+
self.episode = 0
12+
pass
13+
14+
def log_result(self, total_reward, winnum, drawnum, episode):
15+
"""
16+
log result
17+
:param total_reward:
18+
:param winnum:
19+
:param drawnum:
20+
:param episode:
21+
:return:
22+
"""
23+
self.episode=episode
24+
self.score.append(total_reward)
25+
self.writer.add_scalar('total_reward', total_reward, episode)
26+
mean_reward = sum(self.score[-100:]) / 100
27+
self.mean.append(mean_reward)
28+
self.writer.add_scalar('mean_reward', mean_reward, episode)
29+
30+
if episode % 100:
31+
self.writer.add_scalar('win_rate', winnum / 100, episode) # 用于博弈环境
32+
self.writer.add_scalar('draw_rate', drawnum / 100, episode) # 用于博弈环境
33+
pass
34+
35+
def log_parameter(self, info_dict=None):
36+
"""
37+
Log hyper parameter.
38+
:param info_dict:
39+
:return:
40+
"""
41+
if info_dict and type(info_dict) == dict:
42+
for (k, v) in info_dict.items():
43+
self.writer.add_scalar(k, v, self.episode)
44+
45+
pass
46+
47+
def log_gaes(self, gae, episode):
48+
self.writer.add_scalar('GAE', gae, episode)
49+
50+

PPO2/algo/ppo.py

+142
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
import tensorflow as tf
2+
import copy
3+
4+
5+
class PPOTrain:
6+
def __init__(self, Policy, Old_Policy, gamma=0.95, clip_value=0.2, c_1=1, c_2=0.01, logger=None, args=None):
7+
"""
8+
:param Policy:
9+
:param Old_Policy:
10+
:param gamma:
11+
:param clip_value:
12+
:param c_1: parameter for value difference
13+
:param c_2: parameter for entropy bonus
14+
:param logger: hyper-parameter Saver
15+
:param is_log: wheter save the hyper-parameter
16+
"""
17+
18+
self.Policy = Policy
19+
self.Old_Policy = Old_Policy
20+
self.gamma = gamma
21+
self.logger = logger
22+
self.args = args
23+
24+
pi_trainable = self.Policy.get_trainable_variables()
25+
old_pi_trainable = self.Old_Policy.get_trainable_variables()
26+
27+
# assign_operations for policy parameter values to old policy parameters
28+
with tf.variable_scope('assign_op'):
29+
self.assign_ops = []
30+
for v_old, v in zip(old_pi_trainable, pi_trainable):
31+
self.assign_ops.append(tf.assign(v_old, v))
32+
33+
# inputs for train_op
34+
with tf.variable_scope('train_inp'):
35+
self.actions = tf.placeholder(dtype=tf.int32, shape=[None], name='actions')
36+
self.rewards = tf.placeholder(dtype=tf.float32, shape=[None], name='rewards')
37+
self.v_preds_next = tf.placeholder(dtype=tf.float32, shape=[None], name='v_preds_next')
38+
self.gaes = tf.placeholder(dtype=tf.float32, shape=[None], name='gaes')
39+
40+
act_probs = self.Policy.act_probs
41+
act_probs_old = self.Old_Policy.act_probs
42+
43+
# agent通过新策略选择action的概率 probabilities of actions which agent took with policy
44+
act_probs = act_probs * tf.one_hot(indices=self.actions, depth=act_probs.shape[1])
45+
act_probs = tf.reduce_sum(act_probs, axis=1)
46+
47+
# agent通过旧策略选择action的概率 probabilities of actions which agent took with old policy
48+
act_probs_old = act_probs_old * tf.one_hot(indices=self.actions, depth=act_probs_old.shape[1])
49+
act_probs_old = tf.reduce_sum(act_probs_old, axis=1)
50+
51+
with tf.variable_scope('PPO_loss'):
52+
"""
53+
策略目标函数
54+
"""
55+
#
56+
# ratios = tf.divide(act_probs, act_probs_old)
57+
# r_t(θ) = π/πold 为了防止除数为0,这里截取一下值,然后使用e(log减法)来代替直接除法
58+
ratios = tf.exp(tf.log(tf.clip_by_value(act_probs, 1e-10, 1.0)) - tf.log(tf.clip_by_value(act_probs_old, 1e-10, 1.0)))
59+
# L_CLIP 裁剪优势函数值
60+
clipped_ratios = tf.clip_by_value(ratios, clip_value_min=1 - clip_value, clip_value_max=1 + clip_value)
61+
self.loss_clip = tf.minimum(tf.multiply(self.gaes, ratios), tf.multiply(self.gaes, clipped_ratios))
62+
self.loss_clip = tf.reduce_mean(self.loss_clip)
63+
64+
"""
65+
策略模型的熵
66+
"""
67+
# 计算新策略πθ的熵 S = -p log(p) 这里裁剪防止p=0
68+
self.entropy = -tf.reduce_sum(self.Policy.act_probs * tf.log(tf.clip_by_value(self.Policy.act_probs, 1e-10, 1.0)), axis=1)
69+
self.entropy = tf.reduce_mean(self.entropy, axis=0) # mean of entropy of pi(obs)
70+
71+
"""
72+
值目标函数
73+
"""
74+
# L_vf = [(r+γV(π(st+1))) - (V(π(st)))]^2
75+
v_preds = self.Policy.v_preds
76+
self.loss_vf = tf.squared_difference(self.rewards + self.gamma * self.v_preds_next, v_preds)
77+
self.loss_vf = tf.reduce_mean(self.loss_vf)
78+
79+
# construct computation graph for loss
80+
# L(θ) = E_hat[L_CLIP(θ) - c1 L_VF(θ) + c2 S[πθ](s)]
81+
# L = 策略目标函数 + 值目标函数 + 策略模型的熵
82+
self.loss = self.loss_clip - c_1 * self.loss_vf + c_2 * self.entropy
83+
# minimize -loss == maximize loss
84+
self.loss = -self.loss
85+
86+
optimizer = tf.train.RMSPropOptimizer(learning_rate=args.ppo_lr, epsilon=1e-5)
87+
self.gradients = optimizer.compute_gradients(self.loss, var_list=pi_trainable)
88+
self.train_op = optimizer.minimize(self.loss, var_list=pi_trainable)
89+
90+
91+
def train(self, obs, actions, gaes, rewards, v_preds_next):
92+
tf.get_default_session().run(self.train_op, feed_dict={self.Policy.obs: obs,
93+
self.Old_Policy.obs: obs,
94+
self.actions: actions,
95+
self.rewards: rewards,
96+
self.v_preds_next: v_preds_next,
97+
self.gaes: gaes})
98+
99+
def log_parameter(self, obs, actions, gaes, rewards, v_preds_next):
100+
lc, ent, lvf, loss = tf.get_default_session().run([self.loss_clip, self.entropy, self.loss_vf, self.loss], feed_dict={self.Policy.obs: obs,
101+
self.Old_Policy.obs: obs,
102+
self.actions: actions,
103+
self.rewards: rewards,
104+
self.v_preds_next: v_preds_next,
105+
self.gaes: gaes})
106+
107+
log_dict = {
108+
'ppo_loss_clip': lc,
109+
'ppo_entropy': ent,
110+
'ppo_value_difference': lvf,
111+
'ppo_total = (Lclip+Lvf+S)': loss
112+
}
113+
114+
self.logger.log_parameter(log_dict)
115+
116+
def assign_policy_parameters(self):
117+
# assign policy parameter values to old policy parameters
118+
return tf.get_default_session().run(self.assign_ops)
119+
120+
def get_gaes(self, rewards, v_preds, v_preds_next):
121+
"""
122+
GAE
123+
:param rewards: r(t)
124+
:param v_preds: v(st)
125+
:param v_preds_next: v(st+1)
126+
:return:
127+
"""
128+
deltas = [r_t + self.gamma * v_next - v for r_t, v_next, v in zip(rewards, v_preds_next, v_preds)]
129+
130+
# calculate generative advantage estimator(lambda = 1), see ppo paper eq(11)
131+
gaes = copy.deepcopy(deltas)
132+
for t in reversed(range(len(gaes) - 1)): # is T-1, where T is time step which run policy
133+
gaes[t] = gaes[t] + self.gamma * gaes[t + 1]
134+
return gaes
135+
136+
def get_grad(self, obs, actions, gaes, rewards, v_preds_next):
137+
return tf.get_default_session().run(self.gradients, feed_dict={self.Policy.obs: obs,
138+
self.Old_Policy.obs: obs,
139+
self.actions: actions,
140+
self.rewards: rewards,
141+
self.v_preds_next: v_preds_next,
142+
self.gaes: gaes})

PPO2/main.py

+145
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
#!/usr/bin/python3
2+
import argparse
3+
import gym
4+
import numpy as np
5+
import utils
6+
import tensorflow as tf
7+
from environment import Environment
8+
from network.policy_net import Policy_net
9+
from algo.ppo import PPOTrain
10+
from tensorboardX import SummaryWriter
11+
from ResultLogger import ResultLogger
12+
13+
14+
def argparser():
15+
parser = argparse.ArgumentParser()
16+
parser.add_argument('--logdir', help='log directory', default='log/train/ppo_curriculum_without_misslerew')
17+
parser.add_argument('--savedir', help='save directory', default='trained_models/ppo_curriculum_without_misslerew')
18+
parser.add_argument('--gamma', default=0.95, type=float)
19+
parser.add_argument('--ppo_lr', help='ppo learning rate', default=1e-4)
20+
parser.add_argument('--episode', default=int(10e4), type=int)
21+
parser.add_argument('--continue_train',default=False, type=bool, help='whether continue training on the previous model.')
22+
parser.add_argument('--continue_meta', type=str, default='./trained_models/ppo_curriculum/model.ckpt.meta',
23+
help='meta file trained by the previous model.')
24+
parser.add_argument('--continue_modeldir', type=str, default='./trained_models/ppo_curriculum/',
25+
help='trained models dirctory trained by the previous model.')
26+
return parser.parse_args()
27+
28+
29+
def main(args):
30+
writer = SummaryWriter(args.logdir)
31+
logger = ResultLogger(writer)
32+
33+
env = Environment() # 自定义环境
34+
ob_space = env.observation_space
35+
Policy = Policy_net('policy', env)
36+
Old_Policy = Policy_net('old_policy', env)
37+
PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma, args=args, logger=logger)
38+
saver = tf.train.Saver()
39+
40+
if args.continue_train:
41+
tf.reset_default_graph()
42+
tf.train.import_meta_graph(args.continue_meta)
43+
44+
45+
46+
with tf.Session() as sess:
47+
if args.continue_train:
48+
saver.restore(sess, args.continue_modeldir)
49+
sess.run(tf.global_variables_initializer())
50+
reward = 0
51+
winnum = 0
52+
drawnum = 0
53+
for episode in range(args.episode):
54+
55+
observations = []
56+
actions = []
57+
v_preds = []
58+
rewards = []
59+
60+
run_policy_steps = 0
61+
62+
total_reward = 0
63+
obs = env.reset()
64+
while True: # run policy RUN_POLICY_STEPS which is much less than episode length
65+
run_policy_steps += 1
66+
67+
obs = np.stack([obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs
68+
act, v_pred = Policy.act(obs=obs, stochastic=True)
69+
70+
act = np.asscalar(act)
71+
v_pred = np.asscalar(v_pred)
72+
73+
observations.append(obs)
74+
actions.append(act)
75+
v_preds.append(v_pred)
76+
rewards.append(reward)
77+
78+
next_obs, reward, sparse_rew, done, info = env.step(act)
79+
if reward < -1000:
80+
reward = -10
81+
82+
reward = utils.get_curriculum_reward(reward, sparse_rew, 1.0, run_policy_steps)
83+
# if episode==1:
84+
# print(reward)
85+
86+
87+
obs = next_obs
88+
if done:
89+
total_reward = sum(rewards)
90+
total_reward /= run_policy_steps
91+
total_reward += reward
92+
v_preds_next = v_preds[1:] + [0] # next state of terminate state has 0 state value
93+
94+
reward = -1
95+
if info == 3:
96+
winnum += 1
97+
if info == 2:
98+
drawnum += 1
99+
100+
break
101+
102+
if episode % 100 == 0:
103+
winnum = 0
104+
drawnum = 0
105+
106+
logger.log_result(total_reward, winnum, drawnum, episode)
107+
print(episode, total_reward)
108+
if episode % 1000 == 0:
109+
saver.save(sess, args.savedir + '/model.ckpt')
110+
111+
####
112+
## GAE
113+
####
114+
gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next)
115+
116+
# 把list 转成 适应于tf.placeholder 的numpy array
117+
observations = np.reshape(observations, newshape=(-1, ob_space))
118+
actions = np.array(actions).astype(dtype=np.int32)
119+
gaes = np.array(gaes).astype(dtype=np.float32)
120+
gaes = (gaes - gaes.mean()) / gaes.std()
121+
rewards = np.array(rewards).astype(dtype=np.float32)
122+
v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)
123+
124+
logger.log_gaes(gaes.mean(), episode)
125+
PPO.log_parameter(observations, actions, gaes, rewards, v_preds_next)
126+
PPO.assign_policy_parameters()
127+
128+
inp = [observations, actions, gaes, rewards, v_preds_next]
129+
130+
# train
131+
for epoch in range(2):
132+
# sample indices from [low, high)
133+
sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32)
134+
sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data
135+
PPO.train(obs=sampled_inp[0],
136+
actions=sampled_inp[1],
137+
gaes=sampled_inp[2],
138+
rewards=sampled_inp[3],
139+
v_preds_next=sampled_inp[4])
140+
141+
142+
143+
if __name__ == '__main__':
144+
args = argparser()
145+
main(args)

0 commit comments

Comments
 (0)