本篇文章利用tensorflow2.0自定义loss函数实现policy gradient策略梯度,自定义loss=-log(prob) *Vt
现在训练最高分能到193分,但是还是不稳定,在修改中,欢迎一起探讨
文章代码也有参考莫烦大佬的代码
action_dim = 2 //定义动作
state_dim = 4 //定义状态
env = gym.make('CartPole-v0')
class PGModel(tf.keras.Model):
def __init__(self):
super().__init__()
self.dense1 = layers.Dense(128,input_dim=state_dim,activation='relu')
layers.Dropout(0.1)
self.all_acts = layers.Dense(units=action_dim)
self.x = 0
def call(self,inputs):
x = self.dense1(inputs)
x = self.all_acts(x)
self.x = x
output = tf.nn.softmax(x)
return output
class PG():
def __init__(self):
self.model = PGModel()
def choose_action(self, s):
prob = self.model.predict(np.array([s]))[0]
#print(prob)
return np.random.choice(len(prob),p=prob)
def discount_reward(self,rewards,gamma=0.95): #衰减reward 通过最后一步奖励反推真实奖励
out = np.zeros_like(rewards)
dis_reward = 0
for i in reversed(range(len(rewards))):
dis_reward = dis_reward + gamma * rewards[i] # 前一步的reward等于后一步衰减reward加上即时奖励乘以衰减因子
out[i] = dis_reward
return out/np.std(out - np.mean(out))
def all_actf(self):
all_act = self.model.x
print(all_act)
return all_act
def reca_batch(self,a_batch):
a = a_batch
return a
def def_loss(self,label=reca_batch,logit=all_actf): //自定义loss函数
neg_log_prob = tf.nn.softmax_cross_entropy_with_logits(labels=label,logits=logit)
return neg_log_prob
def train(self,records): #训练
s_batch = np.array([record[0] for record in records]) #取状态,每次batch个状态
a_batch = np.array([[1 if record[1]==i else 0 for i in range(action_dim)]
for record in records])
self.reca_batch(a_batch)
prob_batch = self.model.predict(s_batch) * a_batch
r_batch = self.discount_reward([record[2] for record in records ])
self.model.compile(loss=self.def_loss,optimizer=optimizers.Adam(0.001))
self.model.fit(s_batch,prob_batch,sample_weight=r_batch,verbose=1)
episodes = 2000
score_list= []
pg = PG()
for i in range(episodes):
score = 0
records = []
s = env.reset()
while True:
a = pg.choose_action(s)
#print(a)
next_s,r,done,_ = env.step(a)
records.append((s, a, r))
s = next_s
score += r
if done:
pg.train(records)
score_list.append(score)
print("episode:", i, "score:", score, "maxscore:", max(score_list))
break
if np.mean(score_list[-10:]) > 195:
pg.model.save('CarPoleModel.h5')
break
env.close()