网络结构
def agent_net():
x = Input(shape=STATE_SHAPE)
action = Input(shape=[ACTION_DIM, ])
h = Dense(100, activation="relu")(x)
out1 = Dense(ACTION_DIM, activation=None)(h)
out2 = Dot(-1)([action, out1])
model = keras.Model([x, action], out2)
model.compile(optimizer="adam", loss="mse")
model.summary()
return model
训练
net = agent_net()
callback_function = keras.callbacks.ModelCheckpoint("model/weight.{epoch:02d}.h5")
func = keras.backend.function(net.input[0], net.layers[-2].output)
data = Data(func)
net.fit(data.next_data(), epochs=200, steps_per_epoch=200, callbacks=[callback_function])
数据生成
Data类中生成训练所需数据
func函数为智能体决策动作的函数
Data类中的D存放状态,下一个状态,动作,回报(reward),游戏结束信息。
本文将reward设定为以下几个值:
游戏中 0.1
失败(到达最大游戏次数) -1
胜利(在最大游戏次数内到达顶点) 1
class Data:
def __init__(self, func):
self.func = func
self.state_shape = STATE_SHAPE
self.action_dim = ACTION_DIM
self.max_length = 2000
self.D = deque(maxlen=self.max_length)
def display(self):
for _ in self.next_data():
pass
def act(self, state, epsilon=0.1):
if np.random.random() < epsilon:
return np.random.randint(0, self.action_dim)
return self.func(state[np.newaxis, :]).argmax(-1).astype("int")[0]
def next_data(self):
epsilon = 0.1
game = gym.make("MountainCar-v0")
state = game.reset()
reward = 0.1
while True:
state = game.reset() if abs(reward) > 0.1 else state
game.render()
action = self.act(state, epsilon)
next_state, reward, terminal, info = game.step(action)
reward = 0.1
if terminal:
reward = -1 if len(info) > 0 else 1
terminal = int(reward < 0)
self.D.append((state, next_state, action, reward, terminal))
state = next_state
batch = self.batch_data()
if batch is None:
continue
epsilon -= 0.0002
yield batch
def batch_data(self, batch_size=32):
gamma = 0.95
if len(self.D) < self.max_length:
sys.stdout.write("\r num of sample is : %d/%d" % (len(self.D), self.max_length))
sys.stdout.flush()
return None
# 抽取数据训练
batch = random.sample(self.D, batch_size)
batch = [np.stack(elem) for elem in zip(*batch)]
state, next_state, action, reward, terminal = batch
out = self.func(next_state).max(-1)
batch_y = reward + gamma * out * (1 - terminal)
action = np.eye(self.action_dim)[action]
return [state, action], batch_y