【深度强化学习实战】tensorflow2.x 训练 muzero 玩井字棋(tic-tac-toe)
参考资料:
[1]ColinFred. 蒙特卡洛树搜索(MCTS)代码详解【python】. 2019-03-23 23:37:09.
[2]饼干Japson 深度强化学习实验室.【论文深度研读报告】MuZero算法过程详解.2021-01-19.
[3]Tangarf. Muzero算法研读报告. 2020-08-31 11:40:20 .
[4]带带弟弟好吗. AlphaGo版本三——MuZero. 2020-08-30.
[5]Google原论文:Mastering Atari, Go, Chess and Shogi by Planning with a Learned Model.
[6]参考GitHub代码1.
[7]参考GitHub代码2.

- 因为之前的五子棋程序难以训练,这里提供一个井字棋的程序方便学习。
- 代码结构和之前的五子棋的代码结构一样。
resnet_model.py
import tensorflow as tf
import numpy as np
num_blocks = 6
assert num_blocks >= 1 , "残差块的数量必须大于等于1"
l2 = 1e-4
def hidden_state_norm(x):
min = tf.reduce_min(x, axis=(1, 2), keepdims=True) - 1e-6
max = tf.reduce_max(x, axis=(1, 2), keepdims=True) + 1e-6
hs_norm = (x - min) / (max - min)
return hs_norm
class ResidualBlock(tf.keras.Model):
expansion = 1
def __init__(self, in_channels, out_channels, strides=1):
super(ResidualBlock, self).__init__()
self.conv1 = tf.keras.layers.Conv2D(out_channels, kernel_size=3, strides=strides,
padding="same", use_bias=False,
kernel_regularizer=tf.keras.regularizers.l2(l2))
self.bn1 = tf.keras.layers.BatchNormalization()
self.conv2 = tf.keras.layers.Conv2D(out_channels, kernel_size=3, strides=1,
padding="same", use_bias=False,
kernel_regularizer=tf.keras.regularizers.l2(l2))
self.bn2 = tf.keras.layers.BatchNormalization()
"""
Adds a shortcut between input and residual block and merges them with "sum"
"""
if strides != 1 or in_channels != self.expansion * out_channels:
self.shortcut = tf.keras.Sequential([
tf.keras.layers.Conv2D(self.expansion*out_channels, kernel_size=1,
strides=strides, use_bias=False, kernel_regularizer=tf.keras.regularizers.l2(l2)),
tf.keras.layers.BatchNormalization()]
)
else:
self.shortcut = lambda x,_: x
def call(self, x, training=False):
# if training: print("=> training network ... ")
out = tf.nn.relu(self.bn1(self.conv1(x), training=training))
out = self.bn2(self.conv2(out), training=training)
out += self.shortcut(x, training)
return tf.nn.relu(out)
class representation:
def __init__(self, observation_shape, hidden_state_channel):
observation = tf.keras.Input(shape=observation_shape)
x = ResidualBlock(
in_channels=observation_shape[-1],
out_channels=hidden_state_channel,
)(observation)
for _ in range(num_blocks - 1):
x = ResidualBlock(
in_channels=hidden_state_channel,
out_channels=hidden_state_channel,
)(x)
hidden_state = hidden_state_norm(x)
self.model = tf.keras.Model(inputs=observation, outputs=hidden_state)
self.trainable_variables = self.model.trainable_variables
def predict(self, observation):
observation = np.array([observation])
hidden_state = np.array(self.model(observation)[0])
return hidden_state
class dynamics:
def __init__(self, hidden_state_shape, hidden_state_channel, num_chess):
self.num_chess = num_chess
hidden_state = tf.keras.Input(shape=hidden_state_shape)
action = tf.keras.Input(shape=(num_chess, num_chess, 1))
x = tf.keras.layers.concatenate([hidden_state, action])
# print(x.shape)
x = ResidualBlock(
in_channels=hidden_state_channel + 1,
out_channels=hidden_state_channel
)(x)
for _ in range(num_blocks - 1):
x = ResidualBlock(
in_channels=hidden_state_channel,
out_channels=hidden_state_channel
)(x)
next_hidden_state = hidden_state_norm(x)
self.model = tf.keras.Model(inputs=[hidden_state, action], outputs=next_hidden_state)
self.trainable_variables = self.model.trainable_variables
def predict(self, hidden_state, action):
hidden_state = np.array([hidden_state])
action = np.array([1 if i == action else 0 for i in range(self.num_chess ** 2)])
action = np.reshape(action, newshape=(1, self.num_chess, self.num_chess, 1))
next_hidden_state = self.model([hidden_state, action])
next_hidden_state = np.array(next_hidden_state[0])
return next_hidden_state
class prediction:
def __init__(self, hidden_state_shape, hidden_state_channel, num_chess):
hidden_state = tf.keras.Input(shape=hidden_state_shape)
x = hidden_state
for _ in range(num_blocks):
x = ResidualBlock(
in_channels=hidden_state_channel,
out_channels=hidden_state_channel,
)(x)
policy = tf.keras.layers.Conv2D(filters=32, kernel_size=3, strides=1,
padding="SAME", use_bias=False, kernel_regularizer=tf.keras.regularizers.l2(l2))(x)
policy = tf.keras.layers.BatchNormalization()(policy)
policy = tf.keras.layers.Activation('relu')(policy)
policy = tf.keras.layers.Flatten()(policy)
policy = tf.keras.layers.Dense(units=1024, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2))(policy)
policy = tf.keras.layers.Dense(units=num_chess ** 2, activation='softmax', kernel_regularizer=tf.keras.regularizers.l2(l2))(policy)
value = tf.keras.layers.Conv2D(filters=32, kernel_size=3, strides=1,
padding="SAME", use_bias=False, kernel_regularizer=tf.keras.regularizers.l2(l2))(x)
value = tf.keras.layers.BatchNormalization()(value)
value = tf.keras.layers.Activation('relu')(value)
value = tf.keras.layers.Flatten()(value)
value = tf.keras.layers.Dense(units=1024, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2))(value)
value = tf.keras.layers.Dense(units=1, activation='tanh', kernel_regularizer=tf.keras.regularizers.l2(l2))(value)
self.model = tf.keras.Model(inputs=hidden_state, outputs=[policy, value])
self.trainable_variables = self.model.trainable_variables
def predict(self, hidden_state):
hidden_state = np.array([hidden_state

本文介绍了如何使用 TensorFlow 2.x 构建 Muzero 算法应用于井字棋环境,包括 ResNet 模型的实现、蒙特卡洛树搜索(MCTS)的运用,以及模型的训练过程。通过实例代码展示了从环境初始化到自我对弈的完整流程。
最低0.47元/天 解锁文章
3004

被折叠的 条评论
为什么被折叠?



