【深度强化学习实战】tensorflow2.x 训练 muzero 玩井字棋（tic-tac-toe）

原创

已于 2022-11-10 16:44:05 修改 · 763 阅读

3 ·

CC 4.0 BY-SA版权

文章标签：

#tensorflow #深度学习 #python #人工智能 #强化学习

于 2022-08-04 09:14:25 首次发布

本文介绍了如何使用 TensorFlow 2.x 构建 Muzero 算法应用于井字棋环境，包括 ResNet 模型的实现、蒙特卡洛树搜索（MCTS）的运用，以及模型的训练过程。通过实例代码展示了从环境初始化到自我对弈的完整流程。

【深度强化学习实战】tensorflow2.x 训练 muzero 玩井字棋（tic-tac-toe）

参考资料：
[1]ColinFred. 蒙特卡洛树搜索（MCTS）代码详解【python】. 2019-03-23 23:37:09.
[2]饼干Japson 深度强化学习实验室.【论文深度研读报告】MuZero算法过程详解.2021-01-19.
[3]Tangarf. Muzero算法研读报告. 2020-08-31 11:40:20 .
[4]带带弟弟好吗. AlphaGo版本三——MuZero. 2020-08-30.
[5]Google原论文：Mastering Atari, Go, Chess and Shogi by Planning with a Learned Model.
[6]参考GitHub代码1.
[7]参考GitHub代码2.

在这里插入图片描述

因为之前的五子棋程序难以训练，这里提供一个井字棋的程序方便学习。
代码结构和之前的五子棋的代码结构一样。

resnet_model.py

import tensorflow as tf
import numpy as np

num_blocks = 6
assert num_blocks >= 1 , "残差块的数量必须大于等于1"
l2 = 1e-4

def hidden_state_norm(x):
    min = tf.reduce_min(x, axis=(1, 2), keepdims=True) - 1e-6
    max = tf.reduce_max(x, axis=(1, 2), keepdims=True) + 1e-6
    hs_norm = (x - min) / (max - min)
    return hs_norm

class ResidualBlock(tf.keras.Model):
    expansion = 1

    def __init__(self, in_channels, out_channels, strides=1):
        super(ResidualBlock, self).__init__()
        self.conv1 = tf.keras.layers.Conv2D(out_channels, kernel_size=3, strides=strides,
                                            padding="same", use_bias=False,
                                            kernel_regularizer=tf.keras.regularizers.l2(l2))
        self.bn1 = tf.keras.layers.BatchNormalization()

        self.conv2 = tf.keras.layers.Conv2D(out_channels, kernel_size=3, strides=1,
                                            padding="same", use_bias=False,
                                            kernel_regularizer=tf.keras.regularizers.l2(l2))
        self.bn2 = tf.keras.layers.BatchNormalization()

        """
        Adds a shortcut between input and residual block and merges them with "sum"
        """
        if strides != 1 or in_channels != self.expansion * out_channels:
            self.shortcut = tf.keras.Sequential([
                    tf.keras.layers.Conv2D(self.expansion*out_channels, kernel_size=1,
                                           strides=strides, use_bias=False, kernel_regularizer=tf.keras.regularizers.l2(l2)),
                    tf.keras.layers.BatchNormalization()]
                    )
        else:
            self.shortcut = lambda x,_: x

    def call(self, x, training=False):
        # if training: print("=> training network ... ")
        out = tf.nn.relu(self.bn1(self.conv1(x), training=training))
        out = self.bn2(self.conv2(out), training=training)
        out += self.shortcut(x, training)
        return tf.nn.relu(out)

class representation:
    def __init__(self, observation_shape, hidden_state_channel):
        observation = tf.keras.Input(shape=observation_shape)

        x = ResidualBlock(
                in_channels=observation_shape[-1],
                out_channels=hidden_state_channel,
            )(observation)

        for _ in range(num_blocks - 1):
            x = ResidualBlock(
                    in_channels=hidden_state_channel,
                    out_channels=hidden_state_channel,
                )(x)

        hidden_state = hidden_state_norm(x)

        self.model = tf.keras.Model(inputs=observation, outputs=hidden_state)
        self.trainable_variables = self.model.trainable_variables

    def predict(self, observation):
        observation = np.array([observation])
        hidden_state = np.array(self.model(observation)[0])
        return hidden_state

class dynamics:
    def __init__(self, hidden_state_shape, hidden_state_channel, num_chess):
        self.num_chess = num_chess
        hidden_state = tf.keras.Input(shape=hidden_state_shape)
        action = tf.keras.Input(shape=(num_chess, num_chess, 1))
        
        x = tf.keras.layers.concatenate([hidden_state, action])
        # print(x.shape)
        x = ResidualBlock(
            in_channels=hidden_state_channel + 1,
            out_channels=hidden_state_channel
        )(x)

        for _ in range(num_blocks - 1):
            x = ResidualBlock(
                    in_channels=hidden_state_channel,
                    out_channels=hidden_state_channel
                )(x)
        next_hidden_state = hidden_state_norm(x)

        self.model = tf.keras.Model(inputs=[hidden_state, action], outputs=next_hidden_state)
        self.trainable_variables = self.model.trainable_variables

    def predict(self, hidden_state, action):
        hidden_state = np.array([hidden_state])
        action = np.array([1 if i == action else 0 for i in range(self.num_chess ** 2)])
        action = np.reshape(action, newshape=(1, self.num_chess, self.num_chess, 1))
        next_hidden_state = self.model([hidden_state, action])
        next_hidden_state = np.array(next_hidden_state[0])
        return next_hidden_state

class prediction:
    def __init__(self, hidden_state_shape, hidden_state_channel, num_chess):
        hidden_state = tf.keras.Input(shape=hidden_state_shape)
        x = hidden_state
        for _ in range(num_blocks):
            x = ResidualBlock(
                    in_channels=hidden_state_channel,
                    out_channels=hidden_state_channel,
                )(x)

        policy = tf.keras.layers.Conv2D(filters=32, kernel_size=3, strides=1,
                          padding="SAME", use_bias=False, kernel_regularizer=tf.keras.regularizers.l2(l2))(x)
        policy = tf.keras.layers.BatchNormalization()(policy)
        policy = tf.keras.layers.Activation('relu')(policy)
        policy = tf.keras.layers.Flatten()(policy)
        policy = tf.keras.layers.Dense(units=1024, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2))(policy)
        policy = tf.keras.layers.Dense(units=num_chess ** 2, activation='softmax', kernel_regularizer=tf.keras.regularizers.l2(l2))(policy)

        value = tf.keras.layers.Conv2D(filters=32, kernel_size=3, strides=1,
                          padding="SAME", use_bias=False, kernel_regularizer=tf.keras.regularizers.l2(l2))(x)
        value = tf.keras.layers.BatchNormalization()(value)
        value = tf.keras.layers.Activation('relu')(value)
        value = tf.keras.layers.Flatten()(value)
        value = tf.keras.layers.Dense(units=1024, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2))(value)
        value = tf.keras.layers.Dense(units=1, activation='tanh', kernel_regularizer=tf.keras.regularizers.l2(l2))(value)
        self.model = tf.keras.Model(inputs=hidden_state, outputs=[policy, value])
        self.trainable_variables = self.model.trainable_variables

    def predict(self, hidden_state):
        hidden_state = np.array([hidden_state