- 论文地址.
- 策略网络的损失函数不是按照原论文所写。(X)
- 更新:策略网络的损失函数已按照原论文更新。
- 更新:加入了自适应温度参数 alpha 控制策略熵。
requirements.txt:
tensorflow-gpu==2.4.0
gym[all]==0.21.0
tensorflow_probability==0.14.0
keras==2.6.0
matplotlib==3.5.1
from tensorflow.keras import layers, models, Input, optimizers, losses
from tensorflow_probability.python.distributions import Normal
from collections import deque
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import random
import copy
import gym
class SoftActorCritic:
def __init__(self, state_shape, action_dim):
self.ema = tf.train.ExponentialMovingAverage(decay=0.995)
self.replay_buffer = deque(maxlen=10000)
self.gamma = 0.997
self.log_aplha = tf.Variable(np.random.normal(), trainable=True, name="EntropyTemperature")
self.mini_entropy = 0.1
self.policy_OPT = optimizers.Adam(learning_rate=1e-3)
self.Q1_OPT = optimizers.Adam(learning_rate=1e-3)
self.Q2_OPT = optimizers.RMSprop(learning_rate=1e-3)
self.value_OPT = optimizers.Adam(learning_rate=1e-3)
self.alpha_OPT = optimizers.SGD(learning_rate=1e-3)
policy_input = Input(shape=state_shape)
x = layers.Dense(units=1024, activation='relu')(policy_input)
x = layers.Dense(units=1024, activation='relu')(x)
policy_mean = layers.Dense(units=action_dim, activation='linear')(x)
log_policy_std = layers.Dense(units=action_dim, activation='linear')(x)
log_policy_std_clipped = tf.clip_by_value(log_policy_std, -10, 2)
self.policy_network = models.Model(inputs=policy_input, outputs=[policy_mean, log_policy_std_clipped])
value_input = Input(shape=state_shape)
x = layers.Dense(units=1024, activation='relu')(value_input)
x = layers.Dense(units=1024, activation='relu')(x)
value_output = layers.Dense(units=1, activation='linear')(x)
self.value_network = models.Model(inputs=value_input, outputs=value_output)
self.<