import gym
import tensorflow as tf
import numpy as np
import random
from collections import deque
# Hyper Parameters
GAMMA = 0.95 # discount factor
LEARNING_RATE=0.01
class Actor():# PI
def __init__(self, env, sess):
# init some parameters
self.time_step = 0
self.state_dim = env.observation_space.shape[0]
self.action_dim = env.action_space.n
# 策略
self.create_softmax_network()
# Init session
self.session = sess
self.session.run(tf.global_variables_initializer())
def create_softmax_network(self):
# network weights
W1 = self.weight_variable([self.state_dim, 20])
b1 = self.bias_variable([20])
W2 = self.weight_variable([20, self.action_dim])
b2 = self.bias_variable([self.action_dim])
# input layer
self.state_input = tf.placeholder("float", [None, self.state_dim])
self.tf_acts = tf.placeholder(tf.int32, [None,2], name="actions_num")
self.td_error = tf.placeholder(tf.float32, None, "td_error") # TD_error
# hidden layers
h_layer = tf.nn.relu(tf.matmul(self.state_input, W1) + b1)
# softmax layer
self.softmax_input = tf.matmul(h_layer, W2) + b2
# softmax o