问题描述
1.策略梯度方法在gym环境中的实现
登月器的目标是在月球上正确着陆,(两根旗帜之间即为目标地点)。利用梯度下降和更新梯度的方式,使动作产生,处于不同奖励状态的,概率的不同
2.深度确定性策略梯度方法在gym环境中的实现
在此算法中,会形成行动网络(Actor Network)和评价网络(Critic Network)。同时对两种网络定义目标网络,利用深度确定性策略梯度进行模拟
代码实现
方法①
#策略梯度下降
from argparse import Action
from turtle import shape
import tensorflow as tf
import numpy as np
from tensorflow.python.framework import ops
import gym
import time
#define the policyGradient
class PolicyGradient:
'''initilaize all variable'''
def __init__(self,n_x,n_y,learning_rate = 0.01,reward_decay = 0.95):
#number of states in the env
self.n_x = n_x
#number of actions in the env
self.n_y = n_y
#learning rate of the network
self.lr = learning_rate
#discount factor
self.gamma = reward_decay
#initialize the lists of storing oberservation
#actions and rewards
self.episode_observations,self.episode_actions,self.episode_rewards =[],[],[]
#define function for build the network
self.build_network()
#store the cois i.e loss
self.cost_history =[]
#initialize tensorflow session
self.sess = tf.compat.v1.Session()
self.sess.run(tf.compat.v1.global_variables_initializer())
''' stores the transitions, that is, state,action, and reward.'''
def store_transition(self,s,a,r):
self.episode_observations.append(s)
self.episode_rewards.append(r)
#store actions as list of arrays
action = np.zeros(self.n_y)
action[a] = 1
self.episode_actions.append(action)
'''for choosing the action which given the state'''
def choose_action(self,observation):
#reshapr observation to (num_feature,1)
observation = observation[:,np.newaxis]
#run forward propagation to get softmax probabilities
prob_weights = self.sess.run(self.outputs_softmax,feed_dict={self.X:observation})
#select action using a biased sample
action = np.random.choice(range(len(prob_weights.ravel())), p=prob_weights.ravel())
return action
'''building the neural network'''
def build_network(self):
# placeholder for input x and output y
tf.compat.v1.disable_eager_execution()
self.X = tf.compat.v1.placeholder(tf.float32,shape=(self.n_x,None),name ="X")
self.Y = tf.compat.v1.placeholder(tf.float32,shape=(self.n_y,None),name="Y")
#placeholder for reward
self.discounted_episode_rewards_norm = tf.compat.v1.placeholder(tf.float32,[None, ], name="actions_value")
#build 3 layers
units_layers_1 = 10
units_layers_2 = 10
# number of neurons in the output layer
units_output_layers =self.n_y
#initialize weights and bias value using
W1