import gym
import numpy as np
def policy_evaluation(env, policy, gamma=0.9, threshold=1e-6):
num_states = env.observation_space.n
num_actions = env.action_space.n
# 初始化值函数
values = np.zeros(num_states)
while True:
delta = 0
for state in range(num_states):
v = values[state]
# 根据贝尔曼方程更新值函数
q_values = np.zeros(num_actions)
for action in range(num_actions):
for prob, next_state, reward, _ in env.P[state][action]:
q_values[action] += prob * (reward + gamma * values[next_state])
# 更新值函数
values[state] = np.sum(policy[state] * q_values)
delta = max(delta, np.abs(v - values[state]))
if delta < threshold:
break
return values
# 请在下面的 Begin-End 之间按照注释中给出的提示编写正确的代码
########## Begin ##########
def policy_iteration(env, gamma=0.9, max_iterations=10000):
num_states = env.observation_space.n
num_actions = env.action_space.n
# 第一步:初始化策略函数
policy = np.ones((num_states, num_actions)) / num_actions
for _ in range(max_iterations):
# 进行策略评估
values = policy_evaluation(env, policy, gamma)
policy_stable = True
for state in range(num_states):
# 第二步:记录当前状态的旧动作
old_action = np.argmax(policy[state])
# 第三步:初始化动作值函数(q_values)为全零向量
q_values = np.zeros(num_actions)
for action in range(num_actions):
for prob, next_state, reward, _ in env.P[state][action]:
# 第四步:根据贝尔曼方程计算当前动作的Q值
q_values[action] += prob * (reward + gamma * values[next_state])
# 第五步:选择具有最大Q值的动作作为当前状态的新动作
new_action = np.argmax(q_values)
if old_action != new_action:
policy_stable = False
policy[state] = np.eye(num_actions)[new_action]
if policy_stable:
break
return values
def value_iteration(env, gamma=0.9, max_iterations=10000):
num_states = env.observation_space.n
num_actions = env.action_space.n
# 初始化值函数
values = np.zeros(num_states)
for _ in range(max_iterations):
delta = 0
for state in range(num_states):
# 第六步:记录当前状态的旧值
v = values[state]
# 更新值函数
q_values = np.zeros(num_actions)
for action in range(num_actions):
for prob, next_state, reward, _ in env.P[state][action]:
q_values[action] += prob * (reward + gamma * values[next_state])
# 第七步:使用Q值的最大值更新值函数
values[state] = np.max(q_values)
# 第八步:计算值函数的变化量
delta = max(delta, np.abs(v - values[state]))
if delta < 1e-6:
break
# 根据最终的值函数,生成最优策略
policy = np.zeros((num_states, num_actions))
for state in range(num_states):
q_values = np.zeros(num_actions)
for action in range(num_actions):
for prob, next_state, reward, _ in env.P[state][action]:
q_values[action] += prob * (reward + gamma * values[next_state])
best_action = np.argmax(q_values)
policy[state][best_action] = 1.0
return values
########## End ##########
env = gym.make('FrozenLake-v0')
# 使用策略迭代
v1 = policy_iteration(env)
# 使用值迭代
v2 = value_iteration(env)