第3关:策略迭代与值迭代

import gym
import numpy as np

def policy_evaluation(env, policy, gamma=0.9, threshold=1e-6):
    num_states = env.observation_space.n
    num_actions = env.action_space.n

    # 初始化值函数
    values = np.zeros(num_states)

    while True:
        delta = 0

        for state in range(num_states):
            v = values[state]

            # 根据贝尔曼方程更新值函数
            q_values = np.zeros(num_actions)
            for action in range(num_actions):
                for prob, next_state, reward, _ in env.P[state][action]:
                    q_values[action] += prob * (reward + gamma * values[next_state])

            # 更新值函数
            values[state] = np.sum(policy[state] * q_values)

            delta = max(delta, np.abs(v - values[state]))

        if delta < threshold:
            break

    return values

# 请在下面的 Begin-End 之间按照注释中给出的提示编写正确的代码
########## Begin ##########

def policy_iteration(env, gamma=0.9, max_iterations=10000):
    num_states = env.observation_space.n
    num_actions = env.action_space.n

    # 第一步:初始化策略函数
    policy = np.ones((num_states, num_actions)) / num_actions


    for _ in range(max_iterations):
        # 进行策略评估
        values = policy_evaluation(env, policy, gamma)

        policy_stable = True

        for state in range(num_states):

            # 第二步:记录当前状态的旧动作
            old_action = np.argmax(policy[state])


            # 第三步:初始化动作值函数(q_values)为全零向量
            q_values = np.zeros(num_actions)
            

            for action in range(num_actions):
                for prob, next_state, reward, _ in env.P[state][action]:

                    # 第四步:根据贝尔曼方程计算当前动作的Q值
                    q_values[action] += prob * (reward + gamma * values[next_state])


            # 第五步:选择具有最大Q值的动作作为当前状态的新动作
            new_action = np.argmax(q_values)


            if old_action != new_action:
                policy_stable = False

            policy[state] = np.eye(num_actions)[new_action]

        if policy_stable:
            break

    return values


def value_iteration(env, gamma=0.9, max_iterations=10000):
    num_states = env.observation_space.n
    num_actions = env.action_space.n

    # 初始化值函数
    values = np.zeros(num_states)

    for _ in range(max_iterations):
        delta = 0

        for state in range(num_states):

            # 第六步:记录当前状态的旧值
            v = values[state]


            # 更新值函数
            q_values = np.zeros(num_actions)
            for action in range(num_actions):
                for prob, next_state, reward, _ in env.P[state][action]:
                    q_values[action] += prob * (reward + gamma * values[next_state])

            # 第七步:使用Q值的最大值更新值函数
            values[state] = np.max(q_values)


            # 第八步:计算值函数的变化量
            delta = max(delta, np.abs(v - values[state]))


        if delta < 1e-6:
            break

    # 根据最终的值函数,生成最优策略
    policy = np.zeros((num_states, num_actions))
    for state in range(num_states):
        q_values = np.zeros(num_actions)
        for action in range(num_actions):
            for prob, next_state, reward, _ in env.P[state][action]:
                q_values[action] += prob * (reward + gamma * values[next_state])
        best_action = np.argmax(q_values)
        policy[state][best_action] = 1.0
    return values

########## End ##########

env = gym.make('FrozenLake-v0')

# 使用策略迭代
v1 = policy_iteration(env)

# 使用值迭代
v2 = value_iteration(env)




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值