强化学习(Sarsa与q-learning)代码示例

本文介绍了Sarsa和Q-learning两种强化学习算法。Sarsa算法作为on-policy策略,其探索相对保守。示例代码显示,Sarsa的agent会避开陷阱。而Q-learning作为off-policy策略,探索更积极,其agent紧贴陷阱行走,最终效果出色。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

 一、Sarsa算法

        Sarsa算法是on-policy的,行为策略是什么,目标策略就是什么,即优化的是他实际执行的策略,所以使用Sarsa算法的agent在探索时显得有点“胆小”。

上Sarsa的代码:

import time
import numpy as np
import gym
class SarsaAgent(object):
    def __init__(self, obs_n, act_n, e_greed=0.1, gamma=0.9, learning_rate=0.01):
        self.act_n = act_n
        self.Q = np.zeros((obs_n, act_n))
        self.lr = learning_rate
        self.gamma = gamma
        self.epsilon = e_greed
    
    def sample(self, obs):
        if np.random.uniform(0,1) > (1.0 - self.epsilon):
            action = np.random.choice(self.act_n)
        else:
            action = self.predict(obs)
        return action
    
    def predict(self, obs):
        Q_list = self.Q[obs, :]
        Qmax = np.max(Q_list)
        action_list = np.where(Q_list == Qmax)[0]
        action = np.random.choice(action_list)
        return action
    
    def learn(self, obs, action, next_obs, next_action, reward, done):
        predict_Q = self.Q[obs, action]
        if done:
            target_Q = reward
        else:
            target_Q = reward + self.gamma * self.Q[next_obs, next_action]
        self.Q[obs, action] += self.lr * (target_Q - predict_Q)

def run_episode(env, agent, render=False):
    total_reward = 0
    total_steps = 0
    obs = env.reset()
    action = agent.sample(obs)
    
    while True:
        next_obs, reward, done, _ = env.step(action)
        next_action = agent.sample(next_obs)
        agent.learn(obs, action, next_obs, next_action, reward, done)
        total_reward += reward
        total_steps += 1
        obs = next_obs
        action = next_action
        if render:
            env.render()
        if done:
            break
    return total_reward, total_steps

def test_episode(env, agent):
    total_reward = 0
    # total_steps = 0
    obs = env.reset()

    while True:
        action = agent.predict(obs)
        next_obs, reward, done, _ = env.step(action)
        next_action = agent.predict(next_obs)
        total_reward += reward
        obs = next_obs
        time.sleep(0.5)
        env.render()
        if done:
            break
    return total_reward




env = gym.make('CliffWalking-v0')
agent = SarsaAgent(obs_n=env.observation_space.n,
        act_n=env.action_space.n,
        learning_rate=0.1,
        gamma=0.9,
        e_greed=0.1)
for episode in range(500):
    ep_reward, ep_step = run_episode(env, agent, False)
    print("episode:{}, step:{}, reward:{}".format(episode, ep_step, ep_reward))

test_reward = test_episode(env, agent)
print("test_reward:{}".format(test_reward))

结果:agent在前往终点的路途上,会远远的绕过陷阱

episode:0, step:63, reward:-261
episode:1, step:837, reward:-2124
episode:2, step:225, reward:-324
episode:3, step:133, reward:-133
episode:4, step:283, reward:-481
episode:5, step:213, reward:-312
episode:6, step:74, reward:-74
episode:7, step:290, reward:-389
episode:8, step:185, reward:-284
episode:9, step:221, reward:-419
episode:10, step:196, reward:-295
episode:11, step:36, reward:-36
episode:12, step:212, reward:-410
episode:13, step:117, reward:-216
episode:14, step:79, reward:-79
episode:15, step:96, reward:-96
episode:16, step:47, reward:-47
episode:17, step:75, reward:-75
episode:18, step:79, reward:-79
episode:19, step:170, reward:-170
episode:20, step:55, reward:-55
episode:21, step:75, reward:-273
episode:22, step:99, reward:-198
episode:23, step:53, reward:-152
episode:24, step:53, reward:-53
episode:25, step:108, reward:-207
episode:26, step:136, reward:-334
episode:27, step:74, reward:-74
episode:28, step:61, reward:-61
episode:29, step:76, reward:-76
episode:30, step:81, reward:-81
episode:31, step:72, reward:-171
episode:32, step:104, reward:-104
episode:33, step:58, reward:-58
episode:34, step:48, reward:-48
episode:35, step:31, reward:-31
episode:36, step:74, reward:-74
episode:37, step:117, reward:-216
episode:38, step:38, reward:-38
episode:39, step:68, reward:-167
episode:40, step:46, reward:-46
episode:41, step:61, reward:-61
episode:42, step:93, reward:-291
episode:43, step:170, reward:-368
episode:44, step:67, reward:-67
episode:45, step:139, reward:-238
episode:46, step:174, reward:-372
episode:47, step:37, reward:-37
episode:48, step:174, reward:-273
episode:49, step:39, reward:-39
episode:50, step:58, reward:-58
episode:51, step:79, reward:-178
episode:52, step:55, reward:-55
episode:53, step:86, reward:-86
episode:54, step:74, reward:-74
episode:55, step:67, reward:-67
episode:56, step:67, reward:-67
episode:57, step:57, reward:-57
episode:58, step:71, reward:-71
episode:59, step:123, reward:-123
episode:60, step:66, reward:-66
episode:61, step:85, reward:-85
episode:62, step:50, reward:-50
episode:63, step:80, reward:-80
episode:64, step:72, reward:-72
episode:65, step:50, reward:-50
episode:66, step:73, reward:-73
episode:67, step:38, reward:-38
episode:68, step:72, reward:-72
episode:69, step:41, reward:-41
episode:70, step:107, reward:-206
episode:71, step:129, reward:-228
episode:72, step:27, reward:-27
episode:73, step:67, reward:-67
episode:74, step:73, reward:-172
episode:75, step:56, reward:-56
episode:76, step:47, reward:-47
episode:77, step:67, reward:-67
episode:78, step:34, reward:-34
episode:79, step:51, reward:-150
episode:80, step:68, reward:-68
episode:81, step:74, reward:-173
episode:82, step:38, reward:-38
episode:83, step:35, reward:-35
episode:84, step:61, reward:-61
episode:85, step:64, reward:-163
episode:86, step:49, reward:-49
episode:87, step:63, reward:-63
episode:88, step:49, reward:-49
episode:89, step:51, reward:-51
episode:90, step:49, reward:-49
episode:91, step:46, reward:-46
episode:92, step:50, reward:-50
episode:93, step:31, reward:-31
episode:94, step:63, reward:-162
episode:95, step:40, reward:-40
episode:96, step:53, reward:-53
episode:97, step:33, reward:-33
episode:98, step:58, reward:-157
episode:99, step:43, reward:-43
episode:100, step:44, reward:-44
episode:101, step:38, reward:-38
episode:102, step:27, reward:-27
episode:103, step:33, reward:-33
episode:104, step:29, reward:-29
episode:105, step:24, reward:-24
episode:106, step:38, reward:-38
episode:107, step:35, reward:-35
episode:108, step:65, reward:-65
episode:109, step:25, reward:-25
episode:110, step:21, reward:-120
episode:111, step:25, reward:-25
episode:112, step:44, reward:-143
episode:113, step:31, reward:-130
episode:114, step:33, reward:-33
episode:115, step:24, reward:-24
episode:116, step:23, reward:-23
episode:117, step:47, reward:-47
episode:118, step:40, reward:-40
episode:119, step:31, reward:-31
episode:120, step:65, reward:-164
episode:121, step:21, reward:-21
episode:122, step:29, reward:-29
episode:123, step:19, reward:-19
episode:124, step:31, reward:-31
episode:125, step:50, reward:-149
episode:126, step:35, reward:-35
episode:127, step:33, reward:-33
episode:128, step:32, reward:-32
episode:129, step:58, reward:-58
episode:130, step:31, reward:-31
episode:131, step:21, reward:-21
episode:132, step:53, reward:-53
episode:133, step:34, reward:-34
episode:134, step:24, reward:-24
episode:135, step:24, reward:-24
episode:136, step:40, reward:-40
episode:137, step:25, reward:-25
episode:138, step:41, reward:-41
episode:139, step:21, reward:-21
episode:140, step:23, reward:-23
episode:141, step:42, reward:-42
episode:142, step:23, reward:-23
episode:143, step:34, reward:-34
episode:144, step:29, reward:-29
episode:145, step:31, reward:-31
episode:146, step:23, reward:-23
episode:147, step:27, reward:-27
episode:148, step:25, reward:-25
episode:149, step:31, reward:-130
episode:150, step:22, reward:-22
episode:151, step:21, reward:-21
episode:152, step:25, reward:-25
episode:153, step:26, reward:-26
episode:154, step:27, reward:-27
episode:155, step:21, reward:-21
episode:156, step:26, reward:-26
episode:157, step:17, reward:-17
episode:158, step:30, reward:-129
episode:159, step:21, reward:-21
episode:160, step:29, reward:-128
episode:161, step:21, reward:-21
episode:162, step:21, reward:-21
episode:163, step:21, reward:-21
episode:164, step:24, reward:-24
episode:165, step:19, reward:-19
episode:166, step:15, reward:-15
episode:167, step:21, reward:-21
episode:168, step:33, reward:-33
episode:169, step:23, reward:-23
episode:170, step:23, reward:-23
episode:171, step:29, reward:-29
episode:172, step:22, reward:-22
episode:173, step:27, reward:-27
episode:174, step:27, reward:-27
episode:175, step:24, reward:-24
episode:176, step:30, reward:-30
episode:177, step:31, reward:-31
episode:178, step:24, reward:-321
episode:179, step:20, reward:-20
episode:180, step:28, reward:-28
episode:181, step:29, reward:-29
episode:182, step:37, reward:-37
episode:183, step:23, reward:-23
episode:184, step:30, reward:-30
episode:185, step:21, reward:-21
episode:186, step:17, reward:-17
episode:187, step:19, reward:-19
episode:188, step:23, reward:-23
episode:189, step:17, reward:-17
episode:190, step:21, reward:-21
episode:191, step:23, reward:-23
episode:192, step:30, reward:-30
episode:193, step:29,
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值