基于强化学习的简单井字棋

         基于价值的简单井字棋,每次选择状态价值最大的状态作为下一状态来实施动作。

import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
import tkinter as tk
import time

class Agent():
    def __init__(self,_index,Alpha,Epsilon,Gamma):
        self.index=_index
        self.alpha=Alpha
        self.epsilon=Epsilon
        self.gamma=Gamma
        self.States=np.zeros(9).astype(np.int8)
        self.value=np.zeros((3,3,3,3,3,3,3,3,3))
    
    def reset(self):
        self.States=np.zeros(9).astype(np.int8)
    
    def move(self,state):
        Outcome=state.copy()

        mask=np.where(Outcome==0)[0]

        if np.random.binomial(n=1,p=self.epsilon):
            Outcome[mask[np.random.randint(0,len(mask))]]=self.index
        else:
            temp_value=np.zeros(len(mask))

            for i in range(len(mask)):
                Cnt_state=state.copy()
                Cnt_state[mask[i]]=self.index

                temp_value[i]=self.value[tuple(Cnt_state)]
            
            choose=np.where(temp_value==np.max(temp_value))[0]
            Outcome[mask[choose[np.random.randint(0,len(choose))]]]=self.index

        Error=self.value[tuple(Outcome)]*self.gamma-self.value[tuple(self.States)]
        self.value[tuple(self.States)]+=(self.alpha*Error)
        self.States=Outcome.copy()

        return Outcome
    
    def Judge(self,state):
            #0 1 2
            #3 4 5
            #6 7 8

        win_patterns = [
            [0, 1, 2], [3, 4, 5], [6, 7, 8],  # 三行
            [0, 3, 6], [1, 4, 7], [2, 5, 8],  # 三列
            [0, 4, 8], [2, 4, 6]              # 两条对角线
        ]

        if any(all(state[i] == 1 for i in pattern) for pattern in win_patterns):
            return 1
        elif any(all(state[i] == 2 for i in pattern) for pattern in win_patterns):
            return 2
        if 0 not in state:
            return 3
        return 0

def is_empty_file(file_path):
    return os.stat(file_path).st_size==0

#main
mode=int(input("请选择模式:0 训练模式 1 记忆模式 2 对弈模式\n"))

if (mode==1 or mode==2) and (not is_empty_file("./model1.pkl")) and (not is_empty_file("./model2.pkl")):
    with open("./model1.pkl","rb") as f1:
        agent1=pickle.load(f1)
    with open("./model2.pkl","rb") as f2:
        agent2=pickle.load(f2)
else:
    agent1=Agent(_index=1,Alpha=0.1,Epsilon=0.1,Gamma=1)
    agent2=Agent(_index=2,Alpha=0.1,Epsilon=0.1,Gamma=1)

if mode==0 or mode ==1:
    trial=30000
    Winners=np.zeros(trial).astype(np.int8)

    for i in range(trial):
        agent1.reset()
        agent2.reset()

        if i==20000:
            agent1.epsilon=0
            agent2.epsilon=0

        winner=0
        State=np.zeros(9).astype(np.int8)

        while winner==0:
            Outcome=agent1.move(State)
            winner=agent1.Judge(Outcome)

            if winner==agent1.index:
                agent1.value[tuple(Outcome)]=1
                agent2.value[tuple(State)]=-1
            elif winner==3:
                agent1.value[tuple(Outcome)]=0.5
                agent2.value[tuple(State)]=0.5
            else:
                State=agent2.move(Outcome)
                winner=agent2.Judge(State)

                if winner==agent2.index:
                    agent2.value[tuple(State)]=1
                    agent1.value[tuple(Outcome)]=-1
                elif winner==3:
                    agent1.value[tuple(Outcome)]=0.5
                    agent2.value[tuple(State)]=0.5
        
        Winners[i]=winner

    #
    try:
        if mode==1 or mode==0:
            with open("./model1.pkl","wb") as f1:
                pickle.dump(agent1,f1)
            with open("./model2.pkl","wb") as f2:
                pickle.dump(agent2,f2) 
    except Exception as e:
        print(f"Error saving models:{e}")

    #   
    step=250
    duration=500
    def Rate(Winner):
        Rate1=np.zeros(int((trial-duration)/step)+1)
        Rate2=np.zeros(int((trial-duration)/step)+1)
        Rate3=np.zeros(int((trial-duration)/step)+1)
        for i in  range(len(Rate1)):
            Rate1[i]=np.sum(Winner[step*i:duration+step*i]==1)/duration
            Rate2[i]=np.sum(Winner[step*i:duration+step*i]==2)/duration
            Rate3[i]=np.sum(Winner[step*i:duration+step*i]==3)/duration
        return Rate1,Rate2,Rate3

    Rate1,Rate2,Rate3=Rate(Winners)

    plt.figure(figsize=(10, 6))
    plt.plot(Rate1, marker='.', label="Rate 1 (Player 1)")
    plt.plot(Rate2, marker='.', label="Rate 2 (Player 2)")
    plt.plot(Rate3, marker='.', label="Rate 3 (Draw)")
    plt.xticks(np.arange(0,121,40),np.arange(0,32,10),fontsize=30)
    plt.yticks(np.arange(0,1.1,0.2),np.round(np.arange(0,1.1,0.2),2),fontsize=30)
    # 添加标题、标签和图例
    plt.title("Winning Rate of Players Over Time")
    plt.xlabel("Step")
    plt.ylabel("Winning Rate")
    plt.legend()
    plt.grid(True)

    # 显示图表
    plt.show()
else:
    def play_game():
        global canvas
        board=[["" for _ in range(3)] for _ in range(3)]
        current_sign=["X"]
        AI_code=0
        Player_code=0

        if np.random.binomial(1,p=0.5):#AI先手
            AI_code=1
            Player_code=2
            current_player=[1]
            agent=agent1
        else:
            AI_code=2
            Player_code=1
            current_player=[1]
            agent=agent2

        State=np.zeros(9).astype(np.int8)

        canvas.delete("all")

        for i in range(1,3):
            canvas.create_line(i*100,0,i*100,300,width=3)
            canvas.create_line(0,i*100,300,i*100,width=3)

        def draw(x,y,player):
            if player=="X":
                canvas.create_line(x-30,y-30,x+30,y+30,width=3,fill="red")
                canvas.create_line(x+30,y-30,x-30,y+30,width=3,fill="red")
            else:
                canvas.create_oval(x-30,y-30,x+30,y+30,width=3,outline="blue")
            
            current_sign[0]="O" if current_sign[0]=='X' else "X"

        def on_click(event):
            if current_player[0]==AI_code:
                return None
            
            row,col=event.y//100,event.x//100

            if board[row][col]=="":

                board[row][col]=current_sign[0]
                current_player[0]=AI_code
                State[row*3+col]=Player_code
                x,y=col*100+50,row*100+50

                draw(x,y,current_sign[0])
            
                # 检查玩家是否获胜
                winner=agent.Judge(State)
                if  winner:
                    end_game(winner)
                    return 
                
                root.after(500, ai_move)  # 让 AI 0.5 秒后行动

        def check_ai_turn():
            #"""检测是否轮到 AI 走棋"""
            if current_player[0] == AI_code:
                root.after(500, ai_move)  # AI 延迟 0.5 秒落子

        def ai_move():
            if current_player[0] == Player_code:
                return  # 轮到玩家,AI 不能动

            Outcome = agent.move(State)
            mask = np.where(Outcome != State)[0]

            if len(mask) > 0:
                x = mask[0] % 3 * 100 + 50
                y = mask[0] // 3 * 100 + 50
                draw(x, y, current_sign[0])

            # 检查 AI 是否获胜
            winner=agent.Judge(Outcome)
            if  winner:
                end_game(winner)
                return 

            State[:] = Outcome  # 更新游戏状态
            current_player[0] = Player_code  # 轮到玩家
        
        def end_game(winner):
            if winner == Player_code:
                label.config(text="玩家win!")
            elif winner == AI_code:
                label.config(text="AI win!")
            else:
                label.config(text="Draw!")

        canvas.bind("<Button-1>",on_click)
        
        check_ai_turn()
            
    root=tk.Tk()
    root.geometry("500x500")
    canvas=tk.Canvas(root,width=300,height=300,bg="white")
    canvas.pack()

    root.title("AI")
    label = tk.Label(root, text="Train", font=("Ubuntu", 20))
    label.pack(pady=20)
    button = tk.Button(root, text="Start", command=play_game, font=("Ubuntu", 20))
    button.pack(pady=10)
    reset_button = tk.Button(root, text="Restart", command=play_game, font=("Ubuntu", 20))
    reset_button.pack(pady=10)

    root.mainloop()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值