import numpy as np
import pandas as pd
import time
np.random.seed(2)
N_STATES = 6
ACTIONS = ["left", "right"]
EPCILON = 0.9
ALPHA = 0.1
GAMMA = 0.9
MAX_EPISODE = 13
FRESH_TIME = 0.1
def build_q_table(n_states, actions):
table = pd.DataFrame(np.zeros((n_states, len(actions))), columns=actions)
return table
def choose_action(state, table):
actions = table.iloc[state, :]
if np.random.uniform() > EPCILON or actions.all() == 0:
action = np.random.choice(ACTIONS)
else:
action = actions.idxmax()
return action
def env_feedback(S, A):
if A == "right":
if S == N_STATES - 2:
S_ = "terminal"
reward = 1
else:
S_ = S + 1
reward = 0
else:
if S == 0:
S_ = S
else:
S_ = S - 1
reward = 0
return S_, reward
def print_env(S, episode, step_counter):
env_list = ['-'] * (N_STATES - 1) + ['T']
if S == 'terminal':
interaction = 'Episode %s: total_steps = %s' % (episode + 1, step_counter)
print('\r{}'.format(interaction), end='')
time.sleep(2)
print('\r ', end='')
else:
env_list[S] = 'o'
interaction = ''.join(env_list)
print('\r{}'.format(interaction), end='')
time.sleep(FRESH_TIME)
def q_learning():
q_table = build_q_table(N_STATES, ACTIONS)
for episode in range(MAX_EPISODE):
is_terminated = False
step_counter = 0
S = 0
print_env(S, episode, step_counter)
while not is_terminated:
A = choose_action(S, q_table)
S_, r = env_feedback(S, A)
q_predict = q_table.loc[S, A]
if S_ == "terminal":
q_target = r
is_terminated = True
print(q_table)
print('第'+str(episode+1)+'次')
print('总共'+str(step_counter+1)+'步')
else:
q_target = r + GAMMA * q_table.iloc[S_, :].max()
print('Q值为:'+str(q_target))
print('S_max值为:' + str(q_table.iloc[S_, :].max()))
q_table.loc[S, A] += ALPHA * (q_target - q_predict)
S = S_
step_counter += 1
print_env(S, episode, step_counter)
print('\n'+'走了第'+str(step_counter)+'步')
print('跟新后Q表:'+'\n'+str(q_table))
return q_table
if __name__ == "__main__":
q_table = q_learning()