import numpy as np
state_size = 5
action_size = 1
learning_rate = 0.5
discount_factor = 0.95
epsilon = 0.1
max_iterations = 10000
q_table = np.zeros((state_size, action_size))
print(q_table)
def q_learning():
for iteration in range(max_iterations):
state = np.random.randint(state_size)
done = False
while not done:
if np.random.uniform() < epsilon:
action = np.random.randint(action_size)
else:
action = np.argmax(q_table[state, :])
next_state = get_next_state(state, action)
reward = get_reward(next_state)
update(state, action, next_state, reward)
state = next_state
if state == get_final_state():
done = True
return q_table
def get_next_state(state, action):
if action == 0:
return state if state == 0 else 1
else:
return state if state == 1 else 0
def get_reward(next_state):
if next_state == get_final_state():
return 1
else:
return 0
def get_final_state():
return 0 if np.random.randint(2) == 0 else 1
def update(state, action, next_state, reward):
old_value = q_table[state, action]
next_max = np.max(q_table[next_state, :])
new_value = old_value + learning_rate * (reward + discount_factor * next_max - old_value)
q_table[state, action] = new_value
q_table = q_learning()
print("Q-table的最大值:", np.max(q_table))
print("Q-table的最小值:", np.min(q_table))