强化学习经典算法笔记(十):使用粒子群算法训练Policy智能体
本文使用粒子群算法训练了一个小型Actor网络,共226个参数,完美解决了CartPole游戏。
粒子群算法实现
群体智能算法采用最简单的粒子群优化算法(PSO)。Python实现如下:
class PSO(object):
def __init__(self, population_size, max_steps, dim=2, x_bound=[-10,10]):
self.w = 0.6 # 惯性权重
self.c1 = self.c2 = 2
self.population_size = population_size # 粒子群数量
self.dim = dim # 搜索空间的维度
self.max_steps = max_steps # 迭代次数
self.x_bound = x_bound # 解空间范围
self.x = np.random.uniform(self.x_bound[0], self.x_bound[1], # 也可以这样写:np.random.uniform([-1,10,20],[1,15,25],(3,3))
(self.population_size, self.dim)) # 初始化粒子群位置
self.v = np.random.rand(self.population_size, self.dim) # 初始化粒子群速度
fitness = self.calculate_fitness(self.x)
self.p = self.x # 个体的最佳位置
self.pg = self.x[np.argmin(fitness)] # 全局最佳位置
self.individual_best_fitness = fitness # 个体的最优适应度
self.global_best_fitness = np.min(fitness) # 全局最佳适应度
def calculate_fitness(self, x, value_net=None, device=None):
return np.sum(pow((x-1),2), axis=1)
# return value_net(torch.tensor(x).to(device))
def evolve(self):
# fig = plt.figure() # 费时
for step in range(self.max_steps):
r1 = np.random.rand(self.population_size, self.dim)
r2 = np.random.rand(self.population_size, self.dim)
# 更新速度和权重
self.v = self.w*self.v+self.c1*r1*(self.p-self.x)+self.c2*r2*(self.pg-self.x)
self.x = self.v + self.x
# plt.clf()
# plt.scatter(self.x[:, 0], self.x[:, 1], s=30, color='k')
# plt.xlim(self.x_bound[0], self.x_bound[1])
# plt.ylim(self.x_bound[0], self.x_bound[1])
# plt.pause(0.0001)
fitness = self.calculate_fitness(self.x)
# 需要更新的个体
update_id = np.greater(self.individual_best_fitness, fitness)
self.p[update_id] = self.x[update_id]
self.individual_best_fitness[update_id] = fitness[update_id]
# 新一代出现了更小的fitness,所以更新全局最优fitness和位置
if np.min(fitness) < self.global_best_fitness:
self.pg = self.x[np.argmin(fitness)]
self.global_best_fitness = np.min(fitness)
print('best fitness: %.5f, mean fitness: %.5f' % (self.global_best_fitness, np.mean(fitness)))
def evolve_step(self):
r1 = np.random.rand(self.population_size, self.dim)
r2 = np.random.rand(self.population_size, self.dim)
# 更新速度和权重
self.v = self.w*self.v+self.c1*r1*(self.p-self.x)+self.c2*r2*(self.pg-self.x)
self.x = self.v + self.x
fitness = self.calculate_fitness(self.x)
# 需要更新的个体
update_id = np.greater(self.individual_best_fitness, fitness)
self.p[update_id] = self.x[update_id]
self.individual_best_fitness[update_id] = fitness[update_id]
# 新一代出现了更小的fitness,所以更新全局最优fitness和位置
if np.min(fitness) < self.global_best_fitness:
self.pg

最低0.47元/天 解锁文章
164

被折叠的 条评论
为什么被折叠?



