one Infos

wiped of any data,target devices will be reformatted Make sure you have backups;

jdbc:mysql://ip:port/dbName?useUnicode=true&characterEncoding=utf-8&zeroDateTimeBehavior=convertToNull

posted on 2014-12-13 20:15 秦瑞It行程实录 阅读( ...) 评论( ...) 编辑 收藏

转载于:https://www.cnblogs.com/ruiy/p/4161789.html

def run_marl(self): self.init_saved_model() run_episode = self.train_config.run_episode_before_train if "ppo" in self.env_config.learn_policy else 1 for epoch in range(self.current_epoch, self.train_config.epochs + 1): # 在正式开始训练之前做一些动作并将信息存进记忆单元中 # grid_wise_control系列算法和常规marl算法不同, 是以格子作为观测空间。 # ppo 属于on policy算法,训练数据要是同策略的 total_reward = 0 if "grid_wise_control" in self.env_config.learn_policy and isinstance(self.batch_episode_memory, GridBatchEpisodeMemory): for i in range(run_episode): self.env.reset() finish_game = False cycle = 0 while not finish_game and cycle < self.env_config.max_cycles: grid_input = self.env.get_grid_input() unit_pos = self.env.get_agents_approximate_pos() actions_with_name, actions, log_probs = self.agents.choose_actions_in_grid(unit_pos=unit_pos, grid_input=grid_input) observations, rewards, finish_game, infos = self.env.step(actions_with_name) grid_input_next = self.env.get_grid_input() self.batch_episode_memory.store_one_episode(grid_input, grid_input_next, unit_pos, actions, rewards, log_probs) total_reward += rewards cycle += 1 self.batch_episode_memory.set_per_episode_len(cycle) elif isinstance(self.batch_episode_memory, CommBatchEpisodeMemory): for i in range(run_episode): obs = self.env.reset()[0] finish_game = False cycle = 0 while not finish_game and cycle < self.env_config.max_cycles: state = self.env.state() actions_with_name, actions, log_probs = self.agents.choose_actions(obs) obs_next, rewards, finish_game, infos = self.env.step(actions_with_name) state_next = self.env.state() if "ppo" in self.env_config.learn_policy: self.batch_episode_memory.store_one_episode(one_obs=obs, one_state=state, action=actions, reward=rewards, log_probs=log_probs) else: self.batch_episode_memory.store_one_episode(one_obs=obs, one_state=state, action=actions, reward=rewards, one_obs_next=obs_next, one_state_next=state_next) total_reward += rewards obs = obs_next cycle += 1 self.batch_episode_memory.set_per_episode_len(cycle) if "ppo" in self.env_config.learn_policy: # 可以用一个policy跑一个batch的数据来收集,由于性能问题假设batch=1,后续来优化 batch_data = self.batch_episode_memory.get_batch_data() self.agents.learn(batch_data) self.batch_episode_memory.clear_memories() else: self.memory.store_episode(self.batch_episode_memory) self.batch_episode_memory.clear_memories() if self.memory.get_memory_real_size() >= 10: for i in range(self.train_config.learn_num): batch = self.memory.sample(self.train_config.memory_batch) self.agents.learn(batch, epoch) # avg_reward = self.evaluate() avg_reward = total_reward / run_episode one_result_buffer = [avg_reward] self.result_buffer.append(one_result_buffer) if epoch % self.train_config.save_epoch == 0 and epoch != 0: self.save_model_and_result(epoch) print("episode_{} over,avg_reward {}".format(epoch, avg_reward))这一段代码什么意思
09-28
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符  | 博主筛选后可见
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值