# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import gym import parl import time import numpy as np from es import ES from obs_filter import MeanStdFilter from mujoco_agent import MujocoAgent from mujoco_model import MujocoModel from noise import SharedNoiseTable @parl.remote_class class Actor(object): def __init__(self, config): self.config = config self.env = gym.make(self.config['env_name']) self.config['obs_dim'] = self.env.observation_space.shape[0] self.config['act_dim'] = self.env.action_space.shape[0] self.obs_filter = MeanStdFilter(self.config['obs_dim']) self.noise = SharedNoiseTable(self.config['noise_size']) model = MujocoModel(self.config['act_dim']) algorithm = ES(model) self.agent = MujocoAgent(algorithm, self.config) def _play_one_episode(self, add_noise=False): episode_reward = 0 episode_step = 0 obs = self.env.reset() while True: if np.random.uniform() < self.config['filter_update_prob']: obs = self.obs_filter(obs[None], update=True) else: obs = self.obs_filter(obs[None], update=False) action = self.agent.predict(obs) if add_noise: action += np.random.randn( *action.shape) * self.config['action_noise_std'] obs, reward, done, _ = self.env.step(action) episode_reward += reward episode_step += 1 if done: break return episode_reward, episode_step def sample(self, flat_weights): noise_indices, rewards, lengths = [], [], [] eval_rewards, eval_lengths = [], [] # Perform some rollouts with noise. task_tstart = time.time() while (len(noise_indices) == 0 or time.time() - task_tstart < self.config['min_task_runtime']): if np.random.uniform() < self.config["eval_prob"]: # Do an evaluation run with no perturbation. self.agent.set_flat_weights(flat_weights) episode_reward, episode_step = self._play_one_episode( add_noise=False) eval_rewards.append(episode_reward) eval_lengths.append(episode_step) else: # Do a regular run with parameter perturbations. noise_index = self.noise.sample_index( self.agent.weights_total_size) perturbation = self.config["noise_stdev"] * self.noise.get( noise_index, self.agent.weights_total_size) # mirrored sampling: evaluate pairs of perturbations \epsilon, −\epsilon self.agent.set_flat_weights(flat_weights + perturbation) episode_reward_pos, episode_step_pos = self._play_one_episode( add_noise=True) self.agent.set_flat_weights(flat_weights - perturbation) episode_reward_neg, episode_step_neg = self._play_one_episode( add_noise=True) noise_indices.append(noise_index) rewards.append([episode_reward_pos, episode_reward_neg]) lengths.append([episode_step_pos, episode_step_neg]) return { 'noise_indices': noise_indices, 'noisy_rewards': rewards, 'noisy_lengths': lengths, 'eval_rewards': eval_rewards, 'eval_lengths': eval_lengths } def get_filter(self, flush_after=False): return_filter = self.obs_filter.as_serializable() if flush_after: self.obs_filter.clear_buffer() return return_filter def set_filter(self, new_filter): self.obs_filter.sync(new_filter) if __name__ == '__main__': from es_config import config actor = Actor(config) actor.as_remote(config['server_ip'], config['server_port'])