diff --git a/README.md b/README.md index 35190dfbbd658e805962fa2defb009f6142d63b6..3d7b02797892c1713cb579869093331817e169e3 100644 --- a/README.md +++ b/README.md @@ -61,13 +61,13 @@ three steps to build an agent model = AtariModel(img_shape=(32, 32), action_dim=4) algorithm = DQN(model) -agent = AtariAgent(aglrotihm) +agent = AtariAgent(algorithm) ``` # Install: ### Dependencies - Python 2.7 or 3.5+. -- PaddlePaddle >=1.0 (We try to make our repository always compatible with newest version PaddlePaddle) +- PaddlePaddle >=1.2.1 (We try to make our repository always compatible with newest version PaddlePaddle) ``` diff --git a/examples/DDPG/replay_memory.py b/examples/DDPG/replay_memory.py deleted file mode 100644 index 1a50d9f7adbe40590d6db54dea2893c2ffeddff7..0000000000000000000000000000000000000000 --- a/examples/DDPG/replay_memory.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np - - -class ReplayMemory(object): - def __init__(self, max_size, obs_dim, act_dim): - self.max_size = max_size - self.obs_memory = np.zeros((max_size, obs_dim), dtype='float32') - self.act_memory = np.zeros((max_size, act_dim), dtype='float32') - self.reward_memory = np.zeros((max_size, ), dtype='float32') - self.next_obs_memory = np.zeros((max_size, obs_dim), dtype='float32') - self.terminal_memory = np.zeros((max_size, ), dtype='bool') - self._curr_size = 0 - self._curr_pos = 0 - - def sample_batch(self, batch_size): - batch_idx = np.random.choice(self._curr_size, size=batch_size) - obs = self.obs_memory[batch_idx, :] - act = self.act_memory[batch_idx, :] - reward = self.reward_memory[batch_idx] - next_obs = self.next_obs_memory[batch_idx, :] - terminal = self.terminal_memory[batch_idx] - return obs, act, reward, next_obs, terminal - - def append(self, obs, act, reward, next_obs, terminal): - if self._curr_size < self.max_size: - self._curr_size += 1 - self.obs_memory[self._curr_pos] = obs - self.act_memory[self._curr_pos] = act - self.reward_memory[self._curr_pos] = reward - self.next_obs_memory[self._curr_pos] = next_obs - self.terminal_memory[self._curr_pos] = terminal - self._curr_pos = (self._curr_pos + 1) % self.max_size - - def size(self): - return self._curr_size diff --git a/examples/DDPG/train.py b/examples/DDPG/train.py index ba1a826e505441a489d6827b7c7ed9a3bdb200dd..c8f7ce3f92b47c639942d3099cfec32535c6043f 100644 --- a/examples/DDPG/train.py +++ b/examples/DDPG/train.py @@ -19,12 +19,10 @@ import time from mujoco_agent import MujocoAgent from mujoco_model import MujocoModel from parl.algorithms import DDPG -from parl.utils import logger, action_mapping -from replay_memory import ReplayMemory +from parl.utils import logger, action_mapping, ReplayMemory MAX_EPISODES = 5000 -TEST_EVERY_EPISODES = 50 -MAX_STEPS_EACH_EPISODE = 1000 +TEST_EVERY_EPISODES = 20 ACTOR_LR = 1e-4 CRITIC_LR = 1e-3 GAMMA = 0.99 @@ -39,7 +37,7 @@ ENV_SEED = 1 def run_train_episode(env, agent, rpm): obs = env.reset() total_reward = 0 - for j in range(MAX_STEPS_EACH_EPISODE): + while True: batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) action = np.squeeze(action) @@ -70,7 +68,7 @@ def run_train_episode(env, agent, rpm): def run_evaluate_episode(env, agent): obs = env.reset() total_reward = 0 - for j in range(MAX_STEPS_EACH_EPISODE): + while True: batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) action = np.squeeze(action) @@ -112,7 +110,8 @@ def main(): logger.info('Episode: {} Reward: {}'.format(i, train_reward)) if (i + 1) % TEST_EVERY_EPISODES == 0: evaluate_reward = run_evaluate_episode(env, agent) - logger.info('Evaluate Reward: {}'.format(evaluate_reward)) + logger.info('Episode {}, Evaluate reward: {}'.format( + i, evaluate_reward)) if __name__ == '__main__': diff --git a/examples/DQN/train.py b/examples/DQN/train.py index fb91dc48d96b3dd498977dbe2b473bb014049302..06cf8eb7ca76003dfb2fe9190551cc428956f6eb 100644 --- a/examples/DQN/train.py +++ b/examples/DQN/train.py @@ -64,8 +64,9 @@ def run_train_episode(env, agent, rpm): state = next_state if isOver: break - logger.info('[Train]total_reward: {}, mean_cost: {}'.format( - total_reward, np.mean(all_cost))) + if all_cost: + logger.info('[Train]total_reward: {}, mean_cost: {}'.format( + total_reward, np.mean(all_cost))) return total_reward, step @@ -122,7 +123,7 @@ def main(): if total_step // args.test_every_steps == test_flag: pbar.write("testing") eval_rewards = [] - for _ in tqdm(range(30), desc='eval agent'): + for _ in tqdm(range(3), desc='eval agent'): eval_reward = run_evaluate_episode(test_env, agent) eval_rewards.append(eval_reward) test_flag += 1 diff --git a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/README.md b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/README.md index d7b1c9dcee0b6faad44d037dc6e7816b33f9f93b..7fcbcc5f341fb0e1bb50e8d120eb92cff504d3c0 100644 --- a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/README.md +++ b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/README.md @@ -1,36 +1,101 @@ -## The Winning Solution for the NeurIPS 2018: AI for Prosthetics Challenge +# The Winning Solution for the NeurIPS 2018: AI for Prosthetics Challenge -This folder will contains the code used to train the winning models for the [NeurIPS 2018: AI for Prosthetics Challenge](https://www.crowdai.org/challenges/neurips-2018-ai-for-prosthetics-challenge) along with the resulting models. (Codes of training part is organizing, but the resulting models is available now.) - -### Dependencies +This folder contains the code used to train the winning models for the [NeurIPS 2018: AI for Prosthetics Challenge](https://www.crowdai.org/challenges/neurips-2018-ai-for-prosthetics-challenge) along with the resulting models. +## Dependencies - python3.6 -- [paddlepaddle>=1.2.0](https://github.com/PaddlePaddle/Paddle) +- [paddlepaddle>=1.2.1](https://github.com/PaddlePaddle/Paddle) - [osim-rl](https://github.com/stanfordnmbl/osim-rl) +- [grpcio==1.12.1](https://grpc.io/docs/quickstart/python.html) +- tqdm +- tensorflow (To use tensorboard) + +## Result -### Start Testing best models +| Avg reward of all episodes | Avg reward of complete episodes | Falldown % | Evaluate episodes | +|----------------------------|---------------------------------|------------|-------------------| +| 9968.5404 | 9980.3952 | 0.0026 | 500 CPUs * 10 episodes | + +## Start test our submit models - How to Run + ```bash # cd current directory -# install best models file (saved_model.tar.gz) +# cd final_submit/ +# download submit models file (saved_model.tar.gz) tar zxvf saved_model.tar.gz python test.py ``` -> You can install models file from [Baidu Pan](https://pan.baidu.com/s/1NN1auY2eDblGzUiqR8Bfqw) or [Google Drive](https://drive.google.com/open?id=1DQHrwtXzgFbl9dE7jGOe9ZbY0G9-qfq3) +> You can download models file from [Baidu Pan](https://pan.baidu.com/s/1NN1auY2eDblGzUiqR8Bfqw) or [Google Drive](https://drive.google.com/open?id=1DQHrwtXzgFbl9dE7jGOe9ZbY0G9-qfq3) + + +## Start train + +### Stage I: Curriculum learning + +#### 1. Run Fastest + +```bash +# server +python simulator_server.py --port [PORT] --ensemble_num 1 + +# client (Suggest: 200+ clients) +python simulator_client.py --port [PORT] --ip [IP] --reward_type RunFastest +``` + +#### 2. target speed 3.0 m/s -- More arguments ```bash -# Run with GPU -python test.py --use_cuda +# server +python simulator_server.py --port [PORT] --ensemble_num 1 --warm_start_batchs 1000 \ + --restore_model_path [RunFastest model] + +# client (Suggest: 200+ clients) +python simulator_client.py --port [PORT] --ip [IP] --reward_type FixedTargetSpeed --target_v 3.0 \ + --act_penalty_lowerbound 1.5 +``` -# Visulize the game -python test.py --vis +#### 3. target speed 2.0 m/s -# Set the random seed -python test.py --seed 1024 +```bash +# server +python simulator_server.py --port [PORT] --ensemble_num 1 --warm_start_batchs 1000 \ + --restore_model_path [FixedTargetSpeed 3.0m/s model] -# Set the episode number to run -python test.py --episode_num 2 +# client (Suggest: 200+ clients) +python simulator_client.py --port [PORT] --ip [IP] --reward_type FixedTargetSpeed --target_v 2.0 \ + --act_penalty_lowerbound 0.75 ``` -### Start Training -- [ ] To be Done +#### 4. target speed 1.25 m/s + +```bash +# server +python simulator_server.py --port [PORT] --ensemble_num 1 --warm_start_batchs 1000 \ + --restore_model_path [FixedTargetSpeed 2.0m/s model] + +# client (Suggest: 200+ clients) +python simulator_client.py --port [PORT] --ip [IP] --reward_type FixedTargetSpeed --target_v 1.25 \ + --act_penalty_lowerbound 0.6 +``` + +### Stage II: Round2 + +> You can download resulting 1.25m/s model in Stage I from [Baidu Pan](https://pan.baidu.com/s/1PVDgIe3NuLB-4qI5iSxtKA) or [Google Drive](https://drive.google.com/open?id=1jWzs3wvq7_ierIwGZXc-M92bv1X5eqs7) + +```bash +# server +python simulator_server.py --port [PORT] --ensemble_num 12 --warm_start_batchs 1000 \ + --restore_model_path [FixedTargetSpeed 1.25m/s] --restore_from_one_head + +# client (Suggest: 100+ clients) +python simulator_client.py --port [PORT] --ip [IP] --reward_type Round2 --act_penalty_lowerbound 0.75 \ + --act_penalty_coeff 7.0 --vel_penalty_coeff 20.0 --discrete_data --stage 3 +``` + +> To get a higher score, you need train a seperate model for every stage (target_v change times), and fix trained model of previous stage. It's omitted here. + +### Test trained model + +```bash +python test.py --restore_model_path [MODEL_PATH] --ensemble_num [ENSEMBLE_NUM] +``` diff --git a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/args.py b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/args.py new file mode 100644 index 0000000000000000000000000000000000000000..17522919a8872b4d273ddf2d4875dce073e2aaf7 --- /dev/null +++ b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/args.py @@ -0,0 +1,103 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + + +def get_server_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--port', required=True, type=int, help='server port') + parser.add_argument( + '--logdir', type=str, help='directory to save model/tensorboard data') + parser.add_argument( + '--restore_model_path', + type=str, + help='restore model path for warm start') + parser.add_argument( + '--restore_from_one_head', + action="store_true", + help= + 'If set, will restore model from one head model. If ensemble_num > 1, will assign parameters of model0 to other models.' + ) + parser.add_argument( + '--restore_rpm_path', type=str, help='restore rpm path for warm start') + parser.add_argument( + '--ensemble_num', + type=int, + required=True, + help='model number to ensemble') + parser.add_argument( + '--warm_start_batchs', + type=int, + default=100, + help='collect how many batch data to warm start') + args = parser.parse_args() + return args + + +def get_client_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + '--stage', + default=0, + type=int, + help=''' + stage number, which decides change times of target velocity. + Eg: stage=0 will keep target_v 1.25m/s; + stage=3 will change target velocity 3 times, just like Round2 env.''' + ) + parser.add_argument('--ident', type=int, required=False, help='worker id') + parser.add_argument('--ip', type=str, required=True, help='server ip') + parser.add_argument('--port', type=int, required=True, help='server port') + parser.add_argument( + '--target_v', type=float, help='target velocity for training') + parser.add_argument( + '--act_penalty_lowerbound', + type=float, + help='lower bound of action l2 norm penalty') + parser.add_argument( + '--act_penalty_coeff', + type=float, + default=5.0, + help='coefficient of action l2 norm penalty') + parser.add_argument( + '--vel_penalty_coeff', + type=float, + default=1.0, + help='coefficient of velocity gap penalty') + parser.add_argument( + '--discrete_data', + action="store_true", + help= + 'if set, discrete target velocity in last stage (args.stage), make target velocity more uniform.' + ) + parser.add_argument( + '--discrete_bin', + type=int, + default=10, + help='discrete target velocity in last stage to how many intervals') + parser.add_argument( + '--reward_type', + type=str, + help= + "Choose reward type, 'RunFastest' or 'FixedTargetSpeed' or 'Round2'") + parser.add_argument( + '--debug', + action="store_true", + help='if set, will print debug information') + args = parser.parse_args() + + assert args.reward_type in ['RunFastest', 'FixedTargetSpeed', 'Round2'] + + return args diff --git a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/env_wrapper.py b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/env_wrapper.py index c08e97f6b89cad9abbb6608707d5dc19f470d7b7..88b4944cc27035b21317fcc707817ed5bac370e4 100644 --- a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/env_wrapper.py +++ b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/env_wrapper.py @@ -13,41 +13,218 @@ # limitations under the License. import abc -import copy import gym import math import numpy as np +import random from collections import OrderedDict from osim.env import ProstheticsEnv from parl.utils import logger +from tqdm import tqdm MAXTIME_LIMIT = 1000 ProstheticsEnv.time_limit = MAXTIME_LIMIT FRAME_SKIP = None -FALL_PENALTY = 0 -class RemoteEnv(gym.Wrapper): - def __init__(self, env): - env.metadata = {} - env.action_space = None - env.observation_space = None - env.reward_range = None +class CustomR2Env(gym.Wrapper): + """Customized target trajectory here, it support 3 ways currently + 1.fixed_speed, e.g. reset(.., fixed_speed=1.25) + 2.stage , e.g. reset(.., stage=1) + 3.boundary, e.g. reset(.., boundary=True) + """ + + def __init__(self, + env, + time_limit=MAXTIME_LIMIT, + discrete_data=False, + discrete_bin=10): + logger.info("[CustomR2Env]type:{}, time_limit:{}".format( + type(env), time_limit)) + assert isinstance(env, ProstheticsEnv), type(env) gym.Wrapper.__init__(self, env) - self.remote_env = env - self.first_time = True - - def step(self, act): - return self.remote_env.env_step(act.tolist()) - - def reset(self): - if self.first_time: - self.first_time = False - return self.remote_env.env_create() - obs = self.remote_env.env_reset() - if not obs: - return None - return obs + + self.env.time_limit = time_limit + self.env.spec.timestep_limit = time_limit + self.time_limit = time_limit + + # boundary flag + self._generate_boundary_target_flag = True + + self.discrete_data = discrete_data + self.discrete_bin = discrete_bin + + def rect(self, row): + r = row[0] + theta = row[1] + x = r * math.cos(theta) + y = 0 + z = r * math.sin(theta) + return np.array([x, y, z]) + + def _generate_boundary_table(self, ): + possible_combine = [(math.pi / 8, 0.5), (math.pi / 8, -0.5), + (-math.pi / 8, 0.5), (-math.pi / 8, -0.5)] + self._boundary_table = [] + for a in possible_combine: + for b in possible_combine: + for c in possible_combine: + self._boundary_table.append([a, b, c]) + assert len(self._boundary_table) == 64 + + def generate_boundary_target(self, poisson_lambda=300): + if self._generate_boundary_target_flag == True: + self._generate_boundary_target_flag = False + self._generate_boundary_table() + self._boundary_index = 0 + nsteps = self.time_limit + 1 + velocity = np.zeros(nsteps) + heading = np.zeros(nsteps) + + velocity[0] = 1.25 + heading[0] = 0 + trajectory = self._boundary_table[self._boundary_index] + + change = np.cumsum(np.random.poisson(poisson_lambda, 10)) + target_change_times = 0 + for i in range(1, nsteps): + velocity[i] = velocity[i - 1] + heading[i] = heading[i - 1] + if i in change: + velocity[i] += trajectory[target_change_times][1] + heading[i] += trajectory[target_change_times][0] + # trajectory has length 3, the target_change_times should not be large than 2 + target_change_times = min(2, target_change_times + 1) + + self._boundary_index = (self._boundary_index + 1) % 64 + + def _generate_target_vel(self, stage, change_num): + target_vels = None + if stage == 0: + target_vels = [1.25] + elif stage == 1: + assert change_num >= 1 + interval = 1.0 / self.discrete_bin + discrete_id = np.random.randint(self.discrete_bin) + + min_vel = 0.75 + discrete_id * interval + max_vel = 0.75 + (discrete_id + 1) * interval + + target_vels = [1.25] + for i in range(change_num): + if i == 0: + target_vels.append(random.uniform(min_vel, max_vel)) + else: + target_vels.append(target_vels[-1] + + random.uniform(-0.5, 0.5)) + elif stage == 2: + assert change_num >= 2 + interval = 2.0 / self.discrete_bin + discrete_id = np.random.randint(self.discrete_bin) + min_vel = 0.25 + discrete_id * interval + max_vel = 0.25 + (discrete_id + 1) * interval + while True: + target_vels = [1.25] + for i in range(change_num): + target_vels.append(target_vels[-1] + + random.uniform(-0.5, 0.5)) + if target_vels[2] >= min_vel and target_vels[2] <= max_vel: + break + elif stage == 3: + assert change_num >= 3 + interval = 3.0 / self.discrete_bin + discrete_id = np.random.randint(self.discrete_bin) + min_vel = -0.25 + discrete_id * interval + max_vel = -0.25 + (discrete_id + 1) * interval + while True: + target_vels = [1.25] + for i in range(change_num): + target_vels.append(target_vels[-1] + + random.uniform(-0.5, 0.5)) + if target_vels[3] >= min_vel and target_vels[3] <= max_vel: + break + else: + raise NotImplemented + logger.info('[CustomR2Env] stage: {}, target_vels: {}'.format( + stage, target_vels)) + return target_vels + + def generate_stage_targets(self, poisson_lambda=300, stage=None): + nsteps = self.time_limit + 1 + velocity = np.zeros(nsteps) + heading = np.zeros(nsteps) + + velocity[0] = 1.25 + heading[0] = 0 + + change = np.cumsum(np.random.poisson(poisson_lambda, 10)) + if stage == 0: + change = [] + elif stage == 1: + change = change[:1] + elif stage == 2: + change = change[:2] + elif stage == 3: + if change[3] <= 1000: + change = change[:4] + else: + change = change[:3] + else: + raise NotImplemented + + if self.discrete_data: + target_vels = self._generate_target_vel( + stage=stage, change_num=len(change)) + + change_cnt = 0 + for i in range(1, nsteps): + velocity[i] = velocity[i - 1] + heading[i] = heading[i - 1] + + if i in change: + change_cnt += 1 + if self.discrete_data: + velocity[i] = target_vels[change_cnt] + else: + velocity[i] += random.choice([-1, 1]) * random.uniform( + -0.5, 0.5) + heading[i] += random.choice([-1, 1]) * random.uniform( + -math.pi / 8, math.pi / 8) + + trajectory_polar = np.vstack((velocity, heading)).transpose() + targets = np.apply_along_axis(self.rect, 1, trajectory_polar) + return targets + + def reset(self, **kwargs): + fixed_speed = None + if 'fixed_speed' in kwargs: + fixed_speed = kwargs.pop('fixed_speed', None) + stage = None + if 'stage' in kwargs: + stage = kwargs.pop('stage', None) + boundary = None + if 'boundary' in kwargs: + boundary = kwargs.pop('boundary', None) + _ = self.env.reset(**kwargs) + if fixed_speed is not None: + targets = np.zeros([self.time_limit + 1, 3], dtype=np.float32) + targets[:, 0] = fixed_speed + self.env.targets = targets + elif stage is not None: + self.env.targets = self.generate_stage_targets(stage=stage) + elif boundary is not None: + self.generate_boundary_target() + else: + # generate new target + self.env.generate_new_targets( + poisson_lambda=int(self.time_limit * (300 / 1000))) + if 'project' in kwargs: + if kwargs.get('project') == True: + return self.env.get_observation() + return self.env.get_state_desc() + + def step(self, action, **kwargs): + return self.env.step(action, **kwargs) def calc_vel_diff(state_desc): @@ -85,6 +262,7 @@ class ActionScale(gym.Wrapper): class FrameSkip(gym.Wrapper): def __init__(self, env, k): + logger.info("[FrameSkip]type:{}".format(type(env))) gym.Wrapper.__init__(self, env) self.frame_skip = k global FRAME_SKIP @@ -103,13 +281,13 @@ class FrameSkip(gym.Wrapper): if 'reward' in key: # to assure that we don't igonre other reward # if new reward was added, consider its logic here - assert (key == 'shaping_reward') or (key == 'r2_reward') + assert (key == 'shaping_reward') or ( + key == 'r2_reward') or (key == 'x_offset_reward') merge_info[key] = merge_info.get(key, 0.0) + info[key] else: merge_info[key] = info[key] if info['target_changed']: - #merge_info['shaping_reward'] += info['shaping_reward'] * (self.frame_skip - k - 1) logger.warn("[FrameSkip] early break since target was changed") break @@ -128,6 +306,8 @@ class RewardShaping(gym.Wrapper): def __init__(self, env): logger.info("[RewardShaping]type:{}".format(type(env))) + assert (isinstance(env, ProstheticsEnv) + or isinstance(env, CustomR2Env)), type(env) self.step_count = 0 self.pre_state_desc = None @@ -150,25 +330,7 @@ class RewardShaping(gym.Wrapper): self.step_count += 1 obs, r, done, info = self.env.step(action, **kwargs) info = self.reward_shaping(obs, r, done, action) - if info['target_vel'] > 2.75: - rate = math.sqrt((2.75**2) / (info['target_vel']**2)) - logger.warn('Changing targets, origin targets: {}'.format( - obs['target_vel'])) - obs['target_vel'][0] = obs['target_vel'][0] * rate - obs['target_vel'][2] = obs['target_vel'][2] * rate - logger.warn('Changing targets, new targets: {}'.format( - obs['target_vel'])) - info['target_vel'] = 2.75 - if info['target_vel'] < -0.25: - rate = math.sqrt(((-0.25)**2) / (info['target_vel']**2)) - logger.warn('Changing targets, origin targets: {}'.format( - obs['target_vel'])) - obs['target_vel'][0] = obs['target_vel'][0] * rate - obs['target_vel'][2] = obs['target_vel'][2] * rate - logger.warn('Changing targets, new targets: {}'.format( - obs['target_vel'])) - info['target_vel'] = -0.25 - + #logger.info('Step {}: target_vel: {}'.format(self.step_count, obs['target_vel'])) delta = 0 if self.last_target_vel is not None: delta = np.absolute( @@ -188,9 +350,7 @@ class RewardShaping(gym.Wrapper): timeout = False if self.step_count >= MAXTIME_LIMIT: timeout = True - if done and not timeout: - # penalty for falling down - info['shaping_reward'] += FALL_PENALTY + info['timeout'] = timeout self.pre_state_desc = obs return obs, r, done, info @@ -205,31 +365,182 @@ class RewardShaping(gym.Wrapper): return obs -class ForwardReward(RewardShaping): - """ A reward shaping wraper""" +class TestReward(RewardShaping): + """ Reward shaping wrapper for test""" + + def __init__(self, env): + RewardShaping.__init__(self, env) + + def reward_shaping(self, state_desc, r2_reward, done, action): + return {'shaping_reward': 0} + + +class RunFastestReward(RewardShaping): + """ Reward shaping wrapper for fixed target speed""" def __init__(self, env): RewardShaping.__init__(self, env) def reward_shaping(self, state_desc, r2_reward, done, action): - target_vel = math.sqrt(state_desc["target_vel"][0]**2 + - state_desc["target_vel"][2]**2) - if state_desc["target_vel"][0] < 0: - target_vel = -target_vel + if self.pre_state_desc is None: + x_offset = 0 + else: + x_offset = state_desc["body_pos"]["pelvis"][ + 0] - self.pre_state_desc["body_pos"]["pelvis"][0] + + ret_r = 0 + if self.pre_state_desc is not None: + l_foot_reward = state_desc["body_pos"]["tibia_l"][ + 0] - self.pre_state_desc["body_pos"]["tibia_l"][0] + r_foot_reward = state_desc["body_pos"]["pros_tibia_r"][ + 0] - self.pre_state_desc["body_pos"]["pros_tibia_r"][0] + ret_r = max(l_foot_reward, r_foot_reward) + + # penalty + headx = state_desc['body_pos']['head'][0] + px = state_desc['body_pos']['pelvis'][0] + headz = state_desc['body_pos']['head'][2] + pz = state_desc['body_pos']['pelvis'][2] + kneer = state_desc['joint_pos']['knee_r'][-1] + kneel = state_desc['joint_pos']['knee_l'][-1] + lean_x = min(0.3, max(0, px - headx - 0.15)) * 0.05 + lean_z = min(0.3, max(0, pz - headz - 0.15)) * 0.05 + joint = sum([max(0, k - 0.1) for k in [kneer, kneel]]) * 0.03 + penalty = lean_x + lean_z + joint + + ret_r -= penalty * 0.15 + + cur_vel_x = state_desc['body_vel']['pelvis'][0] + cur_vel_z = state_desc['body_vel']['pelvis'][2] + scalar_vel = math.sqrt(cur_vel_z**2 + cur_vel_x**2) + + info = { + 'shaping_reward': ret_r, + 'r2_reward': r2_reward, + 'x_offset_reward': x_offset, + 'scalar_vel': scalar_vel, + 'mean_action_l2_penalty': 0, + } + return info + + +class FixedTargetSpeedReward(RewardShaping): + """ Reward shaping wrapper for fixed target speed""" + + def __init__(self, env, target_v, act_penalty_lowerbound, + act_penalty_coeff, vel_penalty_coeff): + RewardShaping.__init__(self, env) + + assert target_v is not None + assert act_penalty_lowerbound is not None + assert act_penalty_coeff is not None + assert vel_penalty_coeff is not None + + self.target_v = target_v + self.act_penalty_lowerbound = act_penalty_lowerbound + self.act_penalty_coeff = act_penalty_coeff + self.vel_penalty_coeff = vel_penalty_coeff + + def reward_shaping(self, state_desc, r2_reward, done, action): + if self.pre_state_desc is None: + x_offset = 0 + else: + x_offset = state_desc["body_pos"]["pelvis"][ + 0] - self.pre_state_desc["body_pos"]["pelvis"][0] + + # Reward for not falling + ret_r = 36 + + vel_penalty = ((state_desc["body_vel"]["pelvis"][0] - self.target_v)**2 + + (state_desc["body_vel"]["pelvis"][2] - 0)**2) + + origin_action_l2_penalty = np.linalg.norm(action) + action_l2_penalty = max(self.act_penalty_lowerbound, + origin_action_l2_penalty) + + ret_r = ret_r - vel_penalty * self.vel_penalty_coeff - action_l2_penalty * self.act_penalty_coeff + + cur_vel_x = state_desc['body_vel']['pelvis'][0] + cur_vel_z = state_desc['body_vel']['pelvis'][2] + scalar_vel = math.sqrt(cur_vel_z**2 + cur_vel_x**2) + + info = { + 'shaping_reward': ret_r, + 'r2_reward': r2_reward, + 'x_offset_reward': x_offset, + 'scalar_vel': scalar_vel, + 'mean_action_l2_penalty': origin_action_l2_penalty, + } + return info + + +class Round2Reward(RewardShaping): + """ Reward shaping wrapper for fixed target speed""" + + def __init__(self, env, act_penalty_lowerbound, act_penalty_coeff, + vel_penalty_coeff): + RewardShaping.__init__(self, env) + + assert act_penalty_lowerbound is not None + assert act_penalty_coeff is not None + assert vel_penalty_coeff is not None + + self.act_penalty_lowerbound = act_penalty_lowerbound + self.act_penalty_coeff = act_penalty_coeff + self.vel_penalty_coeff = vel_penalty_coeff + + def reward_shaping(self, state_desc, r2_reward, done, action): + if self.pre_state_desc is None: + x_offset = 0 + else: + x_offset = state_desc["body_pos"]["pelvis"][ + 0] - self.pre_state_desc["body_pos"]["pelvis"][0] + + # Reward for not falling + ret_r = 10 + + # Small penalty for too much activation (cost of transport) + muscle_activations = [] + for muscle in sorted(state_desc["muscles"].keys()): + muscle_activations += [state_desc["muscles"][muscle]["activation"]] + muscle_penalty = np.sum(np.array(muscle_activations)**2) * 0.001 + + vel_penalty = ( + (state_desc["target_vel"][0] - state_desc["body_vel"]["pelvis"][0]) + **2 + (state_desc["target_vel"][2] - + state_desc["body_vel"]["pelvis"][2])**2) + + origin_action_l2_penalty = np.linalg.norm(action) + action_l2_penalty = max(self.act_penalty_lowerbound, + origin_action_l2_penalty) + + if self.step_count < 60 or ( + self.step_count - self.last_target_change_step < 60): + ret_r = ret_r - vel_penalty * self.vel_penalty_coeff + else: + ret_r = ret_r - vel_penalty * self.vel_penalty_coeff - action_l2_penalty * self.act_penalty_coeff + + ret_r -= muscle_penalty + + cur_vel_x = state_desc['body_vel']['pelvis'][0] + cur_vel_z = state_desc['body_vel']['pelvis'][2] + scalar_vel = math.sqrt(cur_vel_z**2 + cur_vel_x**2) info = { - 'shaping_reward': r2_reward, - 'target_vel': target_vel, + 'shaping_reward': ret_r, 'r2_reward': r2_reward, + 'x_offset_reward': x_offset, + 'scalar_vel': scalar_vel, + 'mean_action_l2_penalty': origin_action_l2_penalty, } return info class ObsTranformerBase(gym.Wrapper): def __init__(self, env): + logger.info("[ObsTranformerBase]type:{}".format(type(env))) gym.Wrapper.__init__(self, env) self.step_fea = MAXTIME_LIMIT - self.raw_obs = None global FRAME_SKIP self.frame_skip = int(FRAME_SKIP) @@ -268,19 +579,13 @@ class ObsTranformerBase(gym.Wrapper): self.step_fea -= FRAME_SKIP - self.raw_obs = copy.deepcopy(obs) obs = self.get_observation(obs) - self.raw_obs['step_count'] = MAXTIME_LIMIT - self.step_fea return obs, r, done, info def reset(self, **kwargs): obs = self.env.reset(**kwargs) - if obs is None: - return None self.step_fea = MAXTIME_LIMIT - self.raw_obs = copy.deepcopy(obs) obs = self.get_observation(obs) - self.raw_obs['step_count'] = MAXTIME_LIMIT - self.step_fea return obs @@ -421,7 +726,6 @@ class PelvisBasedObs(ObsTranformerBase): res = np.append(res, feet_dis) remaining_time = (self.step_fea - (MAXTIME_LIMIT / 2.0)) / (MAXTIME_LIMIT / 2.0) * -1.0 - #logger.info('remaining_time fea: {}'.format(remaining_time)) res = np.append(res, remaining_time) # target driven @@ -450,9 +754,10 @@ if __name__ == '__main__': env = ProstheticsEnv(visualize=False) env.change_model(model='3D', difficulty=1, prosthetic=True) - env = ForwardReward(env) + env = CustomR2Env(env) + env = RunFastestReward(env) env = FrameSkip(env, 4) env = ActionScale(env) env = PelvisBasedObs(env) for i in range(64): - observation = env.reset(project=False) + observation = env.reset(project=False, stage=0) diff --git a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/final_submit/env_wrapper.py b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/final_submit/env_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..c08e97f6b89cad9abbb6608707d5dc19f470d7b7 --- /dev/null +++ b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/final_submit/env_wrapper.py @@ -0,0 +1,458 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import abc +import copy +import gym +import math +import numpy as np +from collections import OrderedDict +from osim.env import ProstheticsEnv +from parl.utils import logger + +MAXTIME_LIMIT = 1000 +ProstheticsEnv.time_limit = MAXTIME_LIMIT +FRAME_SKIP = None +FALL_PENALTY = 0 + + +class RemoteEnv(gym.Wrapper): + def __init__(self, env): + env.metadata = {} + env.action_space = None + env.observation_space = None + env.reward_range = None + gym.Wrapper.__init__(self, env) + self.remote_env = env + self.first_time = True + + def step(self, act): + return self.remote_env.env_step(act.tolist()) + + def reset(self): + if self.first_time: + self.first_time = False + return self.remote_env.env_create() + obs = self.remote_env.env_reset() + if not obs: + return None + return obs + + +def calc_vel_diff(state_desc): + cur_vel_x = state_desc['body_vel']['pelvis'][0] + cur_vel_z = state_desc['body_vel']['pelvis'][2] + target_vel_x = state_desc['target_vel'][0] + target_vel_z = state_desc['target_vel'][2] + diff_vel_x = cur_vel_x - target_vel_x + diff_vel_z = cur_vel_z - target_vel_z + + cur_vel = (cur_vel_x**2 + cur_vel_z**2)**0.5 + target_vel = (target_vel_x**2 + target_vel_z**2)**0.5 + diff_vel = cur_vel - target_vel + + target_theta = math.atan(-1.0 * target_vel_z / target_vel_x) + # alone y axis + cur_theta = state_desc['body_pos_rot']['pelvis'][1] + diff_theta = cur_theta - target_theta + + return cur_vel_x, cur_vel_z, diff_vel_x, diff_vel_z, diff_vel, diff_theta + + +class ActionScale(gym.Wrapper): + def __init__(self, env): + gym.Wrapper.__init__(self, env) + + def step(self, action, **kwargs): + action = (np.copy(action) + 1.0) * 0.5 + action = np.clip(action, 0.0, 1.0) + return self.env.step(action, **kwargs) + + def reset(self, **kwargs): + return self.env.reset(**kwargs) + + +class FrameSkip(gym.Wrapper): + def __init__(self, env, k): + gym.Wrapper.__init__(self, env) + self.frame_skip = k + global FRAME_SKIP + FRAME_SKIP = k + self.frame_count = 0 + + def step(self, action, **kwargs): + r = 0.0 + merge_info = {} + for k in range(self.frame_skip): + self.frame_count += 1 + obs, reward, done, info = self.env.step(action, **kwargs) + r += reward + + for key in info.keys(): + if 'reward' in key: + # to assure that we don't igonre other reward + # if new reward was added, consider its logic here + assert (key == 'shaping_reward') or (key == 'r2_reward') + merge_info[key] = merge_info.get(key, 0.0) + info[key] + else: + merge_info[key] = info[key] + + if info['target_changed']: + #merge_info['shaping_reward'] += info['shaping_reward'] * (self.frame_skip - k - 1) + logger.warn("[FrameSkip] early break since target was changed") + break + + if done: + break + merge_info['frame_count'] = self.frame_count + return obs, r, done, merge_info + + def reset(self, **kwargs): + self.frame_count = 0 + return self.env.reset(**kwargs) + + +class RewardShaping(gym.Wrapper): + """ A wrapper for reward shaping, note this wrapper must be the first wrapper """ + + def __init__(self, env): + logger.info("[RewardShaping]type:{}".format(type(env))) + + self.step_count = 0 + self.pre_state_desc = None + self.last_target_vel = None + self.last_target_change_step = 0 + self.target_change_times = 0 + gym.Wrapper.__init__(self, env) + + @abc.abstractmethod + def reward_shaping(self, state_desc, reward, done, action): + """define your own reward computation function + Args: + state_desc(dict): state description for current model + reward(scalar): generic reward generated by env + done(bool): generic done flag generated by env + """ + pass + + def step(self, action, **kwargs): + self.step_count += 1 + obs, r, done, info = self.env.step(action, **kwargs) + info = self.reward_shaping(obs, r, done, action) + if info['target_vel'] > 2.75: + rate = math.sqrt((2.75**2) / (info['target_vel']**2)) + logger.warn('Changing targets, origin targets: {}'.format( + obs['target_vel'])) + obs['target_vel'][0] = obs['target_vel'][0] * rate + obs['target_vel'][2] = obs['target_vel'][2] * rate + logger.warn('Changing targets, new targets: {}'.format( + obs['target_vel'])) + info['target_vel'] = 2.75 + if info['target_vel'] < -0.25: + rate = math.sqrt(((-0.25)**2) / (info['target_vel']**2)) + logger.warn('Changing targets, origin targets: {}'.format( + obs['target_vel'])) + obs['target_vel'][0] = obs['target_vel'][0] * rate + obs['target_vel'][2] = obs['target_vel'][2] * rate + logger.warn('Changing targets, new targets: {}'.format( + obs['target_vel'])) + info['target_vel'] = -0.25 + + delta = 0 + if self.last_target_vel is not None: + delta = np.absolute( + np.array(self.last_target_vel) - np.array(obs['target_vel'])) + if (self.last_target_vel is None) or np.all(delta < 1e-5): + info['target_changed'] = False + else: + info['target_changed'] = True + logger.info("[env_wrapper] target_changed, vx:{} vz:{}".format( + obs['target_vel'][0], obs['target_vel'][2])) + self.last_target_change_step = self.step_count + self.target_change_times += 1 + info['target_change_times'] = self.target_change_times + self.last_target_vel = obs['target_vel'] + + assert 'shaping_reward' in info + timeout = False + if self.step_count >= MAXTIME_LIMIT: + timeout = True + if done and not timeout: + # penalty for falling down + info['shaping_reward'] += FALL_PENALTY + info['timeout'] = timeout + self.pre_state_desc = obs + return obs, r, done, info + + def reset(self, **kwargs): + self.step_count = 0 + self.last_target_vel = None + self.last_target_change_step = 0 + self.target_change_times = 0 + obs = self.env.reset(**kwargs) + self.pre_state_desc = obs + return obs + + +class ForwardReward(RewardShaping): + """ A reward shaping wraper""" + + def __init__(self, env): + RewardShaping.__init__(self, env) + + def reward_shaping(self, state_desc, r2_reward, done, action): + target_vel = math.sqrt(state_desc["target_vel"][0]**2 + + state_desc["target_vel"][2]**2) + if state_desc["target_vel"][0] < 0: + target_vel = -target_vel + + info = { + 'shaping_reward': r2_reward, + 'target_vel': target_vel, + 'r2_reward': r2_reward, + } + return info + + +class ObsTranformerBase(gym.Wrapper): + def __init__(self, env): + gym.Wrapper.__init__(self, env) + self.step_fea = MAXTIME_LIMIT + self.raw_obs = None + global FRAME_SKIP + self.frame_skip = int(FRAME_SKIP) + + def get_observation(self, state_desc): + obs = self._get_observation(state_desc) + if not isinstance(self, PelvisBasedObs): + cur_vel_x, cur_vel_z, diff_vel_x, diff_vel_z, diff_vel, diff_theta = calc_vel_diff( + state_desc) + obs = np.append(obs, [ + cur_vel_x, cur_vel_z, diff_vel_x, diff_vel_z, diff_vel, + diff_theta + ]) + else: + pass + return obs + + @abc.abstractmethod + def _get_observation(self, state_desc): + pass + + def feature_normalize(self, obs, mean, std, duplicate_id): + scaler_len = mean.shape[0] + assert obs.shape[0] >= scaler_len + obs[:scaler_len] = (obs[:scaler_len] - mean) / std + final_obs = [] + for i in range(obs.shape[0]): + if i not in duplicate_id: + final_obs.append(obs[i]) + return np.array(final_obs) + + def step(self, action, **kwargs): + obs, r, done, info = self.env.step(action, **kwargs) + if info['target_changed']: + # reset step_fea when change target + self.step_fea = MAXTIME_LIMIT + + self.step_fea -= FRAME_SKIP + + self.raw_obs = copy.deepcopy(obs) + obs = self.get_observation(obs) + self.raw_obs['step_count'] = MAXTIME_LIMIT - self.step_fea + return obs, r, done, info + + def reset(self, **kwargs): + obs = self.env.reset(**kwargs) + if obs is None: + return None + self.step_fea = MAXTIME_LIMIT + self.raw_obs = copy.deepcopy(obs) + obs = self.get_observation(obs) + self.raw_obs['step_count'] = MAXTIME_LIMIT - self.step_fea + return obs + + +class PelvisBasedObs(ObsTranformerBase): + def __init__(self, env): + ObsTranformerBase.__init__(self, env) + data = np.load('./pelvisBasedObs_scaler.npz') + self.mean, self.std, self.duplicate_id = data['mean'], data[ + 'std'], data['duplicate_id'] + self.duplicate_id = self.duplicate_id.astype(np.int32).tolist() + + def get_core_matrix(self, yaw): + core_matrix = np.zeros(shape=(3, 3)) + core_matrix[0][0] = math.cos(yaw) + core_matrix[0][2] = -1.0 * math.sin(yaw) + core_matrix[1][1] = 1 + core_matrix[2][0] = math.sin(yaw) + core_matrix[2][2] = math.cos(yaw) + return core_matrix + + def _get_observation(self, state_desc): + o = OrderedDict() + for body_part in [ + 'pelvis', 'femur_r', 'pros_tibia_r', 'pros_foot_r', 'femur_l', + 'tibia_l', 'talus_l', 'calcn_l', 'toes_l', 'torso', 'head' + ]: + # position + o[body_part + '_x'] = state_desc['body_pos'][body_part][0] + o[body_part + '_y'] = state_desc['body_pos'][body_part][1] + o[body_part + '_z'] = state_desc['body_pos'][body_part][2] + # velocity + o[body_part + '_v_x'] = state_desc["body_vel"][body_part][0] + o[body_part + '_v_y'] = state_desc["body_vel"][body_part][1] + o[body_part + '_v_z'] = state_desc["body_vel"][body_part][2] + + o[body_part + '_x_r'] = state_desc["body_pos_rot"][body_part][0] + o[body_part + '_y_r'] = state_desc["body_pos_rot"][body_part][1] + o[body_part + '_z_r'] = state_desc["body_pos_rot"][body_part][2] + + o[body_part + '_v_x_r'] = state_desc["body_vel_rot"][body_part][0] + o[body_part + '_v_y_r'] = state_desc["body_vel_rot"][body_part][1] + o[body_part + '_v_z_r'] = state_desc["body_vel_rot"][body_part][2] + + for joint in [ + 'hip_r', 'knee_r', 'ankle_r', 'hip_l', 'knee_l', 'ankle_l', + 'back' + ]: + if 'hip' not in joint: + o[joint + '_joint_pos'] = state_desc['joint_pos'][joint][0] + o[joint + '_joint_vel'] = state_desc['joint_vel'][joint][0] + else: + for i in range(3): + o[joint + '_joint_pos_' + + str(i)] = state_desc['joint_pos'][joint][i] + o[joint + '_joint_vel_' + + str(i)] = state_desc['joint_vel'][joint][i] + + # In NIPS2017, only use activation + for muscle in sorted(state_desc["muscles"].keys()): + activation = state_desc["muscles"][muscle]["activation"] + if isinstance(activation, float): + activation = [activation] + for i, val in enumerate(activation): + o[muscle + '_activation_' + str(i)] = activation[i] + + fiber_length = state_desc["muscles"][muscle]["fiber_length"] + if isinstance(fiber_length, float): + fiber_length = [fiber_length] + for i, val in enumerate(fiber_length): + o[muscle + '_fiber_length_' + str(i)] = fiber_length[i] + + fiber_velocity = state_desc["muscles"][muscle]["fiber_velocity"] + if isinstance(fiber_velocity, float): + fiber_velocity = [fiber_velocity] + for i, val in enumerate(fiber_velocity): + o[muscle + '_fiber_velocity_' + str(i)] = fiber_velocity[i] + + # z axis of mass have some problem now, delete it later + o['mass_x'] = state_desc["misc"]["mass_center_pos"][0] + o['mass_y'] = state_desc["misc"]["mass_center_pos"][1] + o['mass_z'] = state_desc["misc"]["mass_center_pos"][2] + + o['mass_v_x'] = state_desc["misc"]["mass_center_vel"][0] + o['mass_v_y'] = state_desc["misc"]["mass_center_vel"][1] + o['mass_v_z'] = state_desc["misc"]["mass_center_vel"][2] + for key in ['talus_l_y', 'toes_l_y']: + o['touch_indicator_' + key] = np.clip(0.05 - o[key] * 10 + 0.5, 0., + 1.) + o['touch_indicator_2_' + key] = np.clip(0.1 - o[key] * 10 + 0.5, + 0., 1.) + + # Tranformer + core_matrix = self.get_core_matrix(o['pelvis_y_r']) + pelvis_pos = np.array([o['pelvis_x'], o['pelvis_y'], + o['pelvis_z']]).reshape((3, 1)) + pelvis_vel = np.array( + [o['pelvis_v_x'], o['pelvis_v_y'], o['pelvis_v_z']]).reshape((3, + 1)) + for body_part in [ + 'mass', 'femur_r', 'pros_tibia_r', 'pros_foot_r', 'femur_l', + 'tibia_l', 'talus_l', 'calcn_l', 'toes_l', 'torso', 'head' + ]: + # rotation + if body_part != 'mass': + o[body_part + '_y_r'] -= o['pelvis_y_r'] + o[body_part + '_v_y_r'] -= o['pelvis_v_y_r'] + # position/velocity + global_pos = [] + global_vel = [] + for each in ['_x', '_y', '_z']: + global_pos.append(o[body_part + each]) + global_vel.append(o[body_part + '_v' + each]) + global_pos = np.array(global_pos).reshape((3, 1)) + global_vel = np.array(global_vel).reshape((3, 1)) + pelvis_rel_pos = core_matrix.dot(global_pos - pelvis_pos) + w = o['pelvis_v_y_r'] + offset = np.array( + [-w * pelvis_rel_pos[2], 0, w * pelvis_rel_pos[0]]) + pelvis_rel_vel = core_matrix.dot(global_vel - pelvis_vel) + offset + for i, each in enumerate(['_x', '_y', '_z']): + o[body_part + each] = pelvis_rel_pos[i][0] + o[body_part + '_v' + each] = pelvis_rel_vel[i][0] + + for key in ['pelvis_x', 'pelvis_z', 'pelvis_y_r']: + del o[key] + + current_v = np.array(state_desc['body_vel']['pelvis']).reshape((3, 1)) + pelvis_current_v = core_matrix.dot(current_v) + o['pelvis_v_x'] = pelvis_current_v[0] + o['pelvis_v_z'] = pelvis_current_v[2] + + res = np.array(list(o.values())) + res = self.feature_normalize( + res, mean=self.mean, std=self.std, duplicate_id=self.duplicate_id) + + feet_dis = ((o['tibia_l_x'] - o['pros_tibia_r_x'])**2 + + (o['tibia_l_z'] - o['pros_tibia_r_z'])**2)**0.5 + res = np.append(res, feet_dis) + remaining_time = (self.step_fea - + (MAXTIME_LIMIT / 2.0)) / (MAXTIME_LIMIT / 2.0) * -1.0 + #logger.info('remaining_time fea: {}'.format(remaining_time)) + res = np.append(res, remaining_time) + + # target driven + target_v = np.array(state_desc['target_vel']).reshape((3, 1)) + pelvis_target_v = core_matrix.dot(target_v) + diff_vel_x = pelvis_target_v[0] - pelvis_current_v[0] + diff_vel_z = pelvis_target_v[2] - pelvis_current_v[2] + diff_vel = np.sqrt(pelvis_target_v[0] ** 2 + pelvis_target_v[2] ** 2) - \ + np.sqrt(pelvis_current_v[0] ** 2 + pelvis_current_v[2] ** 2) + + target_vel_x = target_v[0] + target_vel_z = target_v[2] + target_theta = math.atan(-1.0 * target_vel_z / target_vel_x) + current_theta = state_desc['body_pos_rot']['pelvis'][1] + diff_theta = target_theta - current_theta + res = np.append(res, [ + diff_vel_x[0] / 3.0, diff_vel_z[0] / 3.0, diff_vel[0] / 3.0, + diff_theta / (np.pi * 3 / 8) + ]) + + return res + + +if __name__ == '__main__': + from osim.env import ProstheticsEnv + + env = ProstheticsEnv(visualize=False) + env.change_model(model='3D', difficulty=1, prosthetic=True) + env = ForwardReward(env) + env = FrameSkip(env, 4) + env = ActionScale(env) + env = PelvisBasedObs(env) + for i in range(64): + observation = env.reset(project=False) diff --git a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/mlp_model.py b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/final_submit/mlp_model.py similarity index 100% rename from examples/NeurIPS2018-AI-for-Prosthetics-Challenge/mlp_model.py rename to examples/NeurIPS2018-AI-for-Prosthetics-Challenge/final_submit/mlp_model.py diff --git a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/final_submit/pelvisBasedObs_scaler.npz b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/final_submit/pelvisBasedObs_scaler.npz new file mode 100644 index 0000000000000000000000000000000000000000..ba6c91436b96a3d5f806016ff1e8c9d82a4981d4 Binary files /dev/null and b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/final_submit/pelvisBasedObs_scaler.npz differ diff --git a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/submit_model.py b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/final_submit/submit_model.py similarity index 100% rename from examples/NeurIPS2018-AI-for-Prosthetics-Challenge/submit_model.py rename to examples/NeurIPS2018-AI-for-Prosthetics-Challenge/final_submit/submit_model.py diff --git a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/final_submit/test.py b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/final_submit/test.py new file mode 100644 index 0000000000000000000000000000000000000000..06613be69fd18ad38b49a74ad89d7de02973cfbe --- /dev/null +++ b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/final_submit/test.py @@ -0,0 +1,88 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import numpy as np +import time +from env_wrapper import FrameSkip, ActionScale, PelvisBasedObs, ForwardReward +from osim.env import ProstheticsEnv +from parl.utils import logger +from submit_model import SubmitModel + + +def play_multi_episode(submit_model, episode_num=2, vis=False, seed=0): + np.random.seed(seed) + env = ProstheticsEnv(visualize=vis) + env.change_model(model='3D', difficulty=1, prosthetic=True, seed=seed) + env = ForwardReward(env) + env = FrameSkip(env, 4) + env = ActionScale(env) + env = PelvisBasedObs(env) + all_reward = [] + all_shaping_reward = 0 + last_frames_count = 0 + + for e in range(episode_num): + t = time.time() + episode_reward = 0.0 + episode_shaping_reward = 0.0 + observation = env.reset(project=False) + target_change_times = 0 + step = 0 + loss = [] + while True: + step += 1 + action = submit_model.pred_batch(observation, target_change_times) + observation, reward, done, info = env.step(action, project=False) + step_frames = info['frame_count'] - last_frames_count + last_frames_count = info['frame_count'] + episode_reward += reward + # we pacle it here to drop the first step after changing + if target_change_times >= 1: + loss.append(10 * step_frames - reward) + if info['target_changed']: + target_change_times = min(target_change_times + 1, 3) + logger.info("[step/{}]reward:{} info:{}".format( + step, reward, info)) + episode_shaping_reward += info['shaping_reward'] + if done: + break + all_reward.append(episode_reward) + all_shaping_reward += episode_shaping_reward + t = time.time() - t + logger.info( + "[episode/{}] time: {} episode_reward:{} change_loss:{} after_change_loss:{} mean_reward:{}" + .format(e, t, episode_reward, np.sum(loss[:15]), np.sum(loss[15:]), + np.mean(all_reward))) + logger.info("Mean reward:{}".format(np.mean(all_reward))) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--use_cuda', action="store_true", help='If set, will run in gpu 0') + parser.add_argument( + '--vis', action="store_true", help='If set, will visualize.') + parser.add_argument('--seed', type=int, default=0, help='Random seed.') + parser.add_argument( + '--episode_num', type=int, default=1, help='Episode number to run.') + args = parser.parse_args() + + submit_model = SubmitModel(use_cuda=args.use_cuda) + + play_multi_episode( + submit_model, + episode_num=args.episode_num, + vis=args.vis, + seed=args.seed) diff --git a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/multi_head_ddpg.py b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/multi_head_ddpg.py new file mode 100644 index 0000000000000000000000000000000000000000..79ad56be2bc87aada7fad504efe1b039ae36a870 --- /dev/null +++ b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/multi_head_ddpg.py @@ -0,0 +1,123 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import parl.layers as layers +from copy import deepcopy +from paddle import fluid +from parl.framework.algorithm_base import Algorithm + +__all__ = ['MultiHeadDDPG'] + + +class MultiHeadDDPG(Algorithm): + def __init__(self, models, hyperparas): + """ model: should implement the function get_actor_params() + """ + self.models = models + self.target_models = [] + for model in models: + target_model = deepcopy(model) + self.target_models.append(target_model) + + # fetch hyper parameters + self.gamma = hyperparas['gamma'] + self.tau = hyperparas['tau'] + self.ensemble_num = hyperparas['ensemble_num'] + + def define_predict(self, obs, model_id): + """ use actor model of self.models[model_id] to predict the action + """ + return self.models[model_id].policy(obs) + + def define_ensemble_predict(self, obs): + """ ensemble predict: + 1. For actions of all actors, each critic will score them + and normalize its scores; + 2. For each actor, will calculate its score by + average scores given by all critics + 3. choose action of the actor whose score is best + """ + actor_outputs = [] + for i in range(self.ensemble_num): + actor_outputs.append(self.models[i].policy(obs)) + batch_actions = layers.concat(actor_outputs, axis=0) + batch_obs = layers.expand(obs, expand_times=[self.ensemble_num, 1]) + + critic_outputs = [] + for i in range(self.ensemble_num): + critic_output = self.models[i].value(batch_obs, batch_actions) + critic_output = layers.unsqueeze(critic_output, axes=[1]) + critic_outputs.append(critic_output) + score_matrix = layers.concat(critic_outputs, axis=1) + + # Normalize scores given by each critic + sum_critic_score = layers.reduce_sum( + score_matrix, dim=0, keep_dim=True) + sum_critic_score = layers.expand( + sum_critic_score, expand_times=[self.ensemble_num, 1]) + norm_score_matrix = score_matrix / sum_critic_score + + actions_mean_score = layers.reduce_mean( + norm_score_matrix, dim=1, keep_dim=True) + best_score_id = layers.argmax(actions_mean_score, axis=0) + best_score_id = layers.cast(best_score_id, dtype='int32') + ensemble_predict_action = layers.gather(batch_actions, best_score_id) + return ensemble_predict_action + + def define_learn(self, obs, action, reward, next_obs, terminal, actor_lr, + critic_lr, model_id): + """ update actor and critic model of self.models[model_id] with DDPG algorithm + """ + actor_cost = self._actor_learn(obs, actor_lr, model_id) + critic_cost = self._critic_learn(obs, action, reward, next_obs, + terminal, critic_lr, model_id) + return actor_cost, critic_cost + + def _actor_learn(self, obs, actor_lr, model_id): + action = self.models[model_id].policy(obs) + Q = self.models[model_id].value(obs, action) + cost = layers.reduce_mean(-1.0 * Q) + optimizer = fluid.optimizer.AdamOptimizer(actor_lr) + optimizer.minimize( + cost, parameter_list=self.models[model_id].get_actor_params()) + return cost + + def _critic_learn(self, obs, action, reward, next_obs, terminal, critic_lr, + model_id): + next_action = self.target_models[model_id].policy(next_obs) + next_Q = self.target_models[model_id].value(next_obs, next_action) + + terminal = layers.cast(terminal, dtype='float32') + target_Q = reward + (1.0 - terminal) * self.gamma * next_Q + target_Q.stop_gradient = True + + Q = self.models[model_id].value(obs, action) + cost = layers.square_error_cost(Q, target_Q) + cost = layers.reduce_mean(cost) + optimizer = fluid.optimizer.AdamOptimizer(critic_lr) + optimizer.minimize(cost) + return cost + + def sync_target(self, + gpu_id, + model_id, + decay=None, + share_vars_parallel_executor=None): + if decay is None: + decay = 1.0 - self.tau + self.models[model_id].sync_params_to( + self.target_models[model_id], + gpu_id=gpu_id, + decay=decay, + share_vars_parallel_executor=share_vars_parallel_executor) diff --git a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/opensim_agent.py b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/opensim_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..175e83db5d97d34d1dc5c5504d7731f521a36320 --- /dev/null +++ b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/opensim_agent.py @@ -0,0 +1,218 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import parl.layers as layers +import re +from paddle import fluid +from paddle.fluid.executor import _fetch_var +from parl.framework.agent_base import Agent +from parl.utils import logger + + +class OpenSimAgent(Agent): + def __init__(self, algorithm, obs_dim, act_dim, ensemble_num): + self.obs_dim = obs_dim + self.act_dim = act_dim + self.ensemble_num = ensemble_num + super(OpenSimAgent, self).__init__(algorithm) + + # Use ParallelExecutor to make program running faster + use_cuda = True if self.gpu_id >= 0 else False + self.learn_pe = [] + self.pred_pe = [] + + exec_strategy = fluid.ExecutionStrategy() + exec_strategy.use_experimental_executor = True + exec_strategy.num_threads = 4 + build_strategy = fluid.BuildStrategy() + build_strategy.remove_unnecessary_lock = True + + for i in range(self.ensemble_num): + with fluid.scope_guard(fluid.global_scope().new_scope()): + pe = fluid.ParallelExecutor( + use_cuda=use_cuda, + main_program=self.learn_programs[i], + exec_strategy=exec_strategy, + build_strategy=build_strategy) + self.learn_pe.append(pe) + + with fluid.scope_guard(fluid.global_scope().new_scope()): + pe = fluid.ParallelExecutor( + use_cuda=use_cuda, + main_program=self.predict_programs[i], + exec_strategy=exec_strategy, + build_strategy=build_strategy) + self.pred_pe.append(pe) + + # Attention: In the beginning, sync target model totally. + self.alg.sync_target( + gpu_id=self.gpu_id, + model_id=i, + decay=1.0, + share_vars_parallel_executor=self.learn_pe[i]) + # Do cache, will create ParallelExecutor of sync params in advance + # If not, there are some issues when ensemble_num > 1 + self.alg.sync_target( + gpu_id=self.gpu_id, + model_id=i, + share_vars_parallel_executor=self.learn_pe[i]) + + with fluid.scope_guard(fluid.global_scope().new_scope()): + self.ensemble_predict_pe = fluid.ParallelExecutor( + use_cuda=use_cuda, + main_program=self.ensemble_predict_program, + exec_strategy=exec_strategy, + build_strategy=build_strategy) + + def build_program(self): + self.predict_programs = [] + self.predict_outputs = [] + self.learn_programs = [] + self.learn_programs_output = [] + for i in range(self.ensemble_num): + predict_program = fluid.Program() + with fluid.program_guard(predict_program): + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + act = self.alg.define_predict(obs, model_id=i) + self.predict_programs.append(predict_program) + self.predict_outputs.append([act.name]) + + learn_program = fluid.Program() + with fluid.program_guard(learn_program): + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + act = layers.data( + name='act', shape=[self.act_dim], dtype='float32') + reward = layers.data(name='reward', shape=[], dtype='float32') + next_obs = layers.data( + name='next_obs', shape=[self.obs_dim], dtype='float32') + terminal = layers.data(name='terminal', shape=[], dtype='bool') + actor_lr = layers.data( + name='actor_lr', + shape=[1], + dtype='float32', + append_batch_size=False) + critic_lr = layers.data( + name='critic_lr', + shape=[1], + dtype='float32', + append_batch_size=False) + actor_loss, critic_loss = self.alg.define_learn( + obs, + act, + reward, + next_obs, + terminal, + actor_lr, + critic_lr, + model_id=i) + self.learn_programs.append(learn_program) + self.learn_programs_output.append([critic_loss.name]) + + self.ensemble_predict_program = fluid.Program() + with fluid.program_guard(self.ensemble_predict_program): + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + act = self.alg.define_ensemble_predict(obs) + self.ensemble_predict_output = [act.name] + + def predict(self, obs, model_id): + feed = {'obs': obs} + feed = [feed] + act = self.pred_pe[model_id].run( + feed=feed, fetch_list=self.predict_outputs[model_id])[0] + return act + + def ensemble_predict(self, obs): + feed = {'obs': obs} + feed = [feed] + act = self.ensemble_predict_pe.run( + feed=feed, fetch_list=self.ensemble_predict_output)[0] + return act + + def learn(self, obs, act, reward, next_obs, terminal, actor_lr, critic_lr, + model_id): + feed = { + 'obs': obs, + 'act': act, + 'reward': reward, + 'next_obs': next_obs, + 'terminal': terminal, + 'actor_lr': actor_lr, + 'critic_lr': critic_lr + } + + feed = [feed] + critic_loss = self.learn_pe[model_id].run( + feed=feed, fetch_list=self.learn_programs_output[model_id])[0] + self.alg.sync_target( + gpu_id=self.gpu_id, + model_id=model_id, + share_vars_parallel_executor=self.learn_pe[model_id]) + return critic_loss + + def save_params(self, dirname): + for i in range(self.ensemble_num): + fluid.io.save_params( + executor=self.fluid_executor, + dirname=dirname, + main_program=self.learn_programs[i]) + + def load_params(self, dirname, from_one_head): + if from_one_head: + logger.info('[From one head, extend to multi head:]') + # load model 0 + fluid.io.load_params( + executor=self.fluid_executor, + dirname=dirname, + main_program=self.learn_programs[0]) + + # sync identity params of model/target_model 0 to other models/target_models + for i in range(1, self.ensemble_num): + params = list( + filter( + lambda x: 'identity' in x.name and '@GRAD' not in x.name, + self.learn_programs[i].list_vars())) + for param in params: + param_var = _fetch_var(param.name, return_numpy=False) + + model0_name = re.sub(r"identity_\d+", "identity_0", + param.name) + model0_value = _fetch_var(model0_name, return_numpy=True) + logger.info('{} -> {}'.format(model0_name, param.name)) + param_var.set(model0_value, self.place) + + # sync share params of target_model 0 to other target models + # After deepcopy, shapre params between target models is different + for i in range(1, self.ensemble_num): + params = list( + filter( + lambda x: 'shared' in x.name and 'PARL_target' in x.name and '@GRAD' not in x.name, + self.learn_programs[i].list_vars())) + for param in params: + param_var = _fetch_var(param.name, return_numpy=False) + + model0_name = re.sub(r"_\d+$", "_0", param.name) + model0_value = _fetch_var(model0_name, return_numpy=True) + logger.info('{} -> {}'.format(model0_name, param.name)) + param_var.set(model0_value, self.place) + + else: + for i in range(self.ensemble_num): + fluid.io.load_params( + executor=self.fluid_executor, + dirname=dirname, + main_program=self.learn_programs[i]) diff --git a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/opensim_model.py b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/opensim_model.py new file mode 100644 index 0000000000000000000000000000000000000000..29e3412651b3090ed235475686e5165218d387b5 --- /dev/null +++ b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/opensim_model.py @@ -0,0 +1,180 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import parl.layers as layers +from paddle import fluid +from paddle.fluid.param_attr import ParamAttr +from parl.framework.model_base import Model + + +class OpenSimModel(Model): + def __init__(self, obs_dim, vel_obs_dim, act_dim, model_id=0, shared=True): + self.actor_model = ActorModel(obs_dim, vel_obs_dim, act_dim, model_id, + shared) + self.critic_model = CriticModel(obs_dim, vel_obs_dim, act_dim, + model_id, shared) + + def policy(self, obs): + return self.actor_model.policy(obs) + + def value(self, obs, action): + return self.critic_model.value(obs, action) + + def get_actor_params(self): + return self.actor_model.parameter_names + + +class ActorModel(Model): + def __init__(self, obs_dim, vel_obs_dim, act_dim, model_id, shared): + hid0_size = 800 + hid1_size = 400 + hid2_size = 200 + vel_hid0_size = 200 + vel_hid1_size = 400 + + self.obs_dim = obs_dim + self.vel_obs_dim = vel_obs_dim + + # bottom layers + if shared: + scope_name = 'policy_shared' + else: + scope_name = 'policy_identity_{}'.format(model_id) + + self.fc0 = layers.fc( + size=hid0_size, + act='tanh', + param_attr=ParamAttr(name='{}/h0/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/h0/b'.format(scope_name))) + self.fc1 = layers.fc( + size=hid1_size, + act='tanh', + param_attr=ParamAttr(name='{}/h1/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/h1/b'.format(scope_name))) + self.vel_fc0 = layers.fc( + size=vel_hid0_size, + act='tanh', + param_attr=ParamAttr(name='{}/vel_h0/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/vel_h0/b'.format(scope_name))) + self.vel_fc1 = layers.fc( + size=vel_hid1_size, + act='tanh', + param_attr=ParamAttr(name='{}/vel_h1/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/vel_h1/b'.format(scope_name))) + + # top layers + scope_name = 'policy_identity_{}'.format(model_id) + + self.fc2 = layers.fc( + size=hid2_size, + act='tanh', + param_attr=ParamAttr(name='{}/h2/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/h2/b'.format(scope_name))) + self.fc3 = layers.fc( + size=act_dim, + act='tanh', + param_attr=ParamAttr(name='{}/means/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/means/b'.format(scope_name))) + + def policy(self, obs): + real_obs = layers.slice( + obs, axes=[1], starts=[0], ends=[-self.vel_obs_dim]) + # target related fetures + vel_obs = layers.slice( + obs, axes=[1], starts=[-self.vel_obs_dim], ends=[self.obs_dim]) + + hid0 = self.fc0(real_obs) + hid1 = self.fc1(hid0) + vel_hid0 = self.vel_fc0(vel_obs) + vel_hid1 = self.vel_fc1(vel_hid0) + concat = layers.concat([hid1, vel_hid1], axis=1) + hid2 = self.fc2(concat) + means = self.fc3(hid2) + return means + + +class CriticModel(Model): + def __init__(self, obs_dim, vel_obs_dim, act_dim, model_id, shared): + super(CriticModel, self).__init__() + hid0_size = 800 + hid1_size = 400 + vel_hid0_size = 200 + vel_hid1_size = 400 + + self.obs_dim = obs_dim + self.vel_obs_dim = vel_obs_dim + + # buttom layers + if shared: + scope_name = 'critic_shared' + else: + scope_name = 'critic_identity_{}'.format(model_id) + + self.fc0 = layers.fc( + size=hid0_size, + act='selu', + param_attr=ParamAttr(name='{}/w1/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/w1/b'.format(scope_name))) + self.fc1 = layers.fc( + size=hid1_size, + act='selu', + param_attr=ParamAttr(name='{}/h1/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/h1/b'.format(scope_name))) + self.vel_fc0 = layers.fc( + size=vel_hid0_size, + act='selu', + param_attr=ParamAttr(name='{}/vel_h0/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/vel_h0/b'.format(scope_name))) + self.vel_fc1 = layers.fc( + size=vel_hid1_size, + act='selu', + param_attr=ParamAttr(name='{}/vel_h1/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/vel_h1/b'.format(scope_name))) + self.act_fc0 = layers.fc( + size=hid1_size, + act='selu', + param_attr=ParamAttr(name='{}/a1/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/a1/b'.format(scope_name))) + + # top layers + scope_name = 'critic_identity_{}'.format(model_id) + + self.fc2 = layers.fc( + size=hid1_size, + act='selu', + param_attr=ParamAttr(name='{}/h3/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/h3/b'.format(scope_name))) + self.fc3 = layers.fc( + size=1, + act='selu', + param_attr=ParamAttr(name='{}/value/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/value/b'.format(scope_name))) + + def value(self, obs, action): + real_obs = layers.slice( + obs, axes=[1], starts=[0], ends=[-self.vel_obs_dim]) + # target related fetures + vel_obs = layers.slice( + obs, axes=[1], starts=[-self.vel_obs_dim], ends=[self.obs_dim]) + + hid0 = self.fc0(real_obs) + hid1 = self.fc1(hid0) + vel_hid0 = self.vel_fc0(vel_obs) + vel_hid1 = self.vel_fc1(vel_hid0) + a1 = self.act_fc0(action) + concat = layers.concat([hid1, a1, vel_hid1], axis=1) + hid2 = self.fc2(concat) + Q = self.fc3(hid2) + Q = layers.squeeze(Q, axes=[1]) + return Q diff --git a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/simulator_client.py b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/simulator_client.py new file mode 100644 index 0000000000000000000000000000000000000000..d5b0f7f7c42f4897dd0e3b0c92465e76359dde12 --- /dev/null +++ b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/simulator_client.py @@ -0,0 +1,115 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import grpc +import json +import numpy as np +import simulator_pb2 +import simulator_pb2_grpc +from args import get_client_args +from env_wrapper import FrameSkip, ActionScale, PelvisBasedObs, MAXTIME_LIMIT, CustomR2Env, RunFastestReward, FixedTargetSpeedReward, Round2Reward +from osim.env import ProstheticsEnv +from parl.utils import logger + +ProstheticsEnv.time_limit = MAXTIME_LIMIT + + +class Worker(object): + def __init__(self, server_ip='localhost', server_port=5007): + if args.ident is not None: + self.worker_id = args.ident + else: + self.worker_id = np.random.randint(int(1e18)) + + self.address = '{}:{}'.format(server_ip, server_port) + + random_seed = int(self.worker_id % int(1e9)) + np.random.seed(random_seed) + + env = ProstheticsEnv(visualize=False, seed=random_seed) + env.change_model( + model='3D', difficulty=1, prosthetic=True, seed=random_seed) + env.spec.timestep_limit = MAXTIME_LIMIT + env = CustomR2Env(env) + + if args.reward_type == 'RunFastest': + env = RunFastestReward(env) + elif args.reward_type == 'FixedTargetSpeed': + env = FixedTargetSpeedReward( + env, args.target_v, args.act_penalty_lowerbound, + args.act_penalty_coeff, args.vel_penalty_coeff) + elif args.reward_type == 'Round2': + env = Round2Reward(env, args.act_penalty_lowerbound, + args.act_penalty_coeff, args.vel_penalty_coeff) + else: + assert False, 'Not supported reward type!' + + env = FrameSkip(env, 4) + env = ActionScale(env) + self.env = PelvisBasedObs(env) + + def run(self): + observation = self.env.reset(project=False, stage=args.stage) + reward = 0 + done = False + info = {'shaping_reward': 0.0} + info['first'] = True + with grpc.insecure_channel(self.address) as channel: + stub = simulator_pb2_grpc.SimulatorStub(channel) + while True: + response = stub.Send( + simulator_pb2.Request( + observation=observation, + reward=reward, + done=done, + info=json.dumps(info), + id=self.worker_id)) + + extra = json.loads(response.extra) + + if 'reset' in extra and extra['reset']: + logger.info('Server require to reset!') + observation = self.env.reset( + project=False, stage=args.stage) + reward = 0 + done = False + info = {'shaping_reward': 0.0} + continue + + if 'shutdown' in extra and extra['shutdown']: + break + + action = np.array(response.action) + next_observation, reward, done, info = self.env.step( + action, project=False) + + # debug info + if args.debug: + logger.info("dim:{} obs:{} act:{} reward:{} done:{} info:{}".format(\ + len(observation), np.sum(observation), np.sum(action), reward, done, info)) + observation = next_observation + if done: + observation = self.env.reset( + project=False, stage=args.stage) + + # the last observation should be used to compute append_value in simulator_server + info['last_obs'] = next_observation.tolist() + + +if __name__ == '__main__': + args = get_client_args() + + worker = Worker(server_ip=args.ip, server_port=args.port) + + worker.run() diff --git a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/simulator_pb2.py b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/simulator_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..a1cce8c7a30eb5e400db0550a39087c0223f3191 --- /dev/null +++ b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/simulator_pb2.py @@ -0,0 +1,244 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: simulator.proto + +import sys +_b = sys.version_info[0] < 3 and (lambda x: x) or ( + lambda x: x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + +DESCRIPTOR = _descriptor.FileDescriptor( + name='simulator.proto', + package='simulator', + syntax='proto3', + serialized_pb=_b( + '\n\x0fsimulator.proto\x12\tsimulator\"V\n\x07Request\x12\x13\n\x0bobservation\x18\x01 \x03(\x01\x12\x0e\n\x06reward\x18\x02 \x01(\x01\x12\x0c\n\x04\x64one\x18\x03 \x01(\x08\x12\x0c\n\x04info\x18\x04 \x01(\t\x12\n\n\x02id\x18\x05 \x01(\x03\"&\n\x05Reply\x12\x0e\n\x06\x61\x63tion\x18\x01 \x03(\x01\x12\r\n\x05\x65xtra\x18\x02 \x01(\t2;\n\tSimulator\x12.\n\x04Send\x12\x12.simulator.Request\x1a\x10.simulator.Reply\"\x00\x62\x06proto3' + )) + +_REQUEST = _descriptor.Descriptor( + name='Request', + full_name='simulator.Request', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='observation', + full_name='simulator.Request.observation', + index=0, + number=1, + type=1, + cpp_type=5, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None, + file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='reward', + full_name='simulator.Request.reward', + index=1, + number=2, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None, + file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='done', + full_name='simulator.Request.done', + index=2, + number=3, + type=8, + cpp_type=7, + label=1, + has_default_value=False, + default_value=False, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None, + file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='info', + full_name='simulator.Request.info', + index=3, + number=4, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None, + file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='id', + full_name='simulator.Request.id', + index=4, + number=5, + type=3, + cpp_type=2, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None, + file=DESCRIPTOR), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[], + serialized_start=30, + serialized_end=116, +) + +_REPLY = _descriptor.Descriptor( + name='Reply', + full_name='simulator.Reply', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='action', + full_name='simulator.Reply.action', + index=0, + number=1, + type=1, + cpp_type=5, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None, + file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='extra', + full_name='simulator.Reply.extra', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None, + file=DESCRIPTOR), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[], + serialized_start=118, + serialized_end=156, +) + +DESCRIPTOR.message_types_by_name['Request'] = _REQUEST +DESCRIPTOR.message_types_by_name['Reply'] = _REPLY +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +Request = _reflection.GeneratedProtocolMessageType( + 'Request', + (_message.Message, ), + dict( + DESCRIPTOR=_REQUEST, + __module__='simulator_pb2' + # @@protoc_insertion_point(class_scope:simulator.Request) + )) +_sym_db.RegisterMessage(Request) + +Reply = _reflection.GeneratedProtocolMessageType( + 'Reply', + (_message.Message, ), + dict( + DESCRIPTOR=_REPLY, + __module__='simulator_pb2' + # @@protoc_insertion_point(class_scope:simulator.Reply) + )) +_sym_db.RegisterMessage(Reply) + +_SIMULATOR = _descriptor.ServiceDescriptor( + name='Simulator', + full_name='simulator.Simulator', + file=DESCRIPTOR, + index=0, + options=None, + serialized_start=158, + serialized_end=217, + methods=[ + _descriptor.MethodDescriptor( + name='Send', + full_name='simulator.Simulator.Send', + index=0, + containing_service=None, + input_type=_REQUEST, + output_type=_REPLY, + options=None, + ), + ]) +_sym_db.RegisterServiceDescriptor(_SIMULATOR) + +DESCRIPTOR.services_by_name['Simulator'] = _SIMULATOR + +# @@protoc_insertion_point(module_scope) diff --git a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/simulator_pb2_grpc.py b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/simulator_pb2_grpc.py new file mode 100644 index 0000000000000000000000000000000000000000..6ff22c807769f8397e3a8744bc9c993d05fce719 --- /dev/null +++ b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/simulator_pb2_grpc.py @@ -0,0 +1,61 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! +import grpc + +import simulator_pb2 as simulator__pb2 + + +class SimulatorStub(object): + """The greeting service definition. + """ + + def __init__(self, channel): + """Constructor. + + Args: + channel: A grpc.Channel. + """ + self.Send = channel.unary_unary( + '/simulator.Simulator/Send', + request_serializer=simulator__pb2.Request.SerializeToString, + response_deserializer=simulator__pb2.Reply.FromString, + ) + + +class SimulatorServicer(object): + """The greeting service definition. + """ + + def Send(self, request, context): + """Request Action + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + +def add_SimulatorServicer_to_server(servicer, server): + rpc_method_handlers = { + 'Send': + grpc.unary_unary_rpc_method_handler( + servicer.Send, + request_deserializer=simulator__pb2.Request.FromString, + response_serializer=simulator__pb2.Reply.SerializeToString, + ), + } + generic_handler = grpc.method_handlers_generic_handler( + 'simulator.Simulator', rpc_method_handlers) + server.add_generic_rpc_handlers((generic_handler, )) diff --git a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/simulator_server.py b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/simulator_server.py new file mode 100755 index 0000000000000000000000000000000000000000..f3b775e5d85ded5da637960b5f2d142e8c6659a6 --- /dev/null +++ b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/simulator_server.py @@ -0,0 +1,336 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import grpc +import json +import numpy as np +import os +import queue +import simulator_pb2 +import simulator_pb2_grpc +import six +import time +import threading +from args import get_server_args +from collections import defaultdict +from concurrent import futures +from multi_head_ddpg import MultiHeadDDPG +from opensim_agent import OpenSimAgent +from opensim_model import OpenSimModel +from parl.utils import logger, ReplayMemory +from utils import calc_indicators, ScalarsManager, TransitionExperience + +ACT_DIM = 19 +VEL_DIM = 4 +OBS_DIM = 185 + VEL_DIM +GAMMA = 0.96 +TAU = 0.001 +ACTOR_LR = 3e-5 +CRITIC_LR = 3e-5 +TRAIN_TIMES = 100 +BATCH_SIZE = 128 +NOISE_DECAY = 0.999998 +_ONE_DAY_IN_SECONDS = 60 * 60 * 24 + + +class SimulatorServer(simulator_pb2_grpc.SimulatorServicer): + class ClientState(object): + def __init__(self): + self.memory = [] # list of Experience + self.ident = None + self.model_idx = np.random.randint(args.ensemble_num) + self.last_target_changed = 0 + self.target_change_times = 0 + + def reset(self): + self.last_target_changed = 0 + self.memory = [] + self.model_idx = np.random.randint(args.ensemble_num) + self.target_change_times = 0 + + def update_last_target_changed(self): + self.last_target_changed = len(self.memory) + + def __init__(self): + self.rpm = ReplayMemory(int(2e6), OBS_DIM, ACT_DIM) + + # Need acquire lock when model learning or predicting + self.locks = [] + for i in range(args.ensemble_num): + self.locks.append(threading.Lock()) + + models = [] + for i in range(args.ensemble_num): + models.append(OpenSimModel(OBS_DIM, VEL_DIM, ACT_DIM, model_id=i)) + + hyperparas = { + 'gamma': GAMMA, + 'tau': TAU, + 'ensemble_num': args.ensemble_num + } + alg = MultiHeadDDPG(models, hyperparas) + + self.agent = OpenSimAgent(alg, OBS_DIM, ACT_DIM, args.ensemble_num) + + self.scalars_manager = ScalarsManager(logger.get_dir()) + + # add lock when appending data to rpm or writing scalars to tensorboard + self.MEMORY_LOCK = threading.Lock() + + self.clients = defaultdict(self.ClientState) + + self.ready_client_queue = queue.Queue() + + self.noiselevel = 0.5 + self.global_step = 0 + + # thread to keep training + t = threading.Thread(target=self.keep_training) + t.start() + + def _new_ready_client(self): + """ The client is ready to start new episode, + but blocking until training thread call client_ready_event.set() + """ + client_ready_event = threading.Event() + self.ready_client_queue.put(client_ready_event) + logger.info( + "[new_ready_client] approximate size of ready clients:{}".format( + self.ready_client_queue.qsize())) + client_ready_event.wait() + + def Send(self, request, context): + """ Implement Send function in SimulatorServicer + Everytime a request comming, will create a new thread to handle + """ + ident, obs, reward, done, info = request.id, request.observation, request.reward, request.done, request.info + client = self.clients[ident] + info = json.loads(info) + + if 'first' in info: + # Waiting training thread to allow start new episode + self._new_ready_client() + + obs = np.array(obs, dtype=np.float32) + self._process_msg(ident, obs, reward, done, info) + + if done: + # Waiting training thread to allow start new episode + self._new_ready_client() + + action = self.pred_batch(obs, client.model_idx) + step = len(client.memory) - client.last_target_changed + + # whether to add noise depends on the ensemble_num + if args.ensemble_num == 1: + current_noise = self.noiselevel * (0.98**(step - 1)) + noise = np.zeros((ACT_DIM, ), dtype=np.float32) + if ident % 3 == 0: + if step % 5 == 0: + noise = np.random.randn(ACT_DIM) * current_noise + elif ident % 3 == 1: + if step % 5 == 0: + noise = np.random.randn(ACT_DIM) * current_noise * 2 + action += noise + action = np.clip(action, -1, 1) + client.memory[-1].action = action + extra_info = {} + return simulator_pb2.Reply(action=action, extra=json.dumps(extra_info)) + + def _process_msg(self, ident, obs, reward, done, info): + client = self.clients[ident] + reward_scale = (1 - GAMMA) + info['shaping_reward'] *= reward_scale + if len(client.memory) > 0: + client.memory[-1].reward = reward + info['target_change_times'] = client.target_change_times + client.memory[-1].info = info + if info['target_changed']: + client.target_change_times = min( + client.target_change_times + 1, 3) + # re-sample model_idx after target was changed + client.model_idx = np.random.randint(args.ensemble_num) + if done: + assert 'last_obs' in info + self._parse_memory(client, ident, info['last_obs']) + client.memory.append( + TransitionExperience(obs=obs, action=None, reward=None, info=None)) + if 'target_changed' in info and info['target_changed']: + client.update_last_target_changed() + return False + + def _parse_memory(self, client, ident, last_obs): + mem = client.memory + n = len(mem) + + # debug info + if ident == 1: + for i, exp in enumerate(mem): + logger.info( + "[step:{}] obs:{} action:{} reward:{} shaping_reward:{}". + format(i, np.sum(mem[i].obs), np.sum(mem[i].action), + mem[i].reward, mem[i].info['shaping_reward'])) + + episode_rpm = [] + for i in range(n - 1): + if not mem[i].info['target_changed']: + episode_rpm.append([ + mem[i].obs, mem[i].action, mem[i].info['shaping_reward'], + mem[i + 1].obs, False, mem[i].info['target_change_times'] + ]) + if not mem[-1].info['target_changed']: + episode_rpm.append([ + mem[-1].obs, mem[-1].action, mem[-1].info['shaping_reward'], + last_obs, not mem[-1].info['timeout'], + mem[i].info['target_change_times'] + ]) + + indicators_dict = calc_indicators(mem) + indicators_dict['free_client_num'] = self.ready_client_queue.qsize() + indicators_dict['noiselevel'] = self.noiselevel + + with self.MEMORY_LOCK: + self.add_episode_rpm(episode_rpm) + self.scalars_manager.record(indicators_dict, self.global_step) + self.global_step += 1 + if self.global_step >= 50: + self.noiselevel = self.noiselevel * NOISE_DECAY + + client.reset() + + def learn(self): + result_q = queue.Queue() + th_list = [] + for j in range(args.ensemble_num): + t = threading.Thread( + target=self.train_single_model, args=(j, result_q)) + th_list.append(t) + start_time = time.time() + for t in th_list: + t.start() + for t in th_list: + t.join() + + logger.info("[learn] {} heads, time consuming:{}".format( + args.ensemble_num, + time.time() - start_time)) + for t in th_list: + result = result_q.get() + for critic_loss in result: + self.scalars_manager.feed_critic_loss(critic_loss) + + def train_single_model(self, model_idx, result_q): + logger.info("[train_single_model] model_idx:{}".format(model_idx)) + critic_loss_list = [] + lock = self.locks[model_idx] + memory = self.rpm + + actor_lr = ACTOR_LR * (1.0 - 0.05 * model_idx) + critic_lr = CRITIC_LR * (1.0 + 0.1 * model_idx) + + for T in range(TRAIN_TIMES): + [states, actions, rewards, new_states, + dones] = memory.sample_batch(BATCH_SIZE) + lock.acquire() + critic_loss = self.agent.learn(states, actions, rewards, + new_states, dones, actor_lr, + critic_lr, model_idx) + lock.release() + critic_loss_list.append(critic_loss) + result_q.put(critic_loss_list) + + def keep_training(self): + episode_count = 1000000 + for T in range(episode_count): + if self.rpm.size() > BATCH_SIZE * args.warm_start_batchs: + self.learn() + logger.info( + "[keep_training/{}] trying to acq a new env".format(T)) + + # Keep training and predicting balance + # After training, waiting for a ready client, and set the client start new episode + ready_client_event = self.ready_client_queue.get() + ready_client_event.set() + + if np.mod(T, 100) == 0: + logger.info("saving models") + self.save(T) + if np.mod(T, 10000) == 0: + logger.info("saving rpm") + self.save_rpm() + + def save_rpm(self): + save_path = os.path.join(logger.get_dir(), "rpm.npz") + self.rpm.save(save_path) + + def restore_rpm(self, rpm_dir): + self.rpm.load(rpm_dir) + + def save(self, T): + save_path = os.path.join(logger.get_dir(), + 'model_every_100_episodes/step-{}'.format(T)) + self.agent.save_params(save_path) + + def restore(self, model_path, restore_from_one_head): + logger.info('restore model from {}'.format(model_path)) + self.agent.load_params(model_path, restore_from_one_head) + + def add_episode_rpm(self, episode_rpm): + for x in episode_rpm: + self.rpm.append( + obs=x[0], act=x[1], reward=x[2], next_obs=x[3], terminal=x[4]) + + def pred_batch(self, obs, model_idx=None): + assert model_idx is not None + batch_obs = np.expand_dims(obs, axis=0) + self.locks[model_idx].acquire() + action = self.agent.predict(batch_obs, model_idx) + self.locks[model_idx].release() + action = np.squeeze(action, axis=0) + return action + + +class SimulatorHandler(threading.Thread): + def __init__(self, simulator_server): + threading.Thread.__init__(self) + self.server = grpc.server(futures.ThreadPoolExecutor(max_workers=400)) + simulator_pb2_grpc.add_SimulatorServicer_to_server( + simulator_server, self.server) + self.server.add_insecure_port('[::]:{}'.format(args.port)) + + def run(self): + self.server.start() + try: + while True: + time.sleep(_ONE_DAY_IN_SECONDS) + except KeyboardInterrupt: + self.server.stop(0) + + +if __name__ == '__main__': + args = get_server_args() + + if args.logdir is not None: + logger.set_dir(args.logdir) + + simulator_server = SimulatorServer() + + if args.restore_rpm_path is not None: + simulator_server.restore_rpm(args.restore_rpm_path) + if args.restore_model_path is not None: + simulator_server.restore(args.restore_model_path, + args.restore_from_one_head) + + simulator_hanlder = SimulatorHandler(simulator_server=simulator_server) + simulator_hanlder.run() diff --git a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/test.py b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/test.py index 06613be69fd18ad38b49a74ad89d7de02973cfbe..58174f4ecc024e4831206eef8f426dc44ac09a8c 100644 --- a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/test.py +++ b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/test.py @@ -15,74 +15,82 @@ import argparse import numpy as np import time -from env_wrapper import FrameSkip, ActionScale, PelvisBasedObs, ForwardReward +from env_wrapper import FrameSkip, ActionScale, PelvisBasedObs, TestReward +from multi_head_ddpg import MultiHeadDDPG +from opensim_agent import OpenSimAgent +from opensim_model import OpenSimModel from osim.env import ProstheticsEnv from parl.utils import logger -from submit_model import SubmitModel +""" +Test model with ensemble predict +""" -def play_multi_episode(submit_model, episode_num=2, vis=False, seed=0): +def play_multi_episode(agent, episode_num=2, vis=False, seed=0): np.random.seed(seed) env = ProstheticsEnv(visualize=vis) env.change_model(model='3D', difficulty=1, prosthetic=True, seed=seed) - env = ForwardReward(env) + env = TestReward(env) env = FrameSkip(env, 4) env = ActionScale(env) env = PelvisBasedObs(env) + all_reward = [] - all_shaping_reward = 0 - last_frames_count = 0 for e in range(episode_num): t = time.time() episode_reward = 0.0 - episode_shaping_reward = 0.0 - observation = env.reset(project=False) - target_change_times = 0 + obs = env.reset(project=False) step = 0 - loss = [] while True: step += 1 - action = submit_model.pred_batch(observation, target_change_times) - observation, reward, done, info = env.step(action, project=False) - step_frames = info['frame_count'] - last_frames_count - last_frames_count = info['frame_count'] + + batch_obs = np.expand_dims(obs, axis=0) + + action = agent.ensemble_predict(batch_obs) + action = np.squeeze(action, axis=0) + obs, reward, done, info = env.step(action, project=False) episode_reward += reward - # we pacle it here to drop the first step after changing - if target_change_times >= 1: - loss.append(10 * step_frames - reward) - if info['target_changed']: - target_change_times = min(target_change_times + 1, 3) - logger.info("[step/{}]reward:{} info:{}".format( - step, reward, info)) - episode_shaping_reward += info['shaping_reward'] + logger.info("[step/{}]reward:{}".format(step, reward)) if done: break all_reward.append(episode_reward) - all_shaping_reward += episode_shaping_reward t = time.time() - t logger.info( - "[episode/{}] time: {} episode_reward:{} change_loss:{} after_change_loss:{} mean_reward:{}" - .format(e, t, episode_reward, np.sum(loss[:15]), np.sum(loss[15:]), - np.mean(all_reward))) + "[episode/{}] time: {} episode_reward:{} mean_reward:{}".format( + e, t, episode_reward, np.mean(all_reward))) logger.info("Mean reward:{}".format(np.mean(all_reward))) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( - '--use_cuda', action="store_true", help='If set, will run in gpu 0') + '--restore_model_path', type=str, help='restore model path for test') parser.add_argument( '--vis', action="store_true", help='If set, will visualize.') parser.add_argument('--seed', type=int, default=0, help='Random seed.') parser.add_argument( '--episode_num', type=int, default=1, help='Episode number to run.') + parser.add_argument('--ensemble_num', type=int, help='ensemble_num') args = parser.parse_args() - submit_model = SubmitModel(use_cuda=args.use_cuda) + ACT_DIM = 19 + VEL_DIM = 4 + OBS_DIM = 185 + VEL_DIM + GAMMA = 0.96 + TAU = 0.001 + models = [] + for i in range(args.ensemble_num): + models.append(OpenSimModel(OBS_DIM, VEL_DIM, ACT_DIM, model_id=i)) + hyperparas = { + 'gamma': GAMMA, + 'tau': TAU, + 'ensemble_num': args.ensemble_num + } + alg = MultiHeadDDPG(models, hyperparas) + agent = OpenSimAgent(alg, OBS_DIM, ACT_DIM, args.ensemble_num) + + agent.load_params(args.restore_model_path) play_multi_episode( - submit_model, - episode_num=args.episode_num, - vis=args.vis, - seed=args.seed) + agent, episode_num=args.episode_num, vis=args.vis, seed=args.seed) diff --git a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/utils.py b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..cb7182ea1731d984614a37d9904f8da09ac331c4 --- /dev/null +++ b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/utils.py @@ -0,0 +1,363 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import numpy as np +import tensorflow as tf +import os +import six + + +class Summary(object): + """Logging in tensorboard without tensorflow ops. + + Simple example on how to log scalars and images to tensorboard without tensor ops. + License: Copyleft + + __author__ = "Michael Gygli" + """ + + def __init__(self, logdir): + """Creates a summary writer logging to logdir.""" + self.writer = tf.summary.FileWriter(logdir) + + def log_scalar(self, tag, value, step): + """Log a scalar variable. + Parameter + ---------- + tag : basestring + Name of the scalar + value + step : int + training iteration + """ + summary = tf.Summary( + value=[tf.Summary.Value(tag=tag, simple_value=value)]) + self.writer.add_summary(summary, step) + self.writer.flush() + + +class StatCounter(object): + """ A simple counter""" + + def __init__(self, max_size=50): + self.reset() + self.max_size = max_size + + def feed(self, v): + """ + Args: + v(float or np.ndarray): has to be the same shape between calls. + """ + self._values.append(v) + if len(self._values) > self.max_size: + self._values = self._values[-self.max_size:] + + def reset(self): + self._values = [] + + @property + def count(self): + return len(self._values) + + @property + def mean(self): + assert len(self._values) + return np.mean(self._values) + + @property + def sum(self): + assert len(self._values) + return np.sum(self._values) + + @property + def max(self): + assert len(self._values) + return max(self._values) + + @property + def min(self): + assert len(self._values) + return min(self._values) + + @property + def success_rate(self): + count = 0 + for v in self._values: + if v > 35.0: + count += 1 + return float(count) / len(self._values) + + +def calc_indicators(mem): + START_STEPS = 15 + n = len(mem) + episode_shaping_reward = np.sum( + [exp.info['shaping_reward'] for exp in mem]) + episode_r2_reward = np.sum([ + exp.info['r2_reward'] for exp in mem if exp.info['frame_count'] <= 1000 + ]) + x_offset_reward = np.sum(exp.info['x_offset_reward'] for exp in mem) + + episode_length = mem[-1].info['frame_count'] + + scalar_vel = np.mean([exp.info['scalar_vel'] for exp in mem]) + action_l2_penalty = np.mean( + [exp.info['mean_action_l2_penalty'] for exp in mem]) + + start_loss = 10 * START_STEPS * 4 - np.sum([exp.reward + for exp in mem][:START_STEPS]) + all_start_loss = 0 + for i in range(n): + if not mem[i].info['target_changed']: + frame_count = 4 + if i - 1 >= 0: + frame_count = mem[i].info['frame_count'] - mem[ + i - 1].info['frame_count'] + all_start_loss += 10.0 * frame_count - mem[i].reward + else: + break + start_other_loss = all_start_loss - start_loss + first_change_loss = 0 + second_change_loss = 0 + third_change_loss = 0 + first_change_other_loss = 0 + second_change_other_loss = 0 + third_change_other_loss = 0 + first_stage_vel = 0.0 + second_stage_vel = 0.0 + third_stage_vel = 0.0 + other_loss = 0 + + change_loss = [] + all_change_loss = [] + change_vel = [] + + for i in range(n - 1): + if mem[i].info['target_changed']: + change_loss.append(0.0) + all_change_loss.append(0.0) + change_vel.append([]) + for j in range(START_STEPS): + idx = i + 1 + j + if idx >= n or mem[idx].info['target_changed']: + break + frame_count = 4 + if idx - 1 >= 0: + frame_count = mem[idx].info['frame_count'] - mem[ + idx - 1].info['frame_count'] + change_loss[-1] += 10.0 * frame_count - mem[idx].reward + for j in range(n - i - 1): + idx = i + 1 + j + if idx >= n or mem[idx].info['target_changed']: + break + if idx - 1 >= 0: + frame_count = mem[idx].info['frame_count'] - mem[ + idx - 1].info['frame_count'] + all_change_loss[-1] += 10.0 * frame_count - mem[idx].reward + change_vel[-1].append(mem[idx].info['scalar_vel']) + + if len(change_loss) >= 1: + first_change_loss = change_loss[0] + first_change_other_loss = all_change_loss[0] - change_loss[0] + if len(change_loss) >= 2: + second_change_loss = change_loss[1] + second_change_other_loss = all_change_loss[1] - change_loss[1] + if len(change_loss) >= 3: + third_change_loss = change_loss[2] + third_change_other_loss = all_change_loss[2] - change_loss[2] + other_loss = 10 * mem[-1].info[ + 'frame_count'] - start_loss - first_change_loss - second_change_loss - third_change_loss - episode_r2_reward + if len(change_vel) >= 1: + first_stage_vel = np.mean(change_vel[0]) + if len(change_vel) >= 2: + second_stage_vel = np.mean(change_vel[1]) + if len(change_vel) >= 3: + third_stage_vel = np.mean(change_vel[2]) + + indicators_dict = { + 'episode_shaping_reward': episode_shaping_reward, + 'episode_r2_reward': episode_r2_reward, + 'x_offset_reward': x_offset_reward, + 'episode_length': episode_length, + 'scalar_vel': scalar_vel, + 'mean_action_l2_penalty': action_l2_penalty, + 'start_loss': start_loss, + 'first_change_loss': first_change_loss, + 'second_change_loss': second_change_loss, + 'third_change_loss': third_change_loss, + 'start_other_loss': start_other_loss, + 'first_change_other_loss': first_change_other_loss, + 'second_change_other_loss': second_change_other_loss, + 'third_change_other_loss': third_change_other_loss, + 'other_loss': other_loss, + 'first_stage_vel': first_stage_vel, + 'second_stage_vel': second_stage_vel, + 'third_stage_vel': third_stage_vel + } + return indicators_dict + + +class ScalarsManager(object): + def __init__(self, logdir): + self.summary = Summary(logdir=logdir) + + self.max_shaping_reward = 0 + self.max_x_offset_reward = 0 + self.max_r2_reward = 0 + + self.critic_loss_counter = StatCounter(max_size=500) + + self.r2_reward_counter = StatCounter(max_size=500) + self.nofall_r2_reward_counter = StatCounter(max_size=500) + self.falldown_counter100 = StatCounter(max_size=100) + + self.vel_keys = [ + 'scalar_vel', 'first_stage_vel', 'second_stage_vel', + 'third_stage_vel' + ] + self.vel_counter = {} + for key in self.vel_keys: + self.vel_counter[key] = StatCounter(max_size=500) + self.reward_loss_keys = [ + 'start_loss', 'first_change_loss', 'second_change_loss', + 'third_change_loss', 'start_other_loss', 'first_change_other_loss', + 'second_change_other_loss', 'third_change_other_loss', 'other_loss' + ] + self.reward_loss_counter = {} + for key in self.reward_loss_keys: + self.reward_loss_counter[key] = StatCounter(max_size=500) + + def feed_critic_loss(self, critic_loss): + self.critic_loss_counter.feed(critic_loss) + + def record(self, record_dict, global_step): + self.max_shaping_reward = max(self.max_shaping_reward, + record_dict['episode_shaping_reward']) + self.max_x_offset_reward = max(self.max_x_offset_reward, + record_dict['x_offset_reward']) + self.max_r2_reward = max(self.max_r2_reward, + record_dict['episode_r2_reward']) + + self.r2_reward_counter.feed(record_dict['episode_r2_reward']) + if record_dict['episode_length'] >= 1000: # no falldown + self.nofall_r2_reward_counter.feed( + record_dict['episode_r2_reward']) + self.falldown_counter100.feed(0.0) + else: + self.falldown_counter100.feed(1.0) + + for key in self.reward_loss_keys: + self.reward_loss_counter[key].feed(record_dict[key]) + for key in self.vel_keys: + self.vel_counter[key].feed(record_dict[key]) + + self.summary.log_scalar('performance/falldown_rate', + self.falldown_counter100.sum / 100.0, + global_step) + self.summary.log_scalar('performance/max_r2_reward', + self.max_r2_reward, global_step) + self.summary.log_scalar('performance/max_shaping_reward', + self.max_shaping_reward, global_step) + self.summary.log_scalar('performance/max_x_offset_reward', + self.max_x_offset_reward, global_step) + self.summary.log_scalar('performance/episode_r2_reward', + record_dict['episode_r2_reward'], global_step) + self.summary.log_scalar('performance/episode_shaping_reward', + record_dict['episode_shaping_reward'], + global_step) + self.summary.log_scalar('performance/x_offset_reward', + record_dict['x_offset_reward'], global_step) + self.summary.log_scalar('performance/episode_length', + record_dict['episode_length'], global_step) + self.summary.log_scalar('performance/mean_action_l2_penalty', + record_dict['mean_action_l2_penalty'], + global_step) + + self.summary.log_scalar('server/free_client_num', + record_dict['free_client_num'], global_step) + + self.summary.log_scalar('model/noiselevel', record_dict['noiselevel'], + global_step) + if self.critic_loss_counter.count > 0: + mean_critic_loss = self.critic_loss_counter.mean + self.summary.log_scalar('model/critic_loss', mean_critic_loss, + global_step) + + if self.r2_reward_counter.count > 400: + mean_r2_reward = self.r2_reward_counter.mean + self.summary.log_scalar('performance/recent_r2_reward', + mean_r2_reward, global_step) + mean_nofall_r2_reward = self.nofall_r2_reward_counter.mean + self.summary.log_scalar('performance/recent_nofall_r2_reward', + mean_nofall_r2_reward, global_step) + + for key in self.vel_keys: + self.summary.log_scalar('scalar_vel/{}'.format(key), + self.vel_counter[key].mean, + global_step) + + for key in self.reward_loss_keys: + if 'first' in key: + self.summary.log_scalar('1_stage_loss_reward/' + key, + self.reward_loss_counter[key].mean, + global_step) + elif 'second' in key: + self.summary.log_scalar('2_stage_loss_reward/' + key, + self.reward_loss_counter[key].mean, + global_step) + elif 'third' in key: + self.summary.log_scalar('3_stage_loss_reward/' + key, + self.reward_loss_counter[key].mean, + global_step) + elif 'start' in key: + self.summary.log_scalar('0_stage_loss_reward/' + key, + self.reward_loss_counter[key].mean, + global_step) + else: + self.summary.log_scalar('loss_reward/' + key, + self.reward_loss_counter[key].mean, + global_step) + self.summary.log_scalar( + '0_stage_loss_reward/stage_loss', + self.reward_loss_counter['start_loss'].mean + + self.reward_loss_counter['start_other_loss'].mean, global_step) + self.summary.log_scalar( + '1_stage_loss_reward/stage_loss', + self.reward_loss_counter['first_change_loss'].mean + + self.reward_loss_counter['first_change_other_loss'].mean, + global_step) + self.summary.log_scalar( + '2_stage_loss_reward/stage_loss', + self.reward_loss_counter['second_change_loss'].mean + + self.reward_loss_counter['second_change_other_loss'].mean, + global_step) + self.summary.log_scalar( + '3_stage_loss_reward/stage_loss', + self.reward_loss_counter['third_change_loss'].mean + + self.reward_loss_counter['third_change_other_loss'].mean, + global_step) + + +class TransitionExperience(object): + """ A transition of state, or experience""" + + def __init__(self, obs, action, reward, info, **kwargs): + """ kwargs: whatever other attribute you want to save""" + self.obs = obs + self.action = action + self.reward = reward + self.info = info + for k, v in six.iteritems(kwargs): + setattr(self, k, v) diff --git a/examples/PPO/train.py b/examples/PPO/train.py index 35d9aa7346daaf40ea179236ed66aceda39865a4..4b9ead54097d8bbc524fabdd79ac945d6d6136d2 100755 --- a/examples/PPO/train.py +++ b/examples/PPO/train.py @@ -25,12 +25,11 @@ from utils import * def run_train_episode(env, agent, scaler): obs = env.reset() observes, actions, rewards, unscaled_obs = [], [], [], [] - done = False step = 0.0 scale, offset = scaler.get() scale[-1] = 1.0 # don't scale time step feature offset[-1] = 0.0 # don't offset time step feature - while not done: + while True: obs = obs.reshape((1, -1)) obs = np.append(obs, [[step]], axis=1) # add time step feature unscaled_obs.append(obs) @@ -50,6 +49,9 @@ def run_train_episode(env, agent, scaler): rewards.append(reward) step += 1e-3 # increment time step feature + if done: + break + return (np.concatenate(observes), np.concatenate(actions), np.array(rewards, dtype='float32'), np.concatenate(unscaled_obs)) @@ -75,6 +77,7 @@ def run_evaluate_episode(env, agent, scaler): rewards.append(reward) step += 1e-3 # increment time step feature + if done: break return np.sum(rewards) @@ -153,7 +156,6 @@ def main(): if __name__ == "__main__": - parser = argparse.ArgumentParser() parser.add_argument( '--env', @@ -186,6 +188,5 @@ if __name__ == "__main__": default='CLIP') args = parser.parse_args() - import time - logger.set_dir('./log_dir/{}_{}'.format(args.loss_type, time.time())) + main() diff --git a/parl/framework/agent_base.py b/parl/framework/agent_base.py index 6fbe81d6c744ae2cfd00b98f8d5372bac8e4e6de..02229ac355c199a4d130ffd6134e6f4b3e245f6a 100644 --- a/parl/framework/agent_base.py +++ b/parl/framework/agent_base.py @@ -48,8 +48,9 @@ class Agent(object): if gpu_id is None: gpu_id = 0 if get_gpu_count() > 0 else -1 self.gpu_id = gpu_id - place = fluid.CUDAPlace(gpu_id) if gpu_id >= 0 else fluid.CPUPlace() - self.fluid_executor = fluid.Executor(place) + self.place = fluid.CUDAPlace( + gpu_id) if gpu_id >= 0 else fluid.CPUPlace() + self.fluid_executor = fluid.Executor(self.place) self.fluid_executor.run(fluid.default_startup_program()) def build_program(self): diff --git a/parl/framework/model_base.py b/parl/framework/model_base.py index 37ba5915081cf1613143822188feaf61d2c7bdac..3adee9a7f2bc6717122b4c4d614ae319edaccc45 100644 --- a/parl/framework/model_base.py +++ b/parl/framework/model_base.py @@ -27,13 +27,19 @@ class Network(object): A Network is an unordered set of LayerFuncs or Networks. """ - def sync_params_to(self, target_net, gpu_id=0, decay=0.0): + def sync_params_to(self, + target_net, + gpu_id=0, + decay=0.0, + share_vars_parallel_executor=None): """ Args: target_net: Network object deepcopy from source network gpu_id: gpu id of target_net decay: Float. The decay to use. target_net_weights = decay * target_net_weights + (1 - decay) * source_net_weights + share_vars_parallel_executor: if not None, will use fluid.ParallelExecutor + to run program instead of fluid.Executor """ args_hash_id = hashlib.md5('{}_{}_{}'.format( id(target_net), gpu_id, decay).encode('utf-8')).hexdigest() @@ -59,9 +65,6 @@ class Network(object): param_pairs = get_parameter_pairs(self, target_net) - place = fluid.CPUPlace() if gpu_id < 0 \ - else fluid.CUDAPlace(gpu_id) - self._cached_fluid_executor = fluid.Executor(place) self._cached_sync_params_program = fluid.Program() with fluid.program_guard(self._cached_sync_params_program): @@ -71,7 +74,34 @@ class Network(object): fluid.layers.assign( decay * target_var + (1 - decay) * src_var, target_var) - self._cached_fluid_executor.run(self._cached_sync_params_program) + if share_vars_parallel_executor is None: + # use fluid.Executor + place = fluid.CPUPlace() if gpu_id < 0 \ + else fluid.CUDAPlace(gpu_id) + self._cached_fluid_executor = fluid.Executor(place) + else: + # use fluid.ParallelExecutor + use_cuda = True if gpu_id >= 0 else False + + # specify strategy to make ParallelExecutor run faster + exec_strategy = fluid.ExecutionStrategy() + exec_strategy.use_experimental_executor = True + exec_strategy.num_threads = 4 + build_strategy = fluid.BuildStrategy() + build_strategy.remove_unnecessary_lock = True + + with fluid.scope_guard(fluid.global_scope().new_scope()): + self._cached_fluid_executor = fluid.ParallelExecutor( + use_cuda=use_cuda, + main_program=self._cached_sync_params_program, + share_vars_from=share_vars_parallel_executor, + exec_strategy=exec_strategy, + build_strategy=build_strategy, + ) + if share_vars_parallel_executor is None: + self._cached_fluid_executor.run(self._cached_sync_params_program) + else: + self._cached_fluid_executor.run(fetch_list=[]) @property def parameter_names(self): diff --git a/parl/utils/__init__.py b/parl/utils/__init__.py index 51b6b5c064daca1853da60d5779d9c806be9315c..c434b9f8d14a7f09622803394ba8a041930c5b4a 100644 --- a/parl/utils/__init__.py +++ b/parl/utils/__init__.py @@ -14,3 +14,4 @@ from parl.utils.utils import * from parl.utils.gputils import * +from parl.utils.replay_memory import * diff --git a/parl/utils/logger.py b/parl/utils/logger.py index c9036339f6d2f16c2685c87086b5685938257789..b53dba5a045859a0986dedafe20887cae73c3cea 100644 --- a/parl/utils/logger.py +++ b/parl/utils/logger.py @@ -101,7 +101,10 @@ for level in _LOGGING_LEVEL: def _set_file(path): global _FILE_HANDLER if os.path.isfile(path): - os.remove(path) + try: + os.remove(path) + except OSError: + pass hdl = logging.FileHandler(filename=path, encoding='utf-8', mode='w') hdl.setFormatter(_Formatter(datefmt='%m-%d %H:%M:%S')) diff --git a/parl/utils/replay_memory.py b/parl/utils/replay_memory.py new file mode 100755 index 0000000000000000000000000000000000000000..051c6fb30f94b933bdafcfad16d284a1a1610432 --- /dev/null +++ b/parl/utils/replay_memory.py @@ -0,0 +1,86 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from parl.utils import logger + +__all__ = ['ReplayMemory'] + + +class ReplayMemory(object): + def __init__(self, max_size, obs_dim, act_dim): + self.max_size = int(max_size) + self.obs_dim = obs_dim + self.act_dim = act_dim + + self.obs = np.zeros((max_size, obs_dim), dtype='float32') + self.action = np.zeros((max_size, act_dim), dtype='float32') + self.reward = np.zeros((max_size, ), dtype='float32') + self.terminal = np.zeros((max_size, ), dtype='bool') + self.next_obs = np.zeros((max_size, obs_dim), dtype='float32') + + self._curr_size = 0 + self._curr_pos = 0 + + def sample_batch(self, batch_size): + # index mapping to avoid sampling saving example + batch_idx = np.random.randint( + self._curr_size - 300 - 1, size=batch_size) + batch_idx = (self._curr_pos + 300 + batch_idx) % self._curr_size + + obs = self.obs[batch_idx] + reward = self.reward[batch_idx] + action = self.action[batch_idx] + next_obs = self.next_obs[batch_idx] + terminal = self.terminal[batch_idx] + return obs, action, reward, next_obs, terminal + + def append(self, obs, act, reward, next_obs, terminal): + if self._curr_size < self.max_size: + self._curr_size += 1 + self.obs[self._curr_pos] = obs + self.action[self._curr_pos] = act + self.reward[self._curr_pos] = reward + self.next_obs[self._curr_pos] = next_obs + self.terminal[self._curr_pos] = terminal + self._curr_pos = (self._curr_pos + 1) % self.max_size + + def size(self): + return self._curr_size + + def save(self, pathname): + other = np.array([self._curr_size, self._curr_pos], dtype=np.int32) + np.savez( + pathname, + obs=self.obs, + action=self.action, + reward=self.reward, + terminal=self.terminal, + next_obs=self.next_obs, + other=other) + + def load(self, pathname): + data = np.load(pathname) + other = data['other'] + if int(other[0]) > self.max_size: + logger.warn('loading from a bigger size rpm!') + self._curr_size = min(int(other[0]), self.max_size) + self._curr_pos = min(int(other[1]), self.max_size - 1) + + self.obs[:self._curr_size] = data['obs'][:self._curr_size] + self.action[:self._curr_size] = data['action'][:self._curr_size] + self.reward[:self._curr_size] = data['reward'][:self._curr_size] + self.terminal[:self._curr_size] = data['terminal'][:self._curr_size] + self.next_obs[:self._curr_size] = data['next_obs'][:self._curr_size] + logger.info("[load rpm]memory loade from {}".format(pathname))