From 7c4063860385f5d7e2584b88ca7b326730610615 Mon Sep 17 00:00:00 2001 From: Hongsheng Zeng Date: Mon, 4 Nov 2019 14:03:25 +0800 Subject: [PATCH] Final submitted models of NeurIPS2019 challenge (#168) * final submit models of NeurIPS2019 challenge * update readme * fix yapf * refine comment --- .../README.md | 18 ++ .../final_submit/env_wrapper.py | 292 ++++++++++++++++++ .../final_submit/mlp_model.py | 181 +++++++++++ .../final_submit/official_obs_scaler.npz | Bin 0 -> 2248 bytes .../final_submit/submit_model.py | 153 +++++++++ .../final_submit/test.py | 72 +++++ 6 files changed, 716 insertions(+) create mode 100644 examples/NeurIPS2019-Learn-to-Move-Challenge/README.md create mode 100644 examples/NeurIPS2019-Learn-to-Move-Challenge/final_submit/env_wrapper.py create mode 100644 examples/NeurIPS2019-Learn-to-Move-Challenge/final_submit/mlp_model.py create mode 100644 examples/NeurIPS2019-Learn-to-Move-Challenge/final_submit/official_obs_scaler.npz create mode 100644 examples/NeurIPS2019-Learn-to-Move-Challenge/final_submit/submit_model.py create mode 100644 examples/NeurIPS2019-Learn-to-Move-Challenge/final_submit/test.py diff --git a/examples/NeurIPS2019-Learn-to-Move-Challenge/README.md b/examples/NeurIPS2019-Learn-to-Move-Challenge/README.md new file mode 100644 index 0000000..5fdd7dc --- /dev/null +++ b/examples/NeurIPS2019-Learn-to-Move-Challenge/README.md @@ -0,0 +1,18 @@ + +## Dependencies +- python3.6 +- [paddlepaddle>=1.5.2](https://github.com/PaddlePaddle/Paddle) +- [parl>=1.2](https://github.com/PaddlePaddle/PARL) +- [osim-rl==3.0.11](https://github.com/stanfordnmbl/osim-rl) + + +## Part1: Final submitted model +### Test +- How to Run + + 1. Enter the sub-folder `final_submit` + 2. Download the model file from online stroage service: [Baidu Pan](https://pan.baidu.com/s/12LIPspckCT8-Q5U1QX69Fg) (password: `b5ck`) or [Google Drive](https://drive.google.com/file/d/1jJtOcOVJ6auz3s-TyWgUJvofPXI94yxy/view?usp=sharing) + 3. Unpack the file: + `tar zxvf saved_models.tar.gz` + 4. Launch test scription: + `python test.py` diff --git a/examples/NeurIPS2019-Learn-to-Move-Challenge/final_submit/env_wrapper.py b/examples/NeurIPS2019-Learn-to-Move-Challenge/final_submit/env_wrapper.py new file mode 100644 index 0000000..463af4f --- /dev/null +++ b/examples/NeurIPS2019-Learn-to-Move-Challenge/final_submit/env_wrapper.py @@ -0,0 +1,292 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import abc +import copy +import gym +import math +import numpy as np +from collections import OrderedDict +from osim.env import L2M2019Env +from parl.utils import logger + +MAXTIME_LIMIT = 2500 +L2M2019Env.time_limit = MAXTIME_LIMIT +FRAME_SKIP = None +FALL_PENALTY = 0 + + +class ActionScale(gym.Wrapper): + def __init__(self, env): + gym.Wrapper.__init__(self, env) + + def step(self, action, **kwargs): + action = (np.copy(action) + 1.0) * 0.5 + action = np.clip(action, 0.0, 1.0) + return self.env.step(action, **kwargs) + + def reset(self, **kwargs): + return self.env.reset(**kwargs) + + +class FrameSkip(gym.Wrapper): + def __init__(self, env, k): + gym.Wrapper.__init__(self, env) + self.frame_skip = k + global FRAME_SKIP + FRAME_SKIP = k + self.frame_count = 0 + + def step(self, action, **kwargs): + r = 0.0 + merge_info = {} + for k in range(self.frame_skip): + self.frame_count += 1 + obs, reward, done, info = self.env.step(action, **kwargs) + r += reward + + for key in info.keys(): + if 'reward' in key: + # to assure that we don't igonre other reward + # if new reward was added, consider its logic here + assert (key == 'shaping_reward') or ( + key == 'env_reward') or (key == 'x_offset_reward') + merge_info[key] = merge_info.get(key, 0.0) + info[key] + else: + merge_info[key] = info[key] + + if info['target_changed']: + logger.warn("[FrameSkip] early break since target was changed") + break + + if done: + break + merge_info['frame_count'] = self.frame_count + return obs, r, done, merge_info + + def reset(self, **kwargs): + self.frame_count = 0 + return self.env.reset(**kwargs) + + +class RewardShaping(gym.Wrapper): + """ A wrapper for reward shaping, note this wrapper must be the first wrapper """ + + def __init__(self, env): + logger.info("[RewardShaping]type:{}".format(type(env))) + + self.step_count = 0 + self.pre_state_desc = None + self.last_target_vel = None + self.last_target_change_step = 0 + gym.Wrapper.__init__(self, env) + + @abc.abstractmethod + def reward_shaping(self, state_desc, reward, done, action): + """define your own reward computation function + Args: + state_desc(dict): state description for current model + reward(scalar): generic reward generated by env + done(bool): generic done flag generated by env + """ + pass + + def step(self, action, **kwargs): + self.step_count += 1 + obs, r, done, info = self.env.step(action, **kwargs) + info = self.reward_shaping(obs, r, done, action) + + target_vel = np.linalg.norm( + [obs['v_tgt_field'][0][5][5], obs['v_tgt_field'][1][5][5]]) + info['target_changed'] = False + if self.last_target_vel is not None: + if np.abs(target_vel - self.last_target_vel) > 0.2: + self.last_target_change_step = self.step_count + info['target_changed'] = True + info['last_target_change_step'] = self.last_target_change_step + self.last_target_vel = target_vel + + assert 'shaping_reward' in info + timeout = False + if self.step_count >= MAXTIME_LIMIT: + timeout = True + if done and not timeout: + # penalty for falling down + info['shaping_reward'] += FALL_PENALTY + info['timeout'] = timeout + self.pre_state_desc = obs + return obs, r, done, info + + def reset(self, **kwargs): + self.step_count = 0 + self.last_target_vel = None + self.last_target_change_step = 0 + obs = self.env.reset(**kwargs) + self.pre_state_desc = obs + return obs + + +class ForwardReward(RewardShaping): + """ A reward shaping wraper""" + + def __init__(self, env): + RewardShaping.__init__(self, env) + + def reward_shaping(self, state_desc, r2_reward, done, action): + info = {'shaping_reward': r2_reward} + return info + + +class ObsTranformerBase(gym.Wrapper): + def __init__(self, env): + gym.Wrapper.__init__(self, env) + self.step_fea = MAXTIME_LIMIT + self.raw_obs = None + global FRAME_SKIP + self.frame_skip = int(FRAME_SKIP) + + def get_observation(self, state_desc): + + obs = self._get_observation(state_desc) + return obs + + @abc.abstractmethod + def _get_observation(self, state_desc): + pass + + def feature_normalize(self, obs, mean, std, duplicate_id): + scaler_len = mean.shape[0] + assert obs.shape[0] >= scaler_len + obs[:scaler_len] = (obs[:scaler_len] - mean) / std + final_obs = [] + for i in range(obs.shape[0]): + if i not in duplicate_id: + final_obs.append(obs[i]) + return np.array(final_obs) + + def step(self, action, **kwargs): + obs, r, done, info = self.env.step(action, **kwargs) + if info['target_changed']: + # reset step_fea when change target + self.step_fea = MAXTIME_LIMIT + + self.step_fea -= FRAME_SKIP + + self.raw_obs = copy.deepcopy(obs) + obs = self.get_observation(obs) + self.raw_obs['step_count'] = MAXTIME_LIMIT - self.step_fea + return obs, r, done, info + + def reset(self, **kwargs): + obs = self.env.reset(**kwargs) + if obs is None: + return None + self.step_fea = MAXTIME_LIMIT + self.raw_obs = copy.deepcopy(obs) + obs = self.get_observation(obs) + self.raw_obs['step_count'] = MAXTIME_LIMIT - self.step_fea + return obs + + +class OfficialObs(ObsTranformerBase): + def __init__(self, env): + ObsTranformerBase.__init__(self, env) + data = np.load('./official_obs_scaler.npz') + self.mean, self.std, self.duplicate_id = data['mean'], data[ + 'std'], data['duplicate_id'] + self.duplicate_id = self.duplicate_id.astype(np.int32).tolist() + + def _get_observation(self, obs_dict): + res = [] + + res.append(obs_dict['pelvis']['height']) + res.append(obs_dict['pelvis']['pitch']) + res.append(obs_dict['pelvis']['roll']) + res.append(obs_dict['pelvis']['vel'][0]) + res.append(obs_dict['pelvis']['vel'][1]) + res.append(obs_dict['pelvis']['vel'][2]) + res.append(obs_dict['pelvis']['vel'][3]) + res.append(obs_dict['pelvis']['vel'][4]) + res.append(obs_dict['pelvis']['vel'][5]) + + for leg in ['r_leg', 'l_leg']: + res += obs_dict[leg]['ground_reaction_forces'] + res.append(obs_dict[leg]['joint']['hip_abd']) + res.append(obs_dict[leg]['joint']['hip']) + res.append(obs_dict[leg]['joint']['knee']) + res.append(obs_dict[leg]['joint']['ankle']) + res.append(obs_dict[leg]['d_joint']['hip_abd']) + res.append(obs_dict[leg]['d_joint']['hip']) + res.append(obs_dict[leg]['d_joint']['knee']) + res.append(obs_dict[leg]['d_joint']['ankle']) + for MUS in [ + 'HAB', 'HAD', 'HFL', 'GLU', 'HAM', 'RF', 'VAS', 'BFSH', + 'GAS', 'SOL', 'TA' + ]: + res.append(obs_dict[leg][MUS]['f']) + res.append(obs_dict[leg][MUS]['l']) + res.append(obs_dict[leg][MUS]['v']) + + res = np.array(res) + + res = self.feature_normalize( + res, mean=self.mean, std=self.std, duplicate_id=self.duplicate_id) + + remaining_time = (self.step_fea - + (MAXTIME_LIMIT / 2.0)) / (MAXTIME_LIMIT / 2.0) * -1.0 + res = np.append(res, remaining_time) + + # target driven + current_v_x = obs_dict['pelvis']['vel'][0] # (+) forward + current_v_z = obs_dict['pelvis']['vel'][1] # (+) leftward + + # future vels (0m, 1m, ..., 5m) + for index in range(5, 11): + target_v_x, target_v_z = obs_dict['v_tgt_field'][0][index][ + 5], obs_dict['v_tgt_field'][1][index][5] + + diff_vel_x = target_v_x - current_v_x + diff_vel_z = target_v_z - current_v_z + diff_vel = np.sqrt(target_v_x ** 2 + target_v_z ** 2) - \ + np.sqrt(current_v_x ** 2 + current_v_z ** 2) + res = np.append( + res, [diff_vel_x / 5.0, diff_vel_z / 5.0, diff_vel / 5.0]) + + # current relative target theta + target_v_x, target_v_z = obs_dict['v_tgt_field'][0][5][5], obs_dict[ + 'v_tgt_field'][1][5][5] + + target_theta = math.atan2(target_v_z, target_v_x) + diff_theta = target_theta + res = np.append(res, [diff_theta / np.pi]) + + return res + + +if __name__ == '__main__': + from osim.env import L2M2019Env + + env = L2M2019Env(difficulty=3, visualize=False) + env.change_model(model='3D', difficulty=3) + env = ForwardReward(env) + env = FrameSkip(env, 4) + env = ActionScale(env) + env = OfficialObs(env) + observation = env.reset(project=True, obs_as_dict=True) + print(observation.shape) + while True: + _, _, done, _ = env.step( + env.action_space.sample(), project=True, obs_as_dict=True) + if done: + break diff --git a/examples/NeurIPS2019-Learn-to-Move-Challenge/final_submit/mlp_model.py b/examples/NeurIPS2019-Learn-to-Move-Challenge/final_submit/mlp_model.py new file mode 100644 index 0000000..2faadf6 --- /dev/null +++ b/examples/NeurIPS2019-Learn-to-Move-Challenge/final_submit/mlp_model.py @@ -0,0 +1,181 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import parl +from parl import layers +from paddle import fluid +from paddle.fluid.param_attr import ParamAttr + + +class ActorModel(parl.Model): + def __init__(self, + obs_dim, + vel_obs_dim, + act_dim, + stage_name=None, + model_id=0, + shared=True): + super(ActorModel, self).__init__() + hid0_size = 800 + hid1_size = 400 + hid2_size = 200 + vel_hid0_size = 200 + vel_hid1_size = 400 + + self.obs_dim = obs_dim + self.vel_obs_dim = vel_obs_dim + + # buttom layers + if shared: + scope_name = 'policy_shared' + else: + scope_name = 'policy_identity_{}'.format(model_id) + if stage_name is not None: + scope_name = '{}_{}'.format(stage_name, scope_name) + + self.fc0 = layers.fc( + size=hid0_size, + act='tanh', + param_attr=ParamAttr(name='{}/h0/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/h0/b'.format(scope_name))) + self.fc1 = layers.fc( + size=hid1_size, + act='tanh', + param_attr=ParamAttr(name='{}/h1/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/h1/b'.format(scope_name))) + self.vel_fc0 = layers.fc( + size=vel_hid0_size, + act='tanh', + param_attr=ParamAttr(name='{}/vel_h0/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/vel_h0/b'.format(scope_name))) + self.vel_fc1 = layers.fc( + size=vel_hid1_size, + act='tanh', + param_attr=ParamAttr(name='{}/vel_h1/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/vel_h1/b'.format(scope_name))) + + # top layers + scope_name = 'policy_identity_{}'.format(model_id) + if stage_name is not None: + scope_name = '{}_{}'.format(stage_name, scope_name) + + self.fc2 = layers.fc( + size=hid2_size, + act='tanh', + param_attr=ParamAttr(name='{}/h2/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/h2/b'.format(scope_name))) + self.fc3 = layers.fc( + size=act_dim, + act='tanh', + param_attr=ParamAttr(name='{}/means/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/means/b'.format(scope_name))) + + def predict(self, obs): + real_obs = layers.slice( + obs, axes=[1], starts=[0], ends=[self.obs_dim - self.vel_obs_dim]) + vel_obs = layers.slice( + obs, axes=[1], starts=[-self.vel_obs_dim], ends=[self.obs_dim]) + hid0 = self.fc0(real_obs) + hid1 = self.fc1(hid0) + vel_hid0 = self.vel_fc0(vel_obs) + vel_hid1 = self.vel_fc1(vel_hid0) + concat = layers.concat([hid1, vel_hid1], axis=1) + hid2 = self.fc2(concat) + means = self.fc3(hid2) + return means + + +class CriticModel(parl.Model): + def __init__(self, + obs_dim, + vel_obs_dim, + act_dim, + stage_name=None, + model_id=0, + shared=True): + super(CriticModel, self).__init__() + hid0_size = 800 + hid1_size = 400 + vel_hid0_size = 200 + vel_hid1_size = 400 + + self.obs_dim = obs_dim + self.vel_obs_dim = vel_obs_dim + + # buttom layers + if shared: + scope_name = 'critic_shared' + else: + scope_name = 'critic_identity_{}'.format(model_id) + if stage_name is not None: + scope_name = '{}_{}'.format(stage_name, scope_name) + + self.fc0 = layers.fc( + size=hid0_size, + act='selu', + param_attr=ParamAttr(name='{}/w1/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/w1/b'.format(scope_name))) + self.fc1 = layers.fc( + size=hid1_size, + act='selu', + param_attr=ParamAttr(name='{}/h1/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/h1/b'.format(scope_name))) + self.vel_fc0 = layers.fc( + size=vel_hid0_size, + act='selu', + param_attr=ParamAttr(name='{}/vel_h0/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/vel_h0/b'.format(scope_name))) + self.vel_fc1 = layers.fc( + size=vel_hid1_size, + act='selu', + param_attr=ParamAttr(name='{}/vel_h1/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/vel_h1/b'.format(scope_name))) + self.act_fc0 = layers.fc( + size=hid1_size, + act='selu', + param_attr=ParamAttr(name='{}/a1/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/a1/b'.format(scope_name))) + + # top layers + scope_name = 'critic_identity_{}'.format(model_id) + if stage_name is not None: + scope_name = '{}_{}'.format(stage_name, scope_name) + + self.fc2 = layers.fc( + size=hid1_size, + act='selu', + param_attr=ParamAttr(name='{}/h3/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/h3/b'.format(scope_name))) + self.fc3 = layers.fc( + size=1, + act='selu', + param_attr=ParamAttr(name='{}/value/W'.format(scope_name)), + bias_attr=ParamAttr(name='{}/value/b'.format(scope_name))) + + def predict(self, obs, action): + real_obs = layers.slice( + obs, axes=[1], starts=[0], ends=[self.obs_dim - self.vel_obs_dim]) + vel_obs = layers.slice( + obs, axes=[1], starts=[-self.vel_obs_dim], ends=[self.obs_dim]) + hid0 = self.fc0(real_obs) + hid1 = self.fc1(hid0) + vel_hid0 = self.vel_fc0(vel_obs) + vel_hid1 = self.vel_fc1(vel_hid0) + a1 = self.act_fc0(action) + concat = layers.concat([hid1, a1, vel_hid1], axis=1) + hid2 = self.fc2(concat) + V = self.fc3(hid2) + V = layers.squeeze(V, axes=[1]) + return V diff --git a/examples/NeurIPS2019-Learn-to-Move-Challenge/final_submit/official_obs_scaler.npz b/examples/NeurIPS2019-Learn-to-Move-Challenge/final_submit/official_obs_scaler.npz new file mode 100644 index 0000000000000000000000000000000000000000..1099b5f2b036833d9149987b7c10e5a1ec78dbef GIT binary patch literal 2248 zcmd5;c~H}L7EeeBh>8em#ey8JgDAyPQK7JW;sJ^va;yrjN-!LPA^}2C7Q>;EdH_~O zP%bYJEQbV)CL;W}ERe{87C{aHfo$XoAXpFtH?-T?sWZFN>Hf3tJ8#~6-+Vta@B3yx z?|mHYDAYxyUrdigN?&{FTMCIZ>&o(^zn8}`<72E7mmPNRbaW$=z9Ie1z|$+38D#La zu7RbunSqh6fpj=Gji6uUn=t8peBJ2t#wKZi}+R$h59O4fmgJK2IiIk5p99+l z#n0UQB!umO-j>%>0;D{YDC=Z1;krX>oA0^=BA|QqN7+x0h^ozVXy;V#A~l7_6XzG8 z$ij6!*?a&~q;yxIcPz9wO@s)V?%`H^sW|TKKS1n_F>ugIg3?(Z7meLz=;w~bUvC~k zbyM4WaYhpKYk14~D8T&jcE8o*Lols=n!k79eH7KL*KP z%ae15cJ1L|tT?u1VC*gWbtp}w3qL^1y}rEe;$t`uw++?%j3B1fI!EMN09E$z3tf%- zC~>OkW%dk%P2IGVaet-xN*^82cH=~G zlw~10evw4L*VcTEFxAuO|$as5X=8Ev^Y_x0Nqkjgo{T% zGS_l?O?C>Qy1Fz-^Y(qLT2`4y63bAnbNXKL(LxwY*&Q`50z8;HzD74(2Kz_jRciBOUt8iM1Q zuQK)~wWG{wYgbZX3v6uetp=tI5Dbw&kNr`Go`V)-M$7NZynCF+{}md zR}DP%GZJ*A=Yz+s!P7!=dW2dB7OVK}|D4$l^Xulin(i$KHhdXsN1lKyRb_uzsv6yB z$ykPdxjNl)yv{>4bs?RWR(P${v=mF5zS!3|UWss7M1TF~V{rCU`r)1UKA4PGaYmU> z&@#+QDgSl?FOtU{N2nE8_l&xL>sb4c5&o^Q*1MA=xR4o^RCX<<^tZwA50>?bn*NcB@2> z_Ht=+-CM+P=C%|$^RVTB$xZ#*YP2sgVN+eE@cqWDX--%f7Cty{&nYj**FD|&&Hj_{ zG2?Hd6_p_RjS_v$%S!OR6z<+9(xH=HwuS^+36N!Ww(9MoDioN9Zb*4GgQbe9ugus& zSXw!;C_FLPIaiekh5~(zG8JM|TZYS02PMC>YFKE`Qyk2n!MyO>*$Jiv$hzdUEz+eJ zOSNn(_Kc3gZ5LyOaD{~xtMWS2Vi$?R+h3MKuJ94109yiwyKUCaE zrL*=v_A|CBLVxY{BWuNC7)}3u{_P|R{kLSbpzYokDJSWytI6Fngmu)H>B07L_x&%d z^)>xnEB4F(b;IhaINFgF7E%7Q6p)wvvqh3;!>_{NQ`Dy;=@*Rr*W^68r1;l>`V{nO eYyJ)Tt9-0|+9*dmMcV8Oiu~M_$2qe?h4fE7y@P)M literal 0 HcmV?d00001 diff --git a/examples/NeurIPS2019-Learn-to-Move-Challenge/final_submit/submit_model.py b/examples/NeurIPS2019-Learn-to-Move-Challenge/final_submit/submit_model.py new file mode 100644 index 0000000..c6b386d --- /dev/null +++ b/examples/NeurIPS2019-Learn-to-Move-Challenge/final_submit/submit_model.py @@ -0,0 +1,153 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from parl import layers +from mlp_model import ActorModel, CriticModel +from paddle import fluid +from parl.utils import logger + +VEL_OBS_DIM = 4 + 15 +OBS_DIM = 98 + VEL_OBS_DIM +ACT_DIM = 22 + + +class EnsembleBaseModel(object): + def __init__(self, + model_dirname=None, + stage_name=None, + ensemble_num=12, + use_cuda=False): + self.stage_name = stage_name + self.ensemble_num = ensemble_num + self.actors = [] + self.critics1 = [] + self.critics2 = [] + for i in range(ensemble_num): + self.actors.append( + ActorModel( + OBS_DIM, + VEL_OBS_DIM, + ACT_DIM, + stage_name=stage_name, + model_id=i)) + self.critics1.append( + CriticModel( + OBS_DIM, + VEL_OBS_DIM, + ACT_DIM, + stage_name=stage_name, + model_id=i * 2)) + self.critics2.append( + CriticModel( + OBS_DIM, + VEL_OBS_DIM, + ACT_DIM, + stage_name=stage_name, + model_id=i * 2 + 1)) + + self._define_program() + + self.place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + self.fluid_executor = fluid.Executor(self.place) + self.fluid_executor.run(self.startup_program) + + if model_dirname is not None: + self._load_params(model_dirname) + + def _load_params(self, dirname): + logger.info('[{}]: Loading model from {}'.format( + self.stage_name, dirname)) + fluid.io.load_params( + executor=self.fluid_executor, + dirname=dirname, + main_program=self.ensemble_predict_program, + filename='model.ckpt') + + def _define_program(self): + self.ensemble_predict_program = fluid.Program() + self.startup_program = fluid.Program() + with fluid.program_guard(self.ensemble_predict_program, + self.startup_program): + obs = layers.data(name='obs', shape=[OBS_DIM], dtype='float32') + action = self._ensemble_predict(obs) + self.ensemble_predict_output = [action] + + def _ensemble_predict(self, obs): + actor_outputs = [] + for i in range(self.ensemble_num): + actor_outputs.append(self.actors[i].predict(obs)) + batch_actions = layers.concat(actor_outputs, axis=0) + batch_obs = layers.expand(obs, expand_times=[self.ensemble_num, 1]) + + critic_outputs = [] + for i in range(self.ensemble_num): + critic1_output = self.critics1[i].predict(batch_obs, batch_actions) + critic1_output = layers.unsqueeze(critic1_output, axes=[1]) + + critic2_output = self.critics2[i].predict(batch_obs, batch_actions) + critic2_output = layers.unsqueeze(critic2_output, axes=[1]) + + critic_output = layers.elementwise_min(critic1_output, + critic2_output) + critic_outputs.append(critic_output) + score_matrix = layers.concat(critic_outputs, axis=1) + + # Normalize scores given by each critic + sum_critic_score = layers.reduce_sum( + score_matrix, dim=0, keep_dim=True) + sum_critic_score = layers.expand( + sum_critic_score, expand_times=[self.ensemble_num, 1]) + norm_score_matrix = score_matrix / sum_critic_score + + actions_mean_score = layers.reduce_mean( + norm_score_matrix, dim=1, keep_dim=True) + best_score_id = layers.argmax(actions_mean_score, axis=0) + best_score_id = layers.cast(best_score_id, dtype='int32') + ensemble_predict_action = layers.gather(batch_actions, best_score_id) + ensemble_predict_action = layers.squeeze( + ensemble_predict_action, axes=[0]) + return ensemble_predict_action + + def pred_batch(self, obs): + feed = {'obs': obs} + action = self.fluid_executor.run( + self.ensemble_predict_program, + feed=feed, + fetch_list=self.ensemble_predict_output)[0] + return action + + +class SubmitModel(object): + def __init__(self, use_cuda=False): + self.stage0_model = EnsembleBaseModel( + model_dirname='./stage0_saved_models', + stage_name='stage0', + use_cuda=use_cuda) + self.stage1_model = EnsembleBaseModel( + model_dirname='./stage1_saved_models', + stage_name='stage1', + use_cuda=use_cuda) + + def pred_batch(self, obs, target_change_times): + batch_obs = np.expand_dims(obs, axis=0).astype('float32') + if target_change_times == 0: + action = self.stage0_model.pred_batch(batch_obs) + else: + action = self.stage1_model.pred_batch(batch_obs) + return action + + +if __name__ == '__main__': + submit_model = SubmitModel() diff --git a/examples/NeurIPS2019-Learn-to-Move-Challenge/final_submit/test.py b/examples/NeurIPS2019-Learn-to-Move-Challenge/final_submit/test.py new file mode 100644 index 0000000..83f74ad --- /dev/null +++ b/examples/NeurIPS2019-Learn-to-Move-Challenge/final_submit/test.py @@ -0,0 +1,72 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import numpy as np +import time +from env_wrapper import FrameSkip, ActionScale, OfficialObs, ForwardReward +from osim.env import L2M2019Env +from parl.utils import logger +from submit_model import SubmitModel + + +def play_multi_episode(submit_model, episode_num=2, vis=False, seed=0): + np.random.seed(seed) + + env = L2M2019Env(difficulty=3, visualize=vis) + env.change_model(model='3D', difficulty=3) + env = ForwardReward(env) + env = FrameSkip(env, 4) + env = ActionScale(env) + env = OfficialObs(env) + all_reward = [] + + for e in range(episode_num): + episode_reward = 0.0 + observation = env.reset(project=True, obs_as_dict=True) + step = 0 + target_change_times = 0 + while True: + step += 1 + action = submit_model.pred_batch(observation, target_change_times) + observation, reward, done, info = env.step( + action, project=True, obs_as_dict=True) + if info['target_changed']: + target_change_times += 1 + episode_reward += reward + if done: + break + all_reward.append(episode_reward) + logger.info("[episode/{}] episode_reward:{} mean_reward:{}".format(\ + e, episode_reward, np.mean(all_reward))) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--use_cuda', action="store_true", help='If set, will run in gpu 0') + parser.add_argument( + '--vis', action="store_true", help='If set, will visualize.') + parser.add_argument('--seed', type=int, default=0, help='Random seed.') + parser.add_argument( + '--episode_num', type=int, default=1, help='Episode number to run.') + args = parser.parse_args() + + submit_model = SubmitModel(use_cuda=args.use_cuda) + + play_multi_episode( + submit_model, + episode_num=args.episode_num, + vis=args.vis, + seed=args.seed) -- GitLab