提交 7c406386 编写于 作者: H Hongsheng Zeng 提交者: Bo Zhou

Final submitted models of NeurIPS2019 challenge (#168)

* final submit models of NeurIPS2019 challenge

* update readme

* fix yapf

* refine comment
上级 51fd6169
## Dependencies
- python3.6
- [paddlepaddle>=1.5.2](https://github.com/PaddlePaddle/Paddle)
- [parl>=1.2](https://github.com/PaddlePaddle/PARL)
- [osim-rl==3.0.11](https://github.com/stanfordnmbl/osim-rl)
## Part1: Final submitted model
### Test
- How to Run
1. Enter the sub-folder `final_submit`
2. Download the model file from online stroage service: [Baidu Pan](https://pan.baidu.com/s/12LIPspckCT8-Q5U1QX69Fg) (password: `b5ck`) or [Google Drive](https://drive.google.com/file/d/1jJtOcOVJ6auz3s-TyWgUJvofPXI94yxy/view?usp=sharing)
3. Unpack the file:
`tar zxvf saved_models.tar.gz`
4. Launch test scription:
`python test.py`
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import abc
import copy
import gym
import math
import numpy as np
from collections import OrderedDict
from osim.env import L2M2019Env
from parl.utils import logger
MAXTIME_LIMIT = 2500
L2M2019Env.time_limit = MAXTIME_LIMIT
FRAME_SKIP = None
FALL_PENALTY = 0
class ActionScale(gym.Wrapper):
def __init__(self, env):
gym.Wrapper.__init__(self, env)
def step(self, action, **kwargs):
action = (np.copy(action) + 1.0) * 0.5
action = np.clip(action, 0.0, 1.0)
return self.env.step(action, **kwargs)
def reset(self, **kwargs):
return self.env.reset(**kwargs)
class FrameSkip(gym.Wrapper):
def __init__(self, env, k):
gym.Wrapper.__init__(self, env)
self.frame_skip = k
global FRAME_SKIP
FRAME_SKIP = k
self.frame_count = 0
def step(self, action, **kwargs):
r = 0.0
merge_info = {}
for k in range(self.frame_skip):
self.frame_count += 1
obs, reward, done, info = self.env.step(action, **kwargs)
r += reward
for key in info.keys():
if 'reward' in key:
# to assure that we don't igonre other reward
# if new reward was added, consider its logic here
assert (key == 'shaping_reward') or (
key == 'env_reward') or (key == 'x_offset_reward')
merge_info[key] = merge_info.get(key, 0.0) + info[key]
else:
merge_info[key] = info[key]
if info['target_changed']:
logger.warn("[FrameSkip] early break since target was changed")
break
if done:
break
merge_info['frame_count'] = self.frame_count
return obs, r, done, merge_info
def reset(self, **kwargs):
self.frame_count = 0
return self.env.reset(**kwargs)
class RewardShaping(gym.Wrapper):
""" A wrapper for reward shaping, note this wrapper must be the first wrapper """
def __init__(self, env):
logger.info("[RewardShaping]type:{}".format(type(env)))
self.step_count = 0
self.pre_state_desc = None
self.last_target_vel = None
self.last_target_change_step = 0
gym.Wrapper.__init__(self, env)
@abc.abstractmethod
def reward_shaping(self, state_desc, reward, done, action):
"""define your own reward computation function
Args:
state_desc(dict): state description for current model
reward(scalar): generic reward generated by env
done(bool): generic done flag generated by env
"""
pass
def step(self, action, **kwargs):
self.step_count += 1
obs, r, done, info = self.env.step(action, **kwargs)
info = self.reward_shaping(obs, r, done, action)
target_vel = np.linalg.norm(
[obs['v_tgt_field'][0][5][5], obs['v_tgt_field'][1][5][5]])
info['target_changed'] = False
if self.last_target_vel is not None:
if np.abs(target_vel - self.last_target_vel) > 0.2:
self.last_target_change_step = self.step_count
info['target_changed'] = True
info['last_target_change_step'] = self.last_target_change_step
self.last_target_vel = target_vel
assert 'shaping_reward' in info
timeout = False
if self.step_count >= MAXTIME_LIMIT:
timeout = True
if done and not timeout:
# penalty for falling down
info['shaping_reward'] += FALL_PENALTY
info['timeout'] = timeout
self.pre_state_desc = obs
return obs, r, done, info
def reset(self, **kwargs):
self.step_count = 0
self.last_target_vel = None
self.last_target_change_step = 0
obs = self.env.reset(**kwargs)
self.pre_state_desc = obs
return obs
class ForwardReward(RewardShaping):
""" A reward shaping wraper"""
def __init__(self, env):
RewardShaping.__init__(self, env)
def reward_shaping(self, state_desc, r2_reward, done, action):
info = {'shaping_reward': r2_reward}
return info
class ObsTranformerBase(gym.Wrapper):
def __init__(self, env):
gym.Wrapper.__init__(self, env)
self.step_fea = MAXTIME_LIMIT
self.raw_obs = None
global FRAME_SKIP
self.frame_skip = int(FRAME_SKIP)
def get_observation(self, state_desc):
obs = self._get_observation(state_desc)
return obs
@abc.abstractmethod
def _get_observation(self, state_desc):
pass
def feature_normalize(self, obs, mean, std, duplicate_id):
scaler_len = mean.shape[0]
assert obs.shape[0] >= scaler_len
obs[:scaler_len] = (obs[:scaler_len] - mean) / std
final_obs = []
for i in range(obs.shape[0]):
if i not in duplicate_id:
final_obs.append(obs[i])
return np.array(final_obs)
def step(self, action, **kwargs):
obs, r, done, info = self.env.step(action, **kwargs)
if info['target_changed']:
# reset step_fea when change target
self.step_fea = MAXTIME_LIMIT
self.step_fea -= FRAME_SKIP
self.raw_obs = copy.deepcopy(obs)
obs = self.get_observation(obs)
self.raw_obs['step_count'] = MAXTIME_LIMIT - self.step_fea
return obs, r, done, info
def reset(self, **kwargs):
obs = self.env.reset(**kwargs)
if obs is None:
return None
self.step_fea = MAXTIME_LIMIT
self.raw_obs = copy.deepcopy(obs)
obs = self.get_observation(obs)
self.raw_obs['step_count'] = MAXTIME_LIMIT - self.step_fea
return obs
class OfficialObs(ObsTranformerBase):
def __init__(self, env):
ObsTranformerBase.__init__(self, env)
data = np.load('./official_obs_scaler.npz')
self.mean, self.std, self.duplicate_id = data['mean'], data[
'std'], data['duplicate_id']
self.duplicate_id = self.duplicate_id.astype(np.int32).tolist()
def _get_observation(self, obs_dict):
res = []
res.append(obs_dict['pelvis']['height'])
res.append(obs_dict['pelvis']['pitch'])
res.append(obs_dict['pelvis']['roll'])
res.append(obs_dict['pelvis']['vel'][0])
res.append(obs_dict['pelvis']['vel'][1])
res.append(obs_dict['pelvis']['vel'][2])
res.append(obs_dict['pelvis']['vel'][3])
res.append(obs_dict['pelvis']['vel'][4])
res.append(obs_dict['pelvis']['vel'][5])
for leg in ['r_leg', 'l_leg']:
res += obs_dict[leg]['ground_reaction_forces']
res.append(obs_dict[leg]['joint']['hip_abd'])
res.append(obs_dict[leg]['joint']['hip'])
res.append(obs_dict[leg]['joint']['knee'])
res.append(obs_dict[leg]['joint']['ankle'])
res.append(obs_dict[leg]['d_joint']['hip_abd'])
res.append(obs_dict[leg]['d_joint']['hip'])
res.append(obs_dict[leg]['d_joint']['knee'])
res.append(obs_dict[leg]['d_joint']['ankle'])
for MUS in [
'HAB', 'HAD', 'HFL', 'GLU', 'HAM', 'RF', 'VAS', 'BFSH',
'GAS', 'SOL', 'TA'
]:
res.append(obs_dict[leg][MUS]['f'])
res.append(obs_dict[leg][MUS]['l'])
res.append(obs_dict[leg][MUS]['v'])
res = np.array(res)
res = self.feature_normalize(
res, mean=self.mean, std=self.std, duplicate_id=self.duplicate_id)
remaining_time = (self.step_fea -
(MAXTIME_LIMIT / 2.0)) / (MAXTIME_LIMIT / 2.0) * -1.0
res = np.append(res, remaining_time)
# target driven
current_v_x = obs_dict['pelvis']['vel'][0] # (+) forward
current_v_z = obs_dict['pelvis']['vel'][1] # (+) leftward
# future vels (0m, 1m, ..., 5m)
for index in range(5, 11):
target_v_x, target_v_z = obs_dict['v_tgt_field'][0][index][
5], obs_dict['v_tgt_field'][1][index][5]
diff_vel_x = target_v_x - current_v_x
diff_vel_z = target_v_z - current_v_z
diff_vel = np.sqrt(target_v_x ** 2 + target_v_z ** 2) - \
np.sqrt(current_v_x ** 2 + current_v_z ** 2)
res = np.append(
res, [diff_vel_x / 5.0, diff_vel_z / 5.0, diff_vel / 5.0])
# current relative target theta
target_v_x, target_v_z = obs_dict['v_tgt_field'][0][5][5], obs_dict[
'v_tgt_field'][1][5][5]
target_theta = math.atan2(target_v_z, target_v_x)
diff_theta = target_theta
res = np.append(res, [diff_theta / np.pi])
return res
if __name__ == '__main__':
from osim.env import L2M2019Env
env = L2M2019Env(difficulty=3, visualize=False)
env.change_model(model='3D', difficulty=3)
env = ForwardReward(env)
env = FrameSkip(env, 4)
env = ActionScale(env)
env = OfficialObs(env)
observation = env.reset(project=True, obs_as_dict=True)
print(observation.shape)
while True:
_, _, done, _ = env.step(
env.action_space.sample(), project=True, obs_as_dict=True)
if done:
break
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import parl
from parl import layers
from paddle import fluid
from paddle.fluid.param_attr import ParamAttr
class ActorModel(parl.Model):
def __init__(self,
obs_dim,
vel_obs_dim,
act_dim,
stage_name=None,
model_id=0,
shared=True):
super(ActorModel, self).__init__()
hid0_size = 800
hid1_size = 400
hid2_size = 200
vel_hid0_size = 200
vel_hid1_size = 400
self.obs_dim = obs_dim
self.vel_obs_dim = vel_obs_dim
# buttom layers
if shared:
scope_name = 'policy_shared'
else:
scope_name = 'policy_identity_{}'.format(model_id)
if stage_name is not None:
scope_name = '{}_{}'.format(stage_name, scope_name)
self.fc0 = layers.fc(
size=hid0_size,
act='tanh',
param_attr=ParamAttr(name='{}/h0/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/h0/b'.format(scope_name)))
self.fc1 = layers.fc(
size=hid1_size,
act='tanh',
param_attr=ParamAttr(name='{}/h1/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/h1/b'.format(scope_name)))
self.vel_fc0 = layers.fc(
size=vel_hid0_size,
act='tanh',
param_attr=ParamAttr(name='{}/vel_h0/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/vel_h0/b'.format(scope_name)))
self.vel_fc1 = layers.fc(
size=vel_hid1_size,
act='tanh',
param_attr=ParamAttr(name='{}/vel_h1/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/vel_h1/b'.format(scope_name)))
# top layers
scope_name = 'policy_identity_{}'.format(model_id)
if stage_name is not None:
scope_name = '{}_{}'.format(stage_name, scope_name)
self.fc2 = layers.fc(
size=hid2_size,
act='tanh',
param_attr=ParamAttr(name='{}/h2/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/h2/b'.format(scope_name)))
self.fc3 = layers.fc(
size=act_dim,
act='tanh',
param_attr=ParamAttr(name='{}/means/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/means/b'.format(scope_name)))
def predict(self, obs):
real_obs = layers.slice(
obs, axes=[1], starts=[0], ends=[self.obs_dim - self.vel_obs_dim])
vel_obs = layers.slice(
obs, axes=[1], starts=[-self.vel_obs_dim], ends=[self.obs_dim])
hid0 = self.fc0(real_obs)
hid1 = self.fc1(hid0)
vel_hid0 = self.vel_fc0(vel_obs)
vel_hid1 = self.vel_fc1(vel_hid0)
concat = layers.concat([hid1, vel_hid1], axis=1)
hid2 = self.fc2(concat)
means = self.fc3(hid2)
return means
class CriticModel(parl.Model):
def __init__(self,
obs_dim,
vel_obs_dim,
act_dim,
stage_name=None,
model_id=0,
shared=True):
super(CriticModel, self).__init__()
hid0_size = 800
hid1_size = 400
vel_hid0_size = 200
vel_hid1_size = 400
self.obs_dim = obs_dim
self.vel_obs_dim = vel_obs_dim
# buttom layers
if shared:
scope_name = 'critic_shared'
else:
scope_name = 'critic_identity_{}'.format(model_id)
if stage_name is not None:
scope_name = '{}_{}'.format(stage_name, scope_name)
self.fc0 = layers.fc(
size=hid0_size,
act='selu',
param_attr=ParamAttr(name='{}/w1/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/w1/b'.format(scope_name)))
self.fc1 = layers.fc(
size=hid1_size,
act='selu',
param_attr=ParamAttr(name='{}/h1/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/h1/b'.format(scope_name)))
self.vel_fc0 = layers.fc(
size=vel_hid0_size,
act='selu',
param_attr=ParamAttr(name='{}/vel_h0/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/vel_h0/b'.format(scope_name)))
self.vel_fc1 = layers.fc(
size=vel_hid1_size,
act='selu',
param_attr=ParamAttr(name='{}/vel_h1/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/vel_h1/b'.format(scope_name)))
self.act_fc0 = layers.fc(
size=hid1_size,
act='selu',
param_attr=ParamAttr(name='{}/a1/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/a1/b'.format(scope_name)))
# top layers
scope_name = 'critic_identity_{}'.format(model_id)
if stage_name is not None:
scope_name = '{}_{}'.format(stage_name, scope_name)
self.fc2 = layers.fc(
size=hid1_size,
act='selu',
param_attr=ParamAttr(name='{}/h3/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/h3/b'.format(scope_name)))
self.fc3 = layers.fc(
size=1,
act='selu',
param_attr=ParamAttr(name='{}/value/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/value/b'.format(scope_name)))
def predict(self, obs, action):
real_obs = layers.slice(
obs, axes=[1], starts=[0], ends=[self.obs_dim - self.vel_obs_dim])
vel_obs = layers.slice(
obs, axes=[1], starts=[-self.vel_obs_dim], ends=[self.obs_dim])
hid0 = self.fc0(real_obs)
hid1 = self.fc1(hid0)
vel_hid0 = self.vel_fc0(vel_obs)
vel_hid1 = self.vel_fc1(vel_hid0)
a1 = self.act_fc0(action)
concat = layers.concat([hid1, a1, vel_hid1], axis=1)
hid2 = self.fc2(concat)
V = self.fc3(hid2)
V = layers.squeeze(V, axes=[1])
return V
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from parl import layers
from mlp_model import ActorModel, CriticModel
from paddle import fluid
from parl.utils import logger
VEL_OBS_DIM = 4 + 15
OBS_DIM = 98 + VEL_OBS_DIM
ACT_DIM = 22
class EnsembleBaseModel(object):
def __init__(self,
model_dirname=None,
stage_name=None,
ensemble_num=12,
use_cuda=False):
self.stage_name = stage_name
self.ensemble_num = ensemble_num
self.actors = []
self.critics1 = []
self.critics2 = []
for i in range(ensemble_num):
self.actors.append(
ActorModel(
OBS_DIM,
VEL_OBS_DIM,
ACT_DIM,
stage_name=stage_name,
model_id=i))
self.critics1.append(
CriticModel(
OBS_DIM,
VEL_OBS_DIM,
ACT_DIM,
stage_name=stage_name,
model_id=i * 2))
self.critics2.append(
CriticModel(
OBS_DIM,
VEL_OBS_DIM,
ACT_DIM,
stage_name=stage_name,
model_id=i * 2 + 1))
self._define_program()
self.place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
self.fluid_executor = fluid.Executor(self.place)
self.fluid_executor.run(self.startup_program)
if model_dirname is not None:
self._load_params(model_dirname)
def _load_params(self, dirname):
logger.info('[{}]: Loading model from {}'.format(
self.stage_name, dirname))
fluid.io.load_params(
executor=self.fluid_executor,
dirname=dirname,
main_program=self.ensemble_predict_program,
filename='model.ckpt')
def _define_program(self):
self.ensemble_predict_program = fluid.Program()
self.startup_program = fluid.Program()
with fluid.program_guard(self.ensemble_predict_program,
self.startup_program):
obs = layers.data(name='obs', shape=[OBS_DIM], dtype='float32')
action = self._ensemble_predict(obs)
self.ensemble_predict_output = [action]
def _ensemble_predict(self, obs):
actor_outputs = []
for i in range(self.ensemble_num):
actor_outputs.append(self.actors[i].predict(obs))
batch_actions = layers.concat(actor_outputs, axis=0)
batch_obs = layers.expand(obs, expand_times=[self.ensemble_num, 1])
critic_outputs = []
for i in range(self.ensemble_num):
critic1_output = self.critics1[i].predict(batch_obs, batch_actions)
critic1_output = layers.unsqueeze(critic1_output, axes=[1])
critic2_output = self.critics2[i].predict(batch_obs, batch_actions)
critic2_output = layers.unsqueeze(critic2_output, axes=[1])
critic_output = layers.elementwise_min(critic1_output,
critic2_output)
critic_outputs.append(critic_output)
score_matrix = layers.concat(critic_outputs, axis=1)
# Normalize scores given by each critic
sum_critic_score = layers.reduce_sum(
score_matrix, dim=0, keep_dim=True)
sum_critic_score = layers.expand(
sum_critic_score, expand_times=[self.ensemble_num, 1])
norm_score_matrix = score_matrix / sum_critic_score
actions_mean_score = layers.reduce_mean(
norm_score_matrix, dim=1, keep_dim=True)
best_score_id = layers.argmax(actions_mean_score, axis=0)
best_score_id = layers.cast(best_score_id, dtype='int32')
ensemble_predict_action = layers.gather(batch_actions, best_score_id)
ensemble_predict_action = layers.squeeze(
ensemble_predict_action, axes=[0])
return ensemble_predict_action
def pred_batch(self, obs):
feed = {'obs': obs}
action = self.fluid_executor.run(
self.ensemble_predict_program,
feed=feed,
fetch_list=self.ensemble_predict_output)[0]
return action
class SubmitModel(object):
def __init__(self, use_cuda=False):
self.stage0_model = EnsembleBaseModel(
model_dirname='./stage0_saved_models',
stage_name='stage0',
use_cuda=use_cuda)
self.stage1_model = EnsembleBaseModel(
model_dirname='./stage1_saved_models',
stage_name='stage1',
use_cuda=use_cuda)
def pred_batch(self, obs, target_change_times):
batch_obs = np.expand_dims(obs, axis=0).astype('float32')
if target_change_times == 0:
action = self.stage0_model.pred_batch(batch_obs)
else:
action = self.stage1_model.pred_batch(batch_obs)
return action
if __name__ == '__main__':
submit_model = SubmitModel()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import numpy as np
import time
from env_wrapper import FrameSkip, ActionScale, OfficialObs, ForwardReward
from osim.env import L2M2019Env
from parl.utils import logger
from submit_model import SubmitModel
def play_multi_episode(submit_model, episode_num=2, vis=False, seed=0):
np.random.seed(seed)
env = L2M2019Env(difficulty=3, visualize=vis)
env.change_model(model='3D', difficulty=3)
env = ForwardReward(env)
env = FrameSkip(env, 4)
env = ActionScale(env)
env = OfficialObs(env)
all_reward = []
for e in range(episode_num):
episode_reward = 0.0
observation = env.reset(project=True, obs_as_dict=True)
step = 0
target_change_times = 0
while True:
step += 1
action = submit_model.pred_batch(observation, target_change_times)
observation, reward, done, info = env.step(
action, project=True, obs_as_dict=True)
if info['target_changed']:
target_change_times += 1
episode_reward += reward
if done:
break
all_reward.append(episode_reward)
logger.info("[episode/{}] episode_reward:{} mean_reward:{}".format(\
e, episode_reward, np.mean(all_reward)))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--use_cuda', action="store_true", help='If set, will run in gpu 0')
parser.add_argument(
'--vis', action="store_true", help='If set, will visualize.')
parser.add_argument('--seed', type=int, default=0, help='Random seed.')
parser.add_argument(
'--episode_num', type=int, default=1, help='Episode number to run.')
args = parser.parse_args()
submit_model = SubmitModel(use_cuda=args.use_cuda)
play_multi_episode(
submit_model,
episode_num=args.episode_num,
vis=args.vis,
seed=args.seed)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册