提交 cdb50056 编写于 作者: H Hongsheng Zeng 提交者: Bo Zhou

NeurIPS2018-AI-for-Prosthetics-Challenge training code (#40)

* NeurIPS2018-AI-for-Prosthetics-Challenge training code

* remove model_zoo, provide download link

* remove model_zoo, provide download link

* add restore_from_one_head api, refine README, fix logger bug

* fix test bug

* fix rpm bug, refine ddpg train script

* fix rpm bug, refine Readme
上级 f8de849b
...@@ -61,13 +61,13 @@ three steps to build an agent ...@@ -61,13 +61,13 @@ three steps to build an agent
model = AtariModel(img_shape=(32, 32), action_dim=4) model = AtariModel(img_shape=(32, 32), action_dim=4)
algorithm = DQN(model) algorithm = DQN(model)
agent = AtariAgent(aglrotihm) agent = AtariAgent(algorithm)
``` ```
# Install: # Install:
### Dependencies ### Dependencies
- Python 2.7 or 3.5+. - Python 2.7 or 3.5+.
- PaddlePaddle >=1.0 (We try to make our repository always compatible with newest version PaddlePaddle) - PaddlePaddle >=1.2.1 (We try to make our repository always compatible with newest version PaddlePaddle)
``` ```
......
...@@ -19,12 +19,10 @@ import time ...@@ -19,12 +19,10 @@ import time
from mujoco_agent import MujocoAgent from mujoco_agent import MujocoAgent
from mujoco_model import MujocoModel from mujoco_model import MujocoModel
from parl.algorithms import DDPG from parl.algorithms import DDPG
from parl.utils import logger, action_mapping from parl.utils import logger, action_mapping, ReplayMemory
from replay_memory import ReplayMemory
MAX_EPISODES = 5000 MAX_EPISODES = 5000
TEST_EVERY_EPISODES = 50 TEST_EVERY_EPISODES = 20
MAX_STEPS_EACH_EPISODE = 1000
ACTOR_LR = 1e-4 ACTOR_LR = 1e-4
CRITIC_LR = 1e-3 CRITIC_LR = 1e-3
GAMMA = 0.99 GAMMA = 0.99
...@@ -39,7 +37,7 @@ ENV_SEED = 1 ...@@ -39,7 +37,7 @@ ENV_SEED = 1
def run_train_episode(env, agent, rpm): def run_train_episode(env, agent, rpm):
obs = env.reset() obs = env.reset()
total_reward = 0 total_reward = 0
for j in range(MAX_STEPS_EACH_EPISODE): while True:
batch_obs = np.expand_dims(obs, axis=0) batch_obs = np.expand_dims(obs, axis=0)
action = agent.predict(batch_obs.astype('float32')) action = agent.predict(batch_obs.astype('float32'))
action = np.squeeze(action) action = np.squeeze(action)
...@@ -70,7 +68,7 @@ def run_train_episode(env, agent, rpm): ...@@ -70,7 +68,7 @@ def run_train_episode(env, agent, rpm):
def run_evaluate_episode(env, agent): def run_evaluate_episode(env, agent):
obs = env.reset() obs = env.reset()
total_reward = 0 total_reward = 0
for j in range(MAX_STEPS_EACH_EPISODE): while True:
batch_obs = np.expand_dims(obs, axis=0) batch_obs = np.expand_dims(obs, axis=0)
action = agent.predict(batch_obs.astype('float32')) action = agent.predict(batch_obs.astype('float32'))
action = np.squeeze(action) action = np.squeeze(action)
...@@ -112,7 +110,8 @@ def main(): ...@@ -112,7 +110,8 @@ def main():
logger.info('Episode: {} Reward: {}'.format(i, train_reward)) logger.info('Episode: {} Reward: {}'.format(i, train_reward))
if (i + 1) % TEST_EVERY_EPISODES == 0: if (i + 1) % TEST_EVERY_EPISODES == 0:
evaluate_reward = run_evaluate_episode(env, agent) evaluate_reward = run_evaluate_episode(env, agent)
logger.info('Evaluate Reward: {}'.format(evaluate_reward)) logger.info('Episode {}, Evaluate reward: {}'.format(
i, evaluate_reward))
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -64,6 +64,7 @@ def run_train_episode(env, agent, rpm): ...@@ -64,6 +64,7 @@ def run_train_episode(env, agent, rpm):
state = next_state state = next_state
if isOver: if isOver:
break break
if all_cost:
logger.info('[Train]total_reward: {}, mean_cost: {}'.format( logger.info('[Train]total_reward: {}, mean_cost: {}'.format(
total_reward, np.mean(all_cost))) total_reward, np.mean(all_cost)))
return total_reward, step return total_reward, step
...@@ -122,7 +123,7 @@ def main(): ...@@ -122,7 +123,7 @@ def main():
if total_step // args.test_every_steps == test_flag: if total_step // args.test_every_steps == test_flag:
pbar.write("testing") pbar.write("testing")
eval_rewards = [] eval_rewards = []
for _ in tqdm(range(30), desc='eval agent'): for _ in tqdm(range(3), desc='eval agent'):
eval_reward = run_evaluate_episode(test_env, agent) eval_reward = run_evaluate_episode(test_env, agent)
eval_rewards.append(eval_reward) eval_rewards.append(eval_reward)
test_flag += 1 test_flag += 1
......
## The Winning Solution for the NeurIPS 2018: AI for Prosthetics Challenge # The Winning Solution for the NeurIPS 2018: AI for Prosthetics Challenge
This folder will contains the code used to train the winning models for the [NeurIPS 2018: AI for Prosthetics Challenge](https://www.crowdai.org/challenges/neurips-2018-ai-for-prosthetics-challenge) along with the resulting models. (Codes of training part is organizing, but the resulting models is available now.) This folder contains the code used to train the winning models for the [NeurIPS 2018: AI for Prosthetics Challenge](https://www.crowdai.org/challenges/neurips-2018-ai-for-prosthetics-challenge) along with the resulting models.
## Dependencies
### Dependencies
- python3.6 - python3.6
- [paddlepaddle>=1.2.0](https://github.com/PaddlePaddle/Paddle) - [paddlepaddle>=1.2.1](https://github.com/PaddlePaddle/Paddle)
- [osim-rl](https://github.com/stanfordnmbl/osim-rl) - [osim-rl](https://github.com/stanfordnmbl/osim-rl)
- [grpcio==1.12.1](https://grpc.io/docs/quickstart/python.html)
- tqdm
- tensorflow (To use tensorboard)
## Result
### Start Testing best models | Avg reward of all episodes | Avg reward of complete episodes | Falldown % | Evaluate episodes |
|----------------------------|---------------------------------|------------|-------------------|
| 9968.5404 | 9980.3952 | 0.0026 | 500 CPUs * 10 episodes |
## Start test our submit models
- How to Run - How to Run
```bash ```bash
# cd current directory # cd current directory
# install best models file (saved_model.tar.gz) # cd final_submit/
# download submit models file (saved_model.tar.gz)
tar zxvf saved_model.tar.gz tar zxvf saved_model.tar.gz
python test.py python test.py
``` ```
> You can install models file from [Baidu Pan](https://pan.baidu.com/s/1NN1auY2eDblGzUiqR8Bfqw) or [Google Drive](https://drive.google.com/open?id=1DQHrwtXzgFbl9dE7jGOe9ZbY0G9-qfq3) > You can download models file from [Baidu Pan](https://pan.baidu.com/s/1NN1auY2eDblGzUiqR8Bfqw) or [Google Drive](https://drive.google.com/open?id=1DQHrwtXzgFbl9dE7jGOe9ZbY0G9-qfq3)
## Start train
### Stage I: Curriculum learning
#### 1. Run Fastest
```bash
# server
python simulator_server.py --port [PORT] --ensemble_num 1
# client (Suggest: 200+ clients)
python simulator_client.py --port [PORT] --ip [IP] --reward_type RunFastest
```
#### 2. target speed 3.0 m/s
- More arguments
```bash ```bash
# Run with GPU # server
python test.py --use_cuda python simulator_server.py --port [PORT] --ensemble_num 1 --warm_start_batchs 1000 \
--restore_model_path [RunFastest model]
# client (Suggest: 200+ clients)
python simulator_client.py --port [PORT] --ip [IP] --reward_type FixedTargetSpeed --target_v 3.0 \
--act_penalty_lowerbound 1.5
```
# Visulize the game #### 3. target speed 2.0 m/s
python test.py --vis
# Set the random seed ```bash
python test.py --seed 1024 # server
python simulator_server.py --port [PORT] --ensemble_num 1 --warm_start_batchs 1000 \
--restore_model_path [FixedTargetSpeed 3.0m/s model]
# Set the episode number to run # client (Suggest: 200+ clients)
python test.py --episode_num 2 python simulator_client.py --port [PORT] --ip [IP] --reward_type FixedTargetSpeed --target_v 2.0 \
--act_penalty_lowerbound 0.75
``` ```
### Start Training #### 4. target speed 1.25 m/s
- [ ] To be Done
```bash
# server
python simulator_server.py --port [PORT] --ensemble_num 1 --warm_start_batchs 1000 \
--restore_model_path [FixedTargetSpeed 2.0m/s model]
# client (Suggest: 200+ clients)
python simulator_client.py --port [PORT] --ip [IP] --reward_type FixedTargetSpeed --target_v 1.25 \
--act_penalty_lowerbound 0.6
```
### Stage II: Round2
> You can download resulting 1.25m/s model in Stage I from [Baidu Pan](https://pan.baidu.com/s/1PVDgIe3NuLB-4qI5iSxtKA) or [Google Drive](https://drive.google.com/open?id=1jWzs3wvq7_ierIwGZXc-M92bv1X5eqs7)
```bash
# server
python simulator_server.py --port [PORT] --ensemble_num 12 --warm_start_batchs 1000 \
--restore_model_path [FixedTargetSpeed 1.25m/s] --restore_from_one_head
# client (Suggest: 100+ clients)
python simulator_client.py --port [PORT] --ip [IP] --reward_type Round2 --act_penalty_lowerbound 0.75 \
--act_penalty_coeff 7.0 --vel_penalty_coeff 20.0 --discrete_data --stage 3
```
> To get a higher score, you need train a seperate model for every stage (target_v change times), and fix trained model of previous stage. It's omitted here.
### Test trained model
```bash
python test.py --restore_model_path [MODEL_PATH] --ensemble_num [ENSEMBLE_NUM]
```
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
def get_server_args():
parser = argparse.ArgumentParser()
parser.add_argument('--port', required=True, type=int, help='server port')
parser.add_argument(
'--logdir', type=str, help='directory to save model/tensorboard data')
parser.add_argument(
'--restore_model_path',
type=str,
help='restore model path for warm start')
parser.add_argument(
'--restore_from_one_head',
action="store_true",
help=
'If set, will restore model from one head model. If ensemble_num > 1, will assign parameters of model0 to other models.'
)
parser.add_argument(
'--restore_rpm_path', type=str, help='restore rpm path for warm start')
parser.add_argument(
'--ensemble_num',
type=int,
required=True,
help='model number to ensemble')
parser.add_argument(
'--warm_start_batchs',
type=int,
default=100,
help='collect how many batch data to warm start')
args = parser.parse_args()
return args
def get_client_args():
parser = argparse.ArgumentParser()
parser.add_argument(
'--stage',
default=0,
type=int,
help='''
stage number, which decides change times of target velocity.
Eg: stage=0 will keep target_v 1.25m/s;
stage=3 will change target velocity 3 times, just like Round2 env.'''
)
parser.add_argument('--ident', type=int, required=False, help='worker id')
parser.add_argument('--ip', type=str, required=True, help='server ip')
parser.add_argument('--port', type=int, required=True, help='server port')
parser.add_argument(
'--target_v', type=float, help='target velocity for training')
parser.add_argument(
'--act_penalty_lowerbound',
type=float,
help='lower bound of action l2 norm penalty')
parser.add_argument(
'--act_penalty_coeff',
type=float,
default=5.0,
help='coefficient of action l2 norm penalty')
parser.add_argument(
'--vel_penalty_coeff',
type=float,
default=1.0,
help='coefficient of velocity gap penalty')
parser.add_argument(
'--discrete_data',
action="store_true",
help=
'if set, discrete target velocity in last stage (args.stage), make target velocity more uniform.'
)
parser.add_argument(
'--discrete_bin',
type=int,
default=10,
help='discrete target velocity in last stage to how many intervals')
parser.add_argument(
'--reward_type',
type=str,
help=
"Choose reward type, 'RunFastest' or 'FixedTargetSpeed' or 'Round2'")
parser.add_argument(
'--debug',
action="store_true",
help='if set, will print debug information')
args = parser.parse_args()
assert args.reward_type in ['RunFastest', 'FixedTargetSpeed', 'Round2']
return args
...@@ -13,41 +13,218 @@ ...@@ -13,41 +13,218 @@
# limitations under the License. # limitations under the License.
import abc import abc
import copy
import gym import gym
import math import math
import numpy as np import numpy as np
import random
from collections import OrderedDict from collections import OrderedDict
from osim.env import ProstheticsEnv from osim.env import ProstheticsEnv
from parl.utils import logger from parl.utils import logger
from tqdm import tqdm
MAXTIME_LIMIT = 1000 MAXTIME_LIMIT = 1000
ProstheticsEnv.time_limit = MAXTIME_LIMIT ProstheticsEnv.time_limit = MAXTIME_LIMIT
FRAME_SKIP = None FRAME_SKIP = None
FALL_PENALTY = 0
class RemoteEnv(gym.Wrapper): class CustomR2Env(gym.Wrapper):
def __init__(self, env): """Customized target trajectory here, it support 3 ways currently
env.metadata = {} 1.fixed_speed, e.g. reset(.., fixed_speed=1.25)
env.action_space = None 2.stage , e.g. reset(.., stage=1)
env.observation_space = None 3.boundary, e.g. reset(.., boundary=True)
env.reward_range = None """
def __init__(self,
env,
time_limit=MAXTIME_LIMIT,
discrete_data=False,
discrete_bin=10):
logger.info("[CustomR2Env]type:{}, time_limit:{}".format(
type(env), time_limit))
assert isinstance(env, ProstheticsEnv), type(env)
gym.Wrapper.__init__(self, env) gym.Wrapper.__init__(self, env)
self.remote_env = env
self.first_time = True self.env.time_limit = time_limit
self.env.spec.timestep_limit = time_limit
def step(self, act): self.time_limit = time_limit
return self.remote_env.env_step(act.tolist())
# boundary flag
def reset(self): self._generate_boundary_target_flag = True
if self.first_time:
self.first_time = False self.discrete_data = discrete_data
return self.remote_env.env_create() self.discrete_bin = discrete_bin
obs = self.remote_env.env_reset()
if not obs: def rect(self, row):
return None r = row[0]
return obs theta = row[1]
x = r * math.cos(theta)
y = 0
z = r * math.sin(theta)
return np.array([x, y, z])
def _generate_boundary_table(self, ):
possible_combine = [(math.pi / 8, 0.5), (math.pi / 8, -0.5),
(-math.pi / 8, 0.5), (-math.pi / 8, -0.5)]
self._boundary_table = []
for a in possible_combine:
for b in possible_combine:
for c in possible_combine:
self._boundary_table.append([a, b, c])
assert len(self._boundary_table) == 64
def generate_boundary_target(self, poisson_lambda=300):
if self._generate_boundary_target_flag == True:
self._generate_boundary_target_flag = False
self._generate_boundary_table()
self._boundary_index = 0
nsteps = self.time_limit + 1
velocity = np.zeros(nsteps)
heading = np.zeros(nsteps)
velocity[0] = 1.25
heading[0] = 0
trajectory = self._boundary_table[self._boundary_index]
change = np.cumsum(np.random.poisson(poisson_lambda, 10))
target_change_times = 0
for i in range(1, nsteps):
velocity[i] = velocity[i - 1]
heading[i] = heading[i - 1]
if i in change:
velocity[i] += trajectory[target_change_times][1]
heading[i] += trajectory[target_change_times][0]
# trajectory has length 3, the target_change_times should not be large than 2
target_change_times = min(2, target_change_times + 1)
self._boundary_index = (self._boundary_index + 1) % 64
def _generate_target_vel(self, stage, change_num):
target_vels = None
if stage == 0:
target_vels = [1.25]
elif stage == 1:
assert change_num >= 1
interval = 1.0 / self.discrete_bin
discrete_id = np.random.randint(self.discrete_bin)
min_vel = 0.75 + discrete_id * interval
max_vel = 0.75 + (discrete_id + 1) * interval
target_vels = [1.25]
for i in range(change_num):
if i == 0:
target_vels.append(random.uniform(min_vel, max_vel))
else:
target_vels.append(target_vels[-1] +
random.uniform(-0.5, 0.5))
elif stage == 2:
assert change_num >= 2
interval = 2.0 / self.discrete_bin
discrete_id = np.random.randint(self.discrete_bin)
min_vel = 0.25 + discrete_id * interval
max_vel = 0.25 + (discrete_id + 1) * interval
while True:
target_vels = [1.25]
for i in range(change_num):
target_vels.append(target_vels[-1] +
random.uniform(-0.5, 0.5))
if target_vels[2] >= min_vel and target_vels[2] <= max_vel:
break
elif stage == 3:
assert change_num >= 3
interval = 3.0 / self.discrete_bin
discrete_id = np.random.randint(self.discrete_bin)
min_vel = -0.25 + discrete_id * interval
max_vel = -0.25 + (discrete_id + 1) * interval
while True:
target_vels = [1.25]
for i in range(change_num):
target_vels.append(target_vels[-1] +
random.uniform(-0.5, 0.5))
if target_vels[3] >= min_vel and target_vels[3] <= max_vel:
break
else:
raise NotImplemented
logger.info('[CustomR2Env] stage: {}, target_vels: {}'.format(
stage, target_vels))
return target_vels
def generate_stage_targets(self, poisson_lambda=300, stage=None):
nsteps = self.time_limit + 1
velocity = np.zeros(nsteps)
heading = np.zeros(nsteps)
velocity[0] = 1.25
heading[0] = 0
change = np.cumsum(np.random.poisson(poisson_lambda, 10))
if stage == 0:
change = []
elif stage == 1:
change = change[:1]
elif stage == 2:
change = change[:2]
elif stage == 3:
if change[3] <= 1000:
change = change[:4]
else:
change = change[:3]
else:
raise NotImplemented
if self.discrete_data:
target_vels = self._generate_target_vel(
stage=stage, change_num=len(change))
change_cnt = 0
for i in range(1, nsteps):
velocity[i] = velocity[i - 1]
heading[i] = heading[i - 1]
if i in change:
change_cnt += 1
if self.discrete_data:
velocity[i] = target_vels[change_cnt]
else:
velocity[i] += random.choice([-1, 1]) * random.uniform(
-0.5, 0.5)
heading[i] += random.choice([-1, 1]) * random.uniform(
-math.pi / 8, math.pi / 8)
trajectory_polar = np.vstack((velocity, heading)).transpose()
targets = np.apply_along_axis(self.rect, 1, trajectory_polar)
return targets
def reset(self, **kwargs):
fixed_speed = None
if 'fixed_speed' in kwargs:
fixed_speed = kwargs.pop('fixed_speed', None)
stage = None
if 'stage' in kwargs:
stage = kwargs.pop('stage', None)
boundary = None
if 'boundary' in kwargs:
boundary = kwargs.pop('boundary', None)
_ = self.env.reset(**kwargs)
if fixed_speed is not None:
targets = np.zeros([self.time_limit + 1, 3], dtype=np.float32)
targets[:, 0] = fixed_speed
self.env.targets = targets
elif stage is not None:
self.env.targets = self.generate_stage_targets(stage=stage)
elif boundary is not None:
self.generate_boundary_target()
else:
# generate new target
self.env.generate_new_targets(
poisson_lambda=int(self.time_limit * (300 / 1000)))
if 'project' in kwargs:
if kwargs.get('project') == True:
return self.env.get_observation()
return self.env.get_state_desc()
def step(self, action, **kwargs):
return self.env.step(action, **kwargs)
def calc_vel_diff(state_desc): def calc_vel_diff(state_desc):
...@@ -85,6 +262,7 @@ class ActionScale(gym.Wrapper): ...@@ -85,6 +262,7 @@ class ActionScale(gym.Wrapper):
class FrameSkip(gym.Wrapper): class FrameSkip(gym.Wrapper):
def __init__(self, env, k): def __init__(self, env, k):
logger.info("[FrameSkip]type:{}".format(type(env)))
gym.Wrapper.__init__(self, env) gym.Wrapper.__init__(self, env)
self.frame_skip = k self.frame_skip = k
global FRAME_SKIP global FRAME_SKIP
...@@ -103,13 +281,13 @@ class FrameSkip(gym.Wrapper): ...@@ -103,13 +281,13 @@ class FrameSkip(gym.Wrapper):
if 'reward' in key: if 'reward' in key:
# to assure that we don't igonre other reward # to assure that we don't igonre other reward
# if new reward was added, consider its logic here # if new reward was added, consider its logic here
assert (key == 'shaping_reward') or (key == 'r2_reward') assert (key == 'shaping_reward') or (
key == 'r2_reward') or (key == 'x_offset_reward')
merge_info[key] = merge_info.get(key, 0.0) + info[key] merge_info[key] = merge_info.get(key, 0.0) + info[key]
else: else:
merge_info[key] = info[key] merge_info[key] = info[key]
if info['target_changed']: if info['target_changed']:
#merge_info['shaping_reward'] += info['shaping_reward'] * (self.frame_skip - k - 1)
logger.warn("[FrameSkip] early break since target was changed") logger.warn("[FrameSkip] early break since target was changed")
break break
...@@ -128,6 +306,8 @@ class RewardShaping(gym.Wrapper): ...@@ -128,6 +306,8 @@ class RewardShaping(gym.Wrapper):
def __init__(self, env): def __init__(self, env):
logger.info("[RewardShaping]type:{}".format(type(env))) logger.info("[RewardShaping]type:{}".format(type(env)))
assert (isinstance(env, ProstheticsEnv)
or isinstance(env, CustomR2Env)), type(env)
self.step_count = 0 self.step_count = 0
self.pre_state_desc = None self.pre_state_desc = None
...@@ -150,25 +330,7 @@ class RewardShaping(gym.Wrapper): ...@@ -150,25 +330,7 @@ class RewardShaping(gym.Wrapper):
self.step_count += 1 self.step_count += 1
obs, r, done, info = self.env.step(action, **kwargs) obs, r, done, info = self.env.step(action, **kwargs)
info = self.reward_shaping(obs, r, done, action) info = self.reward_shaping(obs, r, done, action)
if info['target_vel'] > 2.75: #logger.info('Step {}: target_vel: {}'.format(self.step_count, obs['target_vel']))
rate = math.sqrt((2.75**2) / (info['target_vel']**2))
logger.warn('Changing targets, origin targets: {}'.format(
obs['target_vel']))
obs['target_vel'][0] = obs['target_vel'][0] * rate
obs['target_vel'][2] = obs['target_vel'][2] * rate
logger.warn('Changing targets, new targets: {}'.format(
obs['target_vel']))
info['target_vel'] = 2.75
if info['target_vel'] < -0.25:
rate = math.sqrt(((-0.25)**2) / (info['target_vel']**2))
logger.warn('Changing targets, origin targets: {}'.format(
obs['target_vel']))
obs['target_vel'][0] = obs['target_vel'][0] * rate
obs['target_vel'][2] = obs['target_vel'][2] * rate
logger.warn('Changing targets, new targets: {}'.format(
obs['target_vel']))
info['target_vel'] = -0.25
delta = 0 delta = 0
if self.last_target_vel is not None: if self.last_target_vel is not None:
delta = np.absolute( delta = np.absolute(
...@@ -188,9 +350,7 @@ class RewardShaping(gym.Wrapper): ...@@ -188,9 +350,7 @@ class RewardShaping(gym.Wrapper):
timeout = False timeout = False
if self.step_count >= MAXTIME_LIMIT: if self.step_count >= MAXTIME_LIMIT:
timeout = True timeout = True
if done and not timeout:
# penalty for falling down
info['shaping_reward'] += FALL_PENALTY
info['timeout'] = timeout info['timeout'] = timeout
self.pre_state_desc = obs self.pre_state_desc = obs
return obs, r, done, info return obs, r, done, info
...@@ -205,31 +365,182 @@ class RewardShaping(gym.Wrapper): ...@@ -205,31 +365,182 @@ class RewardShaping(gym.Wrapper):
return obs return obs
class ForwardReward(RewardShaping): class TestReward(RewardShaping):
""" A reward shaping wraper""" """ Reward shaping wrapper for test"""
def __init__(self, env):
RewardShaping.__init__(self, env)
def reward_shaping(self, state_desc, r2_reward, done, action):
return {'shaping_reward': 0}
class RunFastestReward(RewardShaping):
""" Reward shaping wrapper for fixed target speed"""
def __init__(self, env): def __init__(self, env):
RewardShaping.__init__(self, env) RewardShaping.__init__(self, env)
def reward_shaping(self, state_desc, r2_reward, done, action): def reward_shaping(self, state_desc, r2_reward, done, action):
target_vel = math.sqrt(state_desc["target_vel"][0]**2 + if self.pre_state_desc is None:
state_desc["target_vel"][2]**2) x_offset = 0
if state_desc["target_vel"][0] < 0: else:
target_vel = -target_vel x_offset = state_desc["body_pos"]["pelvis"][
0] - self.pre_state_desc["body_pos"]["pelvis"][0]
ret_r = 0
if self.pre_state_desc is not None:
l_foot_reward = state_desc["body_pos"]["tibia_l"][
0] - self.pre_state_desc["body_pos"]["tibia_l"][0]
r_foot_reward = state_desc["body_pos"]["pros_tibia_r"][
0] - self.pre_state_desc["body_pos"]["pros_tibia_r"][0]
ret_r = max(l_foot_reward, r_foot_reward)
# penalty
headx = state_desc['body_pos']['head'][0]
px = state_desc['body_pos']['pelvis'][0]
headz = state_desc['body_pos']['head'][2]
pz = state_desc['body_pos']['pelvis'][2]
kneer = state_desc['joint_pos']['knee_r'][-1]
kneel = state_desc['joint_pos']['knee_l'][-1]
lean_x = min(0.3, max(0, px - headx - 0.15)) * 0.05
lean_z = min(0.3, max(0, pz - headz - 0.15)) * 0.05
joint = sum([max(0, k - 0.1) for k in [kneer, kneel]]) * 0.03
penalty = lean_x + lean_z + joint
ret_r -= penalty * 0.15
cur_vel_x = state_desc['body_vel']['pelvis'][0]
cur_vel_z = state_desc['body_vel']['pelvis'][2]
scalar_vel = math.sqrt(cur_vel_z**2 + cur_vel_x**2)
info = {
'shaping_reward': ret_r,
'r2_reward': r2_reward,
'x_offset_reward': x_offset,
'scalar_vel': scalar_vel,
'mean_action_l2_penalty': 0,
}
return info
class FixedTargetSpeedReward(RewardShaping):
""" Reward shaping wrapper for fixed target speed"""
def __init__(self, env, target_v, act_penalty_lowerbound,
act_penalty_coeff, vel_penalty_coeff):
RewardShaping.__init__(self, env)
assert target_v is not None
assert act_penalty_lowerbound is not None
assert act_penalty_coeff is not None
assert vel_penalty_coeff is not None
self.target_v = target_v
self.act_penalty_lowerbound = act_penalty_lowerbound
self.act_penalty_coeff = act_penalty_coeff
self.vel_penalty_coeff = vel_penalty_coeff
def reward_shaping(self, state_desc, r2_reward, done, action):
if self.pre_state_desc is None:
x_offset = 0
else:
x_offset = state_desc["body_pos"]["pelvis"][
0] - self.pre_state_desc["body_pos"]["pelvis"][0]
# Reward for not falling
ret_r = 36
vel_penalty = ((state_desc["body_vel"]["pelvis"][0] - self.target_v)**2
+ (state_desc["body_vel"]["pelvis"][2] - 0)**2)
origin_action_l2_penalty = np.linalg.norm(action)
action_l2_penalty = max(self.act_penalty_lowerbound,
origin_action_l2_penalty)
ret_r = ret_r - vel_penalty * self.vel_penalty_coeff - action_l2_penalty * self.act_penalty_coeff
cur_vel_x = state_desc['body_vel']['pelvis'][0]
cur_vel_z = state_desc['body_vel']['pelvis'][2]
scalar_vel = math.sqrt(cur_vel_z**2 + cur_vel_x**2)
info = {
'shaping_reward': ret_r,
'r2_reward': r2_reward,
'x_offset_reward': x_offset,
'scalar_vel': scalar_vel,
'mean_action_l2_penalty': origin_action_l2_penalty,
}
return info
class Round2Reward(RewardShaping):
""" Reward shaping wrapper for fixed target speed"""
def __init__(self, env, act_penalty_lowerbound, act_penalty_coeff,
vel_penalty_coeff):
RewardShaping.__init__(self, env)
assert act_penalty_lowerbound is not None
assert act_penalty_coeff is not None
assert vel_penalty_coeff is not None
self.act_penalty_lowerbound = act_penalty_lowerbound
self.act_penalty_coeff = act_penalty_coeff
self.vel_penalty_coeff = vel_penalty_coeff
def reward_shaping(self, state_desc, r2_reward, done, action):
if self.pre_state_desc is None:
x_offset = 0
else:
x_offset = state_desc["body_pos"]["pelvis"][
0] - self.pre_state_desc["body_pos"]["pelvis"][0]
# Reward for not falling
ret_r = 10
# Small penalty for too much activation (cost of transport)
muscle_activations = []
for muscle in sorted(state_desc["muscles"].keys()):
muscle_activations += [state_desc["muscles"][muscle]["activation"]]
muscle_penalty = np.sum(np.array(muscle_activations)**2) * 0.001
vel_penalty = (
(state_desc["target_vel"][0] - state_desc["body_vel"]["pelvis"][0])
**2 + (state_desc["target_vel"][2] -
state_desc["body_vel"]["pelvis"][2])**2)
origin_action_l2_penalty = np.linalg.norm(action)
action_l2_penalty = max(self.act_penalty_lowerbound,
origin_action_l2_penalty)
if self.step_count < 60 or (
self.step_count - self.last_target_change_step < 60):
ret_r = ret_r - vel_penalty * self.vel_penalty_coeff
else:
ret_r = ret_r - vel_penalty * self.vel_penalty_coeff - action_l2_penalty * self.act_penalty_coeff
ret_r -= muscle_penalty
cur_vel_x = state_desc['body_vel']['pelvis'][0]
cur_vel_z = state_desc['body_vel']['pelvis'][2]
scalar_vel = math.sqrt(cur_vel_z**2 + cur_vel_x**2)
info = { info = {
'shaping_reward': r2_reward, 'shaping_reward': ret_r,
'target_vel': target_vel,
'r2_reward': r2_reward, 'r2_reward': r2_reward,
'x_offset_reward': x_offset,
'scalar_vel': scalar_vel,
'mean_action_l2_penalty': origin_action_l2_penalty,
} }
return info return info
class ObsTranformerBase(gym.Wrapper): class ObsTranformerBase(gym.Wrapper):
def __init__(self, env): def __init__(self, env):
logger.info("[ObsTranformerBase]type:{}".format(type(env)))
gym.Wrapper.__init__(self, env) gym.Wrapper.__init__(self, env)
self.step_fea = MAXTIME_LIMIT self.step_fea = MAXTIME_LIMIT
self.raw_obs = None
global FRAME_SKIP global FRAME_SKIP
self.frame_skip = int(FRAME_SKIP) self.frame_skip = int(FRAME_SKIP)
...@@ -268,19 +579,13 @@ class ObsTranformerBase(gym.Wrapper): ...@@ -268,19 +579,13 @@ class ObsTranformerBase(gym.Wrapper):
self.step_fea -= FRAME_SKIP self.step_fea -= FRAME_SKIP
self.raw_obs = copy.deepcopy(obs)
obs = self.get_observation(obs) obs = self.get_observation(obs)
self.raw_obs['step_count'] = MAXTIME_LIMIT - self.step_fea
return obs, r, done, info return obs, r, done, info
def reset(self, **kwargs): def reset(self, **kwargs):
obs = self.env.reset(**kwargs) obs = self.env.reset(**kwargs)
if obs is None:
return None
self.step_fea = MAXTIME_LIMIT self.step_fea = MAXTIME_LIMIT
self.raw_obs = copy.deepcopy(obs)
obs = self.get_observation(obs) obs = self.get_observation(obs)
self.raw_obs['step_count'] = MAXTIME_LIMIT - self.step_fea
return obs return obs
...@@ -421,7 +726,6 @@ class PelvisBasedObs(ObsTranformerBase): ...@@ -421,7 +726,6 @@ class PelvisBasedObs(ObsTranformerBase):
res = np.append(res, feet_dis) res = np.append(res, feet_dis)
remaining_time = (self.step_fea - remaining_time = (self.step_fea -
(MAXTIME_LIMIT / 2.0)) / (MAXTIME_LIMIT / 2.0) * -1.0 (MAXTIME_LIMIT / 2.0)) / (MAXTIME_LIMIT / 2.0) * -1.0
#logger.info('remaining_time fea: {}'.format(remaining_time))
res = np.append(res, remaining_time) res = np.append(res, remaining_time)
# target driven # target driven
...@@ -450,9 +754,10 @@ if __name__ == '__main__': ...@@ -450,9 +754,10 @@ if __name__ == '__main__':
env = ProstheticsEnv(visualize=False) env = ProstheticsEnv(visualize=False)
env.change_model(model='3D', difficulty=1, prosthetic=True) env.change_model(model='3D', difficulty=1, prosthetic=True)
env = ForwardReward(env) env = CustomR2Env(env)
env = RunFastestReward(env)
env = FrameSkip(env, 4) env = FrameSkip(env, 4)
env = ActionScale(env) env = ActionScale(env)
env = PelvisBasedObs(env) env = PelvisBasedObs(env)
for i in range(64): for i in range(64):
observation = env.reset(project=False) observation = env.reset(project=False, stage=0)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import abc
import copy
import gym
import math
import numpy as np
from collections import OrderedDict
from osim.env import ProstheticsEnv
from parl.utils import logger
MAXTIME_LIMIT = 1000
ProstheticsEnv.time_limit = MAXTIME_LIMIT
FRAME_SKIP = None
FALL_PENALTY = 0
class RemoteEnv(gym.Wrapper):
def __init__(self, env):
env.metadata = {}
env.action_space = None
env.observation_space = None
env.reward_range = None
gym.Wrapper.__init__(self, env)
self.remote_env = env
self.first_time = True
def step(self, act):
return self.remote_env.env_step(act.tolist())
def reset(self):
if self.first_time:
self.first_time = False
return self.remote_env.env_create()
obs = self.remote_env.env_reset()
if not obs:
return None
return obs
def calc_vel_diff(state_desc):
cur_vel_x = state_desc['body_vel']['pelvis'][0]
cur_vel_z = state_desc['body_vel']['pelvis'][2]
target_vel_x = state_desc['target_vel'][0]
target_vel_z = state_desc['target_vel'][2]
diff_vel_x = cur_vel_x - target_vel_x
diff_vel_z = cur_vel_z - target_vel_z
cur_vel = (cur_vel_x**2 + cur_vel_z**2)**0.5
target_vel = (target_vel_x**2 + target_vel_z**2)**0.5
diff_vel = cur_vel - target_vel
target_theta = math.atan(-1.0 * target_vel_z / target_vel_x)
# alone y axis
cur_theta = state_desc['body_pos_rot']['pelvis'][1]
diff_theta = cur_theta - target_theta
return cur_vel_x, cur_vel_z, diff_vel_x, diff_vel_z, diff_vel, diff_theta
class ActionScale(gym.Wrapper):
def __init__(self, env):
gym.Wrapper.__init__(self, env)
def step(self, action, **kwargs):
action = (np.copy(action) + 1.0) * 0.5
action = np.clip(action, 0.0, 1.0)
return self.env.step(action, **kwargs)
def reset(self, **kwargs):
return self.env.reset(**kwargs)
class FrameSkip(gym.Wrapper):
def __init__(self, env, k):
gym.Wrapper.__init__(self, env)
self.frame_skip = k
global FRAME_SKIP
FRAME_SKIP = k
self.frame_count = 0
def step(self, action, **kwargs):
r = 0.0
merge_info = {}
for k in range(self.frame_skip):
self.frame_count += 1
obs, reward, done, info = self.env.step(action, **kwargs)
r += reward
for key in info.keys():
if 'reward' in key:
# to assure that we don't igonre other reward
# if new reward was added, consider its logic here
assert (key == 'shaping_reward') or (key == 'r2_reward')
merge_info[key] = merge_info.get(key, 0.0) + info[key]
else:
merge_info[key] = info[key]
if info['target_changed']:
#merge_info['shaping_reward'] += info['shaping_reward'] * (self.frame_skip - k - 1)
logger.warn("[FrameSkip] early break since target was changed")
break
if done:
break
merge_info['frame_count'] = self.frame_count
return obs, r, done, merge_info
def reset(self, **kwargs):
self.frame_count = 0
return self.env.reset(**kwargs)
class RewardShaping(gym.Wrapper):
""" A wrapper for reward shaping, note this wrapper must be the first wrapper """
def __init__(self, env):
logger.info("[RewardShaping]type:{}".format(type(env)))
self.step_count = 0
self.pre_state_desc = None
self.last_target_vel = None
self.last_target_change_step = 0
self.target_change_times = 0
gym.Wrapper.__init__(self, env)
@abc.abstractmethod
def reward_shaping(self, state_desc, reward, done, action):
"""define your own reward computation function
Args:
state_desc(dict): state description for current model
reward(scalar): generic reward generated by env
done(bool): generic done flag generated by env
"""
pass
def step(self, action, **kwargs):
self.step_count += 1
obs, r, done, info = self.env.step(action, **kwargs)
info = self.reward_shaping(obs, r, done, action)
if info['target_vel'] > 2.75:
rate = math.sqrt((2.75**2) / (info['target_vel']**2))
logger.warn('Changing targets, origin targets: {}'.format(
obs['target_vel']))
obs['target_vel'][0] = obs['target_vel'][0] * rate
obs['target_vel'][2] = obs['target_vel'][2] * rate
logger.warn('Changing targets, new targets: {}'.format(
obs['target_vel']))
info['target_vel'] = 2.75
if info['target_vel'] < -0.25:
rate = math.sqrt(((-0.25)**2) / (info['target_vel']**2))
logger.warn('Changing targets, origin targets: {}'.format(
obs['target_vel']))
obs['target_vel'][0] = obs['target_vel'][0] * rate
obs['target_vel'][2] = obs['target_vel'][2] * rate
logger.warn('Changing targets, new targets: {}'.format(
obs['target_vel']))
info['target_vel'] = -0.25
delta = 0
if self.last_target_vel is not None:
delta = np.absolute(
np.array(self.last_target_vel) - np.array(obs['target_vel']))
if (self.last_target_vel is None) or np.all(delta < 1e-5):
info['target_changed'] = False
else:
info['target_changed'] = True
logger.info("[env_wrapper] target_changed, vx:{} vz:{}".format(
obs['target_vel'][0], obs['target_vel'][2]))
self.last_target_change_step = self.step_count
self.target_change_times += 1
info['target_change_times'] = self.target_change_times
self.last_target_vel = obs['target_vel']
assert 'shaping_reward' in info
timeout = False
if self.step_count >= MAXTIME_LIMIT:
timeout = True
if done and not timeout:
# penalty for falling down
info['shaping_reward'] += FALL_PENALTY
info['timeout'] = timeout
self.pre_state_desc = obs
return obs, r, done, info
def reset(self, **kwargs):
self.step_count = 0
self.last_target_vel = None
self.last_target_change_step = 0
self.target_change_times = 0
obs = self.env.reset(**kwargs)
self.pre_state_desc = obs
return obs
class ForwardReward(RewardShaping):
""" A reward shaping wraper"""
def __init__(self, env):
RewardShaping.__init__(self, env)
def reward_shaping(self, state_desc, r2_reward, done, action):
target_vel = math.sqrt(state_desc["target_vel"][0]**2 +
state_desc["target_vel"][2]**2)
if state_desc["target_vel"][0] < 0:
target_vel = -target_vel
info = {
'shaping_reward': r2_reward,
'target_vel': target_vel,
'r2_reward': r2_reward,
}
return info
class ObsTranformerBase(gym.Wrapper):
def __init__(self, env):
gym.Wrapper.__init__(self, env)
self.step_fea = MAXTIME_LIMIT
self.raw_obs = None
global FRAME_SKIP
self.frame_skip = int(FRAME_SKIP)
def get_observation(self, state_desc):
obs = self._get_observation(state_desc)
if not isinstance(self, PelvisBasedObs):
cur_vel_x, cur_vel_z, diff_vel_x, diff_vel_z, diff_vel, diff_theta = calc_vel_diff(
state_desc)
obs = np.append(obs, [
cur_vel_x, cur_vel_z, diff_vel_x, diff_vel_z, diff_vel,
diff_theta
])
else:
pass
return obs
@abc.abstractmethod
def _get_observation(self, state_desc):
pass
def feature_normalize(self, obs, mean, std, duplicate_id):
scaler_len = mean.shape[0]
assert obs.shape[0] >= scaler_len
obs[:scaler_len] = (obs[:scaler_len] - mean) / std
final_obs = []
for i in range(obs.shape[0]):
if i not in duplicate_id:
final_obs.append(obs[i])
return np.array(final_obs)
def step(self, action, **kwargs):
obs, r, done, info = self.env.step(action, **kwargs)
if info['target_changed']:
# reset step_fea when change target
self.step_fea = MAXTIME_LIMIT
self.step_fea -= FRAME_SKIP
self.raw_obs = copy.deepcopy(obs)
obs = self.get_observation(obs)
self.raw_obs['step_count'] = MAXTIME_LIMIT - self.step_fea
return obs, r, done, info
def reset(self, **kwargs):
obs = self.env.reset(**kwargs)
if obs is None:
return None
self.step_fea = MAXTIME_LIMIT
self.raw_obs = copy.deepcopy(obs)
obs = self.get_observation(obs)
self.raw_obs['step_count'] = MAXTIME_LIMIT - self.step_fea
return obs
class PelvisBasedObs(ObsTranformerBase):
def __init__(self, env):
ObsTranformerBase.__init__(self, env)
data = np.load('./pelvisBasedObs_scaler.npz')
self.mean, self.std, self.duplicate_id = data['mean'], data[
'std'], data['duplicate_id']
self.duplicate_id = self.duplicate_id.astype(np.int32).tolist()
def get_core_matrix(self, yaw):
core_matrix = np.zeros(shape=(3, 3))
core_matrix[0][0] = math.cos(yaw)
core_matrix[0][2] = -1.0 * math.sin(yaw)
core_matrix[1][1] = 1
core_matrix[2][0] = math.sin(yaw)
core_matrix[2][2] = math.cos(yaw)
return core_matrix
def _get_observation(self, state_desc):
o = OrderedDict()
for body_part in [
'pelvis', 'femur_r', 'pros_tibia_r', 'pros_foot_r', 'femur_l',
'tibia_l', 'talus_l', 'calcn_l', 'toes_l', 'torso', 'head'
]:
# position
o[body_part + '_x'] = state_desc['body_pos'][body_part][0]
o[body_part + '_y'] = state_desc['body_pos'][body_part][1]
o[body_part + '_z'] = state_desc['body_pos'][body_part][2]
# velocity
o[body_part + '_v_x'] = state_desc["body_vel"][body_part][0]
o[body_part + '_v_y'] = state_desc["body_vel"][body_part][1]
o[body_part + '_v_z'] = state_desc["body_vel"][body_part][2]
o[body_part + '_x_r'] = state_desc["body_pos_rot"][body_part][0]
o[body_part + '_y_r'] = state_desc["body_pos_rot"][body_part][1]
o[body_part + '_z_r'] = state_desc["body_pos_rot"][body_part][2]
o[body_part + '_v_x_r'] = state_desc["body_vel_rot"][body_part][0]
o[body_part + '_v_y_r'] = state_desc["body_vel_rot"][body_part][1]
o[body_part + '_v_z_r'] = state_desc["body_vel_rot"][body_part][2]
for joint in [
'hip_r', 'knee_r', 'ankle_r', 'hip_l', 'knee_l', 'ankle_l',
'back'
]:
if 'hip' not in joint:
o[joint + '_joint_pos'] = state_desc['joint_pos'][joint][0]
o[joint + '_joint_vel'] = state_desc['joint_vel'][joint][0]
else:
for i in range(3):
o[joint + '_joint_pos_' +
str(i)] = state_desc['joint_pos'][joint][i]
o[joint + '_joint_vel_' +
str(i)] = state_desc['joint_vel'][joint][i]
# In NIPS2017, only use activation
for muscle in sorted(state_desc["muscles"].keys()):
activation = state_desc["muscles"][muscle]["activation"]
if isinstance(activation, float):
activation = [activation]
for i, val in enumerate(activation):
o[muscle + '_activation_' + str(i)] = activation[i]
fiber_length = state_desc["muscles"][muscle]["fiber_length"]
if isinstance(fiber_length, float):
fiber_length = [fiber_length]
for i, val in enumerate(fiber_length):
o[muscle + '_fiber_length_' + str(i)] = fiber_length[i]
fiber_velocity = state_desc["muscles"][muscle]["fiber_velocity"]
if isinstance(fiber_velocity, float):
fiber_velocity = [fiber_velocity]
for i, val in enumerate(fiber_velocity):
o[muscle + '_fiber_velocity_' + str(i)] = fiber_velocity[i]
# z axis of mass have some problem now, delete it later
o['mass_x'] = state_desc["misc"]["mass_center_pos"][0]
o['mass_y'] = state_desc["misc"]["mass_center_pos"][1]
o['mass_z'] = state_desc["misc"]["mass_center_pos"][2]
o['mass_v_x'] = state_desc["misc"]["mass_center_vel"][0]
o['mass_v_y'] = state_desc["misc"]["mass_center_vel"][1]
o['mass_v_z'] = state_desc["misc"]["mass_center_vel"][2]
for key in ['talus_l_y', 'toes_l_y']:
o['touch_indicator_' + key] = np.clip(0.05 - o[key] * 10 + 0.5, 0.,
1.)
o['touch_indicator_2_' + key] = np.clip(0.1 - o[key] * 10 + 0.5,
0., 1.)
# Tranformer
core_matrix = self.get_core_matrix(o['pelvis_y_r'])
pelvis_pos = np.array([o['pelvis_x'], o['pelvis_y'],
o['pelvis_z']]).reshape((3, 1))
pelvis_vel = np.array(
[o['pelvis_v_x'], o['pelvis_v_y'], o['pelvis_v_z']]).reshape((3,
1))
for body_part in [
'mass', 'femur_r', 'pros_tibia_r', 'pros_foot_r', 'femur_l',
'tibia_l', 'talus_l', 'calcn_l', 'toes_l', 'torso', 'head'
]:
# rotation
if body_part != 'mass':
o[body_part + '_y_r'] -= o['pelvis_y_r']
o[body_part + '_v_y_r'] -= o['pelvis_v_y_r']
# position/velocity
global_pos = []
global_vel = []
for each in ['_x', '_y', '_z']:
global_pos.append(o[body_part + each])
global_vel.append(o[body_part + '_v' + each])
global_pos = np.array(global_pos).reshape((3, 1))
global_vel = np.array(global_vel).reshape((3, 1))
pelvis_rel_pos = core_matrix.dot(global_pos - pelvis_pos)
w = o['pelvis_v_y_r']
offset = np.array(
[-w * pelvis_rel_pos[2], 0, w * pelvis_rel_pos[0]])
pelvis_rel_vel = core_matrix.dot(global_vel - pelvis_vel) + offset
for i, each in enumerate(['_x', '_y', '_z']):
o[body_part + each] = pelvis_rel_pos[i][0]
o[body_part + '_v' + each] = pelvis_rel_vel[i][0]
for key in ['pelvis_x', 'pelvis_z', 'pelvis_y_r']:
del o[key]
current_v = np.array(state_desc['body_vel']['pelvis']).reshape((3, 1))
pelvis_current_v = core_matrix.dot(current_v)
o['pelvis_v_x'] = pelvis_current_v[0]
o['pelvis_v_z'] = pelvis_current_v[2]
res = np.array(list(o.values()))
res = self.feature_normalize(
res, mean=self.mean, std=self.std, duplicate_id=self.duplicate_id)
feet_dis = ((o['tibia_l_x'] - o['pros_tibia_r_x'])**2 +
(o['tibia_l_z'] - o['pros_tibia_r_z'])**2)**0.5
res = np.append(res, feet_dis)
remaining_time = (self.step_fea -
(MAXTIME_LIMIT / 2.0)) / (MAXTIME_LIMIT / 2.0) * -1.0
#logger.info('remaining_time fea: {}'.format(remaining_time))
res = np.append(res, remaining_time)
# target driven
target_v = np.array(state_desc['target_vel']).reshape((3, 1))
pelvis_target_v = core_matrix.dot(target_v)
diff_vel_x = pelvis_target_v[0] - pelvis_current_v[0]
diff_vel_z = pelvis_target_v[2] - pelvis_current_v[2]
diff_vel = np.sqrt(pelvis_target_v[0] ** 2 + pelvis_target_v[2] ** 2) - \
np.sqrt(pelvis_current_v[0] ** 2 + pelvis_current_v[2] ** 2)
target_vel_x = target_v[0]
target_vel_z = target_v[2]
target_theta = math.atan(-1.0 * target_vel_z / target_vel_x)
current_theta = state_desc['body_pos_rot']['pelvis'][1]
diff_theta = target_theta - current_theta
res = np.append(res, [
diff_vel_x[0] / 3.0, diff_vel_z[0] / 3.0, diff_vel[0] / 3.0,
diff_theta / (np.pi * 3 / 8)
])
return res
if __name__ == '__main__':
from osim.env import ProstheticsEnv
env = ProstheticsEnv(visualize=False)
env.change_model(model='3D', difficulty=1, prosthetic=True)
env = ForwardReward(env)
env = FrameSkip(env, 4)
env = ActionScale(env)
env = PelvisBasedObs(env)
for i in range(64):
observation = env.reset(project=False)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import numpy as np
import time
from env_wrapper import FrameSkip, ActionScale, PelvisBasedObs, ForwardReward
from osim.env import ProstheticsEnv
from parl.utils import logger
from submit_model import SubmitModel
def play_multi_episode(submit_model, episode_num=2, vis=False, seed=0):
np.random.seed(seed)
env = ProstheticsEnv(visualize=vis)
env.change_model(model='3D', difficulty=1, prosthetic=True, seed=seed)
env = ForwardReward(env)
env = FrameSkip(env, 4)
env = ActionScale(env)
env = PelvisBasedObs(env)
all_reward = []
all_shaping_reward = 0
last_frames_count = 0
for e in range(episode_num):
t = time.time()
episode_reward = 0.0
episode_shaping_reward = 0.0
observation = env.reset(project=False)
target_change_times = 0
step = 0
loss = []
while True:
step += 1
action = submit_model.pred_batch(observation, target_change_times)
observation, reward, done, info = env.step(action, project=False)
step_frames = info['frame_count'] - last_frames_count
last_frames_count = info['frame_count']
episode_reward += reward
# we pacle it here to drop the first step after changing
if target_change_times >= 1:
loss.append(10 * step_frames - reward)
if info['target_changed']:
target_change_times = min(target_change_times + 1, 3)
logger.info("[step/{}]reward:{} info:{}".format(
step, reward, info))
episode_shaping_reward += info['shaping_reward']
if done:
break
all_reward.append(episode_reward)
all_shaping_reward += episode_shaping_reward
t = time.time() - t
logger.info(
"[episode/{}] time: {} episode_reward:{} change_loss:{} after_change_loss:{} mean_reward:{}"
.format(e, t, episode_reward, np.sum(loss[:15]), np.sum(loss[15:]),
np.mean(all_reward)))
logger.info("Mean reward:{}".format(np.mean(all_reward)))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--use_cuda', action="store_true", help='If set, will run in gpu 0')
parser.add_argument(
'--vis', action="store_true", help='If set, will visualize.')
parser.add_argument('--seed', type=int, default=0, help='Random seed.')
parser.add_argument(
'--episode_num', type=int, default=1, help='Episode number to run.')
args = parser.parse_args()
submit_model = SubmitModel(use_cuda=args.use_cuda)
play_multi_episode(
submit_model,
episode_num=args.episode_num,
vis=args.vis,
seed=args.seed)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import parl.layers as layers
from copy import deepcopy
from paddle import fluid
from parl.framework.algorithm_base import Algorithm
__all__ = ['MultiHeadDDPG']
class MultiHeadDDPG(Algorithm):
def __init__(self, models, hyperparas):
""" model: should implement the function get_actor_params()
"""
self.models = models
self.target_models = []
for model in models:
target_model = deepcopy(model)
self.target_models.append(target_model)
# fetch hyper parameters
self.gamma = hyperparas['gamma']
self.tau = hyperparas['tau']
self.ensemble_num = hyperparas['ensemble_num']
def define_predict(self, obs, model_id):
""" use actor model of self.models[model_id] to predict the action
"""
return self.models[model_id].policy(obs)
def define_ensemble_predict(self, obs):
""" ensemble predict:
1. For actions of all actors, each critic will score them
and normalize its scores;
2. For each actor, will calculate its score by
average scores given by all critics
3. choose action of the actor whose score is best
"""
actor_outputs = []
for i in range(self.ensemble_num):
actor_outputs.append(self.models[i].policy(obs))
batch_actions = layers.concat(actor_outputs, axis=0)
batch_obs = layers.expand(obs, expand_times=[self.ensemble_num, 1])
critic_outputs = []
for i in range(self.ensemble_num):
critic_output = self.models[i].value(batch_obs, batch_actions)
critic_output = layers.unsqueeze(critic_output, axes=[1])
critic_outputs.append(critic_output)
score_matrix = layers.concat(critic_outputs, axis=1)
# Normalize scores given by each critic
sum_critic_score = layers.reduce_sum(
score_matrix, dim=0, keep_dim=True)
sum_critic_score = layers.expand(
sum_critic_score, expand_times=[self.ensemble_num, 1])
norm_score_matrix = score_matrix / sum_critic_score
actions_mean_score = layers.reduce_mean(
norm_score_matrix, dim=1, keep_dim=True)
best_score_id = layers.argmax(actions_mean_score, axis=0)
best_score_id = layers.cast(best_score_id, dtype='int32')
ensemble_predict_action = layers.gather(batch_actions, best_score_id)
return ensemble_predict_action
def define_learn(self, obs, action, reward, next_obs, terminal, actor_lr,
critic_lr, model_id):
""" update actor and critic model of self.models[model_id] with DDPG algorithm
"""
actor_cost = self._actor_learn(obs, actor_lr, model_id)
critic_cost = self._critic_learn(obs, action, reward, next_obs,
terminal, critic_lr, model_id)
return actor_cost, critic_cost
def _actor_learn(self, obs, actor_lr, model_id):
action = self.models[model_id].policy(obs)
Q = self.models[model_id].value(obs, action)
cost = layers.reduce_mean(-1.0 * Q)
optimizer = fluid.optimizer.AdamOptimizer(actor_lr)
optimizer.minimize(
cost, parameter_list=self.models[model_id].get_actor_params())
return cost
def _critic_learn(self, obs, action, reward, next_obs, terminal, critic_lr,
model_id):
next_action = self.target_models[model_id].policy(next_obs)
next_Q = self.target_models[model_id].value(next_obs, next_action)
terminal = layers.cast(terminal, dtype='float32')
target_Q = reward + (1.0 - terminal) * self.gamma * next_Q
target_Q.stop_gradient = True
Q = self.models[model_id].value(obs, action)
cost = layers.square_error_cost(Q, target_Q)
cost = layers.reduce_mean(cost)
optimizer = fluid.optimizer.AdamOptimizer(critic_lr)
optimizer.minimize(cost)
return cost
def sync_target(self,
gpu_id,
model_id,
decay=None,
share_vars_parallel_executor=None):
if decay is None:
decay = 1.0 - self.tau
self.models[model_id].sync_params_to(
self.target_models[model_id],
gpu_id=gpu_id,
decay=decay,
share_vars_parallel_executor=share_vars_parallel_executor)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import parl.layers as layers
import re
from paddle import fluid
from paddle.fluid.executor import _fetch_var
from parl.framework.agent_base import Agent
from parl.utils import logger
class OpenSimAgent(Agent):
def __init__(self, algorithm, obs_dim, act_dim, ensemble_num):
self.obs_dim = obs_dim
self.act_dim = act_dim
self.ensemble_num = ensemble_num
super(OpenSimAgent, self).__init__(algorithm)
# Use ParallelExecutor to make program running faster
use_cuda = True if self.gpu_id >= 0 else False
self.learn_pe = []
self.pred_pe = []
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.use_experimental_executor = True
exec_strategy.num_threads = 4
build_strategy = fluid.BuildStrategy()
build_strategy.remove_unnecessary_lock = True
for i in range(self.ensemble_num):
with fluid.scope_guard(fluid.global_scope().new_scope()):
pe = fluid.ParallelExecutor(
use_cuda=use_cuda,
main_program=self.learn_programs[i],
exec_strategy=exec_strategy,
build_strategy=build_strategy)
self.learn_pe.append(pe)
with fluid.scope_guard(fluid.global_scope().new_scope()):
pe = fluid.ParallelExecutor(
use_cuda=use_cuda,
main_program=self.predict_programs[i],
exec_strategy=exec_strategy,
build_strategy=build_strategy)
self.pred_pe.append(pe)
# Attention: In the beginning, sync target model totally.
self.alg.sync_target(
gpu_id=self.gpu_id,
model_id=i,
decay=1.0,
share_vars_parallel_executor=self.learn_pe[i])
# Do cache, will create ParallelExecutor of sync params in advance
# If not, there are some issues when ensemble_num > 1
self.alg.sync_target(
gpu_id=self.gpu_id,
model_id=i,
share_vars_parallel_executor=self.learn_pe[i])
with fluid.scope_guard(fluid.global_scope().new_scope()):
self.ensemble_predict_pe = fluid.ParallelExecutor(
use_cuda=use_cuda,
main_program=self.ensemble_predict_program,
exec_strategy=exec_strategy,
build_strategy=build_strategy)
def build_program(self):
self.predict_programs = []
self.predict_outputs = []
self.learn_programs = []
self.learn_programs_output = []
for i in range(self.ensemble_num):
predict_program = fluid.Program()
with fluid.program_guard(predict_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
act = self.alg.define_predict(obs, model_id=i)
self.predict_programs.append(predict_program)
self.predict_outputs.append([act.name])
learn_program = fluid.Program()
with fluid.program_guard(learn_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
act = layers.data(
name='act', shape=[self.act_dim], dtype='float32')
reward = layers.data(name='reward', shape=[], dtype='float32')
next_obs = layers.data(
name='next_obs', shape=[self.obs_dim], dtype='float32')
terminal = layers.data(name='terminal', shape=[], dtype='bool')
actor_lr = layers.data(
name='actor_lr',
shape=[1],
dtype='float32',
append_batch_size=False)
critic_lr = layers.data(
name='critic_lr',
shape=[1],
dtype='float32',
append_batch_size=False)
actor_loss, critic_loss = self.alg.define_learn(
obs,
act,
reward,
next_obs,
terminal,
actor_lr,
critic_lr,
model_id=i)
self.learn_programs.append(learn_program)
self.learn_programs_output.append([critic_loss.name])
self.ensemble_predict_program = fluid.Program()
with fluid.program_guard(self.ensemble_predict_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
act = self.alg.define_ensemble_predict(obs)
self.ensemble_predict_output = [act.name]
def predict(self, obs, model_id):
feed = {'obs': obs}
feed = [feed]
act = self.pred_pe[model_id].run(
feed=feed, fetch_list=self.predict_outputs[model_id])[0]
return act
def ensemble_predict(self, obs):
feed = {'obs': obs}
feed = [feed]
act = self.ensemble_predict_pe.run(
feed=feed, fetch_list=self.ensemble_predict_output)[0]
return act
def learn(self, obs, act, reward, next_obs, terminal, actor_lr, critic_lr,
model_id):
feed = {
'obs': obs,
'act': act,
'reward': reward,
'next_obs': next_obs,
'terminal': terminal,
'actor_lr': actor_lr,
'critic_lr': critic_lr
}
feed = [feed]
critic_loss = self.learn_pe[model_id].run(
feed=feed, fetch_list=self.learn_programs_output[model_id])[0]
self.alg.sync_target(
gpu_id=self.gpu_id,
model_id=model_id,
share_vars_parallel_executor=self.learn_pe[model_id])
return critic_loss
def save_params(self, dirname):
for i in range(self.ensemble_num):
fluid.io.save_params(
executor=self.fluid_executor,
dirname=dirname,
main_program=self.learn_programs[i])
def load_params(self, dirname, from_one_head):
if from_one_head:
logger.info('[From one head, extend to multi head:]')
# load model 0
fluid.io.load_params(
executor=self.fluid_executor,
dirname=dirname,
main_program=self.learn_programs[0])
# sync identity params of model/target_model 0 to other models/target_models
for i in range(1, self.ensemble_num):
params = list(
filter(
lambda x: 'identity' in x.name and '@GRAD' not in x.name,
self.learn_programs[i].list_vars()))
for param in params:
param_var = _fetch_var(param.name, return_numpy=False)
model0_name = re.sub(r"identity_\d+", "identity_0",
param.name)
model0_value = _fetch_var(model0_name, return_numpy=True)
logger.info('{} -> {}'.format(model0_name, param.name))
param_var.set(model0_value, self.place)
# sync share params of target_model 0 to other target models
# After deepcopy, shapre params between target models is different
for i in range(1, self.ensemble_num):
params = list(
filter(
lambda x: 'shared' in x.name and 'PARL_target' in x.name and '@GRAD' not in x.name,
self.learn_programs[i].list_vars()))
for param in params:
param_var = _fetch_var(param.name, return_numpy=False)
model0_name = re.sub(r"_\d+$", "_0", param.name)
model0_value = _fetch_var(model0_name, return_numpy=True)
logger.info('{} -> {}'.format(model0_name, param.name))
param_var.set(model0_value, self.place)
else:
for i in range(self.ensemble_num):
fluid.io.load_params(
executor=self.fluid_executor,
dirname=dirname,
main_program=self.learn_programs[i])
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import parl.layers as layers
from paddle import fluid
from paddle.fluid.param_attr import ParamAttr
from parl.framework.model_base import Model
class OpenSimModel(Model):
def __init__(self, obs_dim, vel_obs_dim, act_dim, model_id=0, shared=True):
self.actor_model = ActorModel(obs_dim, vel_obs_dim, act_dim, model_id,
shared)
self.critic_model = CriticModel(obs_dim, vel_obs_dim, act_dim,
model_id, shared)
def policy(self, obs):
return self.actor_model.policy(obs)
def value(self, obs, action):
return self.critic_model.value(obs, action)
def get_actor_params(self):
return self.actor_model.parameter_names
class ActorModel(Model):
def __init__(self, obs_dim, vel_obs_dim, act_dim, model_id, shared):
hid0_size = 800
hid1_size = 400
hid2_size = 200
vel_hid0_size = 200
vel_hid1_size = 400
self.obs_dim = obs_dim
self.vel_obs_dim = vel_obs_dim
# bottom layers
if shared:
scope_name = 'policy_shared'
else:
scope_name = 'policy_identity_{}'.format(model_id)
self.fc0 = layers.fc(
size=hid0_size,
act='tanh',
param_attr=ParamAttr(name='{}/h0/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/h0/b'.format(scope_name)))
self.fc1 = layers.fc(
size=hid1_size,
act='tanh',
param_attr=ParamAttr(name='{}/h1/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/h1/b'.format(scope_name)))
self.vel_fc0 = layers.fc(
size=vel_hid0_size,
act='tanh',
param_attr=ParamAttr(name='{}/vel_h0/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/vel_h0/b'.format(scope_name)))
self.vel_fc1 = layers.fc(
size=vel_hid1_size,
act='tanh',
param_attr=ParamAttr(name='{}/vel_h1/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/vel_h1/b'.format(scope_name)))
# top layers
scope_name = 'policy_identity_{}'.format(model_id)
self.fc2 = layers.fc(
size=hid2_size,
act='tanh',
param_attr=ParamAttr(name='{}/h2/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/h2/b'.format(scope_name)))
self.fc3 = layers.fc(
size=act_dim,
act='tanh',
param_attr=ParamAttr(name='{}/means/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/means/b'.format(scope_name)))
def policy(self, obs):
real_obs = layers.slice(
obs, axes=[1], starts=[0], ends=[-self.vel_obs_dim])
# target related fetures
vel_obs = layers.slice(
obs, axes=[1], starts=[-self.vel_obs_dim], ends=[self.obs_dim])
hid0 = self.fc0(real_obs)
hid1 = self.fc1(hid0)
vel_hid0 = self.vel_fc0(vel_obs)
vel_hid1 = self.vel_fc1(vel_hid0)
concat = layers.concat([hid1, vel_hid1], axis=1)
hid2 = self.fc2(concat)
means = self.fc3(hid2)
return means
class CriticModel(Model):
def __init__(self, obs_dim, vel_obs_dim, act_dim, model_id, shared):
super(CriticModel, self).__init__()
hid0_size = 800
hid1_size = 400
vel_hid0_size = 200
vel_hid1_size = 400
self.obs_dim = obs_dim
self.vel_obs_dim = vel_obs_dim
# buttom layers
if shared:
scope_name = 'critic_shared'
else:
scope_name = 'critic_identity_{}'.format(model_id)
self.fc0 = layers.fc(
size=hid0_size,
act='selu',
param_attr=ParamAttr(name='{}/w1/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/w1/b'.format(scope_name)))
self.fc1 = layers.fc(
size=hid1_size,
act='selu',
param_attr=ParamAttr(name='{}/h1/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/h1/b'.format(scope_name)))
self.vel_fc0 = layers.fc(
size=vel_hid0_size,
act='selu',
param_attr=ParamAttr(name='{}/vel_h0/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/vel_h0/b'.format(scope_name)))
self.vel_fc1 = layers.fc(
size=vel_hid1_size,
act='selu',
param_attr=ParamAttr(name='{}/vel_h1/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/vel_h1/b'.format(scope_name)))
self.act_fc0 = layers.fc(
size=hid1_size,
act='selu',
param_attr=ParamAttr(name='{}/a1/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/a1/b'.format(scope_name)))
# top layers
scope_name = 'critic_identity_{}'.format(model_id)
self.fc2 = layers.fc(
size=hid1_size,
act='selu',
param_attr=ParamAttr(name='{}/h3/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/h3/b'.format(scope_name)))
self.fc3 = layers.fc(
size=1,
act='selu',
param_attr=ParamAttr(name='{}/value/W'.format(scope_name)),
bias_attr=ParamAttr(name='{}/value/b'.format(scope_name)))
def value(self, obs, action):
real_obs = layers.slice(
obs, axes=[1], starts=[0], ends=[-self.vel_obs_dim])
# target related fetures
vel_obs = layers.slice(
obs, axes=[1], starts=[-self.vel_obs_dim], ends=[self.obs_dim])
hid0 = self.fc0(real_obs)
hid1 = self.fc1(hid0)
vel_hid0 = self.vel_fc0(vel_obs)
vel_hid1 = self.vel_fc1(vel_hid0)
a1 = self.act_fc0(action)
concat = layers.concat([hid1, a1, vel_hid1], axis=1)
hid2 = self.fc2(concat)
Q = self.fc3(hid2)
Q = layers.squeeze(Q, axes=[1])
return Q
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import grpc
import json
import numpy as np
import simulator_pb2
import simulator_pb2_grpc
from args import get_client_args
from env_wrapper import FrameSkip, ActionScale, PelvisBasedObs, MAXTIME_LIMIT, CustomR2Env, RunFastestReward, FixedTargetSpeedReward, Round2Reward
from osim.env import ProstheticsEnv
from parl.utils import logger
ProstheticsEnv.time_limit = MAXTIME_LIMIT
class Worker(object):
def __init__(self, server_ip='localhost', server_port=5007):
if args.ident is not None:
self.worker_id = args.ident
else:
self.worker_id = np.random.randint(int(1e18))
self.address = '{}:{}'.format(server_ip, server_port)
random_seed = int(self.worker_id % int(1e9))
np.random.seed(random_seed)
env = ProstheticsEnv(visualize=False, seed=random_seed)
env.change_model(
model='3D', difficulty=1, prosthetic=True, seed=random_seed)
env.spec.timestep_limit = MAXTIME_LIMIT
env = CustomR2Env(env)
if args.reward_type == 'RunFastest':
env = RunFastestReward(env)
elif args.reward_type == 'FixedTargetSpeed':
env = FixedTargetSpeedReward(
env, args.target_v, args.act_penalty_lowerbound,
args.act_penalty_coeff, args.vel_penalty_coeff)
elif args.reward_type == 'Round2':
env = Round2Reward(env, args.act_penalty_lowerbound,
args.act_penalty_coeff, args.vel_penalty_coeff)
else:
assert False, 'Not supported reward type!'
env = FrameSkip(env, 4)
env = ActionScale(env)
self.env = PelvisBasedObs(env)
def run(self):
observation = self.env.reset(project=False, stage=args.stage)
reward = 0
done = False
info = {'shaping_reward': 0.0}
info['first'] = True
with grpc.insecure_channel(self.address) as channel:
stub = simulator_pb2_grpc.SimulatorStub(channel)
while True:
response = stub.Send(
simulator_pb2.Request(
observation=observation,
reward=reward,
done=done,
info=json.dumps(info),
id=self.worker_id))
extra = json.loads(response.extra)
if 'reset' in extra and extra['reset']:
logger.info('Server require to reset!')
observation = self.env.reset(
project=False, stage=args.stage)
reward = 0
done = False
info = {'shaping_reward': 0.0}
continue
if 'shutdown' in extra and extra['shutdown']:
break
action = np.array(response.action)
next_observation, reward, done, info = self.env.step(
action, project=False)
# debug info
if args.debug:
logger.info("dim:{} obs:{} act:{} reward:{} done:{} info:{}".format(\
len(observation), np.sum(observation), np.sum(action), reward, done, info))
observation = next_observation
if done:
observation = self.env.reset(
project=False, stage=args.stage)
# the last observation should be used to compute append_value in simulator_server
info['last_obs'] = next_observation.tolist()
if __name__ == '__main__':
args = get_client_args()
worker = Worker(server_ip=args.ip, server_port=args.port)
worker.run()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: simulator.proto
import sys
_b = sys.version_info[0] < 3 and (lambda x: x) or (
lambda x: x.encode('latin1'))
from google.protobuf import descriptor as _descriptor
from google.protobuf import message as _message
from google.protobuf import reflection as _reflection
from google.protobuf import symbol_database as _symbol_database
from google.protobuf import descriptor_pb2
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor.FileDescriptor(
name='simulator.proto',
package='simulator',
syntax='proto3',
serialized_pb=_b(
'\n\x0fsimulator.proto\x12\tsimulator\"V\n\x07Request\x12\x13\n\x0bobservation\x18\x01 \x03(\x01\x12\x0e\n\x06reward\x18\x02 \x01(\x01\x12\x0c\n\x04\x64one\x18\x03 \x01(\x08\x12\x0c\n\x04info\x18\x04 \x01(\t\x12\n\n\x02id\x18\x05 \x01(\x03\"&\n\x05Reply\x12\x0e\n\x06\x61\x63tion\x18\x01 \x03(\x01\x12\r\n\x05\x65xtra\x18\x02 \x01(\t2;\n\tSimulator\x12.\n\x04Send\x12\x12.simulator.Request\x1a\x10.simulator.Reply\"\x00\x62\x06proto3'
))
_REQUEST = _descriptor.Descriptor(
name='Request',
full_name='simulator.Request',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='observation',
full_name='simulator.Request.observation',
index=0,
number=1,
type=1,
cpp_type=5,
label=3,
has_default_value=False,
default_value=[],
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='reward',
full_name='simulator.Request.reward',
index=1,
number=2,
type=1,
cpp_type=5,
label=1,
has_default_value=False,
default_value=float(0),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='done',
full_name='simulator.Request.done',
index=2,
number=3,
type=8,
cpp_type=7,
label=1,
has_default_value=False,
default_value=False,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='info',
full_name='simulator.Request.info',
index=3,
number=4,
type=9,
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode('utf-8'),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='id',
full_name='simulator.Request.id',
index=4,
number=5,
type=3,
cpp_type=2,
label=1,
has_default_value=False,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
file=DESCRIPTOR),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[],
serialized_start=30,
serialized_end=116,
)
_REPLY = _descriptor.Descriptor(
name='Reply',
full_name='simulator.Reply',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='action',
full_name='simulator.Reply.action',
index=0,
number=1,
type=1,
cpp_type=5,
label=3,
has_default_value=False,
default_value=[],
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='extra',
full_name='simulator.Reply.extra',
index=1,
number=2,
type=9,
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode('utf-8'),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
file=DESCRIPTOR),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[],
serialized_start=118,
serialized_end=156,
)
DESCRIPTOR.message_types_by_name['Request'] = _REQUEST
DESCRIPTOR.message_types_by_name['Reply'] = _REPLY
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
Request = _reflection.GeneratedProtocolMessageType(
'Request',
(_message.Message, ),
dict(
DESCRIPTOR=_REQUEST,
__module__='simulator_pb2'
# @@protoc_insertion_point(class_scope:simulator.Request)
))
_sym_db.RegisterMessage(Request)
Reply = _reflection.GeneratedProtocolMessageType(
'Reply',
(_message.Message, ),
dict(
DESCRIPTOR=_REPLY,
__module__='simulator_pb2'
# @@protoc_insertion_point(class_scope:simulator.Reply)
))
_sym_db.RegisterMessage(Reply)
_SIMULATOR = _descriptor.ServiceDescriptor(
name='Simulator',
full_name='simulator.Simulator',
file=DESCRIPTOR,
index=0,
options=None,
serialized_start=158,
serialized_end=217,
methods=[
_descriptor.MethodDescriptor(
name='Send',
full_name='simulator.Simulator.Send',
index=0,
containing_service=None,
input_type=_REQUEST,
output_type=_REPLY,
options=None,
),
])
_sym_db.RegisterServiceDescriptor(_SIMULATOR)
DESCRIPTOR.services_by_name['Simulator'] = _SIMULATOR
# @@protoc_insertion_point(module_scope)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
import grpc
import simulator_pb2 as simulator__pb2
class SimulatorStub(object):
"""The greeting service definition.
"""
def __init__(self, channel):
"""Constructor.
Args:
channel: A grpc.Channel.
"""
self.Send = channel.unary_unary(
'/simulator.Simulator/Send',
request_serializer=simulator__pb2.Request.SerializeToString,
response_deserializer=simulator__pb2.Reply.FromString,
)
class SimulatorServicer(object):
"""The greeting service definition.
"""
def Send(self, request, context):
"""Request Action
"""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def add_SimulatorServicer_to_server(servicer, server):
rpc_method_handlers = {
'Send':
grpc.unary_unary_rpc_method_handler(
servicer.Send,
request_deserializer=simulator__pb2.Request.FromString,
response_serializer=simulator__pb2.Reply.SerializeToString,
),
}
generic_handler = grpc.method_handlers_generic_handler(
'simulator.Simulator', rpc_method_handlers)
server.add_generic_rpc_handlers((generic_handler, ))
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import grpc
import json
import numpy as np
import os
import queue
import simulator_pb2
import simulator_pb2_grpc
import six
import time
import threading
from args import get_server_args
from collections import defaultdict
from concurrent import futures
from multi_head_ddpg import MultiHeadDDPG
from opensim_agent import OpenSimAgent
from opensim_model import OpenSimModel
from parl.utils import logger, ReplayMemory
from utils import calc_indicators, ScalarsManager, TransitionExperience
ACT_DIM = 19
VEL_DIM = 4
OBS_DIM = 185 + VEL_DIM
GAMMA = 0.96
TAU = 0.001
ACTOR_LR = 3e-5
CRITIC_LR = 3e-5
TRAIN_TIMES = 100
BATCH_SIZE = 128
NOISE_DECAY = 0.999998
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
class SimulatorServer(simulator_pb2_grpc.SimulatorServicer):
class ClientState(object):
def __init__(self):
self.memory = [] # list of Experience
self.ident = None
self.model_idx = np.random.randint(args.ensemble_num)
self.last_target_changed = 0
self.target_change_times = 0
def reset(self):
self.last_target_changed = 0
self.memory = []
self.model_idx = np.random.randint(args.ensemble_num)
self.target_change_times = 0
def update_last_target_changed(self):
self.last_target_changed = len(self.memory)
def __init__(self):
self.rpm = ReplayMemory(int(2e6), OBS_DIM, ACT_DIM)
# Need acquire lock when model learning or predicting
self.locks = []
for i in range(args.ensemble_num):
self.locks.append(threading.Lock())
models = []
for i in range(args.ensemble_num):
models.append(OpenSimModel(OBS_DIM, VEL_DIM, ACT_DIM, model_id=i))
hyperparas = {
'gamma': GAMMA,
'tau': TAU,
'ensemble_num': args.ensemble_num
}
alg = MultiHeadDDPG(models, hyperparas)
self.agent = OpenSimAgent(alg, OBS_DIM, ACT_DIM, args.ensemble_num)
self.scalars_manager = ScalarsManager(logger.get_dir())
# add lock when appending data to rpm or writing scalars to tensorboard
self.MEMORY_LOCK = threading.Lock()
self.clients = defaultdict(self.ClientState)
self.ready_client_queue = queue.Queue()
self.noiselevel = 0.5
self.global_step = 0
# thread to keep training
t = threading.Thread(target=self.keep_training)
t.start()
def _new_ready_client(self):
""" The client is ready to start new episode,
but blocking until training thread call client_ready_event.set()
"""
client_ready_event = threading.Event()
self.ready_client_queue.put(client_ready_event)
logger.info(
"[new_ready_client] approximate size of ready clients:{}".format(
self.ready_client_queue.qsize()))
client_ready_event.wait()
def Send(self, request, context):
""" Implement Send function in SimulatorServicer
Everytime a request comming, will create a new thread to handle
"""
ident, obs, reward, done, info = request.id, request.observation, request.reward, request.done, request.info
client = self.clients[ident]
info = json.loads(info)
if 'first' in info:
# Waiting training thread to allow start new episode
self._new_ready_client()
obs = np.array(obs, dtype=np.float32)
self._process_msg(ident, obs, reward, done, info)
if done:
# Waiting training thread to allow start new episode
self._new_ready_client()
action = self.pred_batch(obs, client.model_idx)
step = len(client.memory) - client.last_target_changed
# whether to add noise depends on the ensemble_num
if args.ensemble_num == 1:
current_noise = self.noiselevel * (0.98**(step - 1))
noise = np.zeros((ACT_DIM, ), dtype=np.float32)
if ident % 3 == 0:
if step % 5 == 0:
noise = np.random.randn(ACT_DIM) * current_noise
elif ident % 3 == 1:
if step % 5 == 0:
noise = np.random.randn(ACT_DIM) * current_noise * 2
action += noise
action = np.clip(action, -1, 1)
client.memory[-1].action = action
extra_info = {}
return simulator_pb2.Reply(action=action, extra=json.dumps(extra_info))
def _process_msg(self, ident, obs, reward, done, info):
client = self.clients[ident]
reward_scale = (1 - GAMMA)
info['shaping_reward'] *= reward_scale
if len(client.memory) > 0:
client.memory[-1].reward = reward
info['target_change_times'] = client.target_change_times
client.memory[-1].info = info
if info['target_changed']:
client.target_change_times = min(
client.target_change_times + 1, 3)
# re-sample model_idx after target was changed
client.model_idx = np.random.randint(args.ensemble_num)
if done:
assert 'last_obs' in info
self._parse_memory(client, ident, info['last_obs'])
client.memory.append(
TransitionExperience(obs=obs, action=None, reward=None, info=None))
if 'target_changed' in info and info['target_changed']:
client.update_last_target_changed()
return False
def _parse_memory(self, client, ident, last_obs):
mem = client.memory
n = len(mem)
# debug info
if ident == 1:
for i, exp in enumerate(mem):
logger.info(
"[step:{}] obs:{} action:{} reward:{} shaping_reward:{}".
format(i, np.sum(mem[i].obs), np.sum(mem[i].action),
mem[i].reward, mem[i].info['shaping_reward']))
episode_rpm = []
for i in range(n - 1):
if not mem[i].info['target_changed']:
episode_rpm.append([
mem[i].obs, mem[i].action, mem[i].info['shaping_reward'],
mem[i + 1].obs, False, mem[i].info['target_change_times']
])
if not mem[-1].info['target_changed']:
episode_rpm.append([
mem[-1].obs, mem[-1].action, mem[-1].info['shaping_reward'],
last_obs, not mem[-1].info['timeout'],
mem[i].info['target_change_times']
])
indicators_dict = calc_indicators(mem)
indicators_dict['free_client_num'] = self.ready_client_queue.qsize()
indicators_dict['noiselevel'] = self.noiselevel
with self.MEMORY_LOCK:
self.add_episode_rpm(episode_rpm)
self.scalars_manager.record(indicators_dict, self.global_step)
self.global_step += 1
if self.global_step >= 50:
self.noiselevel = self.noiselevel * NOISE_DECAY
client.reset()
def learn(self):
result_q = queue.Queue()
th_list = []
for j in range(args.ensemble_num):
t = threading.Thread(
target=self.train_single_model, args=(j, result_q))
th_list.append(t)
start_time = time.time()
for t in th_list:
t.start()
for t in th_list:
t.join()
logger.info("[learn] {} heads, time consuming:{}".format(
args.ensemble_num,
time.time() - start_time))
for t in th_list:
result = result_q.get()
for critic_loss in result:
self.scalars_manager.feed_critic_loss(critic_loss)
def train_single_model(self, model_idx, result_q):
logger.info("[train_single_model] model_idx:{}".format(model_idx))
critic_loss_list = []
lock = self.locks[model_idx]
memory = self.rpm
actor_lr = ACTOR_LR * (1.0 - 0.05 * model_idx)
critic_lr = CRITIC_LR * (1.0 + 0.1 * model_idx)
for T in range(TRAIN_TIMES):
[states, actions, rewards, new_states,
dones] = memory.sample_batch(BATCH_SIZE)
lock.acquire()
critic_loss = self.agent.learn(states, actions, rewards,
new_states, dones, actor_lr,
critic_lr, model_idx)
lock.release()
critic_loss_list.append(critic_loss)
result_q.put(critic_loss_list)
def keep_training(self):
episode_count = 1000000
for T in range(episode_count):
if self.rpm.size() > BATCH_SIZE * args.warm_start_batchs:
self.learn()
logger.info(
"[keep_training/{}] trying to acq a new env".format(T))
# Keep training and predicting balance
# After training, waiting for a ready client, and set the client start new episode
ready_client_event = self.ready_client_queue.get()
ready_client_event.set()
if np.mod(T, 100) == 0:
logger.info("saving models")
self.save(T)
if np.mod(T, 10000) == 0:
logger.info("saving rpm")
self.save_rpm()
def save_rpm(self):
save_path = os.path.join(logger.get_dir(), "rpm.npz")
self.rpm.save(save_path)
def restore_rpm(self, rpm_dir):
self.rpm.load(rpm_dir)
def save(self, T):
save_path = os.path.join(logger.get_dir(),
'model_every_100_episodes/step-{}'.format(T))
self.agent.save_params(save_path)
def restore(self, model_path, restore_from_one_head):
logger.info('restore model from {}'.format(model_path))
self.agent.load_params(model_path, restore_from_one_head)
def add_episode_rpm(self, episode_rpm):
for x in episode_rpm:
self.rpm.append(
obs=x[0], act=x[1], reward=x[2], next_obs=x[3], terminal=x[4])
def pred_batch(self, obs, model_idx=None):
assert model_idx is not None
batch_obs = np.expand_dims(obs, axis=0)
self.locks[model_idx].acquire()
action = self.agent.predict(batch_obs, model_idx)
self.locks[model_idx].release()
action = np.squeeze(action, axis=0)
return action
class SimulatorHandler(threading.Thread):
def __init__(self, simulator_server):
threading.Thread.__init__(self)
self.server = grpc.server(futures.ThreadPoolExecutor(max_workers=400))
simulator_pb2_grpc.add_SimulatorServicer_to_server(
simulator_server, self.server)
self.server.add_insecure_port('[::]:{}'.format(args.port))
def run(self):
self.server.start()
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
self.server.stop(0)
if __name__ == '__main__':
args = get_server_args()
if args.logdir is not None:
logger.set_dir(args.logdir)
simulator_server = SimulatorServer()
if args.restore_rpm_path is not None:
simulator_server.restore_rpm(args.restore_rpm_path)
if args.restore_model_path is not None:
simulator_server.restore(args.restore_model_path,
args.restore_from_one_head)
simulator_hanlder = SimulatorHandler(simulator_server=simulator_server)
simulator_hanlder.run()
...@@ -15,74 +15,82 @@ ...@@ -15,74 +15,82 @@
import argparse import argparse
import numpy as np import numpy as np
import time import time
from env_wrapper import FrameSkip, ActionScale, PelvisBasedObs, ForwardReward from env_wrapper import FrameSkip, ActionScale, PelvisBasedObs, TestReward
from multi_head_ddpg import MultiHeadDDPG
from opensim_agent import OpenSimAgent
from opensim_model import OpenSimModel
from osim.env import ProstheticsEnv from osim.env import ProstheticsEnv
from parl.utils import logger from parl.utils import logger
from submit_model import SubmitModel """
Test model with ensemble predict
"""
def play_multi_episode(submit_model, episode_num=2, vis=False, seed=0): def play_multi_episode(agent, episode_num=2, vis=False, seed=0):
np.random.seed(seed) np.random.seed(seed)
env = ProstheticsEnv(visualize=vis) env = ProstheticsEnv(visualize=vis)
env.change_model(model='3D', difficulty=1, prosthetic=True, seed=seed) env.change_model(model='3D', difficulty=1, prosthetic=True, seed=seed)
env = ForwardReward(env) env = TestReward(env)
env = FrameSkip(env, 4) env = FrameSkip(env, 4)
env = ActionScale(env) env = ActionScale(env)
env = PelvisBasedObs(env) env = PelvisBasedObs(env)
all_reward = [] all_reward = []
all_shaping_reward = 0
last_frames_count = 0
for e in range(episode_num): for e in range(episode_num):
t = time.time() t = time.time()
episode_reward = 0.0 episode_reward = 0.0
episode_shaping_reward = 0.0 obs = env.reset(project=False)
observation = env.reset(project=False)
target_change_times = 0
step = 0 step = 0
loss = []
while True: while True:
step += 1 step += 1
action = submit_model.pred_batch(observation, target_change_times)
observation, reward, done, info = env.step(action, project=False) batch_obs = np.expand_dims(obs, axis=0)
step_frames = info['frame_count'] - last_frames_count
last_frames_count = info['frame_count'] action = agent.ensemble_predict(batch_obs)
action = np.squeeze(action, axis=0)
obs, reward, done, info = env.step(action, project=False)
episode_reward += reward episode_reward += reward
# we pacle it here to drop the first step after changing logger.info("[step/{}]reward:{}".format(step, reward))
if target_change_times >= 1:
loss.append(10 * step_frames - reward)
if info['target_changed']:
target_change_times = min(target_change_times + 1, 3)
logger.info("[step/{}]reward:{} info:{}".format(
step, reward, info))
episode_shaping_reward += info['shaping_reward']
if done: if done:
break break
all_reward.append(episode_reward) all_reward.append(episode_reward)
all_shaping_reward += episode_shaping_reward
t = time.time() - t t = time.time() - t
logger.info( logger.info(
"[episode/{}] time: {} episode_reward:{} change_loss:{} after_change_loss:{} mean_reward:{}" "[episode/{}] time: {} episode_reward:{} mean_reward:{}".format(
.format(e, t, episode_reward, np.sum(loss[:15]), np.sum(loss[15:]), e, t, episode_reward, np.mean(all_reward)))
np.mean(all_reward)))
logger.info("Mean reward:{}".format(np.mean(all_reward))) logger.info("Mean reward:{}".format(np.mean(all_reward)))
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
'--use_cuda', action="store_true", help='If set, will run in gpu 0') '--restore_model_path', type=str, help='restore model path for test')
parser.add_argument( parser.add_argument(
'--vis', action="store_true", help='If set, will visualize.') '--vis', action="store_true", help='If set, will visualize.')
parser.add_argument('--seed', type=int, default=0, help='Random seed.') parser.add_argument('--seed', type=int, default=0, help='Random seed.')
parser.add_argument( parser.add_argument(
'--episode_num', type=int, default=1, help='Episode number to run.') '--episode_num', type=int, default=1, help='Episode number to run.')
parser.add_argument('--ensemble_num', type=int, help='ensemble_num')
args = parser.parse_args() args = parser.parse_args()
submit_model = SubmitModel(use_cuda=args.use_cuda) ACT_DIM = 19
VEL_DIM = 4
OBS_DIM = 185 + VEL_DIM
GAMMA = 0.96
TAU = 0.001
models = []
for i in range(args.ensemble_num):
models.append(OpenSimModel(OBS_DIM, VEL_DIM, ACT_DIM, model_id=i))
hyperparas = {
'gamma': GAMMA,
'tau': TAU,
'ensemble_num': args.ensemble_num
}
alg = MultiHeadDDPG(models, hyperparas)
agent = OpenSimAgent(alg, OBS_DIM, ACT_DIM, args.ensemble_num)
agent.load_params(args.restore_model_path)
play_multi_episode( play_multi_episode(
submit_model, agent, episode_num=args.episode_num, vis=args.vis, seed=args.seed)
episode_num=args.episode_num,
vis=args.vis,
seed=args.seed)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import csv
import numpy as np
import tensorflow as tf
import os
import six
class Summary(object):
"""Logging in tensorboard without tensorflow ops.
Simple example on how to log scalars and images to tensorboard without tensor ops.
License: Copyleft
__author__ = "Michael Gygli"
"""
def __init__(self, logdir):
"""Creates a summary writer logging to logdir."""
self.writer = tf.summary.FileWriter(logdir)
def log_scalar(self, tag, value, step):
"""Log a scalar variable.
Parameter
----------
tag : basestring
Name of the scalar
value
step : int
training iteration
"""
summary = tf.Summary(
value=[tf.Summary.Value(tag=tag, simple_value=value)])
self.writer.add_summary(summary, step)
self.writer.flush()
class StatCounter(object):
""" A simple counter"""
def __init__(self, max_size=50):
self.reset()
self.max_size = max_size
def feed(self, v):
"""
Args:
v(float or np.ndarray): has to be the same shape between calls.
"""
self._values.append(v)
if len(self._values) > self.max_size:
self._values = self._values[-self.max_size:]
def reset(self):
self._values = []
@property
def count(self):
return len(self._values)
@property
def mean(self):
assert len(self._values)
return np.mean(self._values)
@property
def sum(self):
assert len(self._values)
return np.sum(self._values)
@property
def max(self):
assert len(self._values)
return max(self._values)
@property
def min(self):
assert len(self._values)
return min(self._values)
@property
def success_rate(self):
count = 0
for v in self._values:
if v > 35.0:
count += 1
return float(count) / len(self._values)
def calc_indicators(mem):
START_STEPS = 15
n = len(mem)
episode_shaping_reward = np.sum(
[exp.info['shaping_reward'] for exp in mem])
episode_r2_reward = np.sum([
exp.info['r2_reward'] for exp in mem if exp.info['frame_count'] <= 1000
])
x_offset_reward = np.sum(exp.info['x_offset_reward'] for exp in mem)
episode_length = mem[-1].info['frame_count']
scalar_vel = np.mean([exp.info['scalar_vel'] for exp in mem])
action_l2_penalty = np.mean(
[exp.info['mean_action_l2_penalty'] for exp in mem])
start_loss = 10 * START_STEPS * 4 - np.sum([exp.reward
for exp in mem][:START_STEPS])
all_start_loss = 0
for i in range(n):
if not mem[i].info['target_changed']:
frame_count = 4
if i - 1 >= 0:
frame_count = mem[i].info['frame_count'] - mem[
i - 1].info['frame_count']
all_start_loss += 10.0 * frame_count - mem[i].reward
else:
break
start_other_loss = all_start_loss - start_loss
first_change_loss = 0
second_change_loss = 0
third_change_loss = 0
first_change_other_loss = 0
second_change_other_loss = 0
third_change_other_loss = 0
first_stage_vel = 0.0
second_stage_vel = 0.0
third_stage_vel = 0.0
other_loss = 0
change_loss = []
all_change_loss = []
change_vel = []
for i in range(n - 1):
if mem[i].info['target_changed']:
change_loss.append(0.0)
all_change_loss.append(0.0)
change_vel.append([])
for j in range(START_STEPS):
idx = i + 1 + j
if idx >= n or mem[idx].info['target_changed']:
break
frame_count = 4
if idx - 1 >= 0:
frame_count = mem[idx].info['frame_count'] - mem[
idx - 1].info['frame_count']
change_loss[-1] += 10.0 * frame_count - mem[idx].reward
for j in range(n - i - 1):
idx = i + 1 + j
if idx >= n or mem[idx].info['target_changed']:
break
if idx - 1 >= 0:
frame_count = mem[idx].info['frame_count'] - mem[
idx - 1].info['frame_count']
all_change_loss[-1] += 10.0 * frame_count - mem[idx].reward
change_vel[-1].append(mem[idx].info['scalar_vel'])
if len(change_loss) >= 1:
first_change_loss = change_loss[0]
first_change_other_loss = all_change_loss[0] - change_loss[0]
if len(change_loss) >= 2:
second_change_loss = change_loss[1]
second_change_other_loss = all_change_loss[1] - change_loss[1]
if len(change_loss) >= 3:
third_change_loss = change_loss[2]
third_change_other_loss = all_change_loss[2] - change_loss[2]
other_loss = 10 * mem[-1].info[
'frame_count'] - start_loss - first_change_loss - second_change_loss - third_change_loss - episode_r2_reward
if len(change_vel) >= 1:
first_stage_vel = np.mean(change_vel[0])
if len(change_vel) >= 2:
second_stage_vel = np.mean(change_vel[1])
if len(change_vel) >= 3:
third_stage_vel = np.mean(change_vel[2])
indicators_dict = {
'episode_shaping_reward': episode_shaping_reward,
'episode_r2_reward': episode_r2_reward,
'x_offset_reward': x_offset_reward,
'episode_length': episode_length,
'scalar_vel': scalar_vel,
'mean_action_l2_penalty': action_l2_penalty,
'start_loss': start_loss,
'first_change_loss': first_change_loss,
'second_change_loss': second_change_loss,
'third_change_loss': third_change_loss,
'start_other_loss': start_other_loss,
'first_change_other_loss': first_change_other_loss,
'second_change_other_loss': second_change_other_loss,
'third_change_other_loss': third_change_other_loss,
'other_loss': other_loss,
'first_stage_vel': first_stage_vel,
'second_stage_vel': second_stage_vel,
'third_stage_vel': third_stage_vel
}
return indicators_dict
class ScalarsManager(object):
def __init__(self, logdir):
self.summary = Summary(logdir=logdir)
self.max_shaping_reward = 0
self.max_x_offset_reward = 0
self.max_r2_reward = 0
self.critic_loss_counter = StatCounter(max_size=500)
self.r2_reward_counter = StatCounter(max_size=500)
self.nofall_r2_reward_counter = StatCounter(max_size=500)
self.falldown_counter100 = StatCounter(max_size=100)
self.vel_keys = [
'scalar_vel', 'first_stage_vel', 'second_stage_vel',
'third_stage_vel'
]
self.vel_counter = {}
for key in self.vel_keys:
self.vel_counter[key] = StatCounter(max_size=500)
self.reward_loss_keys = [
'start_loss', 'first_change_loss', 'second_change_loss',
'third_change_loss', 'start_other_loss', 'first_change_other_loss',
'second_change_other_loss', 'third_change_other_loss', 'other_loss'
]
self.reward_loss_counter = {}
for key in self.reward_loss_keys:
self.reward_loss_counter[key] = StatCounter(max_size=500)
def feed_critic_loss(self, critic_loss):
self.critic_loss_counter.feed(critic_loss)
def record(self, record_dict, global_step):
self.max_shaping_reward = max(self.max_shaping_reward,
record_dict['episode_shaping_reward'])
self.max_x_offset_reward = max(self.max_x_offset_reward,
record_dict['x_offset_reward'])
self.max_r2_reward = max(self.max_r2_reward,
record_dict['episode_r2_reward'])
self.r2_reward_counter.feed(record_dict['episode_r2_reward'])
if record_dict['episode_length'] >= 1000: # no falldown
self.nofall_r2_reward_counter.feed(
record_dict['episode_r2_reward'])
self.falldown_counter100.feed(0.0)
else:
self.falldown_counter100.feed(1.0)
for key in self.reward_loss_keys:
self.reward_loss_counter[key].feed(record_dict[key])
for key in self.vel_keys:
self.vel_counter[key].feed(record_dict[key])
self.summary.log_scalar('performance/falldown_rate',
self.falldown_counter100.sum / 100.0,
global_step)
self.summary.log_scalar('performance/max_r2_reward',
self.max_r2_reward, global_step)
self.summary.log_scalar('performance/max_shaping_reward',
self.max_shaping_reward, global_step)
self.summary.log_scalar('performance/max_x_offset_reward',
self.max_x_offset_reward, global_step)
self.summary.log_scalar('performance/episode_r2_reward',
record_dict['episode_r2_reward'], global_step)
self.summary.log_scalar('performance/episode_shaping_reward',
record_dict['episode_shaping_reward'],
global_step)
self.summary.log_scalar('performance/x_offset_reward',
record_dict['x_offset_reward'], global_step)
self.summary.log_scalar('performance/episode_length',
record_dict['episode_length'], global_step)
self.summary.log_scalar('performance/mean_action_l2_penalty',
record_dict['mean_action_l2_penalty'],
global_step)
self.summary.log_scalar('server/free_client_num',
record_dict['free_client_num'], global_step)
self.summary.log_scalar('model/noiselevel', record_dict['noiselevel'],
global_step)
if self.critic_loss_counter.count > 0:
mean_critic_loss = self.critic_loss_counter.mean
self.summary.log_scalar('model/critic_loss', mean_critic_loss,
global_step)
if self.r2_reward_counter.count > 400:
mean_r2_reward = self.r2_reward_counter.mean
self.summary.log_scalar('performance/recent_r2_reward',
mean_r2_reward, global_step)
mean_nofall_r2_reward = self.nofall_r2_reward_counter.mean
self.summary.log_scalar('performance/recent_nofall_r2_reward',
mean_nofall_r2_reward, global_step)
for key in self.vel_keys:
self.summary.log_scalar('scalar_vel/{}'.format(key),
self.vel_counter[key].mean,
global_step)
for key in self.reward_loss_keys:
if 'first' in key:
self.summary.log_scalar('1_stage_loss_reward/' + key,
self.reward_loss_counter[key].mean,
global_step)
elif 'second' in key:
self.summary.log_scalar('2_stage_loss_reward/' + key,
self.reward_loss_counter[key].mean,
global_step)
elif 'third' in key:
self.summary.log_scalar('3_stage_loss_reward/' + key,
self.reward_loss_counter[key].mean,
global_step)
elif 'start' in key:
self.summary.log_scalar('0_stage_loss_reward/' + key,
self.reward_loss_counter[key].mean,
global_step)
else:
self.summary.log_scalar('loss_reward/' + key,
self.reward_loss_counter[key].mean,
global_step)
self.summary.log_scalar(
'0_stage_loss_reward/stage_loss',
self.reward_loss_counter['start_loss'].mean +
self.reward_loss_counter['start_other_loss'].mean, global_step)
self.summary.log_scalar(
'1_stage_loss_reward/stage_loss',
self.reward_loss_counter['first_change_loss'].mean +
self.reward_loss_counter['first_change_other_loss'].mean,
global_step)
self.summary.log_scalar(
'2_stage_loss_reward/stage_loss',
self.reward_loss_counter['second_change_loss'].mean +
self.reward_loss_counter['second_change_other_loss'].mean,
global_step)
self.summary.log_scalar(
'3_stage_loss_reward/stage_loss',
self.reward_loss_counter['third_change_loss'].mean +
self.reward_loss_counter['third_change_other_loss'].mean,
global_step)
class TransitionExperience(object):
""" A transition of state, or experience"""
def __init__(self, obs, action, reward, info, **kwargs):
""" kwargs: whatever other attribute you want to save"""
self.obs = obs
self.action = action
self.reward = reward
self.info = info
for k, v in six.iteritems(kwargs):
setattr(self, k, v)
...@@ -25,12 +25,11 @@ from utils import * ...@@ -25,12 +25,11 @@ from utils import *
def run_train_episode(env, agent, scaler): def run_train_episode(env, agent, scaler):
obs = env.reset() obs = env.reset()
observes, actions, rewards, unscaled_obs = [], [], [], [] observes, actions, rewards, unscaled_obs = [], [], [], []
done = False
step = 0.0 step = 0.0
scale, offset = scaler.get() scale, offset = scaler.get()
scale[-1] = 1.0 # don't scale time step feature scale[-1] = 1.0 # don't scale time step feature
offset[-1] = 0.0 # don't offset time step feature offset[-1] = 0.0 # don't offset time step feature
while not done: while True:
obs = obs.reshape((1, -1)) obs = obs.reshape((1, -1))
obs = np.append(obs, [[step]], axis=1) # add time step feature obs = np.append(obs, [[step]], axis=1) # add time step feature
unscaled_obs.append(obs) unscaled_obs.append(obs)
...@@ -50,6 +49,9 @@ def run_train_episode(env, agent, scaler): ...@@ -50,6 +49,9 @@ def run_train_episode(env, agent, scaler):
rewards.append(reward) rewards.append(reward)
step += 1e-3 # increment time step feature step += 1e-3 # increment time step feature
if done:
break
return (np.concatenate(observes), np.concatenate(actions), return (np.concatenate(observes), np.concatenate(actions),
np.array(rewards, dtype='float32'), np.concatenate(unscaled_obs)) np.array(rewards, dtype='float32'), np.concatenate(unscaled_obs))
...@@ -75,6 +77,7 @@ def run_evaluate_episode(env, agent, scaler): ...@@ -75,6 +77,7 @@ def run_evaluate_episode(env, agent, scaler):
rewards.append(reward) rewards.append(reward)
step += 1e-3 # increment time step feature step += 1e-3 # increment time step feature
if done: if done:
break break
return np.sum(rewards) return np.sum(rewards)
...@@ -153,7 +156,6 @@ def main(): ...@@ -153,7 +156,6 @@ def main():
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
'--env', '--env',
...@@ -186,6 +188,5 @@ if __name__ == "__main__": ...@@ -186,6 +188,5 @@ if __name__ == "__main__":
default='CLIP') default='CLIP')
args = parser.parse_args() args = parser.parse_args()
import time
logger.set_dir('./log_dir/{}_{}'.format(args.loss_type, time.time()))
main() main()
...@@ -48,8 +48,9 @@ class Agent(object): ...@@ -48,8 +48,9 @@ class Agent(object):
if gpu_id is None: if gpu_id is None:
gpu_id = 0 if get_gpu_count() > 0 else -1 gpu_id = 0 if get_gpu_count() > 0 else -1
self.gpu_id = gpu_id self.gpu_id = gpu_id
place = fluid.CUDAPlace(gpu_id) if gpu_id >= 0 else fluid.CPUPlace() self.place = fluid.CUDAPlace(
self.fluid_executor = fluid.Executor(place) gpu_id) if gpu_id >= 0 else fluid.CPUPlace()
self.fluid_executor = fluid.Executor(self.place)
self.fluid_executor.run(fluid.default_startup_program()) self.fluid_executor.run(fluid.default_startup_program())
def build_program(self): def build_program(self):
......
...@@ -27,13 +27,19 @@ class Network(object): ...@@ -27,13 +27,19 @@ class Network(object):
A Network is an unordered set of LayerFuncs or Networks. A Network is an unordered set of LayerFuncs or Networks.
""" """
def sync_params_to(self, target_net, gpu_id=0, decay=0.0): def sync_params_to(self,
target_net,
gpu_id=0,
decay=0.0,
share_vars_parallel_executor=None):
""" """
Args: Args:
target_net: Network object deepcopy from source network target_net: Network object deepcopy from source network
gpu_id: gpu id of target_net gpu_id: gpu id of target_net
decay: Float. The decay to use. decay: Float. The decay to use.
target_net_weights = decay * target_net_weights + (1 - decay) * source_net_weights target_net_weights = decay * target_net_weights + (1 - decay) * source_net_weights
share_vars_parallel_executor: if not None, will use fluid.ParallelExecutor
to run program instead of fluid.Executor
""" """
args_hash_id = hashlib.md5('{}_{}_{}'.format( args_hash_id = hashlib.md5('{}_{}_{}'.format(
id(target_net), gpu_id, decay).encode('utf-8')).hexdigest() id(target_net), gpu_id, decay).encode('utf-8')).hexdigest()
...@@ -59,9 +65,6 @@ class Network(object): ...@@ -59,9 +65,6 @@ class Network(object):
param_pairs = get_parameter_pairs(self, target_net) param_pairs = get_parameter_pairs(self, target_net)
place = fluid.CPUPlace() if gpu_id < 0 \
else fluid.CUDAPlace(gpu_id)
self._cached_fluid_executor = fluid.Executor(place)
self._cached_sync_params_program = fluid.Program() self._cached_sync_params_program = fluid.Program()
with fluid.program_guard(self._cached_sync_params_program): with fluid.program_guard(self._cached_sync_params_program):
...@@ -71,7 +74,34 @@ class Network(object): ...@@ -71,7 +74,34 @@ class Network(object):
fluid.layers.assign( fluid.layers.assign(
decay * target_var + (1 - decay) * src_var, target_var) decay * target_var + (1 - decay) * src_var, target_var)
if share_vars_parallel_executor is None:
# use fluid.Executor
place = fluid.CPUPlace() if gpu_id < 0 \
else fluid.CUDAPlace(gpu_id)
self._cached_fluid_executor = fluid.Executor(place)
else:
# use fluid.ParallelExecutor
use_cuda = True if gpu_id >= 0 else False
# specify strategy to make ParallelExecutor run faster
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.use_experimental_executor = True
exec_strategy.num_threads = 4
build_strategy = fluid.BuildStrategy()
build_strategy.remove_unnecessary_lock = True
with fluid.scope_guard(fluid.global_scope().new_scope()):
self._cached_fluid_executor = fluid.ParallelExecutor(
use_cuda=use_cuda,
main_program=self._cached_sync_params_program,
share_vars_from=share_vars_parallel_executor,
exec_strategy=exec_strategy,
build_strategy=build_strategy,
)
if share_vars_parallel_executor is None:
self._cached_fluid_executor.run(self._cached_sync_params_program) self._cached_fluid_executor.run(self._cached_sync_params_program)
else:
self._cached_fluid_executor.run(fetch_list=[])
@property @property
def parameter_names(self): def parameter_names(self):
......
...@@ -14,3 +14,4 @@ ...@@ -14,3 +14,4 @@
from parl.utils.utils import * from parl.utils.utils import *
from parl.utils.gputils import * from parl.utils.gputils import *
from parl.utils.replay_memory import *
...@@ -101,7 +101,10 @@ for level in _LOGGING_LEVEL: ...@@ -101,7 +101,10 @@ for level in _LOGGING_LEVEL:
def _set_file(path): def _set_file(path):
global _FILE_HANDLER global _FILE_HANDLER
if os.path.isfile(path): if os.path.isfile(path):
try:
os.remove(path) os.remove(path)
except OSError:
pass
hdl = logging.FileHandler(filename=path, encoding='utf-8', mode='w') hdl = logging.FileHandler(filename=path, encoding='utf-8', mode='w')
hdl.setFormatter(_Formatter(datefmt='%m-%d %H:%M:%S')) hdl.setFormatter(_Formatter(datefmt='%m-%d %H:%M:%S'))
......
...@@ -13,37 +13,74 @@ ...@@ -13,37 +13,74 @@
# limitations under the License. # limitations under the License.
import numpy as np import numpy as np
from parl.utils import logger
__all__ = ['ReplayMemory']
class ReplayMemory(object): class ReplayMemory(object):
def __init__(self, max_size, obs_dim, act_dim): def __init__(self, max_size, obs_dim, act_dim):
self.max_size = max_size self.max_size = int(max_size)
self.obs_memory = np.zeros((max_size, obs_dim), dtype='float32') self.obs_dim = obs_dim
self.act_memory = np.zeros((max_size, act_dim), dtype='float32') self.act_dim = act_dim
self.reward_memory = np.zeros((max_size, ), dtype='float32')
self.next_obs_memory = np.zeros((max_size, obs_dim), dtype='float32') self.obs = np.zeros((max_size, obs_dim), dtype='float32')
self.terminal_memory = np.zeros((max_size, ), dtype='bool') self.action = np.zeros((max_size, act_dim), dtype='float32')
self.reward = np.zeros((max_size, ), dtype='float32')
self.terminal = np.zeros((max_size, ), dtype='bool')
self.next_obs = np.zeros((max_size, obs_dim), dtype='float32')
self._curr_size = 0 self._curr_size = 0
self._curr_pos = 0 self._curr_pos = 0
def sample_batch(self, batch_size): def sample_batch(self, batch_size):
batch_idx = np.random.choice(self._curr_size, size=batch_size) # index mapping to avoid sampling saving example
obs = self.obs_memory[batch_idx, :] batch_idx = np.random.randint(
act = self.act_memory[batch_idx, :] self._curr_size - 300 - 1, size=batch_size)
reward = self.reward_memory[batch_idx] batch_idx = (self._curr_pos + 300 + batch_idx) % self._curr_size
next_obs = self.next_obs_memory[batch_idx, :]
terminal = self.terminal_memory[batch_idx] obs = self.obs[batch_idx]
return obs, act, reward, next_obs, terminal reward = self.reward[batch_idx]
action = self.action[batch_idx]
next_obs = self.next_obs[batch_idx]
terminal = self.terminal[batch_idx]
return obs, action, reward, next_obs, terminal
def append(self, obs, act, reward, next_obs, terminal): def append(self, obs, act, reward, next_obs, terminal):
if self._curr_size < self.max_size: if self._curr_size < self.max_size:
self._curr_size += 1 self._curr_size += 1
self.obs_memory[self._curr_pos] = obs self.obs[self._curr_pos] = obs
self.act_memory[self._curr_pos] = act self.action[self._curr_pos] = act
self.reward_memory[self._curr_pos] = reward self.reward[self._curr_pos] = reward
self.next_obs_memory[self._curr_pos] = next_obs self.next_obs[self._curr_pos] = next_obs
self.terminal_memory[self._curr_pos] = terminal self.terminal[self._curr_pos] = terminal
self._curr_pos = (self._curr_pos + 1) % self.max_size self._curr_pos = (self._curr_pos + 1) % self.max_size
def size(self): def size(self):
return self._curr_size return self._curr_size
def save(self, pathname):
other = np.array([self._curr_size, self._curr_pos], dtype=np.int32)
np.savez(
pathname,
obs=self.obs,
action=self.action,
reward=self.reward,
terminal=self.terminal,
next_obs=self.next_obs,
other=other)
def load(self, pathname):
data = np.load(pathname)
other = data['other']
if int(other[0]) > self.max_size:
logger.warn('loading from a bigger size rpm!')
self._curr_size = min(int(other[0]), self.max_size)
self._curr_pos = min(int(other[1]), self.max_size - 1)
self.obs[:self._curr_size] = data['obs'][:self._curr_size]
self.action[:self._curr_size] = data['action'][:self._curr_size]
self.reward[:self._curr_size] = data['reward'][:self._curr_size]
self.terminal[:self._curr_size] = data['terminal'][:self._curr_size]
self.next_obs[:self._curr_size] = data['next_obs'][:self._curr_size]
logger.info("[load rpm]memory loade from {}".format(pathname))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册