未验证 提交 bcface6a 编写于 作者: B Bo Zhou 提交者: GitHub

replace tensorboard with summary to support VDL (#276)

* replace tensorboard with summary to support VDL in the future

* unittest

* rename keys for record

* yapf
上级 40a0ab3f
...@@ -27,7 +27,7 @@ from parl.env.atari_wrappers import wrap_deepmind ...@@ -27,7 +27,7 @@ from parl.env.atari_wrappers import wrap_deepmind
from parl.utils.window_stat import WindowStat from parl.utils.window_stat import WindowStat
from parl.utils.time_stat import TimeStat from parl.utils.time_stat import TimeStat
from parl.utils import machine_info from parl.utils import machine_info
from parl.utils import logger, get_gpu_count, tensorboard from parl.utils import logger, get_gpu_count, summary
from parl.algorithms import A2C from parl.algorithms import A2C
from atari_model import ActorCritic from atari_model import ActorCritic
...@@ -205,18 +205,18 @@ class Learner(object): ...@@ -205,18 +205,18 @@ class Learner(object):
} }
if metric['mean_episode_rewards'] is not None: if metric['mean_episode_rewards'] is not None:
tensorboard.add_scalar('train/mean_reward', summary.add_scalar('train/mean_reward',
metric['mean_episode_rewards'], metric['mean_episode_rewards'],
self.sample_total_steps) self.sample_total_steps)
tensorboard.add_scalar('train/total_loss', metric['total_loss'], summary.add_scalar('train/total_loss', metric['total_loss'],
self.sample_total_steps) self.sample_total_steps)
tensorboard.add_scalar('train/pi_loss', metric['pi_loss'], summary.add_scalar('train/pi_loss', metric['pi_loss'],
self.sample_total_steps) self.sample_total_steps)
tensorboard.add_scalar('train/vf_loss', metric['vf_loss'], summary.add_scalar('train/vf_loss', metric['vf_loss'],
self.sample_total_steps) self.sample_total_steps)
tensorboard.add_scalar('train/entropy', metric['entropy'], summary.add_scalar('train/entropy', metric['entropy'],
self.sample_total_steps) self.sample_total_steps)
tensorboard.add_scalar('train/learn_rate', metric['lr'], summary.add_scalar('train/learn_rate', metric['lr'],
self.sample_total_steps) self.sample_total_steps)
logger.info(metric) logger.info(metric)
......
...@@ -22,7 +22,7 @@ import parl ...@@ -22,7 +22,7 @@ import parl
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
from parl.utils import tensorboard, logger from parl.utils import summary, logger
from parl.algorithms import DQN, DDQN from parl.algorithms import DQN, DDQN
from agent import AtariAgent from agent import AtariAgent
...@@ -152,18 +152,17 @@ def main(): ...@@ -152,18 +152,17 @@ def main():
for _ in range(3): for _ in range(3):
eval_rewards.append(run_evaluate_episode(test_env, agent)) eval_rewards.append(run_evaluate_episode(test_env, agent))
tensorboard.add_scalar('dqn/eval', np.mean(eval_rewards), summary.add_scalar('dqn/eval', np.mean(eval_rewards),
total_steps) total_steps)
tensorboard.add_scalar('dqn/score', total_reward, total_steps) summary.add_scalar('dqn/score', total_reward, total_steps)
tensorboard.add_scalar('dqn/loss', loss, total_steps) summary.add_scalar('dqn/loss', loss, total_steps)
tensorboard.add_scalar('dqn/exploration', agent.exploration, summary.add_scalar('dqn/exploration', agent.exploration,
total_steps) total_steps)
tensorboard.add_scalar('dqn/Q value', summary.add_scalar('dqn/Q value',
evaluate_fixed_Q(agent, fixed_obs), evaluate_fixed_Q(agent, fixed_obs),
total_steps) total_steps)
tensorboard.add_scalar('dqn/grad_norm', summary.add_scalar('dqn/grad_norm',
get_grad_norm(agent.alg.model), get_grad_norm(agent.alg.model), total_steps)
total_steps)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
import gym import gym
import argparse import argparse
import numpy as np import numpy as np
from parl.utils import logger, tensorboard, ReplayMemory from parl.utils import logger, summary, ReplayMemory
from mujoco_model import MujocoModel from mujoco_model import MujocoModel
from mujoco_agent import MujocoAgent from mujoco_agent import MujocoAgent
...@@ -103,8 +103,7 @@ def main(): ...@@ -103,8 +103,7 @@ def main():
train_reward, steps = run_train_episode(env, agent, rpm) train_reward, steps = run_train_episode(env, agent, rpm)
total_steps += steps total_steps += steps
logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
tensorboard.add_scalar('train/episode_reward', train_reward, summary.add_scalar('train/episode_reward', train_reward, total_steps)
total_steps)
if total_steps // args.test_every_steps >= test_flag: if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag:
...@@ -112,7 +111,7 @@ def main(): ...@@ -112,7 +111,7 @@ def main():
evaluate_reward = run_evaluate_episode(env, agent) evaluate_reward = run_evaluate_episode(env, agent)
logger.info('Steps {}, Evaluate reward: {}'.format( logger.info('Steps {}, Evaluate reward: {}'.format(
total_steps, evaluate_reward)) total_steps, evaluate_reward))
tensorboard.add_scalar('eval/episode_reward', evaluate_reward, summary.add_scalar('eval/episode_reward', evaluate_reward,
total_steps) total_steps)
......
Tensorboard summary
=============== ===============
Visualize the results with tensorboard. Visualize the results with tensorboard.
...@@ -8,7 +8,7 @@ add_scalar ...@@ -8,7 +8,7 @@ add_scalar
Common used arguments: Common used arguments:
* tensorboard.add_scalar(tag, scalar_value, global_step=None) * summary.add_scalar(tag, scalar_value, global_step=None)
* tag *(string)* – Data identifier * tag *(string)* – Data identifier
* scalar_value *(float or string/blobname)* – Value to save * scalar_value *(float or string/blobname)* – Value to save
* global_step *(int)* – Global step value to record * global_step *(int)* – Global step value to record
...@@ -17,11 +17,11 @@ Example: ...@@ -17,11 +17,11 @@ Example:
.. code-block:: python .. code-block:: python
from parl.utils import tensorboard from parl.utils import summary
x = range(100) x = range(100)
for i in x: for i in x:
tensorboard.add_scalar('y=2x', i * 2, i) summary.add_scalar('y=2x', i * 2, i)
Expected result: Expected result:
...@@ -33,7 +33,7 @@ add_histogram ...@@ -33,7 +33,7 @@ add_histogram
Common used arguments: Common used arguments:
* tensorboard.add_scalar(tag, scalar_value, global_step=None) * summary.add_scalar(tag, scalar_value, global_step=None)
* tag *(string)* – Data identifier * tag *(string)* – Data identifier
* values *(torch.Tensor, numpy.array, or string/blobname)* – Values to build histogram * values *(torch.Tensor, numpy.array, or string/blobname)* – Values to build histogram
* global_step *(int)* – Global step value to record * global_step *(int)* – Global step value to record
...@@ -42,12 +42,12 @@ Example: ...@@ -42,12 +42,12 @@ Example:
.. code-block:: python .. code-block:: python
from parl.utils import tensorboard from parl.utils import summary
import numpy as np import numpy as np
for i in range(10): for i in range(10):
x = np.random.random(1000) x = np.random.random(1000)
tensorboard.add_histogram('distribution centers', x + i, i) summary.add_histogram('distribution centers', x + i, i)
Expected result: Expected result:
......
...@@ -25,7 +25,7 @@ from atari_agent import AtariAgent ...@@ -25,7 +25,7 @@ from atari_agent import AtariAgent
from collections import defaultdict from collections import defaultdict
from parl.env.atari_wrappers import wrap_deepmind from parl.env.atari_wrappers import wrap_deepmind
from parl.utils import logger, get_gpu_count, tensorboard from parl.utils import logger, get_gpu_count, summary
from parl.utils.scheduler import PiecewiseScheduler from parl.utils.scheduler import PiecewiseScheduler
from parl.utils.time_stat import TimeStat from parl.utils.time_stat import TimeStat
from parl.utils.window_stat import WindowStat from parl.utils.window_stat import WindowStat
...@@ -186,7 +186,7 @@ class Learner(object): ...@@ -186,7 +186,7 @@ class Learner(object):
min_episode_steps = np.min(np.array(episode_steps).flatten()) min_episode_steps = np.min(np.array(episode_steps).flatten())
metric = { metric = {
'Sample steps': self.sample_total_steps, 'sample_steps': self.sample_total_steps,
'max_episode_rewards': max_episode_rewards, 'max_episode_rewards': max_episode_rewards,
'mean_episode_rewards': mean_episode_rewards, 'mean_episode_rewards': mean_episode_rewards,
'min_episode_rewards': min_episode_rewards, 'min_episode_rewards': min_episode_rewards,
...@@ -205,7 +205,7 @@ class Learner(object): ...@@ -205,7 +205,7 @@ class Learner(object):
for key, value in metric.items(): for key, value in metric.items():
if value is not None: if value is not None:
tensorboard.add_scalar(key, value, self.sample_total_steps) summary.add_scalar(key, value, self.sample_total_steps)
logger.info(metric) logger.info(metric)
......
...@@ -22,7 +22,7 @@ from atari_agent import AtariAgent ...@@ -22,7 +22,7 @@ from atari_agent import AtariAgent
from atari_model import AtariModel from atari_model import AtariModel
from datetime import datetime from datetime import datetime
from replay_memory import ReplayMemory, Experience from replay_memory import ReplayMemory, Experience
from parl.utils import tensorboard, logger from parl.utils import summary, logger
from tqdm import tqdm from tqdm import tqdm
from utils import get_player from utils import get_player
...@@ -120,11 +120,9 @@ def main(): ...@@ -120,11 +120,9 @@ def main():
total_reward, steps, loss = run_train_episode(env, agent, rpm) total_reward, steps, loss = run_train_episode(env, agent, rpm)
total_steps += steps total_steps += steps
pbar.set_description('[train]exploration:{}'.format(agent.exploration)) pbar.set_description('[train]exploration:{}'.format(agent.exploration))
tensorboard.add_scalar('dqn/score', total_reward, total_steps) summary.add_scalar('dqn/score', total_reward, total_steps)
tensorboard.add_scalar('dqn/loss', loss, summary.add_scalar('dqn/loss', loss, total_steps) # mean of total loss
total_steps) # mean of total loss summary.add_scalar('dqn/exploration', agent.exploration, total_steps)
tensorboard.add_scalar('dqn/exploration', agent.exploration,
total_steps)
pbar.update(steps) pbar.update(steps)
if total_steps // args.test_every_steps >= test_flag: if total_steps // args.test_every_steps >= test_flag:
...@@ -139,7 +137,7 @@ def main(): ...@@ -139,7 +137,7 @@ def main():
"eval_agent done, (steps, eval_reward): ({}, {})".format( "eval_agent done, (steps, eval_reward): ({}, {})".format(
total_steps, np.mean(eval_rewards))) total_steps, np.mean(eval_rewards)))
eval_test = np.mean(eval_rewards) eval_test = np.mean(eval_rewards)
tensorboard.add_scalar('dqn/eval', eval_test, total_steps) summary.add_scalar('dqn/eval', eval_test, total_steps)
pbar.close() pbar.close()
......
...@@ -34,7 +34,7 @@ Then we can start the distributed training by running: ...@@ -34,7 +34,7 @@ Then we can start the distributed training by running:
python train.py python train.py
``` ```
Training result will be saved in `train_log` with training curve that can be visualized in tensorboard data. Training result will be saved in `train_log` with training curve.
### Reference ### Reference
+ [Ray](https://github.com/ray-project/ray) + [Ray](https://github.com/ray-project/ray)
......
...@@ -23,7 +23,7 @@ from obs_filter import MeanStdFilter ...@@ -23,7 +23,7 @@ from obs_filter import MeanStdFilter
from mujoco_agent import MujocoAgent from mujoco_agent import MujocoAgent
from mujoco_model import MujocoModel from mujoco_model import MujocoModel
from noise import SharedNoiseTable from noise import SharedNoiseTable
from parl.utils import logger, tensorboard from parl.utils import logger, summary
from parl.utils.window_stat import WindowStat from parl.utils.window_stat import WindowStat
from six.moves import queue from six.moves import queue
from actor import Actor from actor import Actor
...@@ -202,7 +202,7 @@ class Learner(object): ...@@ -202,7 +202,7 @@ class Learner(object):
logger.info(metrics) logger.info(metrics)
for k, v in metrics.items(): for k, v in metrics.items():
if v is not None: if v is not None:
tensorboard.add_scalar(k, v, self.sample_total_steps) summary.add_scalar(k, v, self.sample_total_steps)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -24,7 +24,7 @@ from atari_model import AtariModel ...@@ -24,7 +24,7 @@ from atari_model import AtariModel
from atari_agent import AtariAgent from atari_agent import AtariAgent
from collections import defaultdict from collections import defaultdict
from parl.env.atari_wrappers import wrap_deepmind from parl.env.atari_wrappers import wrap_deepmind
from parl.utils import logger, get_gpu_count, tensorboard from parl.utils import logger, get_gpu_count, summary
from parl.utils.scheduler import PiecewiseScheduler from parl.utils.scheduler import PiecewiseScheduler
from parl.utils.time_stat import TimeStat from parl.utils.time_stat import TimeStat
from parl.utils.window_stat import WindowStat from parl.utils.window_stat import WindowStat
...@@ -313,7 +313,7 @@ class Learner(object): ...@@ -313,7 +313,7 @@ class Learner(object):
for key, value in metric.items(): for key, value in metric.items():
if value is not None: if value is not None:
tensorboard.add_scalar(key, value, self.sample_total_steps) summary.add_scalar(key, value, self.sample_total_steps)
logger.info(metric) logger.info(metric)
......
...@@ -22,7 +22,7 @@ import parl ...@@ -22,7 +22,7 @@ import parl
from atari_model import AtariModel from atari_model import AtariModel
from atari_agent import AtariAgent from atari_agent import AtariAgent
from parl.env.atari_wrappers import wrap_deepmind from parl.env.atari_wrappers import wrap_deepmind
from parl.utils import logger, tensorboard, get_gpu_count from parl.utils import logger, summary, get_gpu_count
from parl.utils.scheduler import PiecewiseScheduler from parl.utils.scheduler import PiecewiseScheduler
from parl.utils.time_stat import TimeStat from parl.utils.time_stat import TimeStat
from parl.utils.window_stat import WindowStat from parl.utils.window_stat import WindowStat
...@@ -221,7 +221,7 @@ class Learner(object): ...@@ -221,7 +221,7 @@ class Learner(object):
min_episode_steps = np.min(np.array(episode_steps).flatten()) min_episode_steps = np.min(np.array(episode_steps).flatten())
metric = { metric = {
'Sample steps': self.sample_total_steps, 'sample_steps': self.sample_total_steps,
'max_episode_rewards': max_episode_rewards, 'max_episode_rewards': max_episode_rewards,
'mean_episode_rewards': mean_episode_rewards, 'mean_episode_rewards': mean_episode_rewards,
'min_episode_rewards': min_episode_rewards, 'min_episode_rewards': min_episode_rewards,
...@@ -244,7 +244,7 @@ class Learner(object): ...@@ -244,7 +244,7 @@ class Learner(object):
for key, value in metric.items(): for key, value in metric.items():
if value is not None: if value is not None:
tensorboard.add_scalar(key, value, self.sample_total_steps) summary.add_scalar(key, value, self.sample_total_steps)
logger.info(metric) logger.info(metric)
......
# LiftSim基线
## 简介
基于PARL库实现A2C算法,应用于[RLSchool][rlschool]库中的电梯调度模拟环境[LiftSim][liftsim]
## 依赖库
+ [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle)
+ [parl>=1.2.3](https://github.com/PaddlePaddle/PARL)
+ [rlschool>=0.1.1][rlschool]
## 分布式训练
首先,启动一个具有5个CPU资源的本地集群:
```bash
xparl start --port 8010 --cpu_num 5
```
> 注意,如果你已经启动了一个集群,则不需要重复运行上面命令。关于PARL集群更多信息,可以参考[文档](https://parl.readthedocs.io/en/latest/parallel_training/setup.html)。
然后我们就可以通过运行下面命令进行分布式训练:
```bash
python train.py
```
## 评估
可以通过下面命令来评估保存的模型
```bash
python evaluate.py --model_path saved_models/[FILENAME]
```
tensorboard和log文件会保存在`./train_log/train/`;可以通过运行命令`tensorboard --logdir .`查看tensorboard可视化界面。
## 收敛指标
训练30h左右,评估指标能达到-120分左右(LiftSim环境运行1天reward)
<img src="performance.png"/>
## 可视化效果
<img src="effect.gif" width="400"/>
[rlschool]: https://github.com/PaddlePaddle/RLSchool
[liftsim]: https://github.com/PaddlePaddle/RLSchool/tree/master/rlschool/liftsim
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
config = {
#========== remote config ==========
'master_address': 'localhost:8010',
#========== actor config ==========
'actor_num': 5,
'env_num': 5,
'sample_batch_steps': 5,
#========== learner config ==========
'max_sample_steps': int(1e10),
'gamma': 0.998,
'lambda': 1.0, # GAE
# start learning rate
'start_lr': 1.0e-4,
# coefficient of policy entropy adjustment schedule: (train_step, coefficient)
'entropy_coeff_scheduler': [(0, -2.0e-4)],
'vf_loss_coeff': 0.5,
'get_remote_metrics_interval': 100,
'log_metrics_interval_s': 60,
}
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import parl
from collections import defaultdict
from env_wrapper import ObsProcessWrapper, ActionProcessWrapper, RewardWrapper, MetricsWrapper
from parl.utils.rl_utils import calc_gae
from parl.env.vector_env import VectorEnv
from rlschool import LiftSim
from copy import deepcopy
from lift_model import LiftModel
from lift_agent import LiftAgent
@parl.remote_class
class Actor(object):
def __init__(self, config):
self.config = config
self.env_num = config['env_num']
self.envs = []
for _ in range(self.env_num):
env = LiftSim()
env = RewardWrapper(env)
env = ActionProcessWrapper(env)
env = ObsProcessWrapper(env)
env = MetricsWrapper(env)
self.envs.append(env)
self.vector_env = VectorEnv(self.envs)
# number of elevators
self.ele_num = self.envs[0].mansion_attr.ElevatorNumber
act_dim = self.envs[0].act_dim
self.obs_dim = self.envs[0].obs_dim
self.config['obs_dim'] = self.obs_dim
# nested list of shape (env_num, ele_num, obs_dim)
self.obs_batch = self.vector_env.reset()
# (env_num * ele_num, obs_dim)
self.obs_batch = np.array(self.obs_batch).reshape(
[self.env_num * self.ele_num, self.obs_dim])
model = LiftModel(act_dim)
algorithm = parl.algorithms.A3C(
model, vf_loss_coeff=config['vf_loss_coeff'])
self.agent = LiftAgent(algorithm, config)
def sample(self):
sample_data = defaultdict(list)
env_sample_data = {}
# treat each elevator in Liftsim as an independent env
for env_id in range(self.env_num * self.ele_num):
env_sample_data[env_id] = defaultdict(list)
for i in range(self.config['sample_batch_steps']):
actions_batch, values_batch = self.agent.sample(self.obs_batch)
vector_actions = np.array_split(actions_batch, self.env_num)
assert len(vector_actions[-1]) == self.ele_num
next_obs_batch, reward_batch, done_batch, info_batch = \
self.vector_env.step(vector_actions)
# (env_num, ele_num, obs_dim) -> (env_num * ele_num, obs_dim)
next_obs_batch = np.array(next_obs_batch).reshape(
[self.env_num * self.ele_num, self.obs_dim])
# repeat reward and done to ele_num times
# (env_num) -> (env_num, ele_num) -> (env_num * ele_num)
reward_batch = np.repeat(reward_batch, self.ele_num)
done_batch = np.repeat(done_batch, self.ele_num)
for env_id in range(self.env_num * self.ele_num):
env_sample_data[env_id]['obs'].append(self.obs_batch[env_id])
env_sample_data[env_id]['actions'].append(
actions_batch[env_id])
env_sample_data[env_id]['rewards'].append(reward_batch[env_id])
env_sample_data[env_id]['dones'].append(done_batch[env_id])
env_sample_data[env_id]['values'].append(values_batch[env_id])
# Calculate advantages when the episode is done or reaches max sample steps.
if done_batch[env_id] or i + 1 == self.config[
'sample_batch_steps']: # reach max sample steps
next_value = 0
if not done_batch[env_id]:
next_obs = np.expand_dims(next_obs_batch[env_id], 0)
next_value = self.agent.value(next_obs)
values = env_sample_data[env_id]['values']
rewards = env_sample_data[env_id]['rewards']
advantages = calc_gae(rewards, values, next_value,
self.config['gamma'],
self.config['lambda'])
target_values = advantages + values
sample_data['obs'].extend(env_sample_data[env_id]['obs'])
sample_data['actions'].extend(
env_sample_data[env_id]['actions'])
sample_data['advantages'].extend(advantages)
sample_data['target_values'].extend(target_values)
env_sample_data[env_id] = defaultdict(list)
self.obs_batch = deepcopy(next_obs_batch)
# size of sample_data[key]: env_num * ele_num * sample_batch_steps
for key in sample_data:
sample_data[key] = np.stack(sample_data[key])
return sample_data
def get_metrics(self):
metrics = defaultdict(list)
for metrics_env in self.envs:
assert isinstance(
metrics_env,
MetricsWrapper), "Put the MetricsWrapper in the last wrapper"
for env_reward_1h, env_reward_24h in metrics_env.next_episode_results(
):
metrics['env_reward_1h'].append(env_reward_1h)
metrics['env_reward_24h'].append(env_reward_24h)
return metrics
def set_weights(self, params):
self.agent.set_weights(params)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from copy import deepcopy
import numpy as np
from utils import discretize, linear_discretize
from rlschool import LiftSim
class BaseWrapper(object):
def __init__(self, env):
self.env = env
self._mansion = env._mansion
self.mansion_attr = self._mansion.attribute
@property
def obs_dim(self):
if hasattr(self.env, 'obs_dim'):
return self.env.obs_dim
else:
return None
@property
def act_dim(self):
if hasattr(self.env, 'act_dim'):
return self.env.act_dim
else:
return None
def seed(self, seed=None):
return self.env.seed(seed)
def step(self, action):
return self.env.step(action)
def reset(self):
return self.env.reset()
def render(self):
return self.env.render()
def close(self):
return self.env.close()
class ObsProcessWrapper(BaseWrapper):
"""Extract features of each elevator in LiftSim env
"""
def __init__(self, env, hour_distize_num=6):
super(ObsProcessWrapper, self).__init__(env)
self.hour_distize_num = hour_distize_num
self.total_steps = 0
@property
def obs_dim(self):
"""
NOTE:
Keep obs_dim to the return size of function `_mansion_state_process`
"""
ele_dim = self.mansion_attr.NumberOfFloor * 3 + 34
obs_dim = (ele_dim + 1) * self.mansion_attr.ElevatorNumber + \
self.mansion_attr.NumberOfFloor * 2
obs_dim += self.hour_distize_num
return obs_dim
def reset(self):
"""
Returns:
obs(list): [[self.obs_dim]] * mansion_attr.ElevatorNumber, features array of all elevators
"""
obs = self.env.reset()
self.total_steps = 0
obs = self._mansion_state_process(obs)
return obs
def step(self, action):
"""
Returns:
obs(list): nested list, shape of [mansion_attr.ElevatorNumber, self.obs_dim],
features array of all elevators
reward(int): returned by self.env
done(bool): returned by self.env
info(dict): returned by self.env
"""
obs, reward, done, info = self.env.step(action)
self.total_steps += 1
obs = self._mansion_state_process(obs)
return obs, reward, done, info
def _mansion_state_process(self, mansion_state):
"""Extract features of env
"""
ele_features = list()
for ele_state in mansion_state.ElevatorStates:
ele_features.append(self._ele_state_process(ele_state))
max_floor = ele_state.MaximumFloor
target_floor_binaries_up = [0.0 for i in range(max_floor)]
target_floor_binaries_down = [0.0 for i in range(max_floor)]
for floor in mansion_state.RequiringUpwardFloors:
target_floor_binaries_up[floor - 1] = 1.0
for floor in mansion_state.RequiringDownwardFloors:
target_floor_binaries_down[floor - 1] = 1.0
target_floor_binaries = target_floor_binaries_up + target_floor_binaries_down
raw_time = self.total_steps * 0.5 # timestep seconds
time_id = int(raw_time % 86400)
time_id = time_id // (24 / self.hour_distize_num * 3600)
time_id_vec = discretize(time_id + 1, self.hour_distize_num, 1,
self.hour_distize_num)
man_features = list()
for idx in range(len(mansion_state.ElevatorStates)):
elevator_id_vec = discretize(idx + 1,
len(mansion_state.ElevatorStates), 1,
len(mansion_state.ElevatorStates))
idx_array = list(range(len(mansion_state.ElevatorStates)))
idx_array.remove(idx)
man_features.append(ele_features[idx])
for left_idx in idx_array:
man_features[idx] = man_features[idx] + ele_features[left_idx]
man_features[idx] = man_features[idx] + \
elevator_id_vec + target_floor_binaries
man_features[idx] = man_features[idx] + time_id_vec
return np.asarray(man_features, dtype='float32')
def _ele_state_process(self, ele_state):
"""Extract features of elevator
"""
ele_feature = []
# add floor information
ele_feature.extend(
linear_discretize(ele_state.Floor, ele_state.MaximumFloor, 1.0,
ele_state.MaximumFloor))
# add velocity information
ele_feature.extend(
linear_discretize(ele_state.Velocity, 21, -ele_state.MaximumSpeed,
ele_state.MaximumSpeed))
# add door information
ele_feature.append(ele_state.DoorState)
ele_feature.append(float(ele_state.DoorIsOpening))
ele_feature.append(float(ele_state.DoorIsClosing))
# add direction information
ele_feature.extend(discretize(ele_state.Direction, 3, -1, 1))
# add load weight information
ele_feature.extend(
linear_discretize(ele_state.LoadWeight / ele_state.MaximumLoad, 5,
0.0, 1.0))
# add other information
target_floor_binaries = [0.0 for i in range(ele_state.MaximumFloor)]
for target_floor in ele_state.ReservedTargetFloors:
target_floor_binaries[target_floor - 1] = 1.0
ele_feature.extend(target_floor_binaries)
dispatch_floor_binaries = [
0.0 for i in range(ele_state.MaximumFloor + 1)
]
dispatch_floor_binaries[ele_state.CurrentDispatchTarget] = 1.0
ele_feature.extend(dispatch_floor_binaries)
ele_feature.append(ele_state.DispatchTargetDirection)
return ele_feature
class ActionProcessWrapper(BaseWrapper):
def __init__(self, env):
"""Map action id predicted by model to action of LiftSim
"""
super(ActionProcessWrapper, self).__init__(env)
@property
def act_dim(self):
"""
NOTE:
keep act_dim in line with function `_action_idx_to_action`
Returns:
int: NumberOfFloor * (2 directions) + (-1 DispatchTarget) + (0 DispatchTarget)
"""
return self.mansion_attr.NumberOfFloor * 2 + 2
def step(self, action):
"""
Args:
action(list): action_id of all elevators (length = mansion_attr.ElevatorNumber)
"""
ele_actions = []
for action_id in action:
ele_actions.extend(self._action_idx_to_action(action_id))
# ele_action: list, formatted action for LiftSim env (length = 2 * mansion_attr.ElevatorNumber)
return self.env.step(ele_actions)
def _action_idx_to_action(self, action_idx):
action_idx = int(action_idx)
realdim = self.act_dim - 2
if (action_idx == realdim):
return (0, 1) # mapped to DispatchTarget=0
elif (action_idx == realdim + 1):
return (-1, 1) # mapped to DispatchTarget=-1
action = action_idx
if (action_idx < realdim / 2):
direction = 1 # up direction
action += 1
else:
direction = -1 # down direction
action -= int(realdim / 2)
action += 1
return (action, direction)
class RewardWrapper(BaseWrapper):
def __init__(self, env):
"""Design reward of LiftSim env.
"""
super(RewardWrapper, self).__init__(env)
self.ele_num = self.mansion_attr.ElevatorNumber
def step(self, action):
"""Here we return same reward for each elevator,
you alos can design different rewards of each elevator.
Returns:
obs: returned by self.env
reward: shaping reward
done: returned by self.env
info: returned by self.env
"""
obs, origin_reward, done, info = self.env.step(action)
reward = -(30 * info['time_consume'] + 0.01 * info['energy_consume'] +
100 * info['given_up_persons']) * 1.0e-3 / self.ele_num
info['origin_reward'] = origin_reward
return obs, reward, done, info
class MetricsWrapper(BaseWrapper):
def __init__(self, env):
super(MetricsWrapper, self).__init__(env)
self._total_steps = 0
self._env_reward_1h = 0
self._env_reward_24h = 0
self._num_returned = 0
self._episode_result = []
def reset(self):
self._total_steps = 0
self._env_reward_1h = 0
self._env_reward_24h = 0
return self.env.reset()
def step(self, action):
obs, reward, done, info = self.env.step(action)
self._total_steps += 1
self._env_reward_1h += info['origin_reward']
self._env_reward_24h += info['origin_reward']
# Treat 1h in LiftSim env as an episode (1step = 0.5s)
if self._total_steps % (3600 * 2) == 0: # 1h
episode_env_reward_1h = self._env_reward_1h
self._env_reward_1h = 0
episode_env_reward_24h = None
if self._total_steps % (24 * 3600 * 2) == 0: # 24h
episode_env_reward_24h = self._env_reward_24h
self._env_reward_24h = 0
self._episode_result.append(
[episode_env_reward_1h, episode_env_reward_24h])
return obs, reward, done, info
def next_episode_results(self):
for i in range(self._num_returned, len(self._episode_result)):
yield self._episode_result[i]
self._num_returned = len(self._episode_result)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import parl
from parl.utils import logger
from env_wrapper import ObsProcessWrapper, ActionProcessWrapper, RewardWrapper
from rlschool import LiftSim
from lift_model import LiftModel
from lift_agent import LiftAgent
from a2c_config import config
def evaluate_one_day(model_path):
env = LiftSim()
env = ActionProcessWrapper(env)
env = ObsProcessWrapper(env)
act_dim = env.act_dim
obs_dim = env.obs_dim
config['obs_dim'] = obs_dim
model = LiftModel(act_dim)
algorithm = parl.algorithms.A3C(
model, vf_loss_coeff=config['vf_loss_coeff'])
agent = LiftAgent(algorithm, config)
agent.restore(model_path)
reward_24h = 0
obs = env.reset()
for i in range(24 * 3600 * 2): # 24h, 1step = 0.5s
action, _ = agent.sample(obs)
#print(action)
obs, reward, done, info = env.step(action)
reward_24h += reward
if (i + 1) % (3600 * 2) == 0:
logger.info('hour {}, total_reward: {}'.format(
(i + 1) // (3600 * 2), reward_24h))
logger.info('model_path: {}, 24h reward: {}'.format(
model_path, reward_24h))
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
'--model_path', type=str, help='path of the model to evaluate.')
args = parser.parse_args()
evaluate_one_day(args.model_path)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import parl
import paddle.fluid as fluid
import numpy as np
from parl import layers
from parl.utils.scheduler import PiecewiseScheduler, LinearDecayScheduler
class LiftAgent(parl.Agent):
def __init__(self, algorithm, config):
"""
Args:
algorithm (`parl.Algorithm`): algorithm to be used in this agent.
config (dict): config describing the training hyper-parameters(see a2c_config.py)
"""
self.obs_dim = config['obs_dim']
super(LiftAgent, self).__init__(algorithm)
self.lr_scheduler = LinearDecayScheduler(config['start_lr'],
config['max_sample_steps'])
self.entropy_coeff_scheduler = PiecewiseScheduler(
config['entropy_coeff_scheduler'])
def build_program(self):
self.sample_program = fluid.Program()
self.predict_program = fluid.Program()
self.value_program = fluid.Program()
self.learn_program = fluid.Program()
with fluid.program_guard(self.sample_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
sample_actions, values = self.alg.sample(obs)
self.sample_outputs = [sample_actions, values]
with fluid.program_guard(self.predict_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
self.predict_actions = self.alg.predict(obs)
with fluid.program_guard(self.value_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
self.values = self.alg.value(obs)
with fluid.program_guard(self.learn_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
actions = layers.data(name='actions', shape=[], dtype='int32')
advantages = layers.data(
name='advantages', shape=[], dtype='float32')
target_values = layers.data(
name='target_values', shape=[], dtype='float32')
lr = layers.data(
name='lr', shape=[1], dtype='float32', append_batch_size=False)
entropy_coeff = layers.data(
name='entropy_coeff',
shape=[1],
dtype='float32',
append_batch_size=False)
total_loss, pi_loss, vf_loss, entropy = self.alg.learn(
obs, actions, advantages, target_values, lr, entropy_coeff)
self.learn_outputs = [total_loss, pi_loss, vf_loss, entropy]
self.learn_program = parl.compile(self.learn_program, total_loss)
def sample(self, obs_np):
"""
Args:
obs_np: a numpy float32 array of shape (B, obs_dim).
Returns:
sample_ids: a numpy int64 array of shape [B]
values: a numpy float32 array of shape [B]
"""
obs_np = obs_np.astype('float32')
sample_actions, values = self.fluid_executor.run(
self.sample_program,
feed={'obs': obs_np},
fetch_list=self.sample_outputs)
return sample_actions, values
def predict(self, obs_np):
"""
Args:
obs_np: a numpy float32 array of shape (B, obs_dim).
Returns:
predict_actions: a numpy int64 array of shape [B]
"""
obs_np = obs_np.astype('float32')
predict_actions = self.fluid_executor.run(
self.predict_program,
feed={'obs': obs_np},
fetch_list=[self.predict_actions])[0]
return predict_actions
def value(self, obs_np):
"""
Args:
obs_np: a numpy float32 array of shape (B, obs_dim).
Returns:
values: a numpy float32 array of shape [B]
"""
obs_np = obs_np.astype('float32')
values = self.fluid_executor.run(
self.value_program, feed={'obs': obs_np},
fetch_list=[self.values])[0]
return values
def learn(self, obs_np, actions_np, advantages_np, target_values_np):
"""
Args:
obs_np: a numpy float32 array of shape (B, obs_dim).
actions_np: a numpy int64 array of shape [B]
advantages_np: a numpy float32 array of shape [B]
target_values_np: a numpy float32 array of shape [B]
"""
obs_np = obs_np.astype('float32')
actions_np = actions_np.astype('int64')
advantages_np = advantages_np.astype('float32')
target_values_np = target_values_np.astype('float32')
lr = self.lr_scheduler.step(step_num=obs_np.shape[0])
entropy_coeff = self.entropy_coeff_scheduler.step()
total_loss, pi_loss, vf_loss, entropy = self.fluid_executor.run(
self.learn_program,
feed={
'obs': obs_np,
'actions': actions_np,
'advantages': advantages_np,
'target_values': target_values_np,
'lr': np.array([lr], dtype='float32'),
'entropy_coeff': np.array([entropy_coeff], dtype='float32')
},
fetch_list=self.learn_outputs)
return total_loss, pi_loss, vf_loss, entropy, lr, entropy_coeff
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import parl
import paddle.fluid as fluid
from parl import layers
class LiftModel(parl.Model):
def __init__(self, act_dim):
self.act_dim = act_dim
self.fc_1 = layers.fc(size=512, act='relu')
self.fc_2 = layers.fc(size=256, act='relu')
self.fc_3 = layers.fc(size=128, act='tanh')
self.value_fc = layers.fc(size=1)
self.policy_fc = layers.fc(size=act_dim)
def policy(self, obs):
"""
Args:
obs(float32 tensor): shape of (B * obs_dim)
Returns:
policy_logits(float32 tensor): shape of (B * act_dim)
"""
h_1 = self.fc_1(obs)
h_2 = self.fc_2(h_1)
h_3 = self.fc_3(h_2)
policy_logits = self.policy_fc(h_3)
return policy_logits
def value(self, obs):
"""
Args:
obs(float32 tensor): shape of (B * obs_dim)
Returns:
values(float32 tensor): shape of (B,)
"""
h_1 = self.fc_1(obs)
h_2 = self.fc_2(h_1)
h_3 = self.fc_3(h_2)
values = self.value_fc(h_3)
values = layers.squeeze(values, axes=[1])
return values
def policy_and_value(self, obs):
"""
Args:
obs(float32 tensor): shape (B * obs_dim)
Returns:
policy_logits(float32 tensor): shape of (B * act_dim)
values(float32 tensor): shape of (B,)
"""
h_1 = self.fc_1(obs)
h_2 = self.fc_2(h_1)
h_3 = self.fc_3(h_2)
policy_logits = self.policy_fc(h_3)
values = self.value_fc(h_3)
values = layers.squeeze(values, axes=[1])
return policy_logits, values
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import os
import parl
import queue
import six
import time
import threading
from actor import Actor
from collections import defaultdict
from env_wrapper import ObsProcessWrapper, ActionProcessWrapper
from parl.utils import logger, get_gpu_count, tensorboard, machine_info
from parl.utils.scheduler import PiecewiseScheduler
from parl.utils.time_stat import TimeStat
from parl.utils.window_stat import WindowStat
from rlschool import LiftSim
from lift_model import LiftModel
from lift_agent import LiftAgent
class Learner(object):
def __init__(self, config):
self.config = config
#=========== Create Agent ==========
env = LiftSim()
env = ActionProcessWrapper(env)
env = ObsProcessWrapper(env)
obs_dim = env.obs_dim
act_dim = env.act_dim
self.config['obs_dim'] = obs_dim
model = LiftModel(act_dim)
algorithm = parl.algorithms.A3C(
model, vf_loss_coeff=config['vf_loss_coeff'])
self.agent = LiftAgent(algorithm, config)
if machine_info.is_gpu_available():
assert get_gpu_count() == 1, 'Only support training in single GPU,\
Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_TO_USE]` .'
#========== Learner ==========
self.entropy_stat = WindowStat(100)
self.target_values = None
self.learn_time_stat = TimeStat(100)
self.start_time = None
#========== Remote Actor ===========
self.remote_count = 0
self.sample_data_queue = queue.Queue()
self.remote_metrics_queue = queue.Queue()
self.sample_total_steps = 0
self.params_queues = []
self.create_actors()
self.log_steps = 0
def create_actors(self):
""" Connect to the cluster and start sampling of the remote actor.
"""
parl.connect(self.config['master_address'])
logger.info('Waiting for {} remote actors to connect.'.format(
self.config['actor_num']))
for i in six.moves.range(self.config['actor_num']):
params_queue = queue.Queue()
self.params_queues.append(params_queue)
self.remote_count += 1
logger.info('Remote actor count: {}'.format(self.remote_count))
remote_thread = threading.Thread(
target=self.run_remote_sample, args=(params_queue, ))
remote_thread.setDaemon(True)
remote_thread.start()
self.start_time = time.time()
def run_remote_sample(self, params_queue):
""" Sample data from remote actor and update parameters of remote actor.
"""
remote_actor = Actor(self.config)
cnt = 0
while True:
latest_params = params_queue.get()
remote_actor.set_weights(latest_params)
batch = remote_actor.sample()
self.sample_data_queue.put(batch)
cnt += 1
if cnt % self.config['get_remote_metrics_interval'] == 0:
metrics = remote_actor.get_metrics()
if metrics:
self.remote_metrics_queue.put(metrics)
def step(self):
"""
1. kick off all actors to synchronize parameters and sample data;
2. collect sample data of all actors;
3. update parameters.
"""
latest_params = self.agent.get_weights()
for params_queue in self.params_queues:
params_queue.put(latest_params)
train_batch = defaultdict(list)
for i in range(self.config['actor_num']):
sample_data = self.sample_data_queue.get()
for key, value in sample_data.items():
train_batch[key].append(value)
self.sample_total_steps += sample_data['obs'].shape[0]
for key, value in train_batch.items():
train_batch[key] = np.concatenate(value)
with self.learn_time_stat:
total_loss, pi_loss, vf_loss, entropy, lr, entropy_coeff = self.agent.learn(
obs_np=train_batch['obs'],
actions_np=train_batch['actions'],
advantages_np=train_batch['advantages'],
target_values_np=train_batch['target_values'])
self.entropy_stat.add(entropy)
self.target_values = np.mean(train_batch['target_values'])
tensorboard.add_scalar('model/entropy', entropy,
self.sample_total_steps)
tensorboard.add_scalar('model/q_value', self.target_values,
self.sample_total_steps)
def log_metrics(self):
""" Log metrics of learner and actors
"""
if self.start_time is None:
return
metrics = []
while True:
try:
metric = self.remote_metrics_queue.get_nowait()
metrics.append(metric)
except queue.Empty:
break
env_reward_1h, env_reward_24h = [], []
for x in metrics:
env_reward_1h.extend(x['env_reward_1h'])
env_reward_24h.extend(x['env_reward_24h'])
env_reward_1h = [x for x in env_reward_1h if x is not None]
env_reward_24h = [x for x in env_reward_24h if x is not None]
mean_reward_1h, mean_reward_24h = None, None
if env_reward_1h:
mean_reward_1h = np.mean(np.array(env_reward_1h).flatten())
tensorboard.add_scalar('performance/env_rewards_1h',
mean_reward_1h, self.sample_total_steps)
if env_reward_24h:
mean_reward_24h = np.mean(np.array(env_reward_24h).flatten())
tensorboard.add_scalar('performance/env_rewards_24h',
mean_reward_24h, self.sample_total_steps)
metric = {
'Sample steps': self.sample_total_steps,
'env_reward_1h': mean_reward_1h,
'env_reward_24h': mean_reward_24h,
'target_values': self.target_values,
'entropy': self.entropy_stat.mean,
'learn_time_s': self.learn_time_stat.mean,
'elapsed_time_s': int(time.time() - self.start_time),
}
logger.info(metric)
self.log_steps += 1
save_interval_step = 7200 // max(1,
self.config['log_metrics_interval_s'])
if self.log_steps % save_interval_step == 0:
self.save_model() # save model every 2h
def should_stop(self):
return self.sample_total_steps >= self.config['max_sample_steps']
def save_model(self):
time_str = time.strftime(".%Y%m%d_%H%M%S", time.localtime())
self.agent.save(os.path.join('saved_models', 'model.ckpt' + time_str))
if __name__ == '__main__':
from a2c_config import config
learner = Learner(config)
assert config['log_metrics_interval_s'] > 0
while not learner.should_stop():
start = time.time()
while time.time() - start < config['log_metrics_interval_s']:
learner.step()
learner.log_metrics()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
def discretize(value, n_dim, min_val, max_val):
'''
discretize a value into a vector of n_dim dimension 1-hot representation
with the value below min_val being [1, 0, 0, ..., 0]
and the value above max_val being [0, 0, ..., 0, 1]
'''
assert n_dim > 0
if (n_dim == 1):
return [1]
delta = (max_val - min_val) / float(n_dim - 1)
active_pos = int((value - min_val) / delta + 0.5)
active_pos = min(n_dim - 1, active_pos)
active_pos = max(0, active_pos)
ret_array = [0 for i in range(n_dim)]
ret_array[active_pos] = 1.0
return ret_array
def linear_discretize(value, n_dim, min_val, max_val):
'''
discretize a value into a vector of n_dim dimensional representation
with the value below min_val being [1, 0, 0, ..., 0]
and the value above max_val being [0, 0, ..., 0, 1]
e.g. if n_dim = 2, min_val = 1.0, max_val = 2.0
if value = 1.5 returns [0.5, 0.5], if value = 1.8 returns [0.2, 0.8]
'''
assert n_dim > 0
if (n_dim == 1):
return [1]
delta = (max_val - min_val) / float(n_dim - 1)
active_pos = int((value - min_val) / delta + 0.5)
active_pos = min(n_dim - 2, active_pos)
active_pos = max(0, active_pos)
anchor_pt = active_pos * delta + min_val
if (anchor_pt > value and anchor_pt > min_val + 0.5 * delta):
anchor_pt -= delta
active_pos -= 1
weight = (value - anchor_pt) / delta
weight = min(1.0, max(0.0, weight))
ret_array = [0 for i in range(n_dim)]
ret_array[active_pos] = 1.0 - weight
ret_array[active_pos + 1] = weight
return ret_array
# LiftSim基线
## 简介
基于PARL库实现Deep Q-network算法,应用于[RLSchool][rlschool]库中的电梯调度模拟环境[LiftSim][liftsim]
## 依赖库
+ [paddlepaddle==1.5.1](https://github.com/PaddlePaddle/Paddle)
+ [parl==1.1.2](https://github.com/PaddlePaddle/PARL)
+ [rlschool>=0.0.1](rlschool)
## 运行
```python
python demo.py
```
## Benchmark
<img src="rl_10.png" width="400"/>
Accumulated Reward:每3600 steps内reward的总和,可体现电梯调度在单位时间(模拟环境0.5小时)内的效率。
[rlschool]: https://github.com/PaddlePaddle/RLSchool
[liftsim]: https://github.com/PaddlePaddle/RLSchool/tree/master/rlschool/liftsim
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
import numpy.random as random
import paddle.fluid as fluid
from parl import layers
from parl import Agent
from parl.utils import get_gpu_count, machine_info
class ElevatorAgent(Agent):
def __init__(self, algorithm, obs_dim, action_dim):
self._action_dim = action_dim
self._obs_dim = obs_dim
self._update_target_steps = 1000
self._global_step = 0
self.exploration_ratio = 0.9
self.exploration_decre = 1e-7
self.exploration_min = 0.1
super(ElevatorAgent, self).__init__(algorithm)
use_cuda = machine_info.is_gpu_available()
if self.gpu_id >= 0:
assert get_gpu_count() == 1, 'Only support training in single GPU,\
Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_YOU_WANT_TO_USE]` .'
else:
os.environ['CPU_NUM'] = str(1)
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.num_threads = 1
exec_strategy.num_iteration_per_drop_scope = 10
build_strategy = fluid.BuildStrategy()
build_strategy.remove_unnecessary_lock = False
self.learn_pe = fluid.ParallelExecutor(
use_cuda=use_cuda,
main_program=self.learn_program,
build_strategy=build_strategy,
exec_strategy=exec_strategy,
)
def build_program(self):
self.pred_program = fluid.Program()
self.learn_program = fluid.Program()
with fluid.program_guard(self.pred_program):
obs = layers.data(
name='obs', shape=[self._obs_dim], dtype='float32')
self._value = self.alg.define_predict(obs)
with fluid.program_guard(self.learn_program):
obs = layers.data(
name='obs', shape=[self._obs_dim], dtype='float32')
action = layers.data(name='act', shape=[1], dtype='int32')
reward = layers.data(name='reward', shape=[], dtype='float32')
next_obs = layers.data(
name='next_obs', shape=[self._obs_dim], dtype='float32')
terminal = layers.data(name='terminal', shape=[], dtype='bool')
self._cost = self.alg.define_learn(obs, action, reward, next_obs,
terminal)
def sample(self, obs):
if self.exploration_ratio > self.exploration_min:
self.exploration_ratio -= self.exploration_decre
q_values = self.predict(obs)
ret_actions = list()
for i in range(len(q_values)): # number of elevators
if (random.random() < self.exploration_ratio):
action = random.randint(0, self._action_dim)
else:
action = np.argmax(q_values[i])
ret_actions.append(int(action))
return ret_actions
def predict(self, obs):
pred_Q = self.fluid_executor.run(
self.pred_program,
feed={'obs': obs.astype('float32')},
fetch_list=[self._value])
return pred_Q[0]
def learn(self, obs, act, reward, next_obs, terminal):
self._global_step += 1
if self._global_step % self._update_target_steps == 0:
self.alg.sync_target(self.gpu_id)
feed = {
'obs': obs.astype('float32'),
'act': act.astype('int32'),
'reward': reward,
'next_obs': next_obs.astype('float32'),
'terminal': terminal
}
cost = self.learn_pe.run(feed=feed, fetch_list=[self._cost.name])[0]
return cost
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import parl
import numpy as np
import numpy.random as random
from copy import deepcopy
from collections import deque
from rlschool import EPSILON, HUGE
from rl_benchmark.model import RLDispatcherModel
from rl_benchmark.agent import ElevatorAgent
from parl.algorithms import DQN
from parl.utils import ReplayMemory
MEMORY_SIZE = 1000000
BATCH_SIZE = 64
class RL_dispatcher():
"""
An RL benchmark for elevator system
"""
def __init__(self, env, max_episode):
self.env = env
self._obs_dim = env.observation_space
self._act_dim = env.action_space
self._global_step = 0
self.max_episode = max_episode
self._rpm = ReplayMemory(MEMORY_SIZE, self._obs_dim, 1)
self._model = RLDispatcherModel(self._act_dim)
hyperparas = {
'action_dim': self._act_dim,
'lr': 5.0e-4,
'gamma': 0.998
}
self._algorithm = DQN(self._model, hyperparas)
self._agent = ElevatorAgent(self._algorithm, self._obs_dim,
self._act_dim)
self._warm_up_size = 2000
self._statistic_freq = 1000
self._loss_queue = deque()
def run_episode(self):
self.env.reset()
acc_reward = 0.0
while self._global_step < self.max_episode:
# self.env.render()
state = self.env.state
action = self._agent.sample(state)
state_, reward, done, info = self.env.step(action)
output_info = self.learn_step(state, action, reward)
acc_reward += reward
if (isinstance(output_info, dict) and len(output_info) > 0):
self.env.log_notice("%s", output_info)
if (self._global_step % 3600 == 0):
self.env.log_notice(
"Accumulated Reward: %f, Mansion Status: %s", acc_reward,
self.env.statistics)
acc_reward = 0.0
self._agent.save('./model.ckpt')
def learn_step(self, state, action, r):
self._global_step += 1
if (self._global_step > self._warm_up_size):
for i in range(self.env.elevator_num):
self._rpm.append(self._last_observation_array[i],
self._last_action[i], self._last_reward,
deepcopy(state[i]), False)
self._last_observation_array = deepcopy(state)
self._last_action = deepcopy(action)
self._last_reward = r
ret_dict = {}
if self._rpm.size() > self._warm_up_size:
batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = \
self._rpm.sample_batch(BATCH_SIZE)
cost = self._agent.learn(batch_obs, batch_action, batch_reward,
batch_next_obs, batch_terminal)
self._loss_queue.appendleft(cost)
if (len(self._loss_queue) > self._statistic_freq):
self._loss_queue.pop()
if (self._global_step % self._statistic_freq == 0):
ret_dict["Temporal Difference Error(Average)"] = \
float(sum(self._loss_queue)) / float(len(self._loss_queue))
return ret_dict
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import paddle.fluid as fluid
from parl import layers
import numpy as np
import parl
class RLDispatcherModel(parl.Model):
def __init__(self, act_dim):
self._act_dim = act_dim
self._fc_1 = layers.fc(size=512, act='relu')
self._fc_2 = layers.fc(size=256, act='relu')
self._fc_3 = layers.fc(size=128, act='tanh')
self._output = layers.fc(size=act_dim)
def value(self, obs):
_h_1 = self._fc_1(obs)
_h_2 = self._fc_2(_h_1)
_h_3 = self._fc_3(_h_2)
self._pred = self._output(_h_3)
return self._pred
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# wrapper part modified from
# https://github.com/openai/gym/blob/master/gym/core.py
from rlschool import LiftSim
from wrapper_utils import obs_dim, act_dim, mansion_state_preprocessing
from wrapper_utils import action_idx_to_action
class Wrapper(LiftSim):
def __init__(self, env):
self.env = env
self._mansion = env._mansion
self.mansion_attr = self._mansion.attribute
self.elevator_num = self.mansion_attr.ElevatorNumber
self.observation_space = obs_dim(self.mansion_attr)
self.action_space = act_dim(self.mansion_attr)
self.viewer = env.viewer
def __getattr__(self, name):
if name.startswith('_'):
raise AttributeError(
"attempted to get missing private attribute '{}'".format(name))
return getattr(self.env, name)
def seed(self, seed=None):
return self.env.seed(seed)
def step(self, action):
return self.env.step(action)
def reset(self):
return self.env.reset()
def render(self):
return self.env.render()
def close(self):
return self.env.close()
class RewardWrapper(Wrapper):
pass
class ActionWrapper(Wrapper):
def reset(self):
return self.env.reset()
def step(self, action):
act = []
for a in action:
act.extend(self.action(a, self.action_space))
return self.env.step(act)
def action(self, action, action_space):
return action_idx_to_action(action, action_space)
class ObservationWrapper(Wrapper):
def reset(self):
self.env.reset()
return self.observation(self._mansion.state)
def step(self, action):
observation, reward, done, info = self.env.step(action)
return (self.observation(observation), reward, done, info)
def observation(self, observation):
return mansion_state_preprocessing(observation)
@property
def state(self):
return self.observation(self._mansion.state)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import random
import numpy as np
from rlschool import ElevatorState, ElevatorAction
from rlschool import MansionAttribute, MansionState
from rlschool import EPSILON, HUGE
from rlschool import MansionConfig
from rlschool import MansionManager
def discretize(value, n_dim, min_val, max_val):
"""
discretize a value into a vector of n_dim dimension 1-hot representation
with the value below min_val being [1, 0, 0, ..., 0]
and the value above max_val being [0, 0, ..., 0, 1]
Args:
value: the value that needs to be discretized into 1-hot format
n_dim: number of dimensions
min_val: minimal value in the result
man_val: maximum value in the result
Returns:
the discretized vector
"""
assert n_dim > 0
if (n_dim == 1):
return [1]
delta = (max_val - min_val) / float(n_dim - 1)
active_pos = int((value - min_val) / delta + 0.5)
active_pos = min(n_dim - 1, active_pos)
active_pos = max(0, active_pos)
ret_array = [0 for i in range(n_dim)]
ret_array[active_pos] = 1.0
return ret_array
def linear_discretize(value, n_dim, min_val, max_val):
"""
discretize a value into a vector of n_dim dimensional representation
with the value below min_val being [1, 0, 0, ..., 0]
and the value above max_val being [0, 0, ..., 0, 1]
e.g. if n_dim = 2, min_val = 1.0, max_val = 2.0
if value = 1.5 returns [0.5, 0.5], if value = 1.8 returns [0.2, 0.8]
Args:
value: the value that needs to be discretized
n_dim: number of dimensions
min_val: minimal value in the result
man_val: maximum value in the result
Returns:
the discretized vector
"""
assert n_dim > 0
if (n_dim == 1):
return [1]
delta = (max_val - min_val) / float(n_dim - 1)
active_pos = int((value - min_val) / delta + 0.5)
active_pos = min(n_dim - 2, active_pos)
active_pos = max(0, active_pos)
anchor_pt = active_pos * delta + min_val
if (anchor_pt > value and anchor_pt > min_val + 0.5 * delta):
anchor_pt -= delta
active_pos -= 1
weight = (value - anchor_pt) / delta
weight = min(1.0, max(0.0, weight))
ret_array = [0 for i in range(n_dim)]
ret_array[active_pos] = 1.0 - weight
ret_array[active_pos + 1] = weight
return ret_array
def ele_state_preprocessing(ele_state):
"""Process elevator state, make it usable for network
Args:
ele_state: ElevatorState, nametuple, defined in rlschool/liftsim/environment/mansion/utils.py
Returns:
ele_feature: list of elevator state
"""
ele_feature = []
# add floor information
ele_feature.extend(
linear_discretize(ele_state.Floor, ele_state.MaximumFloor, 1.0,
ele_state.MaximumFloor))
# add velocity information
ele_feature.extend(
linear_discretize(ele_state.Velocity, 21, -ele_state.MaximumSpeed,
ele_state.MaximumSpeed))
# add door information
ele_feature.append(ele_state.DoorState)
ele_feature.append(float(ele_state.DoorIsOpening))
ele_feature.append(float(ele_state.DoorIsClosing))
# add direction information
ele_feature.extend(discretize(ele_state.Direction, 3, -1, 1))
# add load weight information
ele_feature.extend(
linear_discretize(ele_state.LoadWeight / ele_state.MaximumLoad, 5, 0.0,
1.0))
# add other information
target_floor_binaries = [0.0 for i in range(ele_state.MaximumFloor)]
for target_floor in ele_state.ReservedTargetFloors:
target_floor_binaries[target_floor - 1] = 1.0
ele_feature.extend(target_floor_binaries)
dispatch_floor_binaries = [0.0 for i in range(ele_state.MaximumFloor + 1)]
dispatch_floor_binaries[ele_state.CurrentDispatchTarget] = 1.0
ele_feature.extend(dispatch_floor_binaries)
ele_feature.append(ele_state.DispatchTargetDirection)
return ele_feature
def obs_dim(mansion_attr):
"""Calculate the observation dimension
Args:
mansion_attr: MansionAttribute, attribute of mansion_manager
Returns:
observation dimension
"""
assert isinstance(mansion_attr, MansionAttribute)
ele_dim = mansion_attr.NumberOfFloor * 3 + 34
obs_dim = (ele_dim + 1) * mansion_attr.ElevatorNumber + \
mansion_attr.NumberOfFloor * 2
return obs_dim
def act_dim(mansion_attr):
"""Calculate the action dimension, which is number of floor times 2 plus 2.
The additional two are for special cases: the elevator stops at once if the new dispatch_target is 0,
the original dispatch_target does not change if dispatch_target is -1. See implementation in
method action_idx_to_action below.
Args:
mansion_attr: MansionAttribute, attribute of mansion_manager
Returns:
action dimension
"""
assert isinstance(mansion_attr, MansionAttribute)
return mansion_attr.NumberOfFloor * 2 + 2
def mansion_state_preprocessing(mansion_state):
"""Process mansion_state to make it usable for networks, convert it into a numpy array
Args:
mansion_state: namedtuple of mansion state,
defined in rlschool/liftsim/environment/mansion/utils.py
Returns:
the converted numpy array
"""
ele_features = list()
for ele_state in mansion_state.ElevatorStates:
ele_features.append(ele_state_preprocessing(ele_state))
max_floor = ele_state.MaximumFloor
target_floor_binaries_up = [0.0 for i in range(max_floor)]
target_floor_binaries_down = [0.0 for i in range(max_floor)]
for floor in mansion_state.RequiringUpwardFloors:
target_floor_binaries_up[floor - 1] = 1.0
for floor in mansion_state.RequiringDownwardFloors:
target_floor_binaries_down[floor - 1] = 1.0
target_floor_binaries = target_floor_binaries_up + target_floor_binaries_down
idx = 0
man_features = list()
for idx in range(len(mansion_state.ElevatorStates)):
elevator_id_vec = discretize(idx + 1,
len(mansion_state.ElevatorStates), 1,
len(mansion_state.ElevatorStates))
idx_array = list(range(len(mansion_state.ElevatorStates)))
idx_array.remove(idx)
# random.shuffle(idx_array)
man_features.append(ele_features[idx])
for left_idx in idx_array:
man_features[idx] = man_features[idx] + ele_features[left_idx]
man_features[idx] = man_features[idx] + \
elevator_id_vec + target_floor_binaries
return np.asarray(man_features, dtype='float32')
def action_idx_to_action(action_idx, act_dim):
"""Convert action_inx to action
Args:
action_idx: the index needed to be converted
act_dim: action dimension
Returns:
the converted namedtuple
"""
assert isinstance(action_idx, int)
assert isinstance(act_dim, int)
realdim = act_dim - 2
if (action_idx == realdim):
return ElevatorAction(0, 1)
elif (action_idx == realdim + 1):
return ElevatorAction(-1, 1)
action = action_idx
if (action_idx < realdim / 2):
direction = 1
action += 1
else:
direction = -1
action -= int(realdim / 2)
action += 1
return [action, direction]
def action_to_action_idx(action, act_dim):
"""Convert action to number according to act_dim.
Args:
action: namedtuple defined in rlschool/liftsim/environment/mansion/utils.py
act_dim: action dimension
Returns:
action_idx: the result index
"""
assert isinstance(action, ElevatorAction)
assert isinstance(act_dim, int)
realdim = act_dim - 2
if (action.TargetFloor == 0):
return realdim
elif (action.TargetFloor < 0):
return realdim + 1
action_idx = 0
if (action.DirectionIndicator < 0):
action_idx += int(realdim / 2)
action_idx += action.TargetFloor - 1
return action_idx
...@@ -20,7 +20,7 @@ from simple_model import MAModel ...@@ -20,7 +20,7 @@ from simple_model import MAModel
from simple_agent import MAAgent from simple_agent import MAAgent
import parl import parl
from parl.env.multiagent_simple_env import MAenv from parl.env.multiagent_simple_env import MAenv
from parl.utils import logger, tensorboard from parl.utils import logger, summary
def run_episode(env, agents): def run_episode(env, agents):
...@@ -62,7 +62,7 @@ def run_episode(env, agents): ...@@ -62,7 +62,7 @@ def run_episode(env, agents):
# learn policy # learn policy
for i, agent in enumerate(agents): for i, agent in enumerate(agents):
critic_loss = agent.learn(agents) critic_loss = agent.learn(agents)
tensorboard.add_scalar('critic_loss_%d' % i, critic_loss, summary.add_scalar('critic_loss_%d' % i, critic_loss,
agent.global_train_step) agent.global_train_step)
return total_reward, agents_reward, steps return total_reward, agents_reward, steps
...@@ -155,11 +155,11 @@ def train_agent(): ...@@ -155,11 +155,11 @@ def train_agent():
format(total_steps, total_episodes, mean_episode_reward, format(total_steps, total_episodes, mean_episode_reward,
use_time)) use_time))
t_start = time.time() t_start = time.time()
tensorboard.add_scalar('mean_episode_reward/episode', summary.add_scalar('mean_episode_reward/episode',
mean_episode_reward, total_episodes) mean_episode_reward, total_episodes)
tensorboard.add_scalar('mean_episode_reward/steps', summary.add_scalar('mean_episode_reward/steps',
mean_episode_reward, total_steps) mean_episode_reward, total_steps)
tensorboard.add_scalar('use_time/1000episode', use_time, summary.add_scalar('use_time/1000episode', use_time,
total_episodes) total_episodes)
# save model # save model
......
...@@ -22,7 +22,7 @@ import numpy as np ...@@ -22,7 +22,7 @@ import numpy as np
from actor import Actor from actor import Actor
from opensim_model import OpenSimModel from opensim_model import OpenSimModel
from opensim_agent import OpenSimAgent from opensim_agent import OpenSimAgent
from parl.utils import logger, ReplayMemory, tensorboard, get_gpu_count from parl.utils import logger, ReplayMemory, summary, get_gpu_count
from parl.utils.window_stat import WindowStat from parl.utils.window_stat import WindowStat
from parl.remote.client import get_global_client from parl.remote.client import get_global_client
from parl.utils import machine_info from parl.utils import machine_info
......
...@@ -22,7 +22,7 @@ import numpy as np ...@@ -22,7 +22,7 @@ import numpy as np
from actor import Actor from actor import Actor
from opensim_model import OpenSimModel from opensim_model import OpenSimModel
from opensim_agent import OpenSimAgent from opensim_agent import OpenSimAgent
from parl.utils import logger, ReplayMemory, tensorboard, get_gpu_count from parl.utils import logger, ReplayMemory, summary, get_gpu_count
from parl.utils.window_stat import WindowStat from parl.utils.window_stat import WindowStat
from parl.remote.client import get_global_client from parl.remote.client import get_global_client
from parl.utils import machine_info from parl.utils import machine_info
...@@ -97,7 +97,7 @@ class Learner(object): ...@@ -97,7 +97,7 @@ class Learner(object):
# add lock between training and predicting # add lock between training and predicting
self.model_lock = threading.Lock() self.model_lock = threading.Lock()
# add lock when appending data to rpm or writing scalars to tensorboard # add lock when appending data to rpm or writing scalars to summary
self.memory_lock = threading.Lock() self.memory_lock = threading.Lock()
self.ready_actor_queue = queue.Queue() self.ready_actor_queue = queue.Queue()
...@@ -246,23 +246,23 @@ class Learner(object): ...@@ -246,23 +246,23 @@ class Learner(object):
episode_env_reward) episode_env_reward)
if self.env_reward_stat.count > 500: if self.env_reward_stat.count > 500:
tensorboard.add_scalar('recent_env_reward', summary.add_scalar('recent_env_reward',
self.env_reward_stat.mean, self.env_reward_stat.mean,
self.total_steps) self.total_steps)
tensorboard.add_scalar('recent_shaping_reward', summary.add_scalar('recent_shaping_reward',
self.shaping_reward_stat.mean, self.shaping_reward_stat.mean,
self.total_steps) self.total_steps)
if self.critic_loss_stat.count > 500: if self.critic_loss_stat.count > 500:
tensorboard.add_scalar('recent_critic_loss', summary.add_scalar('recent_critic_loss',
self.critic_loss_stat.mean, self.critic_loss_stat.mean,
self.total_steps) self.total_steps)
tensorboard.add_scalar('episode_length', n, self.total_steps) summary.add_scalar('episode_length', n, self.total_steps)
tensorboard.add_scalar('max_env_reward', self.max_env_reward, summary.add_scalar('max_env_reward', self.max_env_reward,
self.total_steps) self.total_steps)
tensorboard.add_scalar('ready_actor_num', summary.add_scalar('ready_actor_num',
self.ready_actor_queue.qsize(), self.ready_actor_queue.qsize(),
self.total_steps) self.total_steps)
tensorboard.add_scalar('episode_time', episode_time, summary.add_scalar('episode_time', episode_time,
self.total_steps) self.total_steps)
self.noiselevel = self.noiselevel * NOISE_DECAY self.noiselevel = self.noiselevel * NOISE_DECAY
......
...@@ -21,7 +21,7 @@ import time ...@@ -21,7 +21,7 @@ import time
import parl import parl
from mujoco_agent import MujocoAgent from mujoco_agent import MujocoAgent
from mujoco_model import ActorModel, CriticModel from mujoco_model import ActorModel, CriticModel
from parl.utils import logger, tensorboard, action_mapping, ReplayMemory from parl.utils import logger, summary, action_mapping, ReplayMemory
ACTOR_LR = 1e-3 ACTOR_LR = 1e-3
CRITIC_LR = 1e-3 CRITIC_LR = 1e-3
...@@ -111,8 +111,7 @@ def main(): ...@@ -111,8 +111,7 @@ def main():
train_reward, steps = run_train_episode(env, agent, rpm) train_reward, steps = run_train_episode(env, agent, rpm)
total_steps += steps total_steps += steps
logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
tensorboard.add_scalar('train/episode_reward', train_reward, summary.add_scalar('train/episode_reward', train_reward, total_steps)
total_steps)
if total_steps // args.test_every_steps >= test_flag: if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag:
...@@ -120,7 +119,7 @@ def main(): ...@@ -120,7 +119,7 @@ def main():
evaluate_reward = run_evaluate_episode(env, agent) evaluate_reward = run_evaluate_episode(env, agent)
logger.info('Steps {}, Evaluate reward: {}'.format( logger.info('Steps {}, Evaluate reward: {}'.format(
total_steps, evaluate_reward)) total_steps, evaluate_reward))
tensorboard.add_scalar('eval/episode_reward', evaluate_reward, summary.add_scalar('eval/episode_reward', evaluate_reward,
total_steps) total_steps)
......
...@@ -19,7 +19,7 @@ import time ...@@ -19,7 +19,7 @@ import time
import parl import parl
from mujoco_agent import MujocoAgent from mujoco_agent import MujocoAgent
from mujoco_model import MujocoModel from mujoco_model import MujocoModel
from parl.utils import logger, tensorboard, action_mapping, ReplayMemory from parl.utils import logger, summary, action_mapping, ReplayMemory
MAX_EPISODES = 5000 MAX_EPISODES = 5000
ACTOR_LR = 3e-4 ACTOR_LR = 3e-4
...@@ -117,8 +117,7 @@ def main(): ...@@ -117,8 +117,7 @@ def main():
train_reward, steps = run_train_episode(env, agent, rpm) train_reward, steps = run_train_episode(env, agent, rpm)
total_steps += steps total_steps += steps
logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
tensorboard.add_scalar('train/episode_reward', train_reward, summary.add_scalar('train/episode_reward', train_reward, total_steps)
total_steps)
if total_steps // args.test_every_steps >= test_flag: if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag:
...@@ -126,7 +125,7 @@ def main(): ...@@ -126,7 +125,7 @@ def main():
evaluate_reward = run_evaluate_episode(env, agent) evaluate_reward = run_evaluate_episode(env, agent)
logger.info('Steps {}, Evaluate reward: {}'.format( logger.info('Steps {}, Evaluate reward: {}'.format(
total_steps, evaluate_reward)) total_steps, evaluate_reward))
tensorboard.add_scalar('eval/episode_reward', evaluate_reward, summary.add_scalar('eval/episode_reward', evaluate_reward,
total_steps) total_steps)
......
...@@ -22,7 +22,7 @@ from tqdm import tqdm ...@@ -22,7 +22,7 @@ from tqdm import tqdm
import parl import parl
import paddle.fluid as fluid import paddle.fluid as fluid
from parl.utils import get_gpu_count from parl.utils import get_gpu_count
from parl.utils import tensorboard, logger from parl.utils import summary, logger
from dqn import DQN # slight changes from parl.algorithms.DQN from dqn import DQN # slight changes from parl.algorithms.DQN
from atari_agent import AtariAgent from atari_agent import AtariAgent
......
...@@ -90,7 +90,7 @@ class IMPALA(Algorithm): ...@@ -90,7 +90,7 @@ class IMPALA(Algorithm):
vf_loss_coeff=None, vf_loss_coeff=None,
clip_rho_threshold=None, clip_rho_threshold=None,
clip_pg_rho_threshold=None): clip_pg_rho_threshold=None):
""" IMPALA algorithm r""" IMPALA algorithm
Args: Args:
model (parl.Model): forward network of policy and value model (parl.Model): forward network of policy and value
......
...@@ -12,37 +12,34 @@ ...@@ -12,37 +12,34 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from rlschool import LiftSim from tensorboardX import SummaryWriter
from wrapper import Wrapper, ActionWrapper, ObservationWrapper from parl.utils import logger
from rl_benchmark.dispatcher import RL_dispatcher
import sys
import argparse
__all__ = []
# run main program with args _writer = None
def run_main(args): _WRITTER_METHOD = ['add_scalar', 'add_histogram', 'close', 'flush']
parser = argparse.ArgumentParser(description='demo configuration')
parser.add_argument(
'--iterations',
type=int,
default=100000000,
help='total number of iterations')
args = parser.parse_args(args)
print('iterations:', args.iterations)
mansion_env = LiftSim() def create_file_after_first_call(func_name):
# mansion_env.seed(1988) def call(*args, **kwargs):
global _writer
if _writer is None:
logdir = logger.get_dir()
if logdir is None:
logdir = logger.auto_set_dir(action='d')
logger.warning(
"[tensorboard] logdir is None, will save tensorboard files to {}"
.format(logdir))
_writer = SummaryWriter(logdir=logger.get_dir())
func = getattr(_writer, func_name)
func(*args, **kwargs)
_writer.flush()
mansion_env = Wrapper(mansion_env) return call
mansion_env = ActionWrapper(mansion_env)
mansion_env = ObservationWrapper(mansion_env)
dispatcher = RL_dispatcher(mansion_env, args.iterations)
dispatcher.run_episode()
return 0 # export writter functions
for func_name in _WRITTER_METHOD:
locals()[func_name] = create_file_after_first_call(func_name)
if __name__ == "__main__": __all__.append(func_name)
run_main(sys.argv[1:])
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import unittest import unittest
from parl.utils import tensorboard from parl.utils import summary
import numpy as np import numpy as np
from parl.utils import logger from parl.utils import logger
import os import os
...@@ -20,18 +20,18 @@ import os ...@@ -20,18 +20,18 @@ import os
class TestUtils(unittest.TestCase): class TestUtils(unittest.TestCase):
def tearDown(self): def tearDown(self):
tensorboard.flush() summary.flush()
def test_add_scalar(self): def test_add_scalar(self):
x = range(100) x = range(100)
for i in x: for i in x:
tensorboard.add_scalar('y=2x', i * 2, i) summary.add_scalar('y=2x', i * 2, i)
self.assertTrue(os.path.exists('./train_log/tensorboard_test')) self.assertTrue(os.path.exists('./train_log/summary_test'))
def test_add_histogram(self): def test_add_histogram(self):
for i in range(10): for i in range(10):
x = np.random.random(1000) x = np.random.random(1000)
tensorboard.add_histogram('distribution centers', x + i, i) summary.add_histogram('distribution centers', x + i, i)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册