diff --git a/benchmark/torch/a2c/train.py b/benchmark/torch/a2c/train.py index f2985367f8304edb6bccc93f894a7d04f5f305c8..9a498023988bc72a0a0aa43d4850c25ced8d2856 100644 --- a/benchmark/torch/a2c/train.py +++ b/benchmark/torch/a2c/train.py @@ -27,7 +27,7 @@ from parl.env.atari_wrappers import wrap_deepmind from parl.utils.window_stat import WindowStat from parl.utils.time_stat import TimeStat from parl.utils import machine_info -from parl.utils import logger, get_gpu_count, tensorboard +from parl.utils import logger, get_gpu_count, summary from parl.algorithms import A2C from atari_model import ActorCritic @@ -205,19 +205,19 @@ class Learner(object): } if metric['mean_episode_rewards'] is not None: - tensorboard.add_scalar('train/mean_reward', - metric['mean_episode_rewards'], - self.sample_total_steps) - tensorboard.add_scalar('train/total_loss', metric['total_loss'], - self.sample_total_steps) - tensorboard.add_scalar('train/pi_loss', metric['pi_loss'], - self.sample_total_steps) - tensorboard.add_scalar('train/vf_loss', metric['vf_loss'], - self.sample_total_steps) - tensorboard.add_scalar('train/entropy', metric['entropy'], - self.sample_total_steps) - tensorboard.add_scalar('train/learn_rate', metric['lr'], - self.sample_total_steps) + summary.add_scalar('train/mean_reward', + metric['mean_episode_rewards'], + self.sample_total_steps) + summary.add_scalar('train/total_loss', metric['total_loss'], + self.sample_total_steps) + summary.add_scalar('train/pi_loss', metric['pi_loss'], + self.sample_total_steps) + summary.add_scalar('train/vf_loss', metric['vf_loss'], + self.sample_total_steps) + summary.add_scalar('train/entropy', metric['entropy'], + self.sample_total_steps) + summary.add_scalar('train/learn_rate', metric['lr'], + self.sample_total_steps) logger.info(metric) diff --git a/benchmark/torch/dqn/train.py b/benchmark/torch/dqn/train.py index 26d24d8b1a3cc88bdb26aec954094f282e463003..ba64b95c93a9b4879621331ad30cce3cbcbcac16 100644 --- a/benchmark/torch/dqn/train.py +++ b/benchmark/torch/dqn/train.py @@ -22,7 +22,7 @@ import parl import numpy as np from tqdm import tqdm -from parl.utils import tensorboard, logger +from parl.utils import summary, logger from parl.algorithms import DQN, DDQN from agent import AtariAgent @@ -152,18 +152,17 @@ def main(): for _ in range(3): eval_rewards.append(run_evaluate_episode(test_env, agent)) - tensorboard.add_scalar('dqn/eval', np.mean(eval_rewards), - total_steps) - tensorboard.add_scalar('dqn/score', total_reward, total_steps) - tensorboard.add_scalar('dqn/loss', loss, total_steps) - tensorboard.add_scalar('dqn/exploration', agent.exploration, - total_steps) - tensorboard.add_scalar('dqn/Q value', - evaluate_fixed_Q(agent, fixed_obs), - total_steps) - tensorboard.add_scalar('dqn/grad_norm', - get_grad_norm(agent.alg.model), - total_steps) + summary.add_scalar('dqn/eval', np.mean(eval_rewards), + total_steps) + summary.add_scalar('dqn/score', total_reward, total_steps) + summary.add_scalar('dqn/loss', loss, total_steps) + summary.add_scalar('dqn/exploration', agent.exploration, + total_steps) + summary.add_scalar('dqn/Q value', + evaluate_fixed_Q(agent, fixed_obs), + total_steps) + summary.add_scalar('dqn/grad_norm', + get_grad_norm(agent.alg.model), total_steps) if __name__ == '__main__': diff --git a/benchmark/torch/td3/train.py b/benchmark/torch/td3/train.py index c844d8c079a4b10e1e0ade957202cd7d2dcd27fb..48bd1f77103f1e50bd28f55cc12bee09315496e7 100644 --- a/benchmark/torch/td3/train.py +++ b/benchmark/torch/td3/train.py @@ -15,7 +15,7 @@ import gym import argparse import numpy as np -from parl.utils import logger, tensorboard, ReplayMemory +from parl.utils import logger, summary, ReplayMemory from mujoco_model import MujocoModel from mujoco_agent import MujocoAgent @@ -103,8 +103,7 @@ def main(): train_reward, steps = run_train_episode(env, agent, rpm) total_steps += steps logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) - tensorboard.add_scalar('train/episode_reward', train_reward, - total_steps) + summary.add_scalar('train/episode_reward', train_reward, total_steps) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: @@ -112,8 +111,8 @@ def main(): evaluate_reward = run_evaluate_episode(env, agent) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, evaluate_reward)) - tensorboard.add_scalar('eval/episode_reward', evaluate_reward, - total_steps) + summary.add_scalar('eval/episode_reward', evaluate_reward, + total_steps) if __name__ == '__main__': diff --git a/docs/tutorial/tensorboard.rst b/docs/tutorial/tensorboard.rst index 35869af6506dfebcdb64aa4bf1d7bc876f101a0b..8952a5e00b624e1c02b74c451da0d168ee6a4817 100644 --- a/docs/tutorial/tensorboard.rst +++ b/docs/tutorial/tensorboard.rst @@ -1,14 +1,14 @@ -Tensorboard +summary =============== -Visualize the results with tensorboard. +Visualize the results with tensorboard. add_scalar ------------- Common used arguments: -* tensorboard.add_scalar(tag, scalar_value, global_step=None) +* summary.add_scalar(tag, scalar_value, global_step=None) * tag *(string)* – Data identifier * scalar_value *(float or string/blobname)* – Value to save * global_step *(int)* – Global step value to record @@ -17,11 +17,11 @@ Example: .. code-block:: python - from parl.utils import tensorboard + from parl.utils import summary x = range(100) for i in x: - tensorboard.add_scalar('y=2x', i * 2, i) + summary.add_scalar('y=2x', i * 2, i) Expected result: @@ -33,7 +33,7 @@ add_histogram Common used arguments: -* tensorboard.add_scalar(tag, scalar_value, global_step=None) +* summary.add_scalar(tag, scalar_value, global_step=None) * tag *(string)* – Data identifier * values *(torch.Tensor, numpy.array, or string/blobname)* – Values to build histogram * global_step *(int)* – Global step value to record @@ -42,12 +42,12 @@ Example: .. code-block:: python - from parl.utils import tensorboard + from parl.utils import summary import numpy as np for i in range(10): x = np.random.random(1000) - tensorboard.add_histogram('distribution centers', x + i, i) + summary.add_histogram('distribution centers', x + i, i) Expected result: diff --git a/examples/A2C/train.py b/examples/A2C/train.py index 1daba4736f8e19e9f50d425c12868a43ee0933db..4050a413f0262ec62f76bbc07062578d6a398d5c 100755 --- a/examples/A2C/train.py +++ b/examples/A2C/train.py @@ -25,7 +25,7 @@ from atari_agent import AtariAgent from collections import defaultdict from parl.env.atari_wrappers import wrap_deepmind -from parl.utils import logger, get_gpu_count, tensorboard +from parl.utils import logger, get_gpu_count, summary from parl.utils.scheduler import PiecewiseScheduler from parl.utils.time_stat import TimeStat from parl.utils.window_stat import WindowStat @@ -186,7 +186,7 @@ class Learner(object): min_episode_steps = np.min(np.array(episode_steps).flatten()) metric = { - 'Sample steps': self.sample_total_steps, + 'sample_steps': self.sample_total_steps, 'max_episode_rewards': max_episode_rewards, 'mean_episode_rewards': mean_episode_rewards, 'min_episode_rewards': min_episode_rewards, @@ -205,7 +205,7 @@ class Learner(object): for key, value in metric.items(): if value is not None: - tensorboard.add_scalar(key, value, self.sample_total_steps) + summary.add_scalar(key, value, self.sample_total_steps) logger.info(metric) diff --git a/examples/DQN_variant/train.py b/examples/DQN_variant/train.py index 0bd22797892783758b1059a4df2a06e4096736f6..5ca16df135c346a08f87efc6a694e1e289b8192c 100644 --- a/examples/DQN_variant/train.py +++ b/examples/DQN_variant/train.py @@ -22,7 +22,7 @@ from atari_agent import AtariAgent from atari_model import AtariModel from datetime import datetime from replay_memory import ReplayMemory, Experience -from parl.utils import tensorboard, logger +from parl.utils import summary, logger from tqdm import tqdm from utils import get_player @@ -120,11 +120,9 @@ def main(): total_reward, steps, loss = run_train_episode(env, agent, rpm) total_steps += steps pbar.set_description('[train]exploration:{}'.format(agent.exploration)) - tensorboard.add_scalar('dqn/score', total_reward, total_steps) - tensorboard.add_scalar('dqn/loss', loss, - total_steps) # mean of total loss - tensorboard.add_scalar('dqn/exploration', agent.exploration, - total_steps) + summary.add_scalar('dqn/score', total_reward, total_steps) + summary.add_scalar('dqn/loss', loss, total_steps) # mean of total loss + summary.add_scalar('dqn/exploration', agent.exploration, total_steps) pbar.update(steps) if total_steps // args.test_every_steps >= test_flag: @@ -139,7 +137,7 @@ def main(): "eval_agent done, (steps, eval_reward): ({}, {})".format( total_steps, np.mean(eval_rewards))) eval_test = np.mean(eval_rewards) - tensorboard.add_scalar('dqn/eval', eval_test, total_steps) + summary.add_scalar('dqn/eval', eval_test, total_steps) pbar.close() diff --git a/examples/ES/README.md b/examples/ES/README.md index 207ae2dafa68c5f7d2eb30f956355b07c1bd5d61..d868202753fa34c0799c8c58975c958aa1ffe001 100644 --- a/examples/ES/README.md +++ b/examples/ES/README.md @@ -34,7 +34,7 @@ Then we can start the distributed training by running: python train.py ``` -Training result will be saved in `train_log` with training curve that can be visualized in tensorboard data. +Training result will be saved in `train_log` with training curve. ### Reference + [Ray](https://github.com/ray-project/ray) diff --git a/examples/ES/train.py b/examples/ES/train.py index be2c7d703eeba39931312491274f554ee9a76562..eadf26ea6e7d736abe45e7c08d25a5c7ae8dda2e 100644 --- a/examples/ES/train.py +++ b/examples/ES/train.py @@ -23,7 +23,7 @@ from obs_filter import MeanStdFilter from mujoco_agent import MujocoAgent from mujoco_model import MujocoModel from noise import SharedNoiseTable -from parl.utils import logger, tensorboard +from parl.utils import logger, summary from parl.utils.window_stat import WindowStat from six.moves import queue from actor import Actor @@ -202,7 +202,7 @@ class Learner(object): logger.info(metrics) for k, v in metrics.items(): if v is not None: - tensorboard.add_scalar(k, v, self.sample_total_steps) + summary.add_scalar(k, v, self.sample_total_steps) if __name__ == '__main__': diff --git a/examples/GA3C/train.py b/examples/GA3C/train.py index edc7f33344bc484fff640700dfd80bfc35987843..30f3a415b77cfa83d8868606498379b528ad1c31 100755 --- a/examples/GA3C/train.py +++ b/examples/GA3C/train.py @@ -24,7 +24,7 @@ from atari_model import AtariModel from atari_agent import AtariAgent from collections import defaultdict from parl.env.atari_wrappers import wrap_deepmind -from parl.utils import logger, get_gpu_count, tensorboard +from parl.utils import logger, get_gpu_count, summary from parl.utils.scheduler import PiecewiseScheduler from parl.utils.time_stat import TimeStat from parl.utils.window_stat import WindowStat @@ -313,7 +313,7 @@ class Learner(object): for key, value in metric.items(): if value is not None: - tensorboard.add_scalar(key, value, self.sample_total_steps) + summary.add_scalar(key, value, self.sample_total_steps) logger.info(metric) diff --git a/examples/IMPALA/train.py b/examples/IMPALA/train.py index cf9e55c54d1df8a14fc1751ac75303c6adab42ad..9f2a3e65a7962d0aed103318c4a1979520004f8f 100755 --- a/examples/IMPALA/train.py +++ b/examples/IMPALA/train.py @@ -22,7 +22,7 @@ import parl from atari_model import AtariModel from atari_agent import AtariAgent from parl.env.atari_wrappers import wrap_deepmind -from parl.utils import logger, tensorboard, get_gpu_count +from parl.utils import logger, summary, get_gpu_count from parl.utils.scheduler import PiecewiseScheduler from parl.utils.time_stat import TimeStat from parl.utils.window_stat import WindowStat @@ -221,7 +221,7 @@ class Learner(object): min_episode_steps = np.min(np.array(episode_steps).flatten()) metric = { - 'Sample steps': self.sample_total_steps, + 'sample_steps': self.sample_total_steps, 'max_episode_rewards': max_episode_rewards, 'mean_episode_rewards': mean_episode_rewards, 'min_episode_rewards': min_episode_rewards, @@ -244,7 +244,7 @@ class Learner(object): for key, value in metric.items(): if value is not None: - tensorboard.add_scalar(key, value, self.sample_total_steps) + summary.add_scalar(key, value, self.sample_total_steps) logger.info(metric) diff --git a/examples/LiftSim_baseline/A2C/README.md b/examples/LiftSim_baseline/A2C/README.md deleted file mode 100644 index 235e1aff2c956fd1ec60d999e0c34328f949d80c..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/A2C/README.md +++ /dev/null @@ -1,47 +0,0 @@ -# LiftSim基线 - -## 简介 - -基于PARL库实现A2C算法,应用于[RLSchool][rlschool]库中的电梯调度模拟环境[LiftSim][liftsim]。 - -## 依赖库 - -+ [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) -+ [parl>=1.2.3](https://github.com/PaddlePaddle/PARL) -+ [rlschool>=0.1.1][rlschool] - - -## 分布式训练 - -首先,启动一个具有5个CPU资源的本地集群: - -```bash -xparl start --port 8010 --cpu_num 5 -``` - -> 注意,如果你已经启动了一个集群,则不需要重复运行上面命令。关于PARL集群更多信息,可以参考[文档](https://parl.readthedocs.io/en/latest/parallel_training/setup.html)。 - -然后我们就可以通过运行下面命令进行分布式训练: - -```bash -python train.py -``` - - -## 评估 -可以通过下面命令来评估保存的模型 -```bash -python evaluate.py --model_path saved_models/[FILENAME] -``` - -tensorboard和log文件会保存在`./train_log/train/`;可以通过运行命令`tensorboard --logdir .`查看tensorboard可视化界面。 - -## 收敛指标 -训练30h左右,评估指标能达到-120分左右(LiftSim环境运行1天reward) - - -## 可视化效果 - - -[rlschool]: https://github.com/PaddlePaddle/RLSchool -[liftsim]: https://github.com/PaddlePaddle/RLSchool/tree/master/rlschool/liftsim diff --git a/examples/LiftSim_baseline/A2C/a2c_config.py b/examples/LiftSim_baseline/A2C/a2c_config.py deleted file mode 100644 index e3e776b2f1033bc6bd911ed929437f91c44bf938..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/A2C/a2c_config.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -config = { - #========== remote config ========== - 'master_address': 'localhost:8010', - - #========== actor config ========== - 'actor_num': 5, - 'env_num': 5, - 'sample_batch_steps': 5, - - #========== learner config ========== - 'max_sample_steps': int(1e10), - 'gamma': 0.998, - 'lambda': 1.0, # GAE - - # start learning rate - 'start_lr': 1.0e-4, - - # coefficient of policy entropy adjustment schedule: (train_step, coefficient) - 'entropy_coeff_scheduler': [(0, -2.0e-4)], - 'vf_loss_coeff': 0.5, - 'get_remote_metrics_interval': 100, - 'log_metrics_interval_s': 60, -} diff --git a/examples/LiftSim_baseline/A2C/actor.py b/examples/LiftSim_baseline/A2C/actor.py deleted file mode 100644 index 4286e7f9c4dd64b444c2ca34d5ebffe870b2769f..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/A2C/actor.py +++ /dev/null @@ -1,137 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import parl -from collections import defaultdict -from env_wrapper import ObsProcessWrapper, ActionProcessWrapper, RewardWrapper, MetricsWrapper -from parl.utils.rl_utils import calc_gae -from parl.env.vector_env import VectorEnv -from rlschool import LiftSim -from copy import deepcopy -from lift_model import LiftModel -from lift_agent import LiftAgent - - -@parl.remote_class -class Actor(object): - def __init__(self, config): - self.config = config - self.env_num = config['env_num'] - - self.envs = [] - for _ in range(self.env_num): - env = LiftSim() - env = RewardWrapper(env) - env = ActionProcessWrapper(env) - env = ObsProcessWrapper(env) - env = MetricsWrapper(env) - self.envs.append(env) - self.vector_env = VectorEnv(self.envs) - - # number of elevators - self.ele_num = self.envs[0].mansion_attr.ElevatorNumber - - act_dim = self.envs[0].act_dim - self.obs_dim = self.envs[0].obs_dim - self.config['obs_dim'] = self.obs_dim - - # nested list of shape (env_num, ele_num, obs_dim) - self.obs_batch = self.vector_env.reset() - # (env_num * ele_num, obs_dim) - self.obs_batch = np.array(self.obs_batch).reshape( - [self.env_num * self.ele_num, self.obs_dim]) - - model = LiftModel(act_dim) - algorithm = parl.algorithms.A3C( - model, vf_loss_coeff=config['vf_loss_coeff']) - self.agent = LiftAgent(algorithm, config) - - def sample(self): - sample_data = defaultdict(list) - - env_sample_data = {} - # treat each elevator in Liftsim as an independent env - for env_id in range(self.env_num * self.ele_num): - env_sample_data[env_id] = defaultdict(list) - - for i in range(self.config['sample_batch_steps']): - actions_batch, values_batch = self.agent.sample(self.obs_batch) - - vector_actions = np.array_split(actions_batch, self.env_num) - assert len(vector_actions[-1]) == self.ele_num - next_obs_batch, reward_batch, done_batch, info_batch = \ - self.vector_env.step(vector_actions) - - # (env_num, ele_num, obs_dim) -> (env_num * ele_num, obs_dim) - next_obs_batch = np.array(next_obs_batch).reshape( - [self.env_num * self.ele_num, self.obs_dim]) - # repeat reward and done to ele_num times - # (env_num) -> (env_num, ele_num) -> (env_num * ele_num) - reward_batch = np.repeat(reward_batch, self.ele_num) - done_batch = np.repeat(done_batch, self.ele_num) - - for env_id in range(self.env_num * self.ele_num): - env_sample_data[env_id]['obs'].append(self.obs_batch[env_id]) - env_sample_data[env_id]['actions'].append( - actions_batch[env_id]) - env_sample_data[env_id]['rewards'].append(reward_batch[env_id]) - env_sample_data[env_id]['dones'].append(done_batch[env_id]) - env_sample_data[env_id]['values'].append(values_batch[env_id]) - - # Calculate advantages when the episode is done or reaches max sample steps. - if done_batch[env_id] or i + 1 == self.config[ - 'sample_batch_steps']: # reach max sample steps - next_value = 0 - if not done_batch[env_id]: - next_obs = np.expand_dims(next_obs_batch[env_id], 0) - next_value = self.agent.value(next_obs) - - values = env_sample_data[env_id]['values'] - rewards = env_sample_data[env_id]['rewards'] - advantages = calc_gae(rewards, values, next_value, - self.config['gamma'], - self.config['lambda']) - target_values = advantages + values - - sample_data['obs'].extend(env_sample_data[env_id]['obs']) - sample_data['actions'].extend( - env_sample_data[env_id]['actions']) - sample_data['advantages'].extend(advantages) - sample_data['target_values'].extend(target_values) - - env_sample_data[env_id] = defaultdict(list) - - self.obs_batch = deepcopy(next_obs_batch) - - # size of sample_data[key]: env_num * ele_num * sample_batch_steps - for key in sample_data: - sample_data[key] = np.stack(sample_data[key]) - - return sample_data - - def get_metrics(self): - metrics = defaultdict(list) - for metrics_env in self.envs: - assert isinstance( - metrics_env, - MetricsWrapper), "Put the MetricsWrapper in the last wrapper" - for env_reward_1h, env_reward_24h in metrics_env.next_episode_results( - ): - metrics['env_reward_1h'].append(env_reward_1h) - metrics['env_reward_24h'].append(env_reward_24h) - return metrics - - def set_weights(self, params): - self.agent.set_weights(params) diff --git a/examples/LiftSim_baseline/A2C/effect.gif b/examples/LiftSim_baseline/A2C/effect.gif deleted file mode 100644 index b796acbf3c872f0e960e731fb99848c774446165..0000000000000000000000000000000000000000 Binary files a/examples/LiftSim_baseline/A2C/effect.gif and /dev/null differ diff --git a/examples/LiftSim_baseline/A2C/env_wrapper.py b/examples/LiftSim_baseline/A2C/env_wrapper.py deleted file mode 100644 index c8d08fac40d1cc6fff6f46e97e2bac649a79bc1b..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/A2C/env_wrapper.py +++ /dev/null @@ -1,301 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from copy import deepcopy -import numpy as np -from utils import discretize, linear_discretize -from rlschool import LiftSim - - -class BaseWrapper(object): - def __init__(self, env): - self.env = env - self._mansion = env._mansion - self.mansion_attr = self._mansion.attribute - - @property - def obs_dim(self): - if hasattr(self.env, 'obs_dim'): - return self.env.obs_dim - else: - return None - - @property - def act_dim(self): - if hasattr(self.env, 'act_dim'): - return self.env.act_dim - else: - return None - - def seed(self, seed=None): - return self.env.seed(seed) - - def step(self, action): - return self.env.step(action) - - def reset(self): - return self.env.reset() - - def render(self): - return self.env.render() - - def close(self): - return self.env.close() - - -class ObsProcessWrapper(BaseWrapper): - """Extract features of each elevator in LiftSim env - """ - - def __init__(self, env, hour_distize_num=6): - super(ObsProcessWrapper, self).__init__(env) - self.hour_distize_num = hour_distize_num - self.total_steps = 0 - - @property - def obs_dim(self): - """ - NOTE: - Keep obs_dim to the return size of function `_mansion_state_process` - """ - ele_dim = self.mansion_attr.NumberOfFloor * 3 + 34 - obs_dim = (ele_dim + 1) * self.mansion_attr.ElevatorNumber + \ - self.mansion_attr.NumberOfFloor * 2 - obs_dim += self.hour_distize_num - return obs_dim - - def reset(self): - """ - - Returns: - obs(list): [[self.obs_dim]] * mansion_attr.ElevatorNumber, features array of all elevators - """ - obs = self.env.reset() - self.total_steps = 0 - obs = self._mansion_state_process(obs) - return obs - - def step(self, action): - """ - Returns: - obs(list): nested list, shape of [mansion_attr.ElevatorNumber, self.obs_dim], - features array of all elevators - reward(int): returned by self.env - done(bool): returned by self.env - info(dict): returned by self.env - """ - obs, reward, done, info = self.env.step(action) - self.total_steps += 1 - obs = self._mansion_state_process(obs) - return obs, reward, done, info - - def _mansion_state_process(self, mansion_state): - """Extract features of env - """ - ele_features = list() - for ele_state in mansion_state.ElevatorStates: - ele_features.append(self._ele_state_process(ele_state)) - max_floor = ele_state.MaximumFloor - - target_floor_binaries_up = [0.0 for i in range(max_floor)] - target_floor_binaries_down = [0.0 for i in range(max_floor)] - for floor in mansion_state.RequiringUpwardFloors: - target_floor_binaries_up[floor - 1] = 1.0 - for floor in mansion_state.RequiringDownwardFloors: - target_floor_binaries_down[floor - 1] = 1.0 - target_floor_binaries = target_floor_binaries_up + target_floor_binaries_down - - raw_time = self.total_steps * 0.5 # timestep seconds - time_id = int(raw_time % 86400) - time_id = time_id // (24 / self.hour_distize_num * 3600) - time_id_vec = discretize(time_id + 1, self.hour_distize_num, 1, - self.hour_distize_num) - - man_features = list() - for idx in range(len(mansion_state.ElevatorStates)): - elevator_id_vec = discretize(idx + 1, - len(mansion_state.ElevatorStates), 1, - len(mansion_state.ElevatorStates)) - idx_array = list(range(len(mansion_state.ElevatorStates))) - idx_array.remove(idx) - man_features.append(ele_features[idx]) - for left_idx in idx_array: - man_features[idx] = man_features[idx] + ele_features[left_idx] - man_features[idx] = man_features[idx] + \ - elevator_id_vec + target_floor_binaries - man_features[idx] = man_features[idx] + time_id_vec - return np.asarray(man_features, dtype='float32') - - def _ele_state_process(self, ele_state): - """Extract features of elevator - """ - ele_feature = [] - - # add floor information - ele_feature.extend( - linear_discretize(ele_state.Floor, ele_state.MaximumFloor, 1.0, - ele_state.MaximumFloor)) - - # add velocity information - ele_feature.extend( - linear_discretize(ele_state.Velocity, 21, -ele_state.MaximumSpeed, - ele_state.MaximumSpeed)) - - # add door information - ele_feature.append(ele_state.DoorState) - ele_feature.append(float(ele_state.DoorIsOpening)) - ele_feature.append(float(ele_state.DoorIsClosing)) - - # add direction information - ele_feature.extend(discretize(ele_state.Direction, 3, -1, 1)) - - # add load weight information - ele_feature.extend( - linear_discretize(ele_state.LoadWeight / ele_state.MaximumLoad, 5, - 0.0, 1.0)) - - # add other information - target_floor_binaries = [0.0 for i in range(ele_state.MaximumFloor)] - for target_floor in ele_state.ReservedTargetFloors: - target_floor_binaries[target_floor - 1] = 1.0 - ele_feature.extend(target_floor_binaries) - - dispatch_floor_binaries = [ - 0.0 for i in range(ele_state.MaximumFloor + 1) - ] - dispatch_floor_binaries[ele_state.CurrentDispatchTarget] = 1.0 - ele_feature.extend(dispatch_floor_binaries) - ele_feature.append(ele_state.DispatchTargetDirection) - - return ele_feature - - -class ActionProcessWrapper(BaseWrapper): - def __init__(self, env): - """Map action id predicted by model to action of LiftSim - - """ - super(ActionProcessWrapper, self).__init__(env) - - @property - def act_dim(self): - """ - NOTE: - keep act_dim in line with function `_action_idx_to_action` - - Returns: - int: NumberOfFloor * (2 directions) + (-1 DispatchTarget) + (0 DispatchTarget) - """ - return self.mansion_attr.NumberOfFloor * 2 + 2 - - def step(self, action): - """ - Args: - action(list): action_id of all elevators (length = mansion_attr.ElevatorNumber) - """ - ele_actions = [] - for action_id in action: - ele_actions.extend(self._action_idx_to_action(action_id)) - - # ele_action: list, formatted action for LiftSim env (length = 2 * mansion_attr.ElevatorNumber) - return self.env.step(ele_actions) - - def _action_idx_to_action(self, action_idx): - action_idx = int(action_idx) - realdim = self.act_dim - 2 - if (action_idx == realdim): - return (0, 1) # mapped to DispatchTarget=0 - elif (action_idx == realdim + 1): - return (-1, 1) # mapped to DispatchTarget=-1 - action = action_idx - if (action_idx < realdim / 2): - direction = 1 # up direction - action += 1 - else: - direction = -1 # down direction - action -= int(realdim / 2) - action += 1 - return (action, direction) - - -class RewardWrapper(BaseWrapper): - def __init__(self, env): - """Design reward of LiftSim env. - """ - super(RewardWrapper, self).__init__(env) - self.ele_num = self.mansion_attr.ElevatorNumber - - def step(self, action): - """Here we return same reward for each elevator, - you alos can design different rewards of each elevator. - - Returns: - obs: returned by self.env - reward: shaping reward - done: returned by self.env - info: returned by self.env - """ - obs, origin_reward, done, info = self.env.step(action) - - reward = -(30 * info['time_consume'] + 0.01 * info['energy_consume'] + - 100 * info['given_up_persons']) * 1.0e-3 / self.ele_num - - info['origin_reward'] = origin_reward - - return obs, reward, done, info - - -class MetricsWrapper(BaseWrapper): - def __init__(self, env): - super(MetricsWrapper, self).__init__(env) - - self._total_steps = 0 - self._env_reward_1h = 0 - self._env_reward_24h = 0 - - self._num_returned = 0 - self._episode_result = [] - - def reset(self): - self._total_steps = 0 - self._env_reward_1h = 0 - self._env_reward_24h = 0 - return self.env.reset() - - def step(self, action): - obs, reward, done, info = self.env.step(action) - self._total_steps += 1 - - self._env_reward_1h += info['origin_reward'] - self._env_reward_24h += info['origin_reward'] - - # Treat 1h in LiftSim env as an episode (1step = 0.5s) - if self._total_steps % (3600 * 2) == 0: # 1h - episode_env_reward_1h = self._env_reward_1h - self._env_reward_1h = 0 - - episode_env_reward_24h = None - if self._total_steps % (24 * 3600 * 2) == 0: # 24h - episode_env_reward_24h = self._env_reward_24h - self._env_reward_24h = 0 - - self._episode_result.append( - [episode_env_reward_1h, episode_env_reward_24h]) - - return obs, reward, done, info - - def next_episode_results(self): - for i in range(self._num_returned, len(self._episode_result)): - yield self._episode_result[i] - self._num_returned = len(self._episode_result) diff --git a/examples/LiftSim_baseline/A2C/evaluate.py b/examples/LiftSim_baseline/A2C/evaluate.py deleted file mode 100644 index d004aa4255e7f54d123e2f11de80158e60319e1c..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/A2C/evaluate.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import parl -from parl.utils import logger -from env_wrapper import ObsProcessWrapper, ActionProcessWrapper, RewardWrapper -from rlschool import LiftSim -from lift_model import LiftModel -from lift_agent import LiftAgent -from a2c_config import config - - -def evaluate_one_day(model_path): - env = LiftSim() - env = ActionProcessWrapper(env) - env = ObsProcessWrapper(env) - act_dim = env.act_dim - obs_dim = env.obs_dim - config['obs_dim'] = obs_dim - - model = LiftModel(act_dim) - algorithm = parl.algorithms.A3C( - model, vf_loss_coeff=config['vf_loss_coeff']) - agent = LiftAgent(algorithm, config) - agent.restore(model_path) - - reward_24h = 0 - obs = env.reset() - for i in range(24 * 3600 * 2): # 24h, 1step = 0.5s - action, _ = agent.sample(obs) - #print(action) - obs, reward, done, info = env.step(action) - reward_24h += reward - if (i + 1) % (3600 * 2) == 0: - logger.info('hour {}, total_reward: {}'.format( - (i + 1) // (3600 * 2), reward_24h)) - - logger.info('model_path: {}, 24h reward: {}'.format( - model_path, reward_24h)) - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - parser.add_argument( - '--model_path', type=str, help='path of the model to evaluate.') - args = parser.parse_args() - - evaluate_one_day(args.model_path) diff --git a/examples/LiftSim_baseline/A2C/lift_agent.py b/examples/LiftSim_baseline/A2C/lift_agent.py deleted file mode 100644 index 1dd35e59a3c840b6468f5d95bcc6fccc07445bc8..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/A2C/lift_agent.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import parl -import paddle.fluid as fluid -import numpy as np -from parl import layers -from parl.utils.scheduler import PiecewiseScheduler, LinearDecayScheduler - - -class LiftAgent(parl.Agent): - def __init__(self, algorithm, config): - """ - Args: - algorithm (`parl.Algorithm`): algorithm to be used in this agent. - config (dict): config describing the training hyper-parameters(see a2c_config.py) - """ - self.obs_dim = config['obs_dim'] - super(LiftAgent, self).__init__(algorithm) - - self.lr_scheduler = LinearDecayScheduler(config['start_lr'], - config['max_sample_steps']) - self.entropy_coeff_scheduler = PiecewiseScheduler( - config['entropy_coeff_scheduler']) - - def build_program(self): - self.sample_program = fluid.Program() - self.predict_program = fluid.Program() - self.value_program = fluid.Program() - self.learn_program = fluid.Program() - - with fluid.program_guard(self.sample_program): - obs = layers.data( - name='obs', shape=[self.obs_dim], dtype='float32') - sample_actions, values = self.alg.sample(obs) - self.sample_outputs = [sample_actions, values] - - with fluid.program_guard(self.predict_program): - obs = layers.data( - name='obs', shape=[self.obs_dim], dtype='float32') - self.predict_actions = self.alg.predict(obs) - - with fluid.program_guard(self.value_program): - obs = layers.data( - name='obs', shape=[self.obs_dim], dtype='float32') - self.values = self.alg.value(obs) - - with fluid.program_guard(self.learn_program): - obs = layers.data( - name='obs', shape=[self.obs_dim], dtype='float32') - actions = layers.data(name='actions', shape=[], dtype='int32') - advantages = layers.data( - name='advantages', shape=[], dtype='float32') - target_values = layers.data( - name='target_values', shape=[], dtype='float32') - lr = layers.data( - name='lr', shape=[1], dtype='float32', append_batch_size=False) - entropy_coeff = layers.data( - name='entropy_coeff', - shape=[1], - dtype='float32', - append_batch_size=False) - - total_loss, pi_loss, vf_loss, entropy = self.alg.learn( - obs, actions, advantages, target_values, lr, entropy_coeff) - self.learn_outputs = [total_loss, pi_loss, vf_loss, entropy] - self.learn_program = parl.compile(self.learn_program, total_loss) - - def sample(self, obs_np): - """ - Args: - obs_np: a numpy float32 array of shape (B, obs_dim). - - Returns: - sample_ids: a numpy int64 array of shape [B] - values: a numpy float32 array of shape [B] - """ - obs_np = obs_np.astype('float32') - - sample_actions, values = self.fluid_executor.run( - self.sample_program, - feed={'obs': obs_np}, - fetch_list=self.sample_outputs) - return sample_actions, values - - def predict(self, obs_np): - """ - Args: - obs_np: a numpy float32 array of shape (B, obs_dim). - - Returns: - predict_actions: a numpy int64 array of shape [B] - """ - obs_np = obs_np.astype('float32') - - predict_actions = self.fluid_executor.run( - self.predict_program, - feed={'obs': obs_np}, - fetch_list=[self.predict_actions])[0] - return predict_actions - - def value(self, obs_np): - """ - Args: - obs_np: a numpy float32 array of shape (B, obs_dim). - - Returns: - values: a numpy float32 array of shape [B] - """ - obs_np = obs_np.astype('float32') - - values = self.fluid_executor.run( - self.value_program, feed={'obs': obs_np}, - fetch_list=[self.values])[0] - return values - - def learn(self, obs_np, actions_np, advantages_np, target_values_np): - """ - Args: - obs_np: a numpy float32 array of shape (B, obs_dim). - actions_np: a numpy int64 array of shape [B] - advantages_np: a numpy float32 array of shape [B] - target_values_np: a numpy float32 array of shape [B] - """ - - obs_np = obs_np.astype('float32') - actions_np = actions_np.astype('int64') - advantages_np = advantages_np.astype('float32') - target_values_np = target_values_np.astype('float32') - - lr = self.lr_scheduler.step(step_num=obs_np.shape[0]) - entropy_coeff = self.entropy_coeff_scheduler.step() - - total_loss, pi_loss, vf_loss, entropy = self.fluid_executor.run( - self.learn_program, - feed={ - 'obs': obs_np, - 'actions': actions_np, - 'advantages': advantages_np, - 'target_values': target_values_np, - 'lr': np.array([lr], dtype='float32'), - 'entropy_coeff': np.array([entropy_coeff], dtype='float32') - }, - fetch_list=self.learn_outputs) - return total_loss, pi_loss, vf_loss, entropy, lr, entropy_coeff diff --git a/examples/LiftSim_baseline/A2C/lift_model.py b/examples/LiftSim_baseline/A2C/lift_model.py deleted file mode 100644 index 6da63723b5ab058ef7333c590725f90268ce52b6..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/A2C/lift_model.py +++ /dev/null @@ -1,75 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import parl -import paddle.fluid as fluid -from parl import layers - - -class LiftModel(parl.Model): - def __init__(self, act_dim): - self.act_dim = act_dim - self.fc_1 = layers.fc(size=512, act='relu') - self.fc_2 = layers.fc(size=256, act='relu') - self.fc_3 = layers.fc(size=128, act='tanh') - - self.value_fc = layers.fc(size=1) - self.policy_fc = layers.fc(size=act_dim) - - def policy(self, obs): - """ - Args: - obs(float32 tensor): shape of (B * obs_dim) - - Returns: - policy_logits(float32 tensor): shape of (B * act_dim) - """ - h_1 = self.fc_1(obs) - h_2 = self.fc_2(h_1) - h_3 = self.fc_3(h_2) - policy_logits = self.policy_fc(h_3) - return policy_logits - - def value(self, obs): - """ - Args: - obs(float32 tensor): shape of (B * obs_dim) - - Returns: - values(float32 tensor): shape of (B,) - """ - h_1 = self.fc_1(obs) - h_2 = self.fc_2(h_1) - h_3 = self.fc_3(h_2) - values = self.value_fc(h_3) - values = layers.squeeze(values, axes=[1]) - return values - - def policy_and_value(self, obs): - """ - Args: - obs(float32 tensor): shape (B * obs_dim) - - Returns: - policy_logits(float32 tensor): shape of (B * act_dim) - values(float32 tensor): shape of (B,) - """ - h_1 = self.fc_1(obs) - h_2 = self.fc_2(h_1) - h_3 = self.fc_3(h_2) - policy_logits = self.policy_fc(h_3) - values = self.value_fc(h_3) - values = layers.squeeze(values, axes=[1]) - - return policy_logits, values diff --git a/examples/LiftSim_baseline/A2C/performance.png b/examples/LiftSim_baseline/A2C/performance.png deleted file mode 100644 index 153da4eb12bd3219ed5030516bcc001188c980b2..0000000000000000000000000000000000000000 Binary files a/examples/LiftSim_baseline/A2C/performance.png and /dev/null differ diff --git a/examples/LiftSim_baseline/A2C/train.py b/examples/LiftSim_baseline/A2C/train.py deleted file mode 100644 index 3663cb841f676efcc157fccd15fa816cdaae662c..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/A2C/train.py +++ /dev/null @@ -1,220 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import os -import parl -import queue -import six -import time -import threading - -from actor import Actor -from collections import defaultdict -from env_wrapper import ObsProcessWrapper, ActionProcessWrapper -from parl.utils import logger, get_gpu_count, tensorboard, machine_info -from parl.utils.scheduler import PiecewiseScheduler -from parl.utils.time_stat import TimeStat -from parl.utils.window_stat import WindowStat -from rlschool import LiftSim -from lift_model import LiftModel -from lift_agent import LiftAgent - - -class Learner(object): - def __init__(self, config): - self.config = config - - #=========== Create Agent ========== - env = LiftSim() - env = ActionProcessWrapper(env) - env = ObsProcessWrapper(env) - - obs_dim = env.obs_dim - act_dim = env.act_dim - self.config['obs_dim'] = obs_dim - - model = LiftModel(act_dim) - algorithm = parl.algorithms.A3C( - model, vf_loss_coeff=config['vf_loss_coeff']) - self.agent = LiftAgent(algorithm, config) - - if machine_info.is_gpu_available(): - assert get_gpu_count() == 1, 'Only support training in single GPU,\ - Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_TO_USE]` .' - - #========== Learner ========== - - self.entropy_stat = WindowStat(100) - self.target_values = None - - self.learn_time_stat = TimeStat(100) - self.start_time = None - - #========== Remote Actor =========== - self.remote_count = 0 - self.sample_data_queue = queue.Queue() - - self.remote_metrics_queue = queue.Queue() - self.sample_total_steps = 0 - - self.params_queues = [] - self.create_actors() - - self.log_steps = 0 - - def create_actors(self): - """ Connect to the cluster and start sampling of the remote actor. - """ - parl.connect(self.config['master_address']) - - logger.info('Waiting for {} remote actors to connect.'.format( - self.config['actor_num'])) - - for i in six.moves.range(self.config['actor_num']): - params_queue = queue.Queue() - self.params_queues.append(params_queue) - - self.remote_count += 1 - logger.info('Remote actor count: {}'.format(self.remote_count)) - - remote_thread = threading.Thread( - target=self.run_remote_sample, args=(params_queue, )) - remote_thread.setDaemon(True) - remote_thread.start() - - self.start_time = time.time() - - def run_remote_sample(self, params_queue): - """ Sample data from remote actor and update parameters of remote actor. - """ - remote_actor = Actor(self.config) - - cnt = 0 - while True: - latest_params = params_queue.get() - remote_actor.set_weights(latest_params) - batch = remote_actor.sample() - - self.sample_data_queue.put(batch) - - cnt += 1 - if cnt % self.config['get_remote_metrics_interval'] == 0: - metrics = remote_actor.get_metrics() - if metrics: - self.remote_metrics_queue.put(metrics) - - def step(self): - """ - 1. kick off all actors to synchronize parameters and sample data; - 2. collect sample data of all actors; - 3. update parameters. - """ - latest_params = self.agent.get_weights() - for params_queue in self.params_queues: - params_queue.put(latest_params) - - train_batch = defaultdict(list) - for i in range(self.config['actor_num']): - sample_data = self.sample_data_queue.get() - for key, value in sample_data.items(): - train_batch[key].append(value) - - self.sample_total_steps += sample_data['obs'].shape[0] - - for key, value in train_batch.items(): - train_batch[key] = np.concatenate(value) - - with self.learn_time_stat: - total_loss, pi_loss, vf_loss, entropy, lr, entropy_coeff = self.agent.learn( - obs_np=train_batch['obs'], - actions_np=train_batch['actions'], - advantages_np=train_batch['advantages'], - target_values_np=train_batch['target_values']) - - self.entropy_stat.add(entropy) - self.target_values = np.mean(train_batch['target_values']) - - tensorboard.add_scalar('model/entropy', entropy, - self.sample_total_steps) - tensorboard.add_scalar('model/q_value', self.target_values, - self.sample_total_steps) - - def log_metrics(self): - """ Log metrics of learner and actors - """ - if self.start_time is None: - return - - metrics = [] - while True: - try: - metric = self.remote_metrics_queue.get_nowait() - metrics.append(metric) - except queue.Empty: - break - - env_reward_1h, env_reward_24h = [], [] - for x in metrics: - env_reward_1h.extend(x['env_reward_1h']) - env_reward_24h.extend(x['env_reward_24h']) - env_reward_1h = [x for x in env_reward_1h if x is not None] - env_reward_24h = [x for x in env_reward_24h if x is not None] - - mean_reward_1h, mean_reward_24h = None, None - if env_reward_1h: - mean_reward_1h = np.mean(np.array(env_reward_1h).flatten()) - tensorboard.add_scalar('performance/env_rewards_1h', - mean_reward_1h, self.sample_total_steps) - if env_reward_24h: - mean_reward_24h = np.mean(np.array(env_reward_24h).flatten()) - tensorboard.add_scalar('performance/env_rewards_24h', - mean_reward_24h, self.sample_total_steps) - - metric = { - 'Sample steps': self.sample_total_steps, - 'env_reward_1h': mean_reward_1h, - 'env_reward_24h': mean_reward_24h, - 'target_values': self.target_values, - 'entropy': self.entropy_stat.mean, - 'learn_time_s': self.learn_time_stat.mean, - 'elapsed_time_s': int(time.time() - self.start_time), - } - logger.info(metric) - - self.log_steps += 1 - save_interval_step = 7200 // max(1, - self.config['log_metrics_interval_s']) - if self.log_steps % save_interval_step == 0: - self.save_model() # save model every 2h - - def should_stop(self): - return self.sample_total_steps >= self.config['max_sample_steps'] - - def save_model(self): - time_str = time.strftime(".%Y%m%d_%H%M%S", time.localtime()) - self.agent.save(os.path.join('saved_models', 'model.ckpt' + time_str)) - - -if __name__ == '__main__': - from a2c_config import config - - learner = Learner(config) - assert config['log_metrics_interval_s'] > 0 - - while not learner.should_stop(): - start = time.time() - while time.time() - start < config['log_metrics_interval_s']: - learner.step() - learner.log_metrics() diff --git a/examples/LiftSim_baseline/A2C/utils.py b/examples/LiftSim_baseline/A2C/utils.py deleted file mode 100644 index 05c081eaa39e4a2366ebf986092c4a9ec11a2c2f..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/A2C/utils.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -def discretize(value, n_dim, min_val, max_val): - ''' - discretize a value into a vector of n_dim dimension 1-hot representation - with the value below min_val being [1, 0, 0, ..., 0] - and the value above max_val being [0, 0, ..., 0, 1] - ''' - assert n_dim > 0 - if (n_dim == 1): - return [1] - delta = (max_val - min_val) / float(n_dim - 1) - active_pos = int((value - min_val) / delta + 0.5) - active_pos = min(n_dim - 1, active_pos) - active_pos = max(0, active_pos) - ret_array = [0 for i in range(n_dim)] - ret_array[active_pos] = 1.0 - return ret_array - - -def linear_discretize(value, n_dim, min_val, max_val): - ''' - discretize a value into a vector of n_dim dimensional representation - with the value below min_val being [1, 0, 0, ..., 0] - and the value above max_val being [0, 0, ..., 0, 1] - e.g. if n_dim = 2, min_val = 1.0, max_val = 2.0 - if value = 1.5 returns [0.5, 0.5], if value = 1.8 returns [0.2, 0.8] - ''' - assert n_dim > 0 - if (n_dim == 1): - return [1] - delta = (max_val - min_val) / float(n_dim - 1) - active_pos = int((value - min_val) / delta + 0.5) - active_pos = min(n_dim - 2, active_pos) - active_pos = max(0, active_pos) - anchor_pt = active_pos * delta + min_val - if (anchor_pt > value and anchor_pt > min_val + 0.5 * delta): - anchor_pt -= delta - active_pos -= 1 - weight = (value - anchor_pt) / delta - weight = min(1.0, max(0.0, weight)) - ret_array = [0 for i in range(n_dim)] - ret_array[active_pos] = 1.0 - weight - ret_array[active_pos + 1] = weight - return ret_array diff --git a/examples/LiftSim_baseline/DQN/README.md b/examples/LiftSim_baseline/DQN/README.md deleted file mode 100644 index d75b549f8f3b1c433a915f025ab676115749ddd3..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/DQN/README.md +++ /dev/null @@ -1,26 +0,0 @@ -# LiftSim基线 - -## 简介 - -基于PARL库实现Deep Q-network算法,应用于[RLSchool][rlschool]库中的电梯调度模拟环境[LiftSim][liftsim]。 - -## 依赖库 - -+ [paddlepaddle==1.5.1](https://github.com/PaddlePaddle/Paddle) -+ [parl==1.1.2](https://github.com/PaddlePaddle/PARL) -+ [rlschool>=0.0.1](rlschool) - -## 运行 - -```python -python demo.py -``` - -## Benchmark - - - -Accumulated Reward:每3600 steps内reward的总和,可体现电梯调度在单位时间(模拟环境0.5小时)内的效率。 - -[rlschool]: https://github.com/PaddlePaddle/RLSchool -[liftsim]: https://github.com/PaddlePaddle/RLSchool/tree/master/rlschool/liftsim diff --git a/examples/LiftSim_baseline/DQN/__init__.py b/examples/LiftSim_baseline/DQN/__init__.py deleted file mode 100644 index eca2dce114b069bf9b455d77ce670d73b5047fd2..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/DQN/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/examples/LiftSim_baseline/DQN/demo.py b/examples/LiftSim_baseline/DQN/demo.py deleted file mode 100644 index cecbf6c1a34d9060dac90abf6e4d648aa0f9a870..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/DQN/demo.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from rlschool import LiftSim -from wrapper import Wrapper, ActionWrapper, ObservationWrapper -from rl_benchmark.dispatcher import RL_dispatcher -import sys -import argparse - - -# run main program with args -def run_main(args): - - parser = argparse.ArgumentParser(description='demo configuration') - parser.add_argument( - '--iterations', - type=int, - default=100000000, - help='total number of iterations') - args = parser.parse_args(args) - print('iterations:', args.iterations) - - mansion_env = LiftSim() - # mansion_env.seed(1988) - - mansion_env = Wrapper(mansion_env) - mansion_env = ActionWrapper(mansion_env) - mansion_env = ObservationWrapper(mansion_env) - - dispatcher = RL_dispatcher(mansion_env, args.iterations) - dispatcher.run_episode() - - return 0 - - -if __name__ == "__main__": - run_main(sys.argv[1:]) diff --git a/examples/LiftSim_baseline/DQN/rl_10.png b/examples/LiftSim_baseline/DQN/rl_10.png deleted file mode 100644 index b8f9eef1d10c0a617d8dd462f1d66e5d26484622..0000000000000000000000000000000000000000 Binary files a/examples/LiftSim_baseline/DQN/rl_10.png and /dev/null differ diff --git a/examples/LiftSim_baseline/DQN/rl_benchmark/__init__.py b/examples/LiftSim_baseline/DQN/rl_benchmark/__init__.py deleted file mode 100644 index eca2dce114b069bf9b455d77ce670d73b5047fd2..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/DQN/rl_benchmark/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/examples/LiftSim_baseline/DQN/rl_benchmark/agent.py b/examples/LiftSim_baseline/DQN/rl_benchmark/agent.py deleted file mode 100644 index 846bcf318090916141a4216abb3a889d2548d2ff..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/DQN/rl_benchmark/agent.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import numpy as np -import numpy.random as random -import paddle.fluid as fluid -from parl import layers -from parl import Agent -from parl.utils import get_gpu_count, machine_info - - -class ElevatorAgent(Agent): - def __init__(self, algorithm, obs_dim, action_dim): - self._action_dim = action_dim - self._obs_dim = obs_dim - self._update_target_steps = 1000 - - self._global_step = 0 - self.exploration_ratio = 0.9 - self.exploration_decre = 1e-7 - self.exploration_min = 0.1 - super(ElevatorAgent, self).__init__(algorithm) - - use_cuda = machine_info.is_gpu_available() - if self.gpu_id >= 0: - assert get_gpu_count() == 1, 'Only support training in single GPU,\ - Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_YOU_WANT_TO_USE]` .' - - else: - os.environ['CPU_NUM'] = str(1) - - exec_strategy = fluid.ExecutionStrategy() - exec_strategy.num_threads = 1 - exec_strategy.num_iteration_per_drop_scope = 10 - build_strategy = fluid.BuildStrategy() - build_strategy.remove_unnecessary_lock = False - - self.learn_pe = fluid.ParallelExecutor( - use_cuda=use_cuda, - main_program=self.learn_program, - build_strategy=build_strategy, - exec_strategy=exec_strategy, - ) - - def build_program(self): - self.pred_program = fluid.Program() - self.learn_program = fluid.Program() - - with fluid.program_guard(self.pred_program): - obs = layers.data( - name='obs', shape=[self._obs_dim], dtype='float32') - self._value = self.alg.define_predict(obs) - - with fluid.program_guard(self.learn_program): - obs = layers.data( - name='obs', shape=[self._obs_dim], dtype='float32') - action = layers.data(name='act', shape=[1], dtype='int32') - reward = layers.data(name='reward', shape=[], dtype='float32') - next_obs = layers.data( - name='next_obs', shape=[self._obs_dim], dtype='float32') - terminal = layers.data(name='terminal', shape=[], dtype='bool') - self._cost = self.alg.define_learn(obs, action, reward, next_obs, - terminal) - - def sample(self, obs): - if self.exploration_ratio > self.exploration_min: - self.exploration_ratio -= self.exploration_decre - q_values = self.predict(obs) - - ret_actions = list() - for i in range(len(q_values)): # number of elevators - if (random.random() < self.exploration_ratio): - action = random.randint(0, self._action_dim) - else: - action = np.argmax(q_values[i]) - ret_actions.append(int(action)) - return ret_actions - - def predict(self, obs): - pred_Q = self.fluid_executor.run( - self.pred_program, - feed={'obs': obs.astype('float32')}, - fetch_list=[self._value]) - return pred_Q[0] - - def learn(self, obs, act, reward, next_obs, terminal): - self._global_step += 1 - if self._global_step % self._update_target_steps == 0: - self.alg.sync_target(self.gpu_id) - - feed = { - 'obs': obs.astype('float32'), - 'act': act.astype('int32'), - 'reward': reward, - 'next_obs': next_obs.astype('float32'), - 'terminal': terminal - } - cost = self.learn_pe.run(feed=feed, fetch_list=[self._cost.name])[0] - return cost diff --git a/examples/LiftSim_baseline/DQN/rl_benchmark/dispatcher.py b/examples/LiftSim_baseline/DQN/rl_benchmark/dispatcher.py deleted file mode 100644 index a2561ee6d3d9f2c6b8f39c886ef4a1f2f01fb1ea..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/DQN/rl_benchmark/dispatcher.py +++ /dev/null @@ -1,105 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import parl -import numpy as np -import numpy.random as random - -from copy import deepcopy -from collections import deque - -from rlschool import EPSILON, HUGE -from rl_benchmark.model import RLDispatcherModel -from rl_benchmark.agent import ElevatorAgent -from parl.algorithms import DQN -from parl.utils import ReplayMemory - -MEMORY_SIZE = 1000000 -BATCH_SIZE = 64 - - -class RL_dispatcher(): - """ - An RL benchmark for elevator system - """ - - def __init__(self, env, max_episode): - self.env = env - - self._obs_dim = env.observation_space - self._act_dim = env.action_space - self._global_step = 0 - self.max_episode = max_episode - self._rpm = ReplayMemory(MEMORY_SIZE, self._obs_dim, 1) - self._model = RLDispatcherModel(self._act_dim) - hyperparas = { - 'action_dim': self._act_dim, - 'lr': 5.0e-4, - 'gamma': 0.998 - } - - self._algorithm = DQN(self._model, hyperparas) - self._agent = ElevatorAgent(self._algorithm, self._obs_dim, - self._act_dim) - self._warm_up_size = 2000 - self._statistic_freq = 1000 - self._loss_queue = deque() - - def run_episode(self): - self.env.reset() - acc_reward = 0.0 - - while self._global_step < self.max_episode: - # self.env.render() - state = self.env.state - action = self._agent.sample(state) - state_, reward, done, info = self.env.step(action) - output_info = self.learn_step(state, action, reward) - acc_reward += reward - if (isinstance(output_info, dict) and len(output_info) > 0): - self.env.log_notice("%s", output_info) - if (self._global_step % 3600 == 0): - self.env.log_notice( - "Accumulated Reward: %f, Mansion Status: %s", acc_reward, - self.env.statistics) - acc_reward = 0.0 - - self._agent.save('./model.ckpt') - - def learn_step(self, state, action, r): - self._global_step += 1 - if (self._global_step > self._warm_up_size): - for i in range(self.env.elevator_num): - self._rpm.append(self._last_observation_array[i], - self._last_action[i], self._last_reward, - deepcopy(state[i]), False) - self._last_observation_array = deepcopy(state) - self._last_action = deepcopy(action) - self._last_reward = r - - ret_dict = {} - if self._rpm.size() > self._warm_up_size: - batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = \ - self._rpm.sample_batch(BATCH_SIZE) - cost = self._agent.learn(batch_obs, batch_action, batch_reward, - batch_next_obs, batch_terminal) - self._loss_queue.appendleft(cost) - if (len(self._loss_queue) > self._statistic_freq): - self._loss_queue.pop() - if (self._global_step % self._statistic_freq == 0): - ret_dict["Temporal Difference Error(Average)"] = \ - float(sum(self._loss_queue)) / float(len(self._loss_queue)) - - return ret_dict diff --git a/examples/LiftSim_baseline/DQN/rl_benchmark/model.py b/examples/LiftSim_baseline/DQN/rl_benchmark/model.py deleted file mode 100644 index 3b2364df90565565f5d4e3286b6662c134cb4c08..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/DQN/rl_benchmark/model.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import paddle.fluid as fluid -from parl import layers -import numpy as np -import parl - - -class RLDispatcherModel(parl.Model): - def __init__(self, act_dim): - self._act_dim = act_dim - self._fc_1 = layers.fc(size=512, act='relu') - self._fc_2 = layers.fc(size=256, act='relu') - self._fc_3 = layers.fc(size=128, act='tanh') - self._output = layers.fc(size=act_dim) - - def value(self, obs): - _h_1 = self._fc_1(obs) - _h_2 = self._fc_2(_h_1) - _h_3 = self._fc_3(_h_2) - self._pred = self._output(_h_3) - return self._pred diff --git a/examples/LiftSim_baseline/DQN/wrapper.py b/examples/LiftSim_baseline/DQN/wrapper.py deleted file mode 100644 index 55d525deaeecb76df4f2ba9183ed5ea6c119e5d8..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/DQN/wrapper.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# wrapper part modified from -# https://github.com/openai/gym/blob/master/gym/core.py - -from rlschool import LiftSim -from wrapper_utils import obs_dim, act_dim, mansion_state_preprocessing -from wrapper_utils import action_idx_to_action - - -class Wrapper(LiftSim): - def __init__(self, env): - self.env = env - self._mansion = env._mansion - self.mansion_attr = self._mansion.attribute - self.elevator_num = self.mansion_attr.ElevatorNumber - self.observation_space = obs_dim(self.mansion_attr) - self.action_space = act_dim(self.mansion_attr) - self.viewer = env.viewer - - def __getattr__(self, name): - if name.startswith('_'): - raise AttributeError( - "attempted to get missing private attribute '{}'".format(name)) - return getattr(self.env, name) - - def seed(self, seed=None): - return self.env.seed(seed) - - def step(self, action): - return self.env.step(action) - - def reset(self): - return self.env.reset() - - def render(self): - return self.env.render() - - def close(self): - return self.env.close() - - -class RewardWrapper(Wrapper): - pass - - -class ActionWrapper(Wrapper): - def reset(self): - return self.env.reset() - - def step(self, action): - act = [] - for a in action: - act.extend(self.action(a, self.action_space)) - return self.env.step(act) - - def action(self, action, action_space): - return action_idx_to_action(action, action_space) - - -class ObservationWrapper(Wrapper): - def reset(self): - self.env.reset() - return self.observation(self._mansion.state) - - def step(self, action): - observation, reward, done, info = self.env.step(action) - return (self.observation(observation), reward, done, info) - - def observation(self, observation): - return mansion_state_preprocessing(observation) - - @property - def state(self): - return self.observation(self._mansion.state) diff --git a/examples/LiftSim_baseline/DQN/wrapper_utils.py b/examples/LiftSim_baseline/DQN/wrapper_utils.py deleted file mode 100644 index 45afcefbf9ebdbacc2841bd54b1756a1213be5bf..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/DQN/wrapper_utils.py +++ /dev/null @@ -1,241 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import numpy as np -from rlschool import ElevatorState, ElevatorAction -from rlschool import MansionAttribute, MansionState -from rlschool import EPSILON, HUGE -from rlschool import MansionConfig -from rlschool import MansionManager - - -def discretize(value, n_dim, min_val, max_val): - """ - discretize a value into a vector of n_dim dimension 1-hot representation - with the value below min_val being [1, 0, 0, ..., 0] - and the value above max_val being [0, 0, ..., 0, 1] - Args: - value: the value that needs to be discretized into 1-hot format - n_dim: number of dimensions - min_val: minimal value in the result - man_val: maximum value in the result - Returns: - the discretized vector - """ - assert n_dim > 0 - if (n_dim == 1): - return [1] - delta = (max_val - min_val) / float(n_dim - 1) - active_pos = int((value - min_val) / delta + 0.5) - active_pos = min(n_dim - 1, active_pos) - active_pos = max(0, active_pos) - ret_array = [0 for i in range(n_dim)] - ret_array[active_pos] = 1.0 - return ret_array - - -def linear_discretize(value, n_dim, min_val, max_val): - """ - discretize a value into a vector of n_dim dimensional representation - with the value below min_val being [1, 0, 0, ..., 0] - and the value above max_val being [0, 0, ..., 0, 1] - e.g. if n_dim = 2, min_val = 1.0, max_val = 2.0 - if value = 1.5 returns [0.5, 0.5], if value = 1.8 returns [0.2, 0.8] - Args: - value: the value that needs to be discretized - n_dim: number of dimensions - min_val: minimal value in the result - man_val: maximum value in the result - Returns: - the discretized vector - """ - assert n_dim > 0 - if (n_dim == 1): - return [1] - delta = (max_val - min_val) / float(n_dim - 1) - active_pos = int((value - min_val) / delta + 0.5) - active_pos = min(n_dim - 2, active_pos) - active_pos = max(0, active_pos) - anchor_pt = active_pos * delta + min_val - if (anchor_pt > value and anchor_pt > min_val + 0.5 * delta): - anchor_pt -= delta - active_pos -= 1 - weight = (value - anchor_pt) / delta - weight = min(1.0, max(0.0, weight)) - ret_array = [0 for i in range(n_dim)] - ret_array[active_pos] = 1.0 - weight - ret_array[active_pos + 1] = weight - return ret_array - - -def ele_state_preprocessing(ele_state): - """Process elevator state, make it usable for network - Args: - ele_state: ElevatorState, nametuple, defined in rlschool/liftsim/environment/mansion/utils.py - Returns: - ele_feature: list of elevator state - """ - ele_feature = [] - - # add floor information - ele_feature.extend( - linear_discretize(ele_state.Floor, ele_state.MaximumFloor, 1.0, - ele_state.MaximumFloor)) - - # add velocity information - ele_feature.extend( - linear_discretize(ele_state.Velocity, 21, -ele_state.MaximumSpeed, - ele_state.MaximumSpeed)) - - # add door information - ele_feature.append(ele_state.DoorState) - ele_feature.append(float(ele_state.DoorIsOpening)) - ele_feature.append(float(ele_state.DoorIsClosing)) - - # add direction information - ele_feature.extend(discretize(ele_state.Direction, 3, -1, 1)) - - # add load weight information - ele_feature.extend( - linear_discretize(ele_state.LoadWeight / ele_state.MaximumLoad, 5, 0.0, - 1.0)) - - # add other information - target_floor_binaries = [0.0 for i in range(ele_state.MaximumFloor)] - for target_floor in ele_state.ReservedTargetFloors: - target_floor_binaries[target_floor - 1] = 1.0 - ele_feature.extend(target_floor_binaries) - - dispatch_floor_binaries = [0.0 for i in range(ele_state.MaximumFloor + 1)] - dispatch_floor_binaries[ele_state.CurrentDispatchTarget] = 1.0 - ele_feature.extend(dispatch_floor_binaries) - ele_feature.append(ele_state.DispatchTargetDirection) - - return ele_feature - - -def obs_dim(mansion_attr): - """Calculate the observation dimension - Args: - mansion_attr: MansionAttribute, attribute of mansion_manager - Returns: - observation dimension - """ - assert isinstance(mansion_attr, MansionAttribute) - ele_dim = mansion_attr.NumberOfFloor * 3 + 34 - obs_dim = (ele_dim + 1) * mansion_attr.ElevatorNumber + \ - mansion_attr.NumberOfFloor * 2 - return obs_dim - - -def act_dim(mansion_attr): - """Calculate the action dimension, which is number of floor times 2 plus 2. - The additional two are for special cases: the elevator stops at once if the new dispatch_target is 0, - the original dispatch_target does not change if dispatch_target is -1. See implementation in - method action_idx_to_action below. - Args: - mansion_attr: MansionAttribute, attribute of mansion_manager - Returns: - action dimension - """ - assert isinstance(mansion_attr, MansionAttribute) - return mansion_attr.NumberOfFloor * 2 + 2 - - -def mansion_state_preprocessing(mansion_state): - """Process mansion_state to make it usable for networks, convert it into a numpy array - Args: - mansion_state: namedtuple of mansion state, - defined in rlschool/liftsim/environment/mansion/utils.py - Returns: - the converted numpy array - """ - ele_features = list() - for ele_state in mansion_state.ElevatorStates: - ele_features.append(ele_state_preprocessing(ele_state)) - max_floor = ele_state.MaximumFloor - - target_floor_binaries_up = [0.0 for i in range(max_floor)] - target_floor_binaries_down = [0.0 for i in range(max_floor)] - for floor in mansion_state.RequiringUpwardFloors: - target_floor_binaries_up[floor - 1] = 1.0 - for floor in mansion_state.RequiringDownwardFloors: - target_floor_binaries_down[floor - 1] = 1.0 - target_floor_binaries = target_floor_binaries_up + target_floor_binaries_down - - idx = 0 - man_features = list() - for idx in range(len(mansion_state.ElevatorStates)): - elevator_id_vec = discretize(idx + 1, - len(mansion_state.ElevatorStates), 1, - len(mansion_state.ElevatorStates)) - idx_array = list(range(len(mansion_state.ElevatorStates))) - idx_array.remove(idx) - # random.shuffle(idx_array) - man_features.append(ele_features[idx]) - for left_idx in idx_array: - man_features[idx] = man_features[idx] + ele_features[left_idx] - man_features[idx] = man_features[idx] + \ - elevator_id_vec + target_floor_binaries - return np.asarray(man_features, dtype='float32') - - -def action_idx_to_action(action_idx, act_dim): - """Convert action_inx to action - Args: - action_idx: the index needed to be converted - act_dim: action dimension - Returns: - the converted namedtuple - """ - assert isinstance(action_idx, int) - assert isinstance(act_dim, int) - realdim = act_dim - 2 - if (action_idx == realdim): - return ElevatorAction(0, 1) - elif (action_idx == realdim + 1): - return ElevatorAction(-1, 1) - action = action_idx - if (action_idx < realdim / 2): - direction = 1 - action += 1 - else: - direction = -1 - action -= int(realdim / 2) - action += 1 - return [action, direction] - - -def action_to_action_idx(action, act_dim): - """Convert action to number according to act_dim. - Args: - action: namedtuple defined in rlschool/liftsim/environment/mansion/utils.py - act_dim: action dimension - Returns: - action_idx: the result index - """ - assert isinstance(action, ElevatorAction) - assert isinstance(act_dim, int) - realdim = act_dim - 2 - if (action.TargetFloor == 0): - return realdim - elif (action.TargetFloor < 0): - return realdim + 1 - action_idx = 0 - if (action.DirectionIndicator < 0): - action_idx += int(realdim / 2) - action_idx += action.TargetFloor - 1 - return action_idx diff --git a/examples/MADDPG/train.py b/examples/MADDPG/train.py index d0e20dcdb4fd35638432fb1666b76b30c2a388d8..8454a73ee209707c65340897ce9b090d482c6751 100644 --- a/examples/MADDPG/train.py +++ b/examples/MADDPG/train.py @@ -20,7 +20,7 @@ from simple_model import MAModel from simple_agent import MAAgent import parl from parl.env.multiagent_simple_env import MAenv -from parl.utils import logger, tensorboard +from parl.utils import logger, summary def run_episode(env, agents): @@ -62,8 +62,8 @@ def run_episode(env, agents): # learn policy for i, agent in enumerate(agents): critic_loss = agent.learn(agents) - tensorboard.add_scalar('critic_loss_%d' % i, critic_loss, - agent.global_train_step) + summary.add_scalar('critic_loss_%d' % i, critic_loss, + agent.global_train_step) return total_reward, agents_reward, steps @@ -155,12 +155,12 @@ def train_agent(): format(total_steps, total_episodes, mean_episode_reward, use_time)) t_start = time.time() - tensorboard.add_scalar('mean_episode_reward/episode', - mean_episode_reward, total_episodes) - tensorboard.add_scalar('mean_episode_reward/steps', - mean_episode_reward, total_steps) - tensorboard.add_scalar('use_time/1000episode', use_time, - total_episodes) + summary.add_scalar('mean_episode_reward/episode', + mean_episode_reward, total_episodes) + summary.add_scalar('mean_episode_reward/steps', + mean_episode_reward, total_steps) + summary.add_scalar('use_time/1000episode', use_time, + total_episodes) # save model if not args.restore: diff --git a/examples/NeurIPS2019-Learn-to-Move-Challenge/evaluate.py b/examples/NeurIPS2019-Learn-to-Move-Challenge/evaluate.py index e784dae00f9ffdc5528a4c7dafda2916e5d4c456..e3a8066d79128ed9e969bb7d4c1c8cce3bee3775 100755 --- a/examples/NeurIPS2019-Learn-to-Move-Challenge/evaluate.py +++ b/examples/NeurIPS2019-Learn-to-Move-Challenge/evaluate.py @@ -22,7 +22,7 @@ import numpy as np from actor import Actor from opensim_model import OpenSimModel from opensim_agent import OpenSimAgent -from parl.utils import logger, ReplayMemory, tensorboard, get_gpu_count +from parl.utils import logger, ReplayMemory, summary, get_gpu_count from parl.utils.window_stat import WindowStat from parl.remote.client import get_global_client from parl.utils import machine_info diff --git a/examples/NeurIPS2019-Learn-to-Move-Challenge/train.py b/examples/NeurIPS2019-Learn-to-Move-Challenge/train.py index b37fb369a15c8a28a3911dbb9a864cf28d1da8b7..cf14f1e0306c69c8f134cf6c81c279ac982b52d0 100755 --- a/examples/NeurIPS2019-Learn-to-Move-Challenge/train.py +++ b/examples/NeurIPS2019-Learn-to-Move-Challenge/train.py @@ -22,7 +22,7 @@ import numpy as np from actor import Actor from opensim_model import OpenSimModel from opensim_agent import OpenSimAgent -from parl.utils import logger, ReplayMemory, tensorboard, get_gpu_count +from parl.utils import logger, ReplayMemory, summary, get_gpu_count from parl.utils.window_stat import WindowStat from parl.remote.client import get_global_client from parl.utils import machine_info @@ -97,7 +97,7 @@ class Learner(object): # add lock between training and predicting self.model_lock = threading.Lock() - # add lock when appending data to rpm or writing scalars to tensorboard + # add lock when appending data to rpm or writing scalars to summary self.memory_lock = threading.Lock() self.ready_actor_queue = queue.Queue() @@ -246,24 +246,24 @@ class Learner(object): episode_env_reward) if self.env_reward_stat.count > 500: - tensorboard.add_scalar('recent_env_reward', - self.env_reward_stat.mean, - self.total_steps) - tensorboard.add_scalar('recent_shaping_reward', - self.shaping_reward_stat.mean, - self.total_steps) - if self.critic_loss_stat.count > 500: - tensorboard.add_scalar('recent_critic_loss', - self.critic_loss_stat.mean, - self.total_steps) - tensorboard.add_scalar('episode_length', n, self.total_steps) - tensorboard.add_scalar('max_env_reward', self.max_env_reward, + summary.add_scalar('recent_env_reward', + self.env_reward_stat.mean, self.total_steps) - tensorboard.add_scalar('ready_actor_num', - self.ready_actor_queue.qsize(), + summary.add_scalar('recent_shaping_reward', + self.shaping_reward_stat.mean, self.total_steps) - tensorboard.add_scalar('episode_time', episode_time, + if self.critic_loss_stat.count > 500: + summary.add_scalar('recent_critic_loss', + self.critic_loss_stat.mean, self.total_steps) + summary.add_scalar('episode_length', n, self.total_steps) + summary.add_scalar('max_env_reward', self.max_env_reward, + self.total_steps) + summary.add_scalar('ready_actor_num', + self.ready_actor_queue.qsize(), + self.total_steps) + summary.add_scalar('episode_time', episode_time, + self.total_steps) self.noiselevel = self.noiselevel * NOISE_DECAY diff --git a/examples/SAC/train.py b/examples/SAC/train.py index a88260245880a39738f931573dd0b183487722df..3e2b7140e9ab5694c38bd86ded04a5e977da9d3a 100644 --- a/examples/SAC/train.py +++ b/examples/SAC/train.py @@ -21,7 +21,7 @@ import time import parl from mujoco_agent import MujocoAgent from mujoco_model import ActorModel, CriticModel -from parl.utils import logger, tensorboard, action_mapping, ReplayMemory +from parl.utils import logger, summary, action_mapping, ReplayMemory ACTOR_LR = 1e-3 CRITIC_LR = 1e-3 @@ -111,8 +111,7 @@ def main(): train_reward, steps = run_train_episode(env, agent, rpm) total_steps += steps logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) - tensorboard.add_scalar('train/episode_reward', train_reward, - total_steps) + summary.add_scalar('train/episode_reward', train_reward, total_steps) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: @@ -120,8 +119,8 @@ def main(): evaluate_reward = run_evaluate_episode(env, agent) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, evaluate_reward)) - tensorboard.add_scalar('eval/episode_reward', evaluate_reward, - total_steps) + summary.add_scalar('eval/episode_reward', evaluate_reward, + total_steps) if __name__ == '__main__': diff --git a/examples/TD3/train.py b/examples/TD3/train.py index 4cb74d9c01ab73dcb8cb20385b36262cb7c4aeba..8115a41ba1129e00dda1f2a7ca1b0ad3b9d64c71 100644 --- a/examples/TD3/train.py +++ b/examples/TD3/train.py @@ -19,7 +19,7 @@ import time import parl from mujoco_agent import MujocoAgent from mujoco_model import MujocoModel -from parl.utils import logger, tensorboard, action_mapping, ReplayMemory +from parl.utils import logger, summary, action_mapping, ReplayMemory MAX_EPISODES = 5000 ACTOR_LR = 3e-4 @@ -117,8 +117,7 @@ def main(): train_reward, steps = run_train_episode(env, agent, rpm) total_steps += steps logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) - tensorboard.add_scalar('train/episode_reward', train_reward, - total_steps) + summary.add_scalar('train/episode_reward', train_reward, total_steps) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: @@ -126,8 +125,8 @@ def main(): evaluate_reward = run_evaluate_episode(env, agent) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, evaluate_reward)) - tensorboard.add_scalar('eval/episode_reward', evaluate_reward, - total_steps) + summary.add_scalar('eval/episode_reward', evaluate_reward, + total_steps) if __name__ == '__main__': diff --git a/examples/offline-Q-learning/parallel_run.py b/examples/offline-Q-learning/parallel_run.py index 281ea4504e030f5b7296349a5912a07b30fdec27..d7da430e83de46be82a935bc01ce35ca6bd83c6e 100644 --- a/examples/offline-Q-learning/parallel_run.py +++ b/examples/offline-Q-learning/parallel_run.py @@ -22,7 +22,7 @@ from tqdm import tqdm import parl import paddle.fluid as fluid from parl.utils import get_gpu_count -from parl.utils import tensorboard, logger +from parl.utils import summary, logger from dqn import DQN # slight changes from parl.algorithms.DQN from atari_agent import AtariAgent diff --git a/parl/algorithms/fluid/impala/impala.py b/parl/algorithms/fluid/impala/impala.py index 0007a9c0536cc321989b04807dcee6c7428a30ff..a7adf56ee28f3ec14f304a9a8b163aae31805fda 100644 --- a/parl/algorithms/fluid/impala/impala.py +++ b/parl/algorithms/fluid/impala/impala.py @@ -90,7 +90,7 @@ class IMPALA(Algorithm): vf_loss_coeff=None, clip_rho_threshold=None, clip_pg_rho_threshold=None): - """ IMPALA algorithm + r""" IMPALA algorithm Args: model (parl.Model): forward network of policy and value diff --git a/parl/utils/summary.py b/parl/utils/summary.py new file mode 100644 index 0000000000000000000000000000000000000000..575fc6b9976906e43dddeb7da2ea1ef32d4644c1 --- /dev/null +++ b/parl/utils/summary.py @@ -0,0 +1,45 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tensorboardX import SummaryWriter +from parl.utils import logger + +__all__ = [] + +_writer = None +_WRITTER_METHOD = ['add_scalar', 'add_histogram', 'close', 'flush'] + + +def create_file_after_first_call(func_name): + def call(*args, **kwargs): + global _writer + if _writer is None: + logdir = logger.get_dir() + if logdir is None: + logdir = logger.auto_set_dir(action='d') + logger.warning( + "[tensorboard] logdir is None, will save tensorboard files to {}" + .format(logdir)) + _writer = SummaryWriter(logdir=logger.get_dir()) + func = getattr(_writer, func_name) + func(*args, **kwargs) + _writer.flush() + + return call + + +# export writter functions +for func_name in _WRITTER_METHOD: + locals()[func_name] = create_file_after_first_call(func_name) + __all__.append(func_name) diff --git a/parl/utils/tests/tensorboard_test.py b/parl/utils/tests/summary_test.py similarity index 79% rename from parl/utils/tests/tensorboard_test.py rename to parl/utils/tests/summary_test.py index 65fcb82404adfe461395e594dcf112ea41fd330e..670abcccc35e3039cb1e93ea3462855bb84a503a 100644 --- a/parl/utils/tests/tensorboard_test.py +++ b/parl/utils/tests/summary_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import unittest -from parl.utils import tensorboard +from parl.utils import summary import numpy as np from parl.utils import logger import os @@ -20,18 +20,18 @@ import os class TestUtils(unittest.TestCase): def tearDown(self): - tensorboard.flush() + summary.flush() def test_add_scalar(self): x = range(100) for i in x: - tensorboard.add_scalar('y=2x', i * 2, i) - self.assertTrue(os.path.exists('./train_log/tensorboard_test')) + summary.add_scalar('y=2x', i * 2, i) + self.assertTrue(os.path.exists('./train_log/summary_test')) def test_add_histogram(self): for i in range(10): x = np.random.random(1000) - tensorboard.add_histogram('distribution centers', x + i, i) + summary.add_histogram('distribution centers', x + i, i) if __name__ == '__main__':