diff --git a/benchmark/torch/a2c/train.py b/benchmark/torch/a2c/train.py
index f2985367f8304edb6bccc93f894a7d04f5f305c8..9a498023988bc72a0a0aa43d4850c25ced8d2856 100644
--- a/benchmark/torch/a2c/train.py
+++ b/benchmark/torch/a2c/train.py
@@ -27,7 +27,7 @@ from parl.env.atari_wrappers import wrap_deepmind
from parl.utils.window_stat import WindowStat
from parl.utils.time_stat import TimeStat
from parl.utils import machine_info
-from parl.utils import logger, get_gpu_count, tensorboard
+from parl.utils import logger, get_gpu_count, summary
from parl.algorithms import A2C
from atari_model import ActorCritic
@@ -205,19 +205,19 @@ class Learner(object):
}
if metric['mean_episode_rewards'] is not None:
- tensorboard.add_scalar('train/mean_reward',
- metric['mean_episode_rewards'],
- self.sample_total_steps)
- tensorboard.add_scalar('train/total_loss', metric['total_loss'],
- self.sample_total_steps)
- tensorboard.add_scalar('train/pi_loss', metric['pi_loss'],
- self.sample_total_steps)
- tensorboard.add_scalar('train/vf_loss', metric['vf_loss'],
- self.sample_total_steps)
- tensorboard.add_scalar('train/entropy', metric['entropy'],
- self.sample_total_steps)
- tensorboard.add_scalar('train/learn_rate', metric['lr'],
- self.sample_total_steps)
+ summary.add_scalar('train/mean_reward',
+ metric['mean_episode_rewards'],
+ self.sample_total_steps)
+ summary.add_scalar('train/total_loss', metric['total_loss'],
+ self.sample_total_steps)
+ summary.add_scalar('train/pi_loss', metric['pi_loss'],
+ self.sample_total_steps)
+ summary.add_scalar('train/vf_loss', metric['vf_loss'],
+ self.sample_total_steps)
+ summary.add_scalar('train/entropy', metric['entropy'],
+ self.sample_total_steps)
+ summary.add_scalar('train/learn_rate', metric['lr'],
+ self.sample_total_steps)
logger.info(metric)
diff --git a/benchmark/torch/dqn/train.py b/benchmark/torch/dqn/train.py
index 26d24d8b1a3cc88bdb26aec954094f282e463003..ba64b95c93a9b4879621331ad30cce3cbcbcac16 100644
--- a/benchmark/torch/dqn/train.py
+++ b/benchmark/torch/dqn/train.py
@@ -22,7 +22,7 @@ import parl
import numpy as np
from tqdm import tqdm
-from parl.utils import tensorboard, logger
+from parl.utils import summary, logger
from parl.algorithms import DQN, DDQN
from agent import AtariAgent
@@ -152,18 +152,17 @@ def main():
for _ in range(3):
eval_rewards.append(run_evaluate_episode(test_env, agent))
- tensorboard.add_scalar('dqn/eval', np.mean(eval_rewards),
- total_steps)
- tensorboard.add_scalar('dqn/score', total_reward, total_steps)
- tensorboard.add_scalar('dqn/loss', loss, total_steps)
- tensorboard.add_scalar('dqn/exploration', agent.exploration,
- total_steps)
- tensorboard.add_scalar('dqn/Q value',
- evaluate_fixed_Q(agent, fixed_obs),
- total_steps)
- tensorboard.add_scalar('dqn/grad_norm',
- get_grad_norm(agent.alg.model),
- total_steps)
+ summary.add_scalar('dqn/eval', np.mean(eval_rewards),
+ total_steps)
+ summary.add_scalar('dqn/score', total_reward, total_steps)
+ summary.add_scalar('dqn/loss', loss, total_steps)
+ summary.add_scalar('dqn/exploration', agent.exploration,
+ total_steps)
+ summary.add_scalar('dqn/Q value',
+ evaluate_fixed_Q(agent, fixed_obs),
+ total_steps)
+ summary.add_scalar('dqn/grad_norm',
+ get_grad_norm(agent.alg.model), total_steps)
if __name__ == '__main__':
diff --git a/benchmark/torch/td3/train.py b/benchmark/torch/td3/train.py
index c844d8c079a4b10e1e0ade957202cd7d2dcd27fb..48bd1f77103f1e50bd28f55cc12bee09315496e7 100644
--- a/benchmark/torch/td3/train.py
+++ b/benchmark/torch/td3/train.py
@@ -15,7 +15,7 @@
import gym
import argparse
import numpy as np
-from parl.utils import logger, tensorboard, ReplayMemory
+from parl.utils import logger, summary, ReplayMemory
from mujoco_model import MujocoModel
from mujoco_agent import MujocoAgent
@@ -103,8 +103,7 @@ def main():
train_reward, steps = run_train_episode(env, agent, rpm)
total_steps += steps
logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
- tensorboard.add_scalar('train/episode_reward', train_reward,
- total_steps)
+ summary.add_scalar('train/episode_reward', train_reward, total_steps)
if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag:
@@ -112,8 +111,8 @@ def main():
evaluate_reward = run_evaluate_episode(env, agent)
logger.info('Steps {}, Evaluate reward: {}'.format(
total_steps, evaluate_reward))
- tensorboard.add_scalar('eval/episode_reward', evaluate_reward,
- total_steps)
+ summary.add_scalar('eval/episode_reward', evaluate_reward,
+ total_steps)
if __name__ == '__main__':
diff --git a/docs/tutorial/tensorboard.rst b/docs/tutorial/tensorboard.rst
index 35869af6506dfebcdb64aa4bf1d7bc876f101a0b..8952a5e00b624e1c02b74c451da0d168ee6a4817 100644
--- a/docs/tutorial/tensorboard.rst
+++ b/docs/tutorial/tensorboard.rst
@@ -1,14 +1,14 @@
-Tensorboard
+summary
===============
-Visualize the results with tensorboard.
+Visualize the results with tensorboard.
add_scalar
-------------
Common used arguments:
-* tensorboard.add_scalar(tag, scalar_value, global_step=None)
+* summary.add_scalar(tag, scalar_value, global_step=None)
* tag *(string)* – Data identifier
* scalar_value *(float or string/blobname)* – Value to save
* global_step *(int)* – Global step value to record
@@ -17,11 +17,11 @@ Example:
.. code-block:: python
- from parl.utils import tensorboard
+ from parl.utils import summary
x = range(100)
for i in x:
- tensorboard.add_scalar('y=2x', i * 2, i)
+ summary.add_scalar('y=2x', i * 2, i)
Expected result:
@@ -33,7 +33,7 @@ add_histogram
Common used arguments:
-* tensorboard.add_scalar(tag, scalar_value, global_step=None)
+* summary.add_scalar(tag, scalar_value, global_step=None)
* tag *(string)* – Data identifier
* values *(torch.Tensor, numpy.array, or string/blobname)* – Values to build histogram
* global_step *(int)* – Global step value to record
@@ -42,12 +42,12 @@ Example:
.. code-block:: python
- from parl.utils import tensorboard
+ from parl.utils import summary
import numpy as np
for i in range(10):
x = np.random.random(1000)
- tensorboard.add_histogram('distribution centers', x + i, i)
+ summary.add_histogram('distribution centers', x + i, i)
Expected result:
diff --git a/examples/A2C/train.py b/examples/A2C/train.py
index 1daba4736f8e19e9f50d425c12868a43ee0933db..4050a413f0262ec62f76bbc07062578d6a398d5c 100755
--- a/examples/A2C/train.py
+++ b/examples/A2C/train.py
@@ -25,7 +25,7 @@ from atari_agent import AtariAgent
from collections import defaultdict
from parl.env.atari_wrappers import wrap_deepmind
-from parl.utils import logger, get_gpu_count, tensorboard
+from parl.utils import logger, get_gpu_count, summary
from parl.utils.scheduler import PiecewiseScheduler
from parl.utils.time_stat import TimeStat
from parl.utils.window_stat import WindowStat
@@ -186,7 +186,7 @@ class Learner(object):
min_episode_steps = np.min(np.array(episode_steps).flatten())
metric = {
- 'Sample steps': self.sample_total_steps,
+ 'sample_steps': self.sample_total_steps,
'max_episode_rewards': max_episode_rewards,
'mean_episode_rewards': mean_episode_rewards,
'min_episode_rewards': min_episode_rewards,
@@ -205,7 +205,7 @@ class Learner(object):
for key, value in metric.items():
if value is not None:
- tensorboard.add_scalar(key, value, self.sample_total_steps)
+ summary.add_scalar(key, value, self.sample_total_steps)
logger.info(metric)
diff --git a/examples/DQN_variant/train.py b/examples/DQN_variant/train.py
index 0bd22797892783758b1059a4df2a06e4096736f6..5ca16df135c346a08f87efc6a694e1e289b8192c 100644
--- a/examples/DQN_variant/train.py
+++ b/examples/DQN_variant/train.py
@@ -22,7 +22,7 @@ from atari_agent import AtariAgent
from atari_model import AtariModel
from datetime import datetime
from replay_memory import ReplayMemory, Experience
-from parl.utils import tensorboard, logger
+from parl.utils import summary, logger
from tqdm import tqdm
from utils import get_player
@@ -120,11 +120,9 @@ def main():
total_reward, steps, loss = run_train_episode(env, agent, rpm)
total_steps += steps
pbar.set_description('[train]exploration:{}'.format(agent.exploration))
- tensorboard.add_scalar('dqn/score', total_reward, total_steps)
- tensorboard.add_scalar('dqn/loss', loss,
- total_steps) # mean of total loss
- tensorboard.add_scalar('dqn/exploration', agent.exploration,
- total_steps)
+ summary.add_scalar('dqn/score', total_reward, total_steps)
+ summary.add_scalar('dqn/loss', loss, total_steps) # mean of total loss
+ summary.add_scalar('dqn/exploration', agent.exploration, total_steps)
pbar.update(steps)
if total_steps // args.test_every_steps >= test_flag:
@@ -139,7 +137,7 @@ def main():
"eval_agent done, (steps, eval_reward): ({}, {})".format(
total_steps, np.mean(eval_rewards)))
eval_test = np.mean(eval_rewards)
- tensorboard.add_scalar('dqn/eval', eval_test, total_steps)
+ summary.add_scalar('dqn/eval', eval_test, total_steps)
pbar.close()
diff --git a/examples/ES/README.md b/examples/ES/README.md
index 207ae2dafa68c5f7d2eb30f956355b07c1bd5d61..d868202753fa34c0799c8c58975c958aa1ffe001 100644
--- a/examples/ES/README.md
+++ b/examples/ES/README.md
@@ -34,7 +34,7 @@ Then we can start the distributed training by running:
python train.py
```
-Training result will be saved in `train_log` with training curve that can be visualized in tensorboard data.
+Training result will be saved in `train_log` with training curve.
### Reference
+ [Ray](https://github.com/ray-project/ray)
diff --git a/examples/ES/train.py b/examples/ES/train.py
index be2c7d703eeba39931312491274f554ee9a76562..eadf26ea6e7d736abe45e7c08d25a5c7ae8dda2e 100644
--- a/examples/ES/train.py
+++ b/examples/ES/train.py
@@ -23,7 +23,7 @@ from obs_filter import MeanStdFilter
from mujoco_agent import MujocoAgent
from mujoco_model import MujocoModel
from noise import SharedNoiseTable
-from parl.utils import logger, tensorboard
+from parl.utils import logger, summary
from parl.utils.window_stat import WindowStat
from six.moves import queue
from actor import Actor
@@ -202,7 +202,7 @@ class Learner(object):
logger.info(metrics)
for k, v in metrics.items():
if v is not None:
- tensorboard.add_scalar(k, v, self.sample_total_steps)
+ summary.add_scalar(k, v, self.sample_total_steps)
if __name__ == '__main__':
diff --git a/examples/GA3C/train.py b/examples/GA3C/train.py
index edc7f33344bc484fff640700dfd80bfc35987843..30f3a415b77cfa83d8868606498379b528ad1c31 100755
--- a/examples/GA3C/train.py
+++ b/examples/GA3C/train.py
@@ -24,7 +24,7 @@ from atari_model import AtariModel
from atari_agent import AtariAgent
from collections import defaultdict
from parl.env.atari_wrappers import wrap_deepmind
-from parl.utils import logger, get_gpu_count, tensorboard
+from parl.utils import logger, get_gpu_count, summary
from parl.utils.scheduler import PiecewiseScheduler
from parl.utils.time_stat import TimeStat
from parl.utils.window_stat import WindowStat
@@ -313,7 +313,7 @@ class Learner(object):
for key, value in metric.items():
if value is not None:
- tensorboard.add_scalar(key, value, self.sample_total_steps)
+ summary.add_scalar(key, value, self.sample_total_steps)
logger.info(metric)
diff --git a/examples/IMPALA/train.py b/examples/IMPALA/train.py
index cf9e55c54d1df8a14fc1751ac75303c6adab42ad..9f2a3e65a7962d0aed103318c4a1979520004f8f 100755
--- a/examples/IMPALA/train.py
+++ b/examples/IMPALA/train.py
@@ -22,7 +22,7 @@ import parl
from atari_model import AtariModel
from atari_agent import AtariAgent
from parl.env.atari_wrappers import wrap_deepmind
-from parl.utils import logger, tensorboard, get_gpu_count
+from parl.utils import logger, summary, get_gpu_count
from parl.utils.scheduler import PiecewiseScheduler
from parl.utils.time_stat import TimeStat
from parl.utils.window_stat import WindowStat
@@ -221,7 +221,7 @@ class Learner(object):
min_episode_steps = np.min(np.array(episode_steps).flatten())
metric = {
- 'Sample steps': self.sample_total_steps,
+ 'sample_steps': self.sample_total_steps,
'max_episode_rewards': max_episode_rewards,
'mean_episode_rewards': mean_episode_rewards,
'min_episode_rewards': min_episode_rewards,
@@ -244,7 +244,7 @@ class Learner(object):
for key, value in metric.items():
if value is not None:
- tensorboard.add_scalar(key, value, self.sample_total_steps)
+ summary.add_scalar(key, value, self.sample_total_steps)
logger.info(metric)
diff --git a/examples/LiftSim_baseline/A2C/README.md b/examples/LiftSim_baseline/A2C/README.md
deleted file mode 100644
index 235e1aff2c956fd1ec60d999e0c34328f949d80c..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/A2C/README.md
+++ /dev/null
@@ -1,47 +0,0 @@
-# LiftSim基线
-
-## 简介
-
-基于PARL库实现A2C算法,应用于[RLSchool][rlschool]库中的电梯调度模拟环境[LiftSim][liftsim]。
-
-## 依赖库
-
-+ [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle)
-+ [parl>=1.2.3](https://github.com/PaddlePaddle/PARL)
-+ [rlschool>=0.1.1][rlschool]
-
-
-## 分布式训练
-
-首先,启动一个具有5个CPU资源的本地集群:
-
-```bash
-xparl start --port 8010 --cpu_num 5
-```
-
-> 注意,如果你已经启动了一个集群,则不需要重复运行上面命令。关于PARL集群更多信息,可以参考[文档](https://parl.readthedocs.io/en/latest/parallel_training/setup.html)。
-
-然后我们就可以通过运行下面命令进行分布式训练:
-
-```bash
-python train.py
-```
-
-
-## 评估
-可以通过下面命令来评估保存的模型
-```bash
-python evaluate.py --model_path saved_models/[FILENAME]
-```
-
-tensorboard和log文件会保存在`./train_log/train/`;可以通过运行命令`tensorboard --logdir .`查看tensorboard可视化界面。
-
-## 收敛指标
-训练30h左右,评估指标能达到-120分左右(LiftSim环境运行1天reward)
-
-
-## 可视化效果
-
-
-[rlschool]: https://github.com/PaddlePaddle/RLSchool
-[liftsim]: https://github.com/PaddlePaddle/RLSchool/tree/master/rlschool/liftsim
diff --git a/examples/LiftSim_baseline/A2C/a2c_config.py b/examples/LiftSim_baseline/A2C/a2c_config.py
deleted file mode 100644
index e3e776b2f1033bc6bd911ed929437f91c44bf938..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/A2C/a2c_config.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-config = {
- #========== remote config ==========
- 'master_address': 'localhost:8010',
-
- #========== actor config ==========
- 'actor_num': 5,
- 'env_num': 5,
- 'sample_batch_steps': 5,
-
- #========== learner config ==========
- 'max_sample_steps': int(1e10),
- 'gamma': 0.998,
- 'lambda': 1.0, # GAE
-
- # start learning rate
- 'start_lr': 1.0e-4,
-
- # coefficient of policy entropy adjustment schedule: (train_step, coefficient)
- 'entropy_coeff_scheduler': [(0, -2.0e-4)],
- 'vf_loss_coeff': 0.5,
- 'get_remote_metrics_interval': 100,
- 'log_metrics_interval_s': 60,
-}
diff --git a/examples/LiftSim_baseline/A2C/actor.py b/examples/LiftSim_baseline/A2C/actor.py
deleted file mode 100644
index 4286e7f9c4dd64b444c2ca34d5ebffe870b2769f..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/A2C/actor.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import parl
-from collections import defaultdict
-from env_wrapper import ObsProcessWrapper, ActionProcessWrapper, RewardWrapper, MetricsWrapper
-from parl.utils.rl_utils import calc_gae
-from parl.env.vector_env import VectorEnv
-from rlschool import LiftSim
-from copy import deepcopy
-from lift_model import LiftModel
-from lift_agent import LiftAgent
-
-
-@parl.remote_class
-class Actor(object):
- def __init__(self, config):
- self.config = config
- self.env_num = config['env_num']
-
- self.envs = []
- for _ in range(self.env_num):
- env = LiftSim()
- env = RewardWrapper(env)
- env = ActionProcessWrapper(env)
- env = ObsProcessWrapper(env)
- env = MetricsWrapper(env)
- self.envs.append(env)
- self.vector_env = VectorEnv(self.envs)
-
- # number of elevators
- self.ele_num = self.envs[0].mansion_attr.ElevatorNumber
-
- act_dim = self.envs[0].act_dim
- self.obs_dim = self.envs[0].obs_dim
- self.config['obs_dim'] = self.obs_dim
-
- # nested list of shape (env_num, ele_num, obs_dim)
- self.obs_batch = self.vector_env.reset()
- # (env_num * ele_num, obs_dim)
- self.obs_batch = np.array(self.obs_batch).reshape(
- [self.env_num * self.ele_num, self.obs_dim])
-
- model = LiftModel(act_dim)
- algorithm = parl.algorithms.A3C(
- model, vf_loss_coeff=config['vf_loss_coeff'])
- self.agent = LiftAgent(algorithm, config)
-
- def sample(self):
- sample_data = defaultdict(list)
-
- env_sample_data = {}
- # treat each elevator in Liftsim as an independent env
- for env_id in range(self.env_num * self.ele_num):
- env_sample_data[env_id] = defaultdict(list)
-
- for i in range(self.config['sample_batch_steps']):
- actions_batch, values_batch = self.agent.sample(self.obs_batch)
-
- vector_actions = np.array_split(actions_batch, self.env_num)
- assert len(vector_actions[-1]) == self.ele_num
- next_obs_batch, reward_batch, done_batch, info_batch = \
- self.vector_env.step(vector_actions)
-
- # (env_num, ele_num, obs_dim) -> (env_num * ele_num, obs_dim)
- next_obs_batch = np.array(next_obs_batch).reshape(
- [self.env_num * self.ele_num, self.obs_dim])
- # repeat reward and done to ele_num times
- # (env_num) -> (env_num, ele_num) -> (env_num * ele_num)
- reward_batch = np.repeat(reward_batch, self.ele_num)
- done_batch = np.repeat(done_batch, self.ele_num)
-
- for env_id in range(self.env_num * self.ele_num):
- env_sample_data[env_id]['obs'].append(self.obs_batch[env_id])
- env_sample_data[env_id]['actions'].append(
- actions_batch[env_id])
- env_sample_data[env_id]['rewards'].append(reward_batch[env_id])
- env_sample_data[env_id]['dones'].append(done_batch[env_id])
- env_sample_data[env_id]['values'].append(values_batch[env_id])
-
- # Calculate advantages when the episode is done or reaches max sample steps.
- if done_batch[env_id] or i + 1 == self.config[
- 'sample_batch_steps']: # reach max sample steps
- next_value = 0
- if not done_batch[env_id]:
- next_obs = np.expand_dims(next_obs_batch[env_id], 0)
- next_value = self.agent.value(next_obs)
-
- values = env_sample_data[env_id]['values']
- rewards = env_sample_data[env_id]['rewards']
- advantages = calc_gae(rewards, values, next_value,
- self.config['gamma'],
- self.config['lambda'])
- target_values = advantages + values
-
- sample_data['obs'].extend(env_sample_data[env_id]['obs'])
- sample_data['actions'].extend(
- env_sample_data[env_id]['actions'])
- sample_data['advantages'].extend(advantages)
- sample_data['target_values'].extend(target_values)
-
- env_sample_data[env_id] = defaultdict(list)
-
- self.obs_batch = deepcopy(next_obs_batch)
-
- # size of sample_data[key]: env_num * ele_num * sample_batch_steps
- for key in sample_data:
- sample_data[key] = np.stack(sample_data[key])
-
- return sample_data
-
- def get_metrics(self):
- metrics = defaultdict(list)
- for metrics_env in self.envs:
- assert isinstance(
- metrics_env,
- MetricsWrapper), "Put the MetricsWrapper in the last wrapper"
- for env_reward_1h, env_reward_24h in metrics_env.next_episode_results(
- ):
- metrics['env_reward_1h'].append(env_reward_1h)
- metrics['env_reward_24h'].append(env_reward_24h)
- return metrics
-
- def set_weights(self, params):
- self.agent.set_weights(params)
diff --git a/examples/LiftSim_baseline/A2C/effect.gif b/examples/LiftSim_baseline/A2C/effect.gif
deleted file mode 100644
index b796acbf3c872f0e960e731fb99848c774446165..0000000000000000000000000000000000000000
Binary files a/examples/LiftSim_baseline/A2C/effect.gif and /dev/null differ
diff --git a/examples/LiftSim_baseline/A2C/env_wrapper.py b/examples/LiftSim_baseline/A2C/env_wrapper.py
deleted file mode 100644
index c8d08fac40d1cc6fff6f46e97e2bac649a79bc1b..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/A2C/env_wrapper.py
+++ /dev/null
@@ -1,301 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from copy import deepcopy
-import numpy as np
-from utils import discretize, linear_discretize
-from rlschool import LiftSim
-
-
-class BaseWrapper(object):
- def __init__(self, env):
- self.env = env
- self._mansion = env._mansion
- self.mansion_attr = self._mansion.attribute
-
- @property
- def obs_dim(self):
- if hasattr(self.env, 'obs_dim'):
- return self.env.obs_dim
- else:
- return None
-
- @property
- def act_dim(self):
- if hasattr(self.env, 'act_dim'):
- return self.env.act_dim
- else:
- return None
-
- def seed(self, seed=None):
- return self.env.seed(seed)
-
- def step(self, action):
- return self.env.step(action)
-
- def reset(self):
- return self.env.reset()
-
- def render(self):
- return self.env.render()
-
- def close(self):
- return self.env.close()
-
-
-class ObsProcessWrapper(BaseWrapper):
- """Extract features of each elevator in LiftSim env
- """
-
- def __init__(self, env, hour_distize_num=6):
- super(ObsProcessWrapper, self).__init__(env)
- self.hour_distize_num = hour_distize_num
- self.total_steps = 0
-
- @property
- def obs_dim(self):
- """
- NOTE:
- Keep obs_dim to the return size of function `_mansion_state_process`
- """
- ele_dim = self.mansion_attr.NumberOfFloor * 3 + 34
- obs_dim = (ele_dim + 1) * self.mansion_attr.ElevatorNumber + \
- self.mansion_attr.NumberOfFloor * 2
- obs_dim += self.hour_distize_num
- return obs_dim
-
- def reset(self):
- """
-
- Returns:
- obs(list): [[self.obs_dim]] * mansion_attr.ElevatorNumber, features array of all elevators
- """
- obs = self.env.reset()
- self.total_steps = 0
- obs = self._mansion_state_process(obs)
- return obs
-
- def step(self, action):
- """
- Returns:
- obs(list): nested list, shape of [mansion_attr.ElevatorNumber, self.obs_dim],
- features array of all elevators
- reward(int): returned by self.env
- done(bool): returned by self.env
- info(dict): returned by self.env
- """
- obs, reward, done, info = self.env.step(action)
- self.total_steps += 1
- obs = self._mansion_state_process(obs)
- return obs, reward, done, info
-
- def _mansion_state_process(self, mansion_state):
- """Extract features of env
- """
- ele_features = list()
- for ele_state in mansion_state.ElevatorStates:
- ele_features.append(self._ele_state_process(ele_state))
- max_floor = ele_state.MaximumFloor
-
- target_floor_binaries_up = [0.0 for i in range(max_floor)]
- target_floor_binaries_down = [0.0 for i in range(max_floor)]
- for floor in mansion_state.RequiringUpwardFloors:
- target_floor_binaries_up[floor - 1] = 1.0
- for floor in mansion_state.RequiringDownwardFloors:
- target_floor_binaries_down[floor - 1] = 1.0
- target_floor_binaries = target_floor_binaries_up + target_floor_binaries_down
-
- raw_time = self.total_steps * 0.5 # timestep seconds
- time_id = int(raw_time % 86400)
- time_id = time_id // (24 / self.hour_distize_num * 3600)
- time_id_vec = discretize(time_id + 1, self.hour_distize_num, 1,
- self.hour_distize_num)
-
- man_features = list()
- for idx in range(len(mansion_state.ElevatorStates)):
- elevator_id_vec = discretize(idx + 1,
- len(mansion_state.ElevatorStates), 1,
- len(mansion_state.ElevatorStates))
- idx_array = list(range(len(mansion_state.ElevatorStates)))
- idx_array.remove(idx)
- man_features.append(ele_features[idx])
- for left_idx in idx_array:
- man_features[idx] = man_features[idx] + ele_features[left_idx]
- man_features[idx] = man_features[idx] + \
- elevator_id_vec + target_floor_binaries
- man_features[idx] = man_features[idx] + time_id_vec
- return np.asarray(man_features, dtype='float32')
-
- def _ele_state_process(self, ele_state):
- """Extract features of elevator
- """
- ele_feature = []
-
- # add floor information
- ele_feature.extend(
- linear_discretize(ele_state.Floor, ele_state.MaximumFloor, 1.0,
- ele_state.MaximumFloor))
-
- # add velocity information
- ele_feature.extend(
- linear_discretize(ele_state.Velocity, 21, -ele_state.MaximumSpeed,
- ele_state.MaximumSpeed))
-
- # add door information
- ele_feature.append(ele_state.DoorState)
- ele_feature.append(float(ele_state.DoorIsOpening))
- ele_feature.append(float(ele_state.DoorIsClosing))
-
- # add direction information
- ele_feature.extend(discretize(ele_state.Direction, 3, -1, 1))
-
- # add load weight information
- ele_feature.extend(
- linear_discretize(ele_state.LoadWeight / ele_state.MaximumLoad, 5,
- 0.0, 1.0))
-
- # add other information
- target_floor_binaries = [0.0 for i in range(ele_state.MaximumFloor)]
- for target_floor in ele_state.ReservedTargetFloors:
- target_floor_binaries[target_floor - 1] = 1.0
- ele_feature.extend(target_floor_binaries)
-
- dispatch_floor_binaries = [
- 0.0 for i in range(ele_state.MaximumFloor + 1)
- ]
- dispatch_floor_binaries[ele_state.CurrentDispatchTarget] = 1.0
- ele_feature.extend(dispatch_floor_binaries)
- ele_feature.append(ele_state.DispatchTargetDirection)
-
- return ele_feature
-
-
-class ActionProcessWrapper(BaseWrapper):
- def __init__(self, env):
- """Map action id predicted by model to action of LiftSim
-
- """
- super(ActionProcessWrapper, self).__init__(env)
-
- @property
- def act_dim(self):
- """
- NOTE:
- keep act_dim in line with function `_action_idx_to_action`
-
- Returns:
- int: NumberOfFloor * (2 directions) + (-1 DispatchTarget) + (0 DispatchTarget)
- """
- return self.mansion_attr.NumberOfFloor * 2 + 2
-
- def step(self, action):
- """
- Args:
- action(list): action_id of all elevators (length = mansion_attr.ElevatorNumber)
- """
- ele_actions = []
- for action_id in action:
- ele_actions.extend(self._action_idx_to_action(action_id))
-
- # ele_action: list, formatted action for LiftSim env (length = 2 * mansion_attr.ElevatorNumber)
- return self.env.step(ele_actions)
-
- def _action_idx_to_action(self, action_idx):
- action_idx = int(action_idx)
- realdim = self.act_dim - 2
- if (action_idx == realdim):
- return (0, 1) # mapped to DispatchTarget=0
- elif (action_idx == realdim + 1):
- return (-1, 1) # mapped to DispatchTarget=-1
- action = action_idx
- if (action_idx < realdim / 2):
- direction = 1 # up direction
- action += 1
- else:
- direction = -1 # down direction
- action -= int(realdim / 2)
- action += 1
- return (action, direction)
-
-
-class RewardWrapper(BaseWrapper):
- def __init__(self, env):
- """Design reward of LiftSim env.
- """
- super(RewardWrapper, self).__init__(env)
- self.ele_num = self.mansion_attr.ElevatorNumber
-
- def step(self, action):
- """Here we return same reward for each elevator,
- you alos can design different rewards of each elevator.
-
- Returns:
- obs: returned by self.env
- reward: shaping reward
- done: returned by self.env
- info: returned by self.env
- """
- obs, origin_reward, done, info = self.env.step(action)
-
- reward = -(30 * info['time_consume'] + 0.01 * info['energy_consume'] +
- 100 * info['given_up_persons']) * 1.0e-3 / self.ele_num
-
- info['origin_reward'] = origin_reward
-
- return obs, reward, done, info
-
-
-class MetricsWrapper(BaseWrapper):
- def __init__(self, env):
- super(MetricsWrapper, self).__init__(env)
-
- self._total_steps = 0
- self._env_reward_1h = 0
- self._env_reward_24h = 0
-
- self._num_returned = 0
- self._episode_result = []
-
- def reset(self):
- self._total_steps = 0
- self._env_reward_1h = 0
- self._env_reward_24h = 0
- return self.env.reset()
-
- def step(self, action):
- obs, reward, done, info = self.env.step(action)
- self._total_steps += 1
-
- self._env_reward_1h += info['origin_reward']
- self._env_reward_24h += info['origin_reward']
-
- # Treat 1h in LiftSim env as an episode (1step = 0.5s)
- if self._total_steps % (3600 * 2) == 0: # 1h
- episode_env_reward_1h = self._env_reward_1h
- self._env_reward_1h = 0
-
- episode_env_reward_24h = None
- if self._total_steps % (24 * 3600 * 2) == 0: # 24h
- episode_env_reward_24h = self._env_reward_24h
- self._env_reward_24h = 0
-
- self._episode_result.append(
- [episode_env_reward_1h, episode_env_reward_24h])
-
- return obs, reward, done, info
-
- def next_episode_results(self):
- for i in range(self._num_returned, len(self._episode_result)):
- yield self._episode_result[i]
- self._num_returned = len(self._episode_result)
diff --git a/examples/LiftSim_baseline/A2C/evaluate.py b/examples/LiftSim_baseline/A2C/evaluate.py
deleted file mode 100644
index d004aa4255e7f54d123e2f11de80158e60319e1c..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/A2C/evaluate.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import parl
-from parl.utils import logger
-from env_wrapper import ObsProcessWrapper, ActionProcessWrapper, RewardWrapper
-from rlschool import LiftSim
-from lift_model import LiftModel
-from lift_agent import LiftAgent
-from a2c_config import config
-
-
-def evaluate_one_day(model_path):
- env = LiftSim()
- env = ActionProcessWrapper(env)
- env = ObsProcessWrapper(env)
- act_dim = env.act_dim
- obs_dim = env.obs_dim
- config['obs_dim'] = obs_dim
-
- model = LiftModel(act_dim)
- algorithm = parl.algorithms.A3C(
- model, vf_loss_coeff=config['vf_loss_coeff'])
- agent = LiftAgent(algorithm, config)
- agent.restore(model_path)
-
- reward_24h = 0
- obs = env.reset()
- for i in range(24 * 3600 * 2): # 24h, 1step = 0.5s
- action, _ = agent.sample(obs)
- #print(action)
- obs, reward, done, info = env.step(action)
- reward_24h += reward
- if (i + 1) % (3600 * 2) == 0:
- logger.info('hour {}, total_reward: {}'.format(
- (i + 1) // (3600 * 2), reward_24h))
-
- logger.info('model_path: {}, 24h reward: {}'.format(
- model_path, reward_24h))
-
-
-if __name__ == '__main__':
- import argparse
- parser = argparse.ArgumentParser()
- parser.add_argument(
- '--model_path', type=str, help='path of the model to evaluate.')
- args = parser.parse_args()
-
- evaluate_one_day(args.model_path)
diff --git a/examples/LiftSim_baseline/A2C/lift_agent.py b/examples/LiftSim_baseline/A2C/lift_agent.py
deleted file mode 100644
index 1dd35e59a3c840b6468f5d95bcc6fccc07445bc8..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/A2C/lift_agent.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import parl
-import paddle.fluid as fluid
-import numpy as np
-from parl import layers
-from parl.utils.scheduler import PiecewiseScheduler, LinearDecayScheduler
-
-
-class LiftAgent(parl.Agent):
- def __init__(self, algorithm, config):
- """
- Args:
- algorithm (`parl.Algorithm`): algorithm to be used in this agent.
- config (dict): config describing the training hyper-parameters(see a2c_config.py)
- """
- self.obs_dim = config['obs_dim']
- super(LiftAgent, self).__init__(algorithm)
-
- self.lr_scheduler = LinearDecayScheduler(config['start_lr'],
- config['max_sample_steps'])
- self.entropy_coeff_scheduler = PiecewiseScheduler(
- config['entropy_coeff_scheduler'])
-
- def build_program(self):
- self.sample_program = fluid.Program()
- self.predict_program = fluid.Program()
- self.value_program = fluid.Program()
- self.learn_program = fluid.Program()
-
- with fluid.program_guard(self.sample_program):
- obs = layers.data(
- name='obs', shape=[self.obs_dim], dtype='float32')
- sample_actions, values = self.alg.sample(obs)
- self.sample_outputs = [sample_actions, values]
-
- with fluid.program_guard(self.predict_program):
- obs = layers.data(
- name='obs', shape=[self.obs_dim], dtype='float32')
- self.predict_actions = self.alg.predict(obs)
-
- with fluid.program_guard(self.value_program):
- obs = layers.data(
- name='obs', shape=[self.obs_dim], dtype='float32')
- self.values = self.alg.value(obs)
-
- with fluid.program_guard(self.learn_program):
- obs = layers.data(
- name='obs', shape=[self.obs_dim], dtype='float32')
- actions = layers.data(name='actions', shape=[], dtype='int32')
- advantages = layers.data(
- name='advantages', shape=[], dtype='float32')
- target_values = layers.data(
- name='target_values', shape=[], dtype='float32')
- lr = layers.data(
- name='lr', shape=[1], dtype='float32', append_batch_size=False)
- entropy_coeff = layers.data(
- name='entropy_coeff',
- shape=[1],
- dtype='float32',
- append_batch_size=False)
-
- total_loss, pi_loss, vf_loss, entropy = self.alg.learn(
- obs, actions, advantages, target_values, lr, entropy_coeff)
- self.learn_outputs = [total_loss, pi_loss, vf_loss, entropy]
- self.learn_program = parl.compile(self.learn_program, total_loss)
-
- def sample(self, obs_np):
- """
- Args:
- obs_np: a numpy float32 array of shape (B, obs_dim).
-
- Returns:
- sample_ids: a numpy int64 array of shape [B]
- values: a numpy float32 array of shape [B]
- """
- obs_np = obs_np.astype('float32')
-
- sample_actions, values = self.fluid_executor.run(
- self.sample_program,
- feed={'obs': obs_np},
- fetch_list=self.sample_outputs)
- return sample_actions, values
-
- def predict(self, obs_np):
- """
- Args:
- obs_np: a numpy float32 array of shape (B, obs_dim).
-
- Returns:
- predict_actions: a numpy int64 array of shape [B]
- """
- obs_np = obs_np.astype('float32')
-
- predict_actions = self.fluid_executor.run(
- self.predict_program,
- feed={'obs': obs_np},
- fetch_list=[self.predict_actions])[0]
- return predict_actions
-
- def value(self, obs_np):
- """
- Args:
- obs_np: a numpy float32 array of shape (B, obs_dim).
-
- Returns:
- values: a numpy float32 array of shape [B]
- """
- obs_np = obs_np.astype('float32')
-
- values = self.fluid_executor.run(
- self.value_program, feed={'obs': obs_np},
- fetch_list=[self.values])[0]
- return values
-
- def learn(self, obs_np, actions_np, advantages_np, target_values_np):
- """
- Args:
- obs_np: a numpy float32 array of shape (B, obs_dim).
- actions_np: a numpy int64 array of shape [B]
- advantages_np: a numpy float32 array of shape [B]
- target_values_np: a numpy float32 array of shape [B]
- """
-
- obs_np = obs_np.astype('float32')
- actions_np = actions_np.astype('int64')
- advantages_np = advantages_np.astype('float32')
- target_values_np = target_values_np.astype('float32')
-
- lr = self.lr_scheduler.step(step_num=obs_np.shape[0])
- entropy_coeff = self.entropy_coeff_scheduler.step()
-
- total_loss, pi_loss, vf_loss, entropy = self.fluid_executor.run(
- self.learn_program,
- feed={
- 'obs': obs_np,
- 'actions': actions_np,
- 'advantages': advantages_np,
- 'target_values': target_values_np,
- 'lr': np.array([lr], dtype='float32'),
- 'entropy_coeff': np.array([entropy_coeff], dtype='float32')
- },
- fetch_list=self.learn_outputs)
- return total_loss, pi_loss, vf_loss, entropy, lr, entropy_coeff
diff --git a/examples/LiftSim_baseline/A2C/lift_model.py b/examples/LiftSim_baseline/A2C/lift_model.py
deleted file mode 100644
index 6da63723b5ab058ef7333c590725f90268ce52b6..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/A2C/lift_model.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import parl
-import paddle.fluid as fluid
-from parl import layers
-
-
-class LiftModel(parl.Model):
- def __init__(self, act_dim):
- self.act_dim = act_dim
- self.fc_1 = layers.fc(size=512, act='relu')
- self.fc_2 = layers.fc(size=256, act='relu')
- self.fc_3 = layers.fc(size=128, act='tanh')
-
- self.value_fc = layers.fc(size=1)
- self.policy_fc = layers.fc(size=act_dim)
-
- def policy(self, obs):
- """
- Args:
- obs(float32 tensor): shape of (B * obs_dim)
-
- Returns:
- policy_logits(float32 tensor): shape of (B * act_dim)
- """
- h_1 = self.fc_1(obs)
- h_2 = self.fc_2(h_1)
- h_3 = self.fc_3(h_2)
- policy_logits = self.policy_fc(h_3)
- return policy_logits
-
- def value(self, obs):
- """
- Args:
- obs(float32 tensor): shape of (B * obs_dim)
-
- Returns:
- values(float32 tensor): shape of (B,)
- """
- h_1 = self.fc_1(obs)
- h_2 = self.fc_2(h_1)
- h_3 = self.fc_3(h_2)
- values = self.value_fc(h_3)
- values = layers.squeeze(values, axes=[1])
- return values
-
- def policy_and_value(self, obs):
- """
- Args:
- obs(float32 tensor): shape (B * obs_dim)
-
- Returns:
- policy_logits(float32 tensor): shape of (B * act_dim)
- values(float32 tensor): shape of (B,)
- """
- h_1 = self.fc_1(obs)
- h_2 = self.fc_2(h_1)
- h_3 = self.fc_3(h_2)
- policy_logits = self.policy_fc(h_3)
- values = self.value_fc(h_3)
- values = layers.squeeze(values, axes=[1])
-
- return policy_logits, values
diff --git a/examples/LiftSim_baseline/A2C/performance.png b/examples/LiftSim_baseline/A2C/performance.png
deleted file mode 100644
index 153da4eb12bd3219ed5030516bcc001188c980b2..0000000000000000000000000000000000000000
Binary files a/examples/LiftSim_baseline/A2C/performance.png and /dev/null differ
diff --git a/examples/LiftSim_baseline/A2C/train.py b/examples/LiftSim_baseline/A2C/train.py
deleted file mode 100644
index 3663cb841f676efcc157fccd15fa816cdaae662c..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/A2C/train.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import os
-import parl
-import queue
-import six
-import time
-import threading
-
-from actor import Actor
-from collections import defaultdict
-from env_wrapper import ObsProcessWrapper, ActionProcessWrapper
-from parl.utils import logger, get_gpu_count, tensorboard, machine_info
-from parl.utils.scheduler import PiecewiseScheduler
-from parl.utils.time_stat import TimeStat
-from parl.utils.window_stat import WindowStat
-from rlschool import LiftSim
-from lift_model import LiftModel
-from lift_agent import LiftAgent
-
-
-class Learner(object):
- def __init__(self, config):
- self.config = config
-
- #=========== Create Agent ==========
- env = LiftSim()
- env = ActionProcessWrapper(env)
- env = ObsProcessWrapper(env)
-
- obs_dim = env.obs_dim
- act_dim = env.act_dim
- self.config['obs_dim'] = obs_dim
-
- model = LiftModel(act_dim)
- algorithm = parl.algorithms.A3C(
- model, vf_loss_coeff=config['vf_loss_coeff'])
- self.agent = LiftAgent(algorithm, config)
-
- if machine_info.is_gpu_available():
- assert get_gpu_count() == 1, 'Only support training in single GPU,\
- Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_TO_USE]` .'
-
- #========== Learner ==========
-
- self.entropy_stat = WindowStat(100)
- self.target_values = None
-
- self.learn_time_stat = TimeStat(100)
- self.start_time = None
-
- #========== Remote Actor ===========
- self.remote_count = 0
- self.sample_data_queue = queue.Queue()
-
- self.remote_metrics_queue = queue.Queue()
- self.sample_total_steps = 0
-
- self.params_queues = []
- self.create_actors()
-
- self.log_steps = 0
-
- def create_actors(self):
- """ Connect to the cluster and start sampling of the remote actor.
- """
- parl.connect(self.config['master_address'])
-
- logger.info('Waiting for {} remote actors to connect.'.format(
- self.config['actor_num']))
-
- for i in six.moves.range(self.config['actor_num']):
- params_queue = queue.Queue()
- self.params_queues.append(params_queue)
-
- self.remote_count += 1
- logger.info('Remote actor count: {}'.format(self.remote_count))
-
- remote_thread = threading.Thread(
- target=self.run_remote_sample, args=(params_queue, ))
- remote_thread.setDaemon(True)
- remote_thread.start()
-
- self.start_time = time.time()
-
- def run_remote_sample(self, params_queue):
- """ Sample data from remote actor and update parameters of remote actor.
- """
- remote_actor = Actor(self.config)
-
- cnt = 0
- while True:
- latest_params = params_queue.get()
- remote_actor.set_weights(latest_params)
- batch = remote_actor.sample()
-
- self.sample_data_queue.put(batch)
-
- cnt += 1
- if cnt % self.config['get_remote_metrics_interval'] == 0:
- metrics = remote_actor.get_metrics()
- if metrics:
- self.remote_metrics_queue.put(metrics)
-
- def step(self):
- """
- 1. kick off all actors to synchronize parameters and sample data;
- 2. collect sample data of all actors;
- 3. update parameters.
- """
- latest_params = self.agent.get_weights()
- for params_queue in self.params_queues:
- params_queue.put(latest_params)
-
- train_batch = defaultdict(list)
- for i in range(self.config['actor_num']):
- sample_data = self.sample_data_queue.get()
- for key, value in sample_data.items():
- train_batch[key].append(value)
-
- self.sample_total_steps += sample_data['obs'].shape[0]
-
- for key, value in train_batch.items():
- train_batch[key] = np.concatenate(value)
-
- with self.learn_time_stat:
- total_loss, pi_loss, vf_loss, entropy, lr, entropy_coeff = self.agent.learn(
- obs_np=train_batch['obs'],
- actions_np=train_batch['actions'],
- advantages_np=train_batch['advantages'],
- target_values_np=train_batch['target_values'])
-
- self.entropy_stat.add(entropy)
- self.target_values = np.mean(train_batch['target_values'])
-
- tensorboard.add_scalar('model/entropy', entropy,
- self.sample_total_steps)
- tensorboard.add_scalar('model/q_value', self.target_values,
- self.sample_total_steps)
-
- def log_metrics(self):
- """ Log metrics of learner and actors
- """
- if self.start_time is None:
- return
-
- metrics = []
- while True:
- try:
- metric = self.remote_metrics_queue.get_nowait()
- metrics.append(metric)
- except queue.Empty:
- break
-
- env_reward_1h, env_reward_24h = [], []
- for x in metrics:
- env_reward_1h.extend(x['env_reward_1h'])
- env_reward_24h.extend(x['env_reward_24h'])
- env_reward_1h = [x for x in env_reward_1h if x is not None]
- env_reward_24h = [x for x in env_reward_24h if x is not None]
-
- mean_reward_1h, mean_reward_24h = None, None
- if env_reward_1h:
- mean_reward_1h = np.mean(np.array(env_reward_1h).flatten())
- tensorboard.add_scalar('performance/env_rewards_1h',
- mean_reward_1h, self.sample_total_steps)
- if env_reward_24h:
- mean_reward_24h = np.mean(np.array(env_reward_24h).flatten())
- tensorboard.add_scalar('performance/env_rewards_24h',
- mean_reward_24h, self.sample_total_steps)
-
- metric = {
- 'Sample steps': self.sample_total_steps,
- 'env_reward_1h': mean_reward_1h,
- 'env_reward_24h': mean_reward_24h,
- 'target_values': self.target_values,
- 'entropy': self.entropy_stat.mean,
- 'learn_time_s': self.learn_time_stat.mean,
- 'elapsed_time_s': int(time.time() - self.start_time),
- }
- logger.info(metric)
-
- self.log_steps += 1
- save_interval_step = 7200 // max(1,
- self.config['log_metrics_interval_s'])
- if self.log_steps % save_interval_step == 0:
- self.save_model() # save model every 2h
-
- def should_stop(self):
- return self.sample_total_steps >= self.config['max_sample_steps']
-
- def save_model(self):
- time_str = time.strftime(".%Y%m%d_%H%M%S", time.localtime())
- self.agent.save(os.path.join('saved_models', 'model.ckpt' + time_str))
-
-
-if __name__ == '__main__':
- from a2c_config import config
-
- learner = Learner(config)
- assert config['log_metrics_interval_s'] > 0
-
- while not learner.should_stop():
- start = time.time()
- while time.time() - start < config['log_metrics_interval_s']:
- learner.step()
- learner.log_metrics()
diff --git a/examples/LiftSim_baseline/A2C/utils.py b/examples/LiftSim_baseline/A2C/utils.py
deleted file mode 100644
index 05c081eaa39e4a2366ebf986092c4a9ec11a2c2f..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/A2C/utils.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-def discretize(value, n_dim, min_val, max_val):
- '''
- discretize a value into a vector of n_dim dimension 1-hot representation
- with the value below min_val being [1, 0, 0, ..., 0]
- and the value above max_val being [0, 0, ..., 0, 1]
- '''
- assert n_dim > 0
- if (n_dim == 1):
- return [1]
- delta = (max_val - min_val) / float(n_dim - 1)
- active_pos = int((value - min_val) / delta + 0.5)
- active_pos = min(n_dim - 1, active_pos)
- active_pos = max(0, active_pos)
- ret_array = [0 for i in range(n_dim)]
- ret_array[active_pos] = 1.0
- return ret_array
-
-
-def linear_discretize(value, n_dim, min_val, max_val):
- '''
- discretize a value into a vector of n_dim dimensional representation
- with the value below min_val being [1, 0, 0, ..., 0]
- and the value above max_val being [0, 0, ..., 0, 1]
- e.g. if n_dim = 2, min_val = 1.0, max_val = 2.0
- if value = 1.5 returns [0.5, 0.5], if value = 1.8 returns [0.2, 0.8]
- '''
- assert n_dim > 0
- if (n_dim == 1):
- return [1]
- delta = (max_val - min_val) / float(n_dim - 1)
- active_pos = int((value - min_val) / delta + 0.5)
- active_pos = min(n_dim - 2, active_pos)
- active_pos = max(0, active_pos)
- anchor_pt = active_pos * delta + min_val
- if (anchor_pt > value and anchor_pt > min_val + 0.5 * delta):
- anchor_pt -= delta
- active_pos -= 1
- weight = (value - anchor_pt) / delta
- weight = min(1.0, max(0.0, weight))
- ret_array = [0 for i in range(n_dim)]
- ret_array[active_pos] = 1.0 - weight
- ret_array[active_pos + 1] = weight
- return ret_array
diff --git a/examples/LiftSim_baseline/DQN/README.md b/examples/LiftSim_baseline/DQN/README.md
deleted file mode 100644
index d75b549f8f3b1c433a915f025ab676115749ddd3..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/DQN/README.md
+++ /dev/null
@@ -1,26 +0,0 @@
-# LiftSim基线
-
-## 简介
-
-基于PARL库实现Deep Q-network算法,应用于[RLSchool][rlschool]库中的电梯调度模拟环境[LiftSim][liftsim]。
-
-## 依赖库
-
-+ [paddlepaddle==1.5.1](https://github.com/PaddlePaddle/Paddle)
-+ [parl==1.1.2](https://github.com/PaddlePaddle/PARL)
-+ [rlschool>=0.0.1](rlschool)
-
-## 运行
-
-```python
-python demo.py
-```
-
-## Benchmark
-
-
-
-Accumulated Reward:每3600 steps内reward的总和,可体现电梯调度在单位时间(模拟环境0.5小时)内的效率。
-
-[rlschool]: https://github.com/PaddlePaddle/RLSchool
-[liftsim]: https://github.com/PaddlePaddle/RLSchool/tree/master/rlschool/liftsim
diff --git a/examples/LiftSim_baseline/DQN/__init__.py b/examples/LiftSim_baseline/DQN/__init__.py
deleted file mode 100644
index eca2dce114b069bf9b455d77ce670d73b5047fd2..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/DQN/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/examples/LiftSim_baseline/DQN/demo.py b/examples/LiftSim_baseline/DQN/demo.py
deleted file mode 100644
index cecbf6c1a34d9060dac90abf6e4d648aa0f9a870..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/DQN/demo.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from rlschool import LiftSim
-from wrapper import Wrapper, ActionWrapper, ObservationWrapper
-from rl_benchmark.dispatcher import RL_dispatcher
-import sys
-import argparse
-
-
-# run main program with args
-def run_main(args):
-
- parser = argparse.ArgumentParser(description='demo configuration')
- parser.add_argument(
- '--iterations',
- type=int,
- default=100000000,
- help='total number of iterations')
- args = parser.parse_args(args)
- print('iterations:', args.iterations)
-
- mansion_env = LiftSim()
- # mansion_env.seed(1988)
-
- mansion_env = Wrapper(mansion_env)
- mansion_env = ActionWrapper(mansion_env)
- mansion_env = ObservationWrapper(mansion_env)
-
- dispatcher = RL_dispatcher(mansion_env, args.iterations)
- dispatcher.run_episode()
-
- return 0
-
-
-if __name__ == "__main__":
- run_main(sys.argv[1:])
diff --git a/examples/LiftSim_baseline/DQN/rl_10.png b/examples/LiftSim_baseline/DQN/rl_10.png
deleted file mode 100644
index b8f9eef1d10c0a617d8dd462f1d66e5d26484622..0000000000000000000000000000000000000000
Binary files a/examples/LiftSim_baseline/DQN/rl_10.png and /dev/null differ
diff --git a/examples/LiftSim_baseline/DQN/rl_benchmark/__init__.py b/examples/LiftSim_baseline/DQN/rl_benchmark/__init__.py
deleted file mode 100644
index eca2dce114b069bf9b455d77ce670d73b5047fd2..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/DQN/rl_benchmark/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/examples/LiftSim_baseline/DQN/rl_benchmark/agent.py b/examples/LiftSim_baseline/DQN/rl_benchmark/agent.py
deleted file mode 100644
index 846bcf318090916141a4216abb3a889d2548d2ff..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/DQN/rl_benchmark/agent.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import numpy as np
-import numpy.random as random
-import paddle.fluid as fluid
-from parl import layers
-from parl import Agent
-from parl.utils import get_gpu_count, machine_info
-
-
-class ElevatorAgent(Agent):
- def __init__(self, algorithm, obs_dim, action_dim):
- self._action_dim = action_dim
- self._obs_dim = obs_dim
- self._update_target_steps = 1000
-
- self._global_step = 0
- self.exploration_ratio = 0.9
- self.exploration_decre = 1e-7
- self.exploration_min = 0.1
- super(ElevatorAgent, self).__init__(algorithm)
-
- use_cuda = machine_info.is_gpu_available()
- if self.gpu_id >= 0:
- assert get_gpu_count() == 1, 'Only support training in single GPU,\
- Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_YOU_WANT_TO_USE]` .'
-
- else:
- os.environ['CPU_NUM'] = str(1)
-
- exec_strategy = fluid.ExecutionStrategy()
- exec_strategy.num_threads = 1
- exec_strategy.num_iteration_per_drop_scope = 10
- build_strategy = fluid.BuildStrategy()
- build_strategy.remove_unnecessary_lock = False
-
- self.learn_pe = fluid.ParallelExecutor(
- use_cuda=use_cuda,
- main_program=self.learn_program,
- build_strategy=build_strategy,
- exec_strategy=exec_strategy,
- )
-
- def build_program(self):
- self.pred_program = fluid.Program()
- self.learn_program = fluid.Program()
-
- with fluid.program_guard(self.pred_program):
- obs = layers.data(
- name='obs', shape=[self._obs_dim], dtype='float32')
- self._value = self.alg.define_predict(obs)
-
- with fluid.program_guard(self.learn_program):
- obs = layers.data(
- name='obs', shape=[self._obs_dim], dtype='float32')
- action = layers.data(name='act', shape=[1], dtype='int32')
- reward = layers.data(name='reward', shape=[], dtype='float32')
- next_obs = layers.data(
- name='next_obs', shape=[self._obs_dim], dtype='float32')
- terminal = layers.data(name='terminal', shape=[], dtype='bool')
- self._cost = self.alg.define_learn(obs, action, reward, next_obs,
- terminal)
-
- def sample(self, obs):
- if self.exploration_ratio > self.exploration_min:
- self.exploration_ratio -= self.exploration_decre
- q_values = self.predict(obs)
-
- ret_actions = list()
- for i in range(len(q_values)): # number of elevators
- if (random.random() < self.exploration_ratio):
- action = random.randint(0, self._action_dim)
- else:
- action = np.argmax(q_values[i])
- ret_actions.append(int(action))
- return ret_actions
-
- def predict(self, obs):
- pred_Q = self.fluid_executor.run(
- self.pred_program,
- feed={'obs': obs.astype('float32')},
- fetch_list=[self._value])
- return pred_Q[0]
-
- def learn(self, obs, act, reward, next_obs, terminal):
- self._global_step += 1
- if self._global_step % self._update_target_steps == 0:
- self.alg.sync_target(self.gpu_id)
-
- feed = {
- 'obs': obs.astype('float32'),
- 'act': act.astype('int32'),
- 'reward': reward,
- 'next_obs': next_obs.astype('float32'),
- 'terminal': terminal
- }
- cost = self.learn_pe.run(feed=feed, fetch_list=[self._cost.name])[0]
- return cost
diff --git a/examples/LiftSim_baseline/DQN/rl_benchmark/dispatcher.py b/examples/LiftSim_baseline/DQN/rl_benchmark/dispatcher.py
deleted file mode 100644
index a2561ee6d3d9f2c6b8f39c886ef4a1f2f01fb1ea..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/DQN/rl_benchmark/dispatcher.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import parl
-import numpy as np
-import numpy.random as random
-
-from copy import deepcopy
-from collections import deque
-
-from rlschool import EPSILON, HUGE
-from rl_benchmark.model import RLDispatcherModel
-from rl_benchmark.agent import ElevatorAgent
-from parl.algorithms import DQN
-from parl.utils import ReplayMemory
-
-MEMORY_SIZE = 1000000
-BATCH_SIZE = 64
-
-
-class RL_dispatcher():
- """
- An RL benchmark for elevator system
- """
-
- def __init__(self, env, max_episode):
- self.env = env
-
- self._obs_dim = env.observation_space
- self._act_dim = env.action_space
- self._global_step = 0
- self.max_episode = max_episode
- self._rpm = ReplayMemory(MEMORY_SIZE, self._obs_dim, 1)
- self._model = RLDispatcherModel(self._act_dim)
- hyperparas = {
- 'action_dim': self._act_dim,
- 'lr': 5.0e-4,
- 'gamma': 0.998
- }
-
- self._algorithm = DQN(self._model, hyperparas)
- self._agent = ElevatorAgent(self._algorithm, self._obs_dim,
- self._act_dim)
- self._warm_up_size = 2000
- self._statistic_freq = 1000
- self._loss_queue = deque()
-
- def run_episode(self):
- self.env.reset()
- acc_reward = 0.0
-
- while self._global_step < self.max_episode:
- # self.env.render()
- state = self.env.state
- action = self._agent.sample(state)
- state_, reward, done, info = self.env.step(action)
- output_info = self.learn_step(state, action, reward)
- acc_reward += reward
- if (isinstance(output_info, dict) and len(output_info) > 0):
- self.env.log_notice("%s", output_info)
- if (self._global_step % 3600 == 0):
- self.env.log_notice(
- "Accumulated Reward: %f, Mansion Status: %s", acc_reward,
- self.env.statistics)
- acc_reward = 0.0
-
- self._agent.save('./model.ckpt')
-
- def learn_step(self, state, action, r):
- self._global_step += 1
- if (self._global_step > self._warm_up_size):
- for i in range(self.env.elevator_num):
- self._rpm.append(self._last_observation_array[i],
- self._last_action[i], self._last_reward,
- deepcopy(state[i]), False)
- self._last_observation_array = deepcopy(state)
- self._last_action = deepcopy(action)
- self._last_reward = r
-
- ret_dict = {}
- if self._rpm.size() > self._warm_up_size:
- batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = \
- self._rpm.sample_batch(BATCH_SIZE)
- cost = self._agent.learn(batch_obs, batch_action, batch_reward,
- batch_next_obs, batch_terminal)
- self._loss_queue.appendleft(cost)
- if (len(self._loss_queue) > self._statistic_freq):
- self._loss_queue.pop()
- if (self._global_step % self._statistic_freq == 0):
- ret_dict["Temporal Difference Error(Average)"] = \
- float(sum(self._loss_queue)) / float(len(self._loss_queue))
-
- return ret_dict
diff --git a/examples/LiftSim_baseline/DQN/rl_benchmark/model.py b/examples/LiftSim_baseline/DQN/rl_benchmark/model.py
deleted file mode 100644
index 3b2364df90565565f5d4e3286b6662c134cb4c08..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/DQN/rl_benchmark/model.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import paddle.fluid as fluid
-from parl import layers
-import numpy as np
-import parl
-
-
-class RLDispatcherModel(parl.Model):
- def __init__(self, act_dim):
- self._act_dim = act_dim
- self._fc_1 = layers.fc(size=512, act='relu')
- self._fc_2 = layers.fc(size=256, act='relu')
- self._fc_3 = layers.fc(size=128, act='tanh')
- self._output = layers.fc(size=act_dim)
-
- def value(self, obs):
- _h_1 = self._fc_1(obs)
- _h_2 = self._fc_2(_h_1)
- _h_3 = self._fc_3(_h_2)
- self._pred = self._output(_h_3)
- return self._pred
diff --git a/examples/LiftSim_baseline/DQN/wrapper.py b/examples/LiftSim_baseline/DQN/wrapper.py
deleted file mode 100644
index 55d525deaeecb76df4f2ba9183ed5ea6c119e5d8..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/DQN/wrapper.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# wrapper part modified from
-# https://github.com/openai/gym/blob/master/gym/core.py
-
-from rlschool import LiftSim
-from wrapper_utils import obs_dim, act_dim, mansion_state_preprocessing
-from wrapper_utils import action_idx_to_action
-
-
-class Wrapper(LiftSim):
- def __init__(self, env):
- self.env = env
- self._mansion = env._mansion
- self.mansion_attr = self._mansion.attribute
- self.elevator_num = self.mansion_attr.ElevatorNumber
- self.observation_space = obs_dim(self.mansion_attr)
- self.action_space = act_dim(self.mansion_attr)
- self.viewer = env.viewer
-
- def __getattr__(self, name):
- if name.startswith('_'):
- raise AttributeError(
- "attempted to get missing private attribute '{}'".format(name))
- return getattr(self.env, name)
-
- def seed(self, seed=None):
- return self.env.seed(seed)
-
- def step(self, action):
- return self.env.step(action)
-
- def reset(self):
- return self.env.reset()
-
- def render(self):
- return self.env.render()
-
- def close(self):
- return self.env.close()
-
-
-class RewardWrapper(Wrapper):
- pass
-
-
-class ActionWrapper(Wrapper):
- def reset(self):
- return self.env.reset()
-
- def step(self, action):
- act = []
- for a in action:
- act.extend(self.action(a, self.action_space))
- return self.env.step(act)
-
- def action(self, action, action_space):
- return action_idx_to_action(action, action_space)
-
-
-class ObservationWrapper(Wrapper):
- def reset(self):
- self.env.reset()
- return self.observation(self._mansion.state)
-
- def step(self, action):
- observation, reward, done, info = self.env.step(action)
- return (self.observation(observation), reward, done, info)
-
- def observation(self, observation):
- return mansion_state_preprocessing(observation)
-
- @property
- def state(self):
- return self.observation(self._mansion.state)
diff --git a/examples/LiftSim_baseline/DQN/wrapper_utils.py b/examples/LiftSim_baseline/DQN/wrapper_utils.py
deleted file mode 100644
index 45afcefbf9ebdbacc2841bd54b1756a1213be5bf..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/DQN/wrapper_utils.py
+++ /dev/null
@@ -1,241 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import random
-import numpy as np
-from rlschool import ElevatorState, ElevatorAction
-from rlschool import MansionAttribute, MansionState
-from rlschool import EPSILON, HUGE
-from rlschool import MansionConfig
-from rlschool import MansionManager
-
-
-def discretize(value, n_dim, min_val, max_val):
- """
- discretize a value into a vector of n_dim dimension 1-hot representation
- with the value below min_val being [1, 0, 0, ..., 0]
- and the value above max_val being [0, 0, ..., 0, 1]
- Args:
- value: the value that needs to be discretized into 1-hot format
- n_dim: number of dimensions
- min_val: minimal value in the result
- man_val: maximum value in the result
- Returns:
- the discretized vector
- """
- assert n_dim > 0
- if (n_dim == 1):
- return [1]
- delta = (max_val - min_val) / float(n_dim - 1)
- active_pos = int((value - min_val) / delta + 0.5)
- active_pos = min(n_dim - 1, active_pos)
- active_pos = max(0, active_pos)
- ret_array = [0 for i in range(n_dim)]
- ret_array[active_pos] = 1.0
- return ret_array
-
-
-def linear_discretize(value, n_dim, min_val, max_val):
- """
- discretize a value into a vector of n_dim dimensional representation
- with the value below min_val being [1, 0, 0, ..., 0]
- and the value above max_val being [0, 0, ..., 0, 1]
- e.g. if n_dim = 2, min_val = 1.0, max_val = 2.0
- if value = 1.5 returns [0.5, 0.5], if value = 1.8 returns [0.2, 0.8]
- Args:
- value: the value that needs to be discretized
- n_dim: number of dimensions
- min_val: minimal value in the result
- man_val: maximum value in the result
- Returns:
- the discretized vector
- """
- assert n_dim > 0
- if (n_dim == 1):
- return [1]
- delta = (max_val - min_val) / float(n_dim - 1)
- active_pos = int((value - min_val) / delta + 0.5)
- active_pos = min(n_dim - 2, active_pos)
- active_pos = max(0, active_pos)
- anchor_pt = active_pos * delta + min_val
- if (anchor_pt > value and anchor_pt > min_val + 0.5 * delta):
- anchor_pt -= delta
- active_pos -= 1
- weight = (value - anchor_pt) / delta
- weight = min(1.0, max(0.0, weight))
- ret_array = [0 for i in range(n_dim)]
- ret_array[active_pos] = 1.0 - weight
- ret_array[active_pos + 1] = weight
- return ret_array
-
-
-def ele_state_preprocessing(ele_state):
- """Process elevator state, make it usable for network
- Args:
- ele_state: ElevatorState, nametuple, defined in rlschool/liftsim/environment/mansion/utils.py
- Returns:
- ele_feature: list of elevator state
- """
- ele_feature = []
-
- # add floor information
- ele_feature.extend(
- linear_discretize(ele_state.Floor, ele_state.MaximumFloor, 1.0,
- ele_state.MaximumFloor))
-
- # add velocity information
- ele_feature.extend(
- linear_discretize(ele_state.Velocity, 21, -ele_state.MaximumSpeed,
- ele_state.MaximumSpeed))
-
- # add door information
- ele_feature.append(ele_state.DoorState)
- ele_feature.append(float(ele_state.DoorIsOpening))
- ele_feature.append(float(ele_state.DoorIsClosing))
-
- # add direction information
- ele_feature.extend(discretize(ele_state.Direction, 3, -1, 1))
-
- # add load weight information
- ele_feature.extend(
- linear_discretize(ele_state.LoadWeight / ele_state.MaximumLoad, 5, 0.0,
- 1.0))
-
- # add other information
- target_floor_binaries = [0.0 for i in range(ele_state.MaximumFloor)]
- for target_floor in ele_state.ReservedTargetFloors:
- target_floor_binaries[target_floor - 1] = 1.0
- ele_feature.extend(target_floor_binaries)
-
- dispatch_floor_binaries = [0.0 for i in range(ele_state.MaximumFloor + 1)]
- dispatch_floor_binaries[ele_state.CurrentDispatchTarget] = 1.0
- ele_feature.extend(dispatch_floor_binaries)
- ele_feature.append(ele_state.DispatchTargetDirection)
-
- return ele_feature
-
-
-def obs_dim(mansion_attr):
- """Calculate the observation dimension
- Args:
- mansion_attr: MansionAttribute, attribute of mansion_manager
- Returns:
- observation dimension
- """
- assert isinstance(mansion_attr, MansionAttribute)
- ele_dim = mansion_attr.NumberOfFloor * 3 + 34
- obs_dim = (ele_dim + 1) * mansion_attr.ElevatorNumber + \
- mansion_attr.NumberOfFloor * 2
- return obs_dim
-
-
-def act_dim(mansion_attr):
- """Calculate the action dimension, which is number of floor times 2 plus 2.
- The additional two are for special cases: the elevator stops at once if the new dispatch_target is 0,
- the original dispatch_target does not change if dispatch_target is -1. See implementation in
- method action_idx_to_action below.
- Args:
- mansion_attr: MansionAttribute, attribute of mansion_manager
- Returns:
- action dimension
- """
- assert isinstance(mansion_attr, MansionAttribute)
- return mansion_attr.NumberOfFloor * 2 + 2
-
-
-def mansion_state_preprocessing(mansion_state):
- """Process mansion_state to make it usable for networks, convert it into a numpy array
- Args:
- mansion_state: namedtuple of mansion state,
- defined in rlschool/liftsim/environment/mansion/utils.py
- Returns:
- the converted numpy array
- """
- ele_features = list()
- for ele_state in mansion_state.ElevatorStates:
- ele_features.append(ele_state_preprocessing(ele_state))
- max_floor = ele_state.MaximumFloor
-
- target_floor_binaries_up = [0.0 for i in range(max_floor)]
- target_floor_binaries_down = [0.0 for i in range(max_floor)]
- for floor in mansion_state.RequiringUpwardFloors:
- target_floor_binaries_up[floor - 1] = 1.0
- for floor in mansion_state.RequiringDownwardFloors:
- target_floor_binaries_down[floor - 1] = 1.0
- target_floor_binaries = target_floor_binaries_up + target_floor_binaries_down
-
- idx = 0
- man_features = list()
- for idx in range(len(mansion_state.ElevatorStates)):
- elevator_id_vec = discretize(idx + 1,
- len(mansion_state.ElevatorStates), 1,
- len(mansion_state.ElevatorStates))
- idx_array = list(range(len(mansion_state.ElevatorStates)))
- idx_array.remove(idx)
- # random.shuffle(idx_array)
- man_features.append(ele_features[idx])
- for left_idx in idx_array:
- man_features[idx] = man_features[idx] + ele_features[left_idx]
- man_features[idx] = man_features[idx] + \
- elevator_id_vec + target_floor_binaries
- return np.asarray(man_features, dtype='float32')
-
-
-def action_idx_to_action(action_idx, act_dim):
- """Convert action_inx to action
- Args:
- action_idx: the index needed to be converted
- act_dim: action dimension
- Returns:
- the converted namedtuple
- """
- assert isinstance(action_idx, int)
- assert isinstance(act_dim, int)
- realdim = act_dim - 2
- if (action_idx == realdim):
- return ElevatorAction(0, 1)
- elif (action_idx == realdim + 1):
- return ElevatorAction(-1, 1)
- action = action_idx
- if (action_idx < realdim / 2):
- direction = 1
- action += 1
- else:
- direction = -1
- action -= int(realdim / 2)
- action += 1
- return [action, direction]
-
-
-def action_to_action_idx(action, act_dim):
- """Convert action to number according to act_dim.
- Args:
- action: namedtuple defined in rlschool/liftsim/environment/mansion/utils.py
- act_dim: action dimension
- Returns:
- action_idx: the result index
- """
- assert isinstance(action, ElevatorAction)
- assert isinstance(act_dim, int)
- realdim = act_dim - 2
- if (action.TargetFloor == 0):
- return realdim
- elif (action.TargetFloor < 0):
- return realdim + 1
- action_idx = 0
- if (action.DirectionIndicator < 0):
- action_idx += int(realdim / 2)
- action_idx += action.TargetFloor - 1
- return action_idx
diff --git a/examples/MADDPG/train.py b/examples/MADDPG/train.py
index d0e20dcdb4fd35638432fb1666b76b30c2a388d8..8454a73ee209707c65340897ce9b090d482c6751 100644
--- a/examples/MADDPG/train.py
+++ b/examples/MADDPG/train.py
@@ -20,7 +20,7 @@ from simple_model import MAModel
from simple_agent import MAAgent
import parl
from parl.env.multiagent_simple_env import MAenv
-from parl.utils import logger, tensorboard
+from parl.utils import logger, summary
def run_episode(env, agents):
@@ -62,8 +62,8 @@ def run_episode(env, agents):
# learn policy
for i, agent in enumerate(agents):
critic_loss = agent.learn(agents)
- tensorboard.add_scalar('critic_loss_%d' % i, critic_loss,
- agent.global_train_step)
+ summary.add_scalar('critic_loss_%d' % i, critic_loss,
+ agent.global_train_step)
return total_reward, agents_reward, steps
@@ -155,12 +155,12 @@ def train_agent():
format(total_steps, total_episodes, mean_episode_reward,
use_time))
t_start = time.time()
- tensorboard.add_scalar('mean_episode_reward/episode',
- mean_episode_reward, total_episodes)
- tensorboard.add_scalar('mean_episode_reward/steps',
- mean_episode_reward, total_steps)
- tensorboard.add_scalar('use_time/1000episode', use_time,
- total_episodes)
+ summary.add_scalar('mean_episode_reward/episode',
+ mean_episode_reward, total_episodes)
+ summary.add_scalar('mean_episode_reward/steps',
+ mean_episode_reward, total_steps)
+ summary.add_scalar('use_time/1000episode', use_time,
+ total_episodes)
# save model
if not args.restore:
diff --git a/examples/NeurIPS2019-Learn-to-Move-Challenge/evaluate.py b/examples/NeurIPS2019-Learn-to-Move-Challenge/evaluate.py
index e784dae00f9ffdc5528a4c7dafda2916e5d4c456..e3a8066d79128ed9e969bb7d4c1c8cce3bee3775 100755
--- a/examples/NeurIPS2019-Learn-to-Move-Challenge/evaluate.py
+++ b/examples/NeurIPS2019-Learn-to-Move-Challenge/evaluate.py
@@ -22,7 +22,7 @@ import numpy as np
from actor import Actor
from opensim_model import OpenSimModel
from opensim_agent import OpenSimAgent
-from parl.utils import logger, ReplayMemory, tensorboard, get_gpu_count
+from parl.utils import logger, ReplayMemory, summary, get_gpu_count
from parl.utils.window_stat import WindowStat
from parl.remote.client import get_global_client
from parl.utils import machine_info
diff --git a/examples/NeurIPS2019-Learn-to-Move-Challenge/train.py b/examples/NeurIPS2019-Learn-to-Move-Challenge/train.py
index b37fb369a15c8a28a3911dbb9a864cf28d1da8b7..cf14f1e0306c69c8f134cf6c81c279ac982b52d0 100755
--- a/examples/NeurIPS2019-Learn-to-Move-Challenge/train.py
+++ b/examples/NeurIPS2019-Learn-to-Move-Challenge/train.py
@@ -22,7 +22,7 @@ import numpy as np
from actor import Actor
from opensim_model import OpenSimModel
from opensim_agent import OpenSimAgent
-from parl.utils import logger, ReplayMemory, tensorboard, get_gpu_count
+from parl.utils import logger, ReplayMemory, summary, get_gpu_count
from parl.utils.window_stat import WindowStat
from parl.remote.client import get_global_client
from parl.utils import machine_info
@@ -97,7 +97,7 @@ class Learner(object):
# add lock between training and predicting
self.model_lock = threading.Lock()
- # add lock when appending data to rpm or writing scalars to tensorboard
+ # add lock when appending data to rpm or writing scalars to summary
self.memory_lock = threading.Lock()
self.ready_actor_queue = queue.Queue()
@@ -246,24 +246,24 @@ class Learner(object):
episode_env_reward)
if self.env_reward_stat.count > 500:
- tensorboard.add_scalar('recent_env_reward',
- self.env_reward_stat.mean,
- self.total_steps)
- tensorboard.add_scalar('recent_shaping_reward',
- self.shaping_reward_stat.mean,
- self.total_steps)
- if self.critic_loss_stat.count > 500:
- tensorboard.add_scalar('recent_critic_loss',
- self.critic_loss_stat.mean,
- self.total_steps)
- tensorboard.add_scalar('episode_length', n, self.total_steps)
- tensorboard.add_scalar('max_env_reward', self.max_env_reward,
+ summary.add_scalar('recent_env_reward',
+ self.env_reward_stat.mean,
self.total_steps)
- tensorboard.add_scalar('ready_actor_num',
- self.ready_actor_queue.qsize(),
+ summary.add_scalar('recent_shaping_reward',
+ self.shaping_reward_stat.mean,
self.total_steps)
- tensorboard.add_scalar('episode_time', episode_time,
+ if self.critic_loss_stat.count > 500:
+ summary.add_scalar('recent_critic_loss',
+ self.critic_loss_stat.mean,
self.total_steps)
+ summary.add_scalar('episode_length', n, self.total_steps)
+ summary.add_scalar('max_env_reward', self.max_env_reward,
+ self.total_steps)
+ summary.add_scalar('ready_actor_num',
+ self.ready_actor_queue.qsize(),
+ self.total_steps)
+ summary.add_scalar('episode_time', episode_time,
+ self.total_steps)
self.noiselevel = self.noiselevel * NOISE_DECAY
diff --git a/examples/SAC/train.py b/examples/SAC/train.py
index a88260245880a39738f931573dd0b183487722df..3e2b7140e9ab5694c38bd86ded04a5e977da9d3a 100644
--- a/examples/SAC/train.py
+++ b/examples/SAC/train.py
@@ -21,7 +21,7 @@ import time
import parl
from mujoco_agent import MujocoAgent
from mujoco_model import ActorModel, CriticModel
-from parl.utils import logger, tensorboard, action_mapping, ReplayMemory
+from parl.utils import logger, summary, action_mapping, ReplayMemory
ACTOR_LR = 1e-3
CRITIC_LR = 1e-3
@@ -111,8 +111,7 @@ def main():
train_reward, steps = run_train_episode(env, agent, rpm)
total_steps += steps
logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
- tensorboard.add_scalar('train/episode_reward', train_reward,
- total_steps)
+ summary.add_scalar('train/episode_reward', train_reward, total_steps)
if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag:
@@ -120,8 +119,8 @@ def main():
evaluate_reward = run_evaluate_episode(env, agent)
logger.info('Steps {}, Evaluate reward: {}'.format(
total_steps, evaluate_reward))
- tensorboard.add_scalar('eval/episode_reward', evaluate_reward,
- total_steps)
+ summary.add_scalar('eval/episode_reward', evaluate_reward,
+ total_steps)
if __name__ == '__main__':
diff --git a/examples/TD3/train.py b/examples/TD3/train.py
index 4cb74d9c01ab73dcb8cb20385b36262cb7c4aeba..8115a41ba1129e00dda1f2a7ca1b0ad3b9d64c71 100644
--- a/examples/TD3/train.py
+++ b/examples/TD3/train.py
@@ -19,7 +19,7 @@ import time
import parl
from mujoco_agent import MujocoAgent
from mujoco_model import MujocoModel
-from parl.utils import logger, tensorboard, action_mapping, ReplayMemory
+from parl.utils import logger, summary, action_mapping, ReplayMemory
MAX_EPISODES = 5000
ACTOR_LR = 3e-4
@@ -117,8 +117,7 @@ def main():
train_reward, steps = run_train_episode(env, agent, rpm)
total_steps += steps
logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
- tensorboard.add_scalar('train/episode_reward', train_reward,
- total_steps)
+ summary.add_scalar('train/episode_reward', train_reward, total_steps)
if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag:
@@ -126,8 +125,8 @@ def main():
evaluate_reward = run_evaluate_episode(env, agent)
logger.info('Steps {}, Evaluate reward: {}'.format(
total_steps, evaluate_reward))
- tensorboard.add_scalar('eval/episode_reward', evaluate_reward,
- total_steps)
+ summary.add_scalar('eval/episode_reward', evaluate_reward,
+ total_steps)
if __name__ == '__main__':
diff --git a/examples/offline-Q-learning/parallel_run.py b/examples/offline-Q-learning/parallel_run.py
index 281ea4504e030f5b7296349a5912a07b30fdec27..d7da430e83de46be82a935bc01ce35ca6bd83c6e 100644
--- a/examples/offline-Q-learning/parallel_run.py
+++ b/examples/offline-Q-learning/parallel_run.py
@@ -22,7 +22,7 @@ from tqdm import tqdm
import parl
import paddle.fluid as fluid
from parl.utils import get_gpu_count
-from parl.utils import tensorboard, logger
+from parl.utils import summary, logger
from dqn import DQN # slight changes from parl.algorithms.DQN
from atari_agent import AtariAgent
diff --git a/parl/algorithms/fluid/impala/impala.py b/parl/algorithms/fluid/impala/impala.py
index 0007a9c0536cc321989b04807dcee6c7428a30ff..a7adf56ee28f3ec14f304a9a8b163aae31805fda 100644
--- a/parl/algorithms/fluid/impala/impala.py
+++ b/parl/algorithms/fluid/impala/impala.py
@@ -90,7 +90,7 @@ class IMPALA(Algorithm):
vf_loss_coeff=None,
clip_rho_threshold=None,
clip_pg_rho_threshold=None):
- """ IMPALA algorithm
+ r""" IMPALA algorithm
Args:
model (parl.Model): forward network of policy and value
diff --git a/parl/utils/summary.py b/parl/utils/summary.py
new file mode 100644
index 0000000000000000000000000000000000000000..575fc6b9976906e43dddeb7da2ea1ef32d4644c1
--- /dev/null
+++ b/parl/utils/summary.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tensorboardX import SummaryWriter
+from parl.utils import logger
+
+__all__ = []
+
+_writer = None
+_WRITTER_METHOD = ['add_scalar', 'add_histogram', 'close', 'flush']
+
+
+def create_file_after_first_call(func_name):
+ def call(*args, **kwargs):
+ global _writer
+ if _writer is None:
+ logdir = logger.get_dir()
+ if logdir is None:
+ logdir = logger.auto_set_dir(action='d')
+ logger.warning(
+ "[tensorboard] logdir is None, will save tensorboard files to {}"
+ .format(logdir))
+ _writer = SummaryWriter(logdir=logger.get_dir())
+ func = getattr(_writer, func_name)
+ func(*args, **kwargs)
+ _writer.flush()
+
+ return call
+
+
+# export writter functions
+for func_name in _WRITTER_METHOD:
+ locals()[func_name] = create_file_after_first_call(func_name)
+ __all__.append(func_name)
diff --git a/parl/utils/tests/tensorboard_test.py b/parl/utils/tests/summary_test.py
similarity index 79%
rename from parl/utils/tests/tensorboard_test.py
rename to parl/utils/tests/summary_test.py
index 65fcb82404adfe461395e594dcf112ea41fd330e..670abcccc35e3039cb1e93ea3462855bb84a503a 100644
--- a/parl/utils/tests/tensorboard_test.py
+++ b/parl/utils/tests/summary_test.py
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
-from parl.utils import tensorboard
+from parl.utils import summary
import numpy as np
from parl.utils import logger
import os
@@ -20,18 +20,18 @@ import os
class TestUtils(unittest.TestCase):
def tearDown(self):
- tensorboard.flush()
+ summary.flush()
def test_add_scalar(self):
x = range(100)
for i in x:
- tensorboard.add_scalar('y=2x', i * 2, i)
- self.assertTrue(os.path.exists('./train_log/tensorboard_test'))
+ summary.add_scalar('y=2x', i * 2, i)
+ self.assertTrue(os.path.exists('./train_log/summary_test'))
def test_add_histogram(self):
for i in range(10):
x = np.random.random(1000)
- tensorboard.add_histogram('distribution centers', x + i, i)
+ summary.add_histogram('distribution centers', x + i, i)
if __name__ == '__main__':