提交 c1646351 编写于 作者: L LI Yunxiang 提交者: Bo Zhou

add liftsim baseline (#120)

* add liftsim baseline

* yapf

* yapf...

* modify acc. comments

* yapf

* yapf..........

* yapf!

why is yapf on paddle different from that on my mac!!!!!
上级 5f71cd4e
# LiftSim基线
## 简介
基于PARL库实现Deep Q-network算法,应用于[RLSchool][rlschool]库中的电梯调度模拟环境[LiftSim][liftsim]
## 依赖库
- paddlepaddle >= 1.5.1
- parl >= 1.1.2
- rlschool >= 0.0.1
Windows版本仅支持Python3.5及以上版本。
## 运行
```python
python demo.py
```
## Benchmark
<img src="rl_10.png" width="400"/>
Accumulated Reward:每3600 steps内reward的总和,可体现电梯调度在单位时间(模拟环境0.5小时)内的效率。
[rlschool]: https://github.com/PaddlePaddle/RLSchool
[liftsim]: https://github.com/PaddlePaddle/RLSchool/tree/master/rlschool/liftsim
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from rlschool import LiftSim
from wrapper import Wrapper, ActionWrapper, ObservationWrapper
from rl_benchmark.dispatcher import RL_dispatcher
import sys
import argparse
# run main program with args
def run_main(args):
parser = argparse.ArgumentParser(description='demo configuration')
parser.add_argument(
'--iterations',
type=int,
default=100000000,
help='total number of iterations')
args = parser.parse_args(args)
print('iterations:', args.iterations)
mansion_env = LiftSim()
# mansion_env.seed(1988)
mansion_env = Wrapper(mansion_env)
mansion_env = ActionWrapper(mansion_env)
mansion_env = ObservationWrapper(mansion_env)
dispatcher = RL_dispatcher(mansion_env, args.iterations)
dispatcher.run_episode()
return 0
if __name__ == "__main__":
run_main(sys.argv[1:])
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
import numpy.random as random
import paddle.fluid as fluid
from parl import layers
from parl import Agent
from parl.utils import get_gpu_count, machine_info
class ElevatorAgent(Agent):
def __init__(self, algorithm, obs_dim, action_dim):
self._action_dim = action_dim
self._obs_dim = obs_dim
self._update_target_steps = 1000
self._global_step = 0
self.exploration_ratio = 0.9
self.exploration_decre = 1e-7
self.exploration_min = 0.1
super(ElevatorAgent, self).__init__(algorithm)
use_cuda = machine_info.is_gpu_available()
if self.gpu_id >= 0:
assert get_gpu_count() == 1, 'Only support training in single GPU,\
Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_YOU_WANT_TO_USE]` .'
else:
os.environ['CPU_NUM'] = str(1)
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.num_threads = 1
exec_strategy.num_iteration_per_drop_scope = 10
build_strategy = fluid.BuildStrategy()
build_strategy.remove_unnecessary_lock = False
self.learn_pe = fluid.ParallelExecutor(
use_cuda=use_cuda,
main_program=self.learn_program,
build_strategy=build_strategy,
exec_strategy=exec_strategy,
)
def build_program(self):
self.pred_program = fluid.Program()
self.learn_program = fluid.Program()
with fluid.program_guard(self.pred_program):
obs = layers.data(
name='obs', shape=[self._obs_dim], dtype='float32')
self._value = self.alg.define_predict(obs)
with fluid.program_guard(self.learn_program):
obs = layers.data(
name='obs', shape=[self._obs_dim], dtype='float32')
action = layers.data(name='act', shape=[1], dtype='int32')
reward = layers.data(name='reward', shape=[], dtype='float32')
next_obs = layers.data(
name='next_obs', shape=[self._obs_dim], dtype='float32')
terminal = layers.data(name='terminal', shape=[], dtype='bool')
self._cost = self.alg.define_learn(obs, action, reward, next_obs,
terminal)
def sample(self, obs):
if self.exploration_ratio > self.exploration_min:
self.exploration_ratio -= self.exploration_decre
q_values = self.predict(obs)
ret_actions = list()
for i in range(len(q_values)): # number of elevators
if (random.random() < self.exploration_ratio):
action = random.randint(0, self._action_dim)
else:
action = np.argmax(q_values[i])
ret_actions.append(int(action))
return ret_actions
def predict(self, obs):
pred_Q = self.fluid_executor.run(
self.pred_program,
feed={'obs': obs.astype('float32')},
fetch_list=[self._value])
return pred_Q[0]
def learn(self, obs, act, reward, next_obs, terminal):
self._global_step += 1
if self._global_step % self._update_target_steps == 0:
self.alg.sync_target(self.gpu_id)
feed = {
'obs': obs.astype('float32'),
'act': act.astype('int32'),
'reward': reward,
'next_obs': next_obs.astype('float32'),
'terminal': terminal
}
cost = self.learn_pe.run(feed=feed, fetch_list=[self._cost.name])[0]
return cost
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import parl
import numpy as np
import numpy.random as random
from copy import deepcopy
from collections import deque
from rlschool import EPSILON, HUGE
from rl_benchmark.model import RLDispatcherModel
from rl_benchmark.agent import ElevatorAgent
from parl.algorithms import DQN
from parl.utils import ReplayMemory
MEMORY_SIZE = 1000000
BATCH_SIZE = 64
class RL_dispatcher():
"""
An RL benchmark for elevator system
"""
def __init__(self, env, max_episode):
self.env = env
self._obs_dim = env.observation_space
self._act_dim = env.action_space
self._global_step = 0
self.max_episode = max_episode
self._rpm = ReplayMemory(MEMORY_SIZE, self._obs_dim, 1)
self._model = RLDispatcherModel(self._act_dim)
hyperparas = {
'action_dim': self._act_dim,
'lr': 5.0e-4,
'gamma': 0.998
}
self._algorithm = DQN(self._model, hyperparas)
self._agent = ElevatorAgent(self._algorithm, self._obs_dim,
self._act_dim)
self._warm_up_size = 2000
self._statistic_freq = 1000
self._loss_queue = deque()
def run_episode(self):
self.env.reset()
acc_reward = 0.0
while self._global_step < self.max_episode:
# self.env.render()
state = self.env.state
action = self._agent.sample(state)
state_, reward, done, info = self.env.step(action)
output_info = self.learn_step(state, action, reward)
acc_reward += reward
if (isinstance(output_info, dict) and len(output_info) > 0):
self.env.log_notice("%s", output_info)
if (self._global_step % 3600 == 0):
self.env.log_notice(
"Accumulated Reward: %f, Mansion Status: %s", acc_reward,
self.env.statistics)
acc_reward = 0.0
self._agent.save('./model.ckpt')
def learn_step(self, state, action, r):
self._global_step += 1
if (self._global_step > self._warm_up_size):
for i in range(self.env.elevator_num):
self._rpm.append(self._last_observation_array[i],
self._last_action[i], self._last_reward,
deepcopy(state[i]), False)
self._last_observation_array = deepcopy(state)
self._last_action = deepcopy(action)
self._last_reward = r
ret_dict = {}
if self._rpm.size() > self._warm_up_size:
batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = \
self._rpm.sample_batch(BATCH_SIZE)
cost = self._agent.learn(batch_obs, batch_action, batch_reward,
batch_next_obs, batch_terminal)
self._loss_queue.appendleft(cost)
if (len(self._loss_queue) > self._statistic_freq):
self._loss_queue.pop()
if (self._global_step % self._statistic_freq == 0):
ret_dict["Temporal Difference Error(Average)"] = \
float(sum(self._loss_queue)) / float(len(self._loss_queue))
return ret_dict
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import paddle.fluid as fluid
from parl import layers
import numpy as np
import parl
class RLDispatcherModel(parl.Model):
def __init__(self, act_dim):
self._act_dim = act_dim
self._fc_1 = layers.fc(size=512, act='relu')
self._fc_2 = layers.fc(size=256, act='relu')
self._fc_3 = layers.fc(size=128, act='tanh')
self._output = layers.fc(size=act_dim)
def value(self, obs):
_h_1 = self._fc_1(obs)
_h_2 = self._fc_2(_h_1)
_h_3 = self._fc_3(_h_2)
self._pred = self._output(_h_3)
return self._pred
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# wrapper part modified from
# https://github.com/openai/gym/blob/master/gym/core.py
from rlschool import LiftSim
from wrapper_utils import obs_dim, act_dim, mansion_state_preprocessing
from wrapper_utils import action_idx_to_action
class Wrapper(LiftSim):
def __init__(self, env):
self.env = env
self._mansion = env._mansion
self.mansion_attr = self._mansion.attribute
self.elevator_num = self.mansion_attr.ElevatorNumber
self.observation_space = obs_dim(self.mansion_attr)
self.action_space = act_dim(self.mansion_attr)
self.viewer = env.viewer
def __getattr__(self, name):
if name.startswith('_'):
raise AttributeError(
"attempted to get missing private attribute '{}'".format(name))
return getattr(self.env, name)
def seed(self, seed=None):
return self.env.seed(seed)
def step(self, action):
return self.env.step(action)
def reset(self):
return self.env.reset()
def render(self):
return self.env.render()
def close(self):
return self.env.close()
class RewardWrapper(Wrapper):
pass
class ActionWrapper(Wrapper):
def reset(self):
return self.env.reset()
def step(self, action):
act = []
for a in action:
act.extend(self.action(a, self.action_space))
return self.env.step(act)
def action(self, action, action_space):
return action_idx_to_action(action, action_space)
class ObservationWrapper(Wrapper):
def reset(self):
self.env.reset()
return self.observation(self._mansion.state)
def step(self, action):
observation, reward, done, info = self.env.step(action)
return (self.observation(observation), reward, done, info)
def observation(self, observation):
return mansion_state_preprocessing(observation)
@property
def state(self):
return self.observation(self._mansion.state)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import random
import numpy as np
from rlschool import ElevatorState, ElevatorAction
from rlschool import MansionAttribute, MansionState
from rlschool import EPSILON, HUGE
from rlschool import MansionConfig
from rlschool import MansionManager
def discretize(value, n_dim, min_val, max_val):
"""
discretize a value into a vector of n_dim dimension 1-hot representation
with the value below min_val being [1, 0, 0, ..., 0]
and the value above max_val being [0, 0, ..., 0, 1]
Args:
value: the value that needs to be discretized into 1-hot format
n_dim: number of dimensions
min_val: minimal value in the result
man_val: maximum value in the result
Returns:
the discretized vector
"""
assert n_dim > 0
if (n_dim == 1):
return [1]
delta = (max_val - min_val) / float(n_dim - 1)
active_pos = int((value - min_val) / delta + 0.5)
active_pos = min(n_dim - 1, active_pos)
active_pos = max(0, active_pos)
ret_array = [0 for i in range(n_dim)]
ret_array[active_pos] = 1.0
return ret_array
def linear_discretize(value, n_dim, min_val, max_val):
"""
discretize a value into a vector of n_dim dimensional representation
with the value below min_val being [1, 0, 0, ..., 0]
and the value above max_val being [0, 0, ..., 0, 1]
e.g. if n_dim = 2, min_val = 1.0, max_val = 2.0
if value = 1.5 returns [0.5, 0.5], if value = 1.8 returns [0.2, 0.8]
Args:
value: the value that needs to be discretized
n_dim: number of dimensions
min_val: minimal value in the result
man_val: maximum value in the result
Returns:
the discretized vector
"""
assert n_dim > 0
if (n_dim == 1):
return [1]
delta = (max_val - min_val) / float(n_dim - 1)
active_pos = int((value - min_val) / delta + 0.5)
active_pos = min(n_dim - 2, active_pos)
active_pos = max(0, active_pos)
anchor_pt = active_pos * delta + min_val
if (anchor_pt > value and anchor_pt > min_val + 0.5 * delta):
anchor_pt -= delta
active_pos -= 1
weight = (value - anchor_pt) / delta
weight = min(1.0, max(0.0, weight))
ret_array = [0 for i in range(n_dim)]
ret_array[active_pos] = 1.0 - weight
ret_array[active_pos + 1] = weight
return ret_array
def ele_state_preprocessing(ele_state):
"""Process elevator state, make it usable for network
Args:
ele_state: ElevatorState, nametuple, defined in rlschool/liftsim/environment/mansion/utils.py
Returns:
ele_feature: list of elevator state
"""
ele_feature = []
# add floor information
ele_feature.extend(
linear_discretize(ele_state.Floor, ele_state.MaximumFloor, 1.0,
ele_state.MaximumFloor))
# add velocity information
ele_feature.extend(
linear_discretize(ele_state.Velocity, 21, -ele_state.MaximumSpeed,
ele_state.MaximumSpeed))
# add door information
ele_feature.append(ele_state.DoorState)
ele_feature.append(float(ele_state.DoorIsOpening))
ele_feature.append(float(ele_state.DoorIsClosing))
# add direction information
ele_feature.extend(discretize(ele_state.Direction, 3, -1, 1))
# add load weight information
ele_feature.extend(
linear_discretize(ele_state.LoadWeight / ele_state.MaximumLoad, 5, 0.0,
1.0))
# add other information
target_floor_binaries = [0.0 for i in range(ele_state.MaximumFloor)]
for target_floor in ele_state.ReservedTargetFloors:
target_floor_binaries[target_floor - 1] = 1.0
ele_feature.extend(target_floor_binaries)
dispatch_floor_binaries = [0.0 for i in range(ele_state.MaximumFloor + 1)]
dispatch_floor_binaries[ele_state.CurrentDispatchTarget] = 1.0
ele_feature.extend(dispatch_floor_binaries)
ele_feature.append(ele_state.DispatchTargetDirection)
return ele_feature
def obs_dim(mansion_attr):
"""Calculate the observation dimension
Args:
mansion_attr: MansionAttribute, attribute of mansion_manager
Returns:
observation dimension
"""
assert isinstance(mansion_attr, MansionAttribute)
ele_dim = mansion_attr.NumberOfFloor * 3 + 34
obs_dim = (ele_dim + 1) * mansion_attr.ElevatorNumber + \
mansion_attr.NumberOfFloor * 2
return obs_dim
def act_dim(mansion_attr):
"""Calculate the action dimension, which is number of floor times 2 plus 2.
The additional two are for special cases: the elevator stops at once if the new dispatch_target is 0,
the original dispatch_target does not change if dispatch_target is -1. See implementation in
method action_idx_to_action below.
Args:
mansion_attr: MansionAttribute, attribute of mansion_manager
Returns:
action dimension
"""
assert isinstance(mansion_attr, MansionAttribute)
return mansion_attr.NumberOfFloor * 2 + 2
def mansion_state_preprocessing(mansion_state):
"""Process mansion_state to make it usable for networks, convert it into a numpy array
Args:
mansion_state: namedtuple of mansion state,
defined in rlschool/liftsim/environment/mansion/utils.py
Returns:
the converted numpy array
"""
ele_features = list()
for ele_state in mansion_state.ElevatorStates:
ele_features.append(ele_state_preprocessing(ele_state))
max_floor = ele_state.MaximumFloor
target_floor_binaries_up = [0.0 for i in range(max_floor)]
target_floor_binaries_down = [0.0 for i in range(max_floor)]
for floor in mansion_state.RequiringUpwardFloors:
target_floor_binaries_up[floor - 1] = 1.0
for floor in mansion_state.RequiringDownwardFloors:
target_floor_binaries_down[floor - 1] = 1.0
target_floor_binaries = target_floor_binaries_up + target_floor_binaries_down
idx = 0
man_features = list()
for idx in range(len(mansion_state.ElevatorStates)):
elevator_id_vec = discretize(idx + 1,
len(mansion_state.ElevatorStates), 1,
len(mansion_state.ElevatorStates))
idx_array = list(range(len(mansion_state.ElevatorStates)))
idx_array.remove(idx)
# random.shuffle(idx_array)
man_features.append(ele_features[idx])
for left_idx in idx_array:
man_features[idx] = man_features[idx] + ele_features[left_idx]
man_features[idx] = man_features[idx] + \
elevator_id_vec + target_floor_binaries
return np.asarray(man_features, dtype='float32')
def action_idx_to_action(action_idx, act_dim):
"""Convert action_inx to action
Args:
action_idx: the index needed to be converted
act_dim: action dimension
Returns:
the converted namedtuple
"""
assert isinstance(action_idx, int)
assert isinstance(act_dim, int)
realdim = act_dim - 2
if (action_idx == realdim):
return ElevatorAction(0, 1)
elif (action_idx == realdim + 1):
return ElevatorAction(-1, 1)
action = action_idx
if (action_idx < realdim / 2):
direction = 1
action += 1
else:
direction = -1
action -= int(realdim / 2)
action += 1
return [action, direction]
def action_to_action_idx(action, act_dim):
"""Convert action to number according to act_dim.
Args:
action: namedtuple defined in rlschool/liftsim/environment/mansion/utils.py
act_dim: action dimension
Returns:
action_idx: the result index
"""
assert isinstance(action, ElevatorAction)
assert isinstance(act_dim, int)
realdim = act_dim - 2
if (action.TargetFloor == 0):
return realdim
elif (action.TargetFloor < 0):
return realdim + 1
action_idx = 0
if (action.DirectionIndicator < 0):
action_idx += int(realdim / 2)
action_idx += action.TargetFloor - 1
return action_idx
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册